Skip to content

Commit 04d5135

Browse files
authored
Write ASpace XLSX by default
2 parents 5c57138 + 81da3cb commit 04d5135

File tree

9 files changed

+257
-9
lines changed

9 files changed

+257
-9
lines changed

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,17 @@
22

33
Analyze disk images and/or create ready-to-ingest SIPs from a directory of disk images and related files.
44

5-
Version: 1.1.1
5+
Version: 1.2.0
6+
7+
## Breaking Changes
8+
9+
Starting in v1.2.0, diskimageprocessor.py and the Processing mode of the GUI populate an ArchivesSpace description import XLSX instead of the previous ISAD-based CSV.
10+
11+
To have Disk Image Processor create the original ISAD-based description CSV instead, use the `-c` or `--csv` option (GUI support not yet added- for now use version 1.1.0 or before from the Releases tab for a GUI that writes the description CSV.
612

713
## Usage
814

9-
Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.
15+
Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.
1016

1117
### Analysis
1218

@@ -48,7 +54,12 @@ For HFS file systems, files are exported from the disk image using CLI version o
4854

4955
For UDF file systems, files are copied from the mounted disk image and `walk_to_dfxml.py` is used to generate DFXML.
5056

51-
When complete, a "description.csv" spreadsheet is created containing some pre-populated archival description:
57+
When complete, a description spreadsheet will be created containings ome pre-populated archival description.
58+
59+
From v1.2.0, Disk Image Processor will write this information into an ArchivesSpace description XLSX spreadsheet.
60+
61+
In previous versions or if the `"-c"/"--csv"` option is passed in v1.2.0+, a description.csv file will be created instead, containing the following columns:
62+
5263
* Date statement
5364
* Date begin
5465
* Date end
29.1 KB
Binary file not shown.

diskimageprocessor.py

Lines changed: 187 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,10 @@
2929
import datetime
3030
import itertools
3131
import logging
32+
import openpyxl
3233
import os
3334
import shutil
35+
import stat
3436
import subprocess
3537
import sys
3638
import time
@@ -256,6 +258,174 @@ def create_spreadsheet(args, sips, volumes, logger):
256258
logger.info("Description CSV created.")
257259

258260

261+
def create_aspace_excel_sheet(args, sips, volumes, logger):
262+
"""Create new copy of ASpace XLSX and append rows describing disk images."""
263+
xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx"))
264+
template_path = os.path.abspath(
265+
os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx")
266+
)
267+
268+
try:
269+
shutil.copyfile(template_path, xlsx_path)
270+
except OSError as err:
271+
logger.error(f"Unable to copy ASpace template to destination: {err}")
272+
273+
# Set ASpace file permissions
274+
try:
275+
os.chmod(
276+
xlsx_path,
277+
stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH,
278+
)
279+
except OSError as err:
280+
logger.error(f"Error setting permissions: {err}")
281+
282+
workbook = openpyxl.load_workbook(filename=xlsx_path)
283+
worksheet = workbook["Data"]
284+
285+
# TODO: Deduplicate with create_speadsheet
286+
# Maybe create separate method that creates dict with info, and handle
287+
# opening/writing csv or xlsx separately
288+
for item in sorted(os.listdir(sips)):
289+
sip_path = os.path.join(sips, item)
290+
291+
if not os.path.isdir(sip_path):
292+
continue
293+
294+
disk_volumes = volumes[item]
295+
number_volumes = len(disk_volumes)
296+
297+
date_earliest = ""
298+
date_latest = ""
299+
300+
# Get and sum information from all DFXML files generated
301+
dfxml_files = []
302+
subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation")
303+
if args.bagfiles:
304+
subdoc_dir = os.path.join(
305+
sip_path, "data", "metadata", "submissionDocumentation"
306+
)
307+
for root, _, files in os.walk(subdoc_dir):
308+
for file in files:
309+
if file.startswith("dfxml"):
310+
dfxml_files.append(os.path.join(root, file))
311+
312+
dfxml_files_info = []
313+
for dfxml_file in dfxml_files:
314+
dfxml_info = _parse_dfxml(dfxml_file, logger)
315+
if not dfxml_info:
316+
logger.warning(
317+
"No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format(
318+
dfxml_file
319+
)
320+
)
321+
continue
322+
dfxml_files_info.append(dfxml_info)
323+
324+
file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info])
325+
total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info])
326+
file_systems = [volume["file_system"] for volume in disk_volumes]
327+
# Deduplicate list
328+
file_systems = list(dict.fromkeys(file_systems))
329+
file_systems_str = ", ".join(file_systems)
330+
331+
for dfxml_info in dfxml_files_info:
332+
if not date_earliest or dfxml_info["date_earliest"] < date_earliest:
333+
date_earliest = dfxml_info["date_earliest"]
334+
if not date_latest or dfxml_info["date_latest"] > date_latest:
335+
date_latest = dfxml_info["date_latest"]
336+
337+
# Create list with empty string for each of template's columns
338+
row_to_write = []
339+
for _ in range(173):
340+
row_to_write.append("")
341+
342+
# Row indices for fields to write
343+
INDEX_FILENAME = 6
344+
INDEX_LEVEL_OF_DESCRIPTION = 8
345+
INDEX_DATE_START = 23
346+
INDEX_DATE_END = 24
347+
INDEX_EXTENT_NUMBER = 34
348+
INDEX_EXTENT_TYPE = 35
349+
INDEX_SIZE = 36
350+
INDEX_SCOPE_CONTENTS = 170
351+
352+
# Fields that are always constant
353+
row_to_write[INDEX_FILENAME] = item
354+
row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File"
355+
356+
if file_count == 0:
357+
row_to_write[
358+
INDEX_SCOPE_CONTENTS
359+
] = "Error gathering statistics from SIP directory"
360+
361+
worksheet.append(row_to_write)
362+
363+
logger.error("Unable to read DFXML files for {}".format(sip_path))
364+
continue
365+
366+
# Get file formats from Brunnhilde
367+
file_formats = []
368+
file_format_csv = os.path.join(
369+
sip_path,
370+
"metadata",
371+
"submissionDocumentation",
372+
"brunnhilde",
373+
"csv_reports",
374+
"formats.csv",
375+
)
376+
if args.bagfiles:
377+
file_format_csv = os.path.join(
378+
sip_path,
379+
"data",
380+
"metadata",
381+
"submissionDocumentation",
382+
"brunnhilde",
383+
"csv_reports",
384+
"formats.csv",
385+
)
386+
387+
try:
388+
with open(file_format_csv, "r") as f:
389+
reader = csv.reader(f)
390+
next(reader)
391+
for row in itertools.islice(reader, 5):
392+
file_formats.append(row[0])
393+
except:
394+
file_formats.append(
395+
"ERROR! No Brunnhilde formats.csv file to pull formats from."
396+
)
397+
398+
file_formats = [element or "Unidentified" for element in file_formats]
399+
file_formats_str = ", ".join(file_formats)
400+
401+
if number_volumes > 1:
402+
scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format(
403+
number_volumes, file_systems_str, file_formats_str
404+
)
405+
else:
406+
scope_content = (
407+
"Files exported from {} file system volume. File formats: {}".format(
408+
disk_volumes[0]["file_system"], file_formats_str
409+
)
410+
)
411+
412+
row_to_write[INDEX_DATE_START] = str(date_earliest[:4])
413+
row_to_write[INDEX_DATE_END] = str(date_latest[:4])
414+
row_to_write[INDEX_EXTENT_NUMBER] = str(file_count)
415+
row_to_write[INDEX_EXTENT_TYPE] = "digital files"
416+
row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes))
417+
row_to_write[INDEX_SCOPE_CONTENTS] = scope_content
418+
419+
worksheet.append(row_to_write)
420+
421+
logger.info("Described %s successfully." % (sip_path))
422+
423+
workbook.save(filename=xlsx_path)
424+
workbook.close()
425+
426+
logger.info("ArchivesSpace description XLSX created.")
427+
428+
259429
def _parse_dfxml(dfxml_path, logger, export_all=False):
260430
"""Parse DFXML and return dict of information for spreadsheet."""
261431
volume_info = {
@@ -423,6 +593,12 @@ def _make_parser():
423593
help="Export AppleDouble resource forks from HFS-formatted disks",
424594
action="store_true",
425595
)
596+
parser.add_argument(
597+
"-c",
598+
"--csv",
599+
help="Write description CSV (old default) instead of ArchivesSpace XLSX",
600+
action="store_true",
601+
)
426602
parser.add_argument("--quiet", action="store_true", help="Write only errors to log")
427603
parser.add_argument(
428604
"source", help="Source directory containing disk images (and related files)"
@@ -563,8 +739,17 @@ def main():
563739
shell=True,
564740
)
565741

566-
# write description CSV
567-
create_spreadsheet(args, sips, volumes, logger)
742+
# write description
743+
if args.csv:
744+
try:
745+
create_spreadsheet(args, sips, volumes, logger)
746+
except Exception as err:
747+
logger.error(f"Error creating description csv: {err}")
748+
else:
749+
try:
750+
create_aspace_excel_sheet(args, sips, volumes, logger)
751+
except Exception as err:
752+
logger.error(f"Error creating ArchivesSpace description xlsx: {err}")
568753

569754
# print unprocessed list
570755
if unprocessed:

install-bc2-ubuntu18.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ sudo cp LICENSE $dip_dir
4141
sudo cp README.md $dip_dir
4242
sudo cp -r disk_image_toolkit/ $dip_dir
4343

44+
if [ ! -d $dip_dir/aspace_template ]; then
45+
sudo mkdir $dip_dir/aspace_template
46+
fi
47+
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template
48+
4449
if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
4550
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
4651
fi

install.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
### Install script for CCA Disk Image Processor in Bitcurator 4/Ubuntu 22
44

5+
sudo python3 -m pip install pyqt5
6+
sudo python3 -m pip install -r requirements/base.txt
7+
58
if [ ! -d /usr/share/ccatools ]; then
69
sudo mkdir /usr/share/ccatools
710
fi
@@ -30,6 +33,11 @@ sudo cp LICENSE $dip_dir
3033
sudo cp README.md $dip_dir
3134
sudo cp -r disk_image_toolkit/ $dip_dir
3235

36+
if [ ! -d $dip_dir/aspace_template ]; then
37+
sudo mkdir $dip_dir/aspace_template
38+
fi
39+
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template
40+
3341
if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
3442
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
3543
fi

main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def about_dialog(self):
4545
QMessageBox.information(
4646
self,
4747
"About",
48-
"Disk Image Processor v1.1.1\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
48+
"Disk Image Processor v1.2.0\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
4949
)
5050

5151
def browse_analysis_source(self):

requirements/base.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
bagit>=1.7.0
22
brunnhilde>=1.9.1
3+
openpyxl>=3.1.2

test-install.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ sudo cp disk_image_toolkit/dfxml/dfxml.py /usr/share/ccatools/diskimageprocessor
3535
sudo cp disk_image_toolkit/dfxml/objects.py /usr/share/ccatools/diskimageprocessor
3636
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py /usr/share/ccatools/diskimageprocessor
3737

38+
if [ ! -d /usr/share/ccatools/diskimageprocessor/aspace_template ]; then
39+
sudo mkdir /usr/share/ccatools/diskimageprocessor/aspace_template
40+
fi
41+
sudo cp aspace_template/aspace_import_template.xlsx /usr/share/ccatools/diskimageprocessor/aspace_template
42+
3843
sudo cp disk_image_toolkit/dfxml/dfxml.py .
3944
sudo cp disk_image_toolkit/dfxml/objects.py .
4045
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py .

0 commit comments

Comments
 (0)