Write ASpace XLSX by default

tw4l · web-flow · commit 04d51356f385 · 2023-08-06T12:27:25.000-04:00
diff --git a/README.md b/README.md
@@ -2,11 +2,17 @@
 
 Analyze disk images and/or create ready-to-ingest SIPs from a directory of disk images and related files.  
 
-Version: 1.1.1
+Version: 1.2.0
+
+## Breaking Changes
+
+Starting in v1.2.0, diskimageprocessor.py and the Processing mode of the GUI populate an ArchivesSpace description import XLSX instead of the previous ISAD-based CSV.
+
+To have Disk Image Processor create the original ISAD-based description CSV instead, use the `-c` or `--csv` option (GUI support not yet added- for now use version 1.1.0 or before from the Releases tab for a GUI that writes the description CSV.
 
 ## Usage
 
-Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.  
+Disk Image Processor has two modes: Analysis and Processing. Each mode can be run from the GUI interface or as a separate CLI utility by calling the underlying Python 3 script.
 
 ### Analysis
 
@@ -48,7 +54,12 @@ For HFS file systems, files are exported from the disk image using CLI version o
 
 For UDF file systems, files are copied from the mounted disk image and `walk_to_dfxml.py` is used to generate DFXML.
 
-When complete, a "description.csv" spreadsheet is created containing some pre-populated archival description:  
+When complete, a description spreadsheet will be created containings ome pre-populated archival description.
+
+From v1.2.0, Disk Image Processor will write this information into an ArchivesSpace description XLSX spreadsheet.
+
+In previous versions or if the `"-c"/"--csv"` option is passed in v1.2.0+, a description.csv file will be created instead, containing the following columns:
+
 * Date statement  
 * Date begin  
 * Date end  
diff --git a/aspace_template/aspace_import_template.xlsx b/aspace_template/aspace_import_template.xlsx
diff --git a/diskimageprocessor.py b/diskimageprocessor.py
@@ -29,8 +29,10 @@
 import datetime
 import itertools
 import logging
+import openpyxl
 import os
 import shutil
+import stat
 import subprocess
 import sys
 import time
@@ -256,6 +258,174 @@ def create_spreadsheet(args, sips, volumes, logger):
     logger.info("Description CSV created.")
 
 
+def create_aspace_excel_sheet(args, sips, volumes, logger):
+    """Create new copy of ASpace XLSX and append rows describing disk images."""
+    xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx"))
+    template_path = os.path.abspath(
+        os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx")
+    )
+
+    try:
+        shutil.copyfile(template_path, xlsx_path)
+    except OSError as err:
+        logger.error(f"Unable to copy ASpace template to destination: {err}")
+
+    # Set ASpace file permissions
+    try:
+        os.chmod(
+            xlsx_path,
+            stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH,
+        )
+    except OSError as err:
+        logger.error(f"Error setting permissions: {err}")
+
+    workbook = openpyxl.load_workbook(filename=xlsx_path)
+    worksheet = workbook["Data"]
+
+    # TODO: Deduplicate with create_speadsheet
+    # Maybe create separate method that creates dict with info, and handle
+    # opening/writing csv or xlsx separately
+    for item in sorted(os.listdir(sips)):
+        sip_path = os.path.join(sips, item)
+
+        if not os.path.isdir(sip_path):
+            continue
+
+        disk_volumes = volumes[item]
+        number_volumes = len(disk_volumes)
+
+        date_earliest = ""
+        date_latest = ""
+
+        # Get and sum information from all DFXML files generated
+        dfxml_files = []
+        subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation")
+        if args.bagfiles:
+            subdoc_dir = os.path.join(
+                sip_path, "data", "metadata", "submissionDocumentation"
+            )
+        for root, _, files in os.walk(subdoc_dir):
+            for file in files:
+                if file.startswith("dfxml"):
+                    dfxml_files.append(os.path.join(root, file))
+
+        dfxml_files_info = []
+        for dfxml_file in dfxml_files:
+            dfxml_info = _parse_dfxml(dfxml_file, logger)
+            if not dfxml_info:
+                logger.warning(
+                    "No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format(
+                        dfxml_file
+                    )
+                )
+                continue
+            dfxml_files_info.append(dfxml_info)
+
+        file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info])
+        total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info])
+        file_systems = [volume["file_system"] for volume in disk_volumes]
+        # Deduplicate list
+        file_systems = list(dict.fromkeys(file_systems))
+        file_systems_str = ", ".join(file_systems)
+
+        for dfxml_info in dfxml_files_info:
+            if not date_earliest or dfxml_info["date_earliest"] < date_earliest:
+                date_earliest = dfxml_info["date_earliest"]
+            if not date_latest or dfxml_info["date_latest"] > date_latest:
+                date_latest = dfxml_info["date_latest"]
+
+        # Create list with empty string for each of template's columns
+        row_to_write = []
+        for _ in range(173):
+            row_to_write.append("")
+
+        # Row indices for fields to write
+        INDEX_FILENAME = 6
+        INDEX_LEVEL_OF_DESCRIPTION = 8
+        INDEX_DATE_START = 23
+        INDEX_DATE_END = 24
+        INDEX_EXTENT_NUMBER = 34
+        INDEX_EXTENT_TYPE = 35
+        INDEX_SIZE = 36
+        INDEX_SCOPE_CONTENTS = 170
+
+        # Fields that are always constant
+        row_to_write[INDEX_FILENAME] = item
+        row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File"
+
+        if file_count == 0:
+            row_to_write[
+                INDEX_SCOPE_CONTENTS
+            ] = "Error gathering statistics from SIP directory"
+
+            worksheet.append(row_to_write)
+
+            logger.error("Unable to read DFXML files for {}".format(sip_path))
+            continue
+
+        # Get file formats from Brunnhilde
+        file_formats = []
+        file_format_csv = os.path.join(
+            sip_path,
+            "metadata",
+            "submissionDocumentation",
+            "brunnhilde",
+            "csv_reports",
+            "formats.csv",
+        )
+        if args.bagfiles:
+            file_format_csv = os.path.join(
+                sip_path,
+                "data",
+                "metadata",
+                "submissionDocumentation",
+                "brunnhilde",
+                "csv_reports",
+                "formats.csv",
+            )
+
+        try:
+            with open(file_format_csv, "r") as f:
+                reader = csv.reader(f)
+                next(reader)
+                for row in itertools.islice(reader, 5):
+                    file_formats.append(row[0])
+        except:
+            file_formats.append(
+                "ERROR! No Brunnhilde formats.csv file to pull formats from."
+            )
+
+        file_formats = [element or "Unidentified" for element in file_formats]
+        file_formats_str = ", ".join(file_formats)
+
+        if number_volumes > 1:
+            scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format(
+                number_volumes, file_systems_str, file_formats_str
+            )
+        else:
+            scope_content = (
+                "Files exported from {} file system volume. File formats: {}".format(
+                    disk_volumes[0]["file_system"], file_formats_str
+                )
+            )
+
+        row_to_write[INDEX_DATE_START] = str(date_earliest[:4])
+        row_to_write[INDEX_DATE_END] = str(date_latest[:4])
+        row_to_write[INDEX_EXTENT_NUMBER] = str(file_count)
+        row_to_write[INDEX_EXTENT_TYPE] = "digital files"
+        row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes))
+        row_to_write[INDEX_SCOPE_CONTENTS] = scope_content
+
+        worksheet.append(row_to_write)
+
+        logger.info("Described %s successfully." % (sip_path))
+
+    workbook.save(filename=xlsx_path)
+    workbook.close()
+
+    logger.info("ArchivesSpace description XLSX created.")
+
+
 def _parse_dfxml(dfxml_path, logger, export_all=False):
     """Parse DFXML and return dict of information for spreadsheet."""
     volume_info = {
@@ -423,6 +593,12 @@ def _make_parser():
         help="Export AppleDouble resource forks from HFS-formatted disks",
         action="store_true",
     )
+    parser.add_argument(
+        "-c",
+        "--csv",
+        help="Write description CSV (old default) instead of ArchivesSpace XLSX",
+        action="store_true",
+    )
     parser.add_argument("--quiet", action="store_true", help="Write only errors to log")
     parser.add_argument(
         "source", help="Source directory containing disk images (and related files)"
@@ -563,8 +739,17 @@ def main():
                 shell=True,
             )
 
-    # write description CSV
-    create_spreadsheet(args, sips, volumes, logger)
+    # write description
+    if args.csv:
+        try:
+            create_spreadsheet(args, sips, volumes, logger)
+        except Exception as err:
+            logger.error(f"Error creating description csv: {err}")
+    else:
+        try:
+            create_aspace_excel_sheet(args, sips, volumes, logger)
+        except Exception as err:
+            logger.error(f"Error creating ArchivesSpace description xlsx: {err}")
 
     # print unprocessed list
     if unprocessed:
diff --git a/install-bc2-ubuntu18.sh b/install-bc2-ubuntu18.sh
@@ -41,6 +41,11 @@ sudo cp LICENSE $dip_dir
 sudo cp README.md $dip_dir
 sudo cp -r disk_image_toolkit/ $dip_dir
 
+if [ ! -d $dip_dir/aspace_template ]; then
+  sudo mkdir $dip_dir/aspace_template
+fi
+sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template
+
 if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
   sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
 fi
diff --git a/install.sh b/install.sh
@@ -2,6 +2,9 @@
 
 ### Install script for CCA Disk Image Processor in Bitcurator 4/Ubuntu 22
 
+sudo python3 -m pip install pyqt5
+sudo python3 -m pip install -r requirements/base.txt
+
 if [ ! -d /usr/share/ccatools ]; then
   sudo mkdir /usr/share/ccatools
 fi
@@ -30,6 +33,11 @@ sudo cp LICENSE $dip_dir
 sudo cp README.md $dip_dir
 sudo cp -r disk_image_toolkit/ $dip_dir
 
+if [ ! -d $dip_dir/aspace_template ]; then
+  sudo mkdir $dip_dir/aspace_template
+fi
+sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template
+
 if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
   sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
 fi
diff --git a/main.py b/main.py
@@ -45,7 +45,7 @@ def about_dialog(self):
         QMessageBox.information(
             self,
             "About",
-            "Disk Image Processor v1.1.1\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
+            "Disk Image Processor v1.2.0\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
         )
 
     def browse_analysis_source(self):
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -1,2 +1,3 @@
 bagit>=1.7.0
 brunnhilde>=1.9.1
+openpyxl>=3.1.2
diff --git a/test-install.sh b/test-install.sh
@@ -35,6 +35,11 @@ sudo cp disk_image_toolkit/dfxml/dfxml.py /usr/share/ccatools/diskimageprocessor
 sudo cp disk_image_toolkit/dfxml/objects.py /usr/share/ccatools/diskimageprocessor
 sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py /usr/share/ccatools/diskimageprocessor
 
+if [ ! -d /usr/share/ccatools/diskimageprocessor/aspace_template ]; then
+  sudo mkdir /usr/share/ccatools/diskimageprocessor/aspace_template
+fi
+sudo cp aspace_template/aspace_import_template.xlsx /usr/share/ccatools/diskimageprocessor/aspace_template
+
 sudo cp disk_image_toolkit/dfxml/dfxml.py .
 sudo cp disk_image_toolkit/dfxml/objects.py .
 sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py .
diff --git a/tests/test_integration.py b/tests/test_integration.py

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def about_dialog(self):`
`45`	`45`	`QMessageBox.information(`
`46`	`46`	`self,`
`47`	`47`	`"About",`
`48`		`- "Disk Image Processor v1.1.1\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",`
	`48`	`+ "Disk Image Processor v1.2.0\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",`
`49`	`49`	`)`
`50`	`50`
`51`	`51`	`def browse_analysis_source(self):`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`bagit>=1.7.0`
`2`	`2`	`brunnhilde>=1.9.1`
	`3`	`+openpyxl>=3.1.2`