|
29 | 29 | import datetime |
30 | 30 | import itertools |
31 | 31 | import logging |
| 32 | +import openpyxl |
32 | 33 | import os |
33 | 34 | import shutil |
| 35 | +import stat |
34 | 36 | import subprocess |
35 | 37 | import sys |
36 | 38 | import time |
@@ -256,6 +258,174 @@ def create_spreadsheet(args, sips, volumes, logger): |
256 | 258 | logger.info("Description CSV created.") |
257 | 259 |
|
258 | 260 |
|
| 261 | +def create_aspace_excel_sheet(args, sips, volumes, logger): |
| 262 | + """Create new copy of ASpace XLSX and append rows describing disk images.""" |
| 263 | + xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx")) |
| 264 | + template_path = os.path.abspath( |
| 265 | + os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx") |
| 266 | + ) |
| 267 | + |
| 268 | + try: |
| 269 | + shutil.copyfile(template_path, xlsx_path) |
| 270 | + except OSError as err: |
| 271 | + logger.error(f"Unable to copy ASpace template to destination: {err}") |
| 272 | + |
| 273 | + # Set ASpace file permissions |
| 274 | + try: |
| 275 | + os.chmod( |
| 276 | + xlsx_path, |
| 277 | + stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH, |
| 278 | + ) |
| 279 | + except OSError as err: |
| 280 | + logger.error(f"Error setting permissions: {err}") |
| 281 | + |
| 282 | + workbook = openpyxl.load_workbook(filename=xlsx_path) |
| 283 | + worksheet = workbook["Data"] |
| 284 | + |
| 285 | + # TODO: Deduplicate with create_speadsheet |
| 286 | + # Maybe create separate method that creates dict with info, and handle |
| 287 | + # opening/writing csv or xlsx separately |
| 288 | + for item in sorted(os.listdir(sips)): |
| 289 | + sip_path = os.path.join(sips, item) |
| 290 | + |
| 291 | + if not os.path.isdir(sip_path): |
| 292 | + continue |
| 293 | + |
| 294 | + disk_volumes = volumes[item] |
| 295 | + number_volumes = len(disk_volumes) |
| 296 | + |
| 297 | + date_earliest = "" |
| 298 | + date_latest = "" |
| 299 | + |
| 300 | + # Get and sum information from all DFXML files generated |
| 301 | + dfxml_files = [] |
| 302 | + subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation") |
| 303 | + if args.bagfiles: |
| 304 | + subdoc_dir = os.path.join( |
| 305 | + sip_path, "data", "metadata", "submissionDocumentation" |
| 306 | + ) |
| 307 | + for root, _, files in os.walk(subdoc_dir): |
| 308 | + for file in files: |
| 309 | + if file.startswith("dfxml"): |
| 310 | + dfxml_files.append(os.path.join(root, file)) |
| 311 | + |
| 312 | + dfxml_files_info = [] |
| 313 | + for dfxml_file in dfxml_files: |
| 314 | + dfxml_info = _parse_dfxml(dfxml_file, logger) |
| 315 | + if not dfxml_info: |
| 316 | + logger.warning( |
| 317 | + "No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format( |
| 318 | + dfxml_file |
| 319 | + ) |
| 320 | + ) |
| 321 | + continue |
| 322 | + dfxml_files_info.append(dfxml_info) |
| 323 | + |
| 324 | + file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info]) |
| 325 | + total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info]) |
| 326 | + file_systems = [volume["file_system"] for volume in disk_volumes] |
| 327 | + # Deduplicate list |
| 328 | + file_systems = list(dict.fromkeys(file_systems)) |
| 329 | + file_systems_str = ", ".join(file_systems) |
| 330 | + |
| 331 | + for dfxml_info in dfxml_files_info: |
| 332 | + if not date_earliest or dfxml_info["date_earliest"] < date_earliest: |
| 333 | + date_earliest = dfxml_info["date_earliest"] |
| 334 | + if not date_latest or dfxml_info["date_latest"] > date_latest: |
| 335 | + date_latest = dfxml_info["date_latest"] |
| 336 | + |
| 337 | + # Create list with empty string for each of template's columns |
| 338 | + row_to_write = [] |
| 339 | + for _ in range(173): |
| 340 | + row_to_write.append("") |
| 341 | + |
| 342 | + # Row indices for fields to write |
| 343 | + INDEX_FILENAME = 6 |
| 344 | + INDEX_LEVEL_OF_DESCRIPTION = 8 |
| 345 | + INDEX_DATE_START = 23 |
| 346 | + INDEX_DATE_END = 24 |
| 347 | + INDEX_EXTENT_NUMBER = 34 |
| 348 | + INDEX_EXTENT_TYPE = 35 |
| 349 | + INDEX_SIZE = 36 |
| 350 | + INDEX_SCOPE_CONTENTS = 170 |
| 351 | + |
| 352 | + # Fields that are always constant |
| 353 | + row_to_write[INDEX_FILENAME] = item |
| 354 | + row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File" |
| 355 | + |
| 356 | + if file_count == 0: |
| 357 | + row_to_write[ |
| 358 | + INDEX_SCOPE_CONTENTS |
| 359 | + ] = "Error gathering statistics from SIP directory" |
| 360 | + |
| 361 | + worksheet.append(row_to_write) |
| 362 | + |
| 363 | + logger.error("Unable to read DFXML files for {}".format(sip_path)) |
| 364 | + continue |
| 365 | + |
| 366 | + # Get file formats from Brunnhilde |
| 367 | + file_formats = [] |
| 368 | + file_format_csv = os.path.join( |
| 369 | + sip_path, |
| 370 | + "metadata", |
| 371 | + "submissionDocumentation", |
| 372 | + "brunnhilde", |
| 373 | + "csv_reports", |
| 374 | + "formats.csv", |
| 375 | + ) |
| 376 | + if args.bagfiles: |
| 377 | + file_format_csv = os.path.join( |
| 378 | + sip_path, |
| 379 | + "data", |
| 380 | + "metadata", |
| 381 | + "submissionDocumentation", |
| 382 | + "brunnhilde", |
| 383 | + "csv_reports", |
| 384 | + "formats.csv", |
| 385 | + ) |
| 386 | + |
| 387 | + try: |
| 388 | + with open(file_format_csv, "r") as f: |
| 389 | + reader = csv.reader(f) |
| 390 | + next(reader) |
| 391 | + for row in itertools.islice(reader, 5): |
| 392 | + file_formats.append(row[0]) |
| 393 | + except: |
| 394 | + file_formats.append( |
| 395 | + "ERROR! No Brunnhilde formats.csv file to pull formats from." |
| 396 | + ) |
| 397 | + |
| 398 | + file_formats = [element or "Unidentified" for element in file_formats] |
| 399 | + file_formats_str = ", ".join(file_formats) |
| 400 | + |
| 401 | + if number_volumes > 1: |
| 402 | + scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format( |
| 403 | + number_volumes, file_systems_str, file_formats_str |
| 404 | + ) |
| 405 | + else: |
| 406 | + scope_content = ( |
| 407 | + "Files exported from {} file system volume. File formats: {}".format( |
| 408 | + disk_volumes[0]["file_system"], file_formats_str |
| 409 | + ) |
| 410 | + ) |
| 411 | + |
| 412 | + row_to_write[INDEX_DATE_START] = str(date_earliest[:4]) |
| 413 | + row_to_write[INDEX_DATE_END] = str(date_latest[:4]) |
| 414 | + row_to_write[INDEX_EXTENT_NUMBER] = str(file_count) |
| 415 | + row_to_write[INDEX_EXTENT_TYPE] = "digital files" |
| 416 | + row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes)) |
| 417 | + row_to_write[INDEX_SCOPE_CONTENTS] = scope_content |
| 418 | + |
| 419 | + worksheet.append(row_to_write) |
| 420 | + |
| 421 | + logger.info("Described %s successfully." % (sip_path)) |
| 422 | + |
| 423 | + workbook.save(filename=xlsx_path) |
| 424 | + workbook.close() |
| 425 | + |
| 426 | + logger.info("ArchivesSpace description XLSX created.") |
| 427 | + |
| 428 | + |
259 | 429 | def _parse_dfxml(dfxml_path, logger, export_all=False): |
260 | 430 | """Parse DFXML and return dict of information for spreadsheet.""" |
261 | 431 | volume_info = { |
@@ -423,6 +593,12 @@ def _make_parser(): |
423 | 593 | help="Export AppleDouble resource forks from HFS-formatted disks", |
424 | 594 | action="store_true", |
425 | 595 | ) |
| 596 | + parser.add_argument( |
| 597 | + "-c", |
| 598 | + "--csv", |
| 599 | + help="Write description CSV (old default) instead of ArchivesSpace XLSX", |
| 600 | + action="store_true", |
| 601 | + ) |
426 | 602 | parser.add_argument("--quiet", action="store_true", help="Write only errors to log") |
427 | 603 | parser.add_argument( |
428 | 604 | "source", help="Source directory containing disk images (and related files)" |
@@ -563,8 +739,17 @@ def main(): |
563 | 739 | shell=True, |
564 | 740 | ) |
565 | 741 |
|
566 | | - # write description CSV |
567 | | - create_spreadsheet(args, sips, volumes, logger) |
| 742 | + # write description |
| 743 | + if args.csv: |
| 744 | + try: |
| 745 | + create_spreadsheet(args, sips, volumes, logger) |
| 746 | + except Exception as err: |
| 747 | + logger.error(f"Error creating description csv: {err}") |
| 748 | + else: |
| 749 | + try: |
| 750 | + create_aspace_excel_sheet(args, sips, volumes, logger) |
| 751 | + except Exception as err: |
| 752 | + logger.error(f"Error creating ArchivesSpace description xlsx: {err}") |
568 | 753 |
|
569 | 754 | # print unprocessed list |
570 | 755 | if unprocessed: |
|
0 commit comments