Source code for ppa.archive.management.commands.hathi_rsync
import csv
import os.path
import tempfile
from datetime import datetime
from django.core.management.base import BaseCommand
from django.template.defaultfilters import pluralize
from pairtree import path2id
from ppa.archive.import_util import HathiImporter
from ppa.archive.models import DigitizedWork
[docs]
class Command(BaseCommand):
"""Update HathiTrust pairtree data via rsync"""
help = __doc__
#: normal verbosity level
v_normal = 1
verbosity = v_normal
[docs]
def add_arguments(self, parser):
parser.add_argument(
"htids",
nargs="*",
help="Optional list HathiTrust ids to synchronize",
)
[docs]
def handle(self, *args, **kwargs):
self.verbosity = kwargs.get("verbosity", self.v_normal)
self.options = kwargs
# use ids specified via command line when present
htids = kwargs.get("htids", [])
# by default, sync data for all non-suppressed hathi source ids
digworks = DigitizedWork.objects.filter(
status=DigitizedWork.PUBLIC, source=DigitizedWork.HATHI
)
# if htids are specified via parameter, use them to filter
# the queryset, to ensure we only sync records that are
# in the database and not suppressed
if htids:
digworks = digworks.filter(source_id__in=htids)
# generate a list of unique source ids from the queryset
working_htids = digworks.values_list("source_id", flat=True).distinct()
# if htids were explicitly specified, report if any are skipped
if htids:
skipped_htids = set(htids) - set(working_htids)
if skipped_htids:
self.stdout.write(
self.style.NOTICE(
f"{len(skipped_htids)} id{pluralize(skipped_htids)} "
+ "not found in public HathiTrust volumes; "
+ f"skipping {' '.join(skipped_htids)}"
)
)
# bail out if there's nothing to do
# (e.g., explicit htids only and none valid)
if not working_htids:
self.stdout.write("No records to synchronize; stopping")
return
self.stdout.write(
f"Synchronizing data for {len(working_htids)} record{pluralize(working_htids)}"
)
# create a tempdir for rsync logfile; will automatically be cleaned up
output_dir = tempfile.TemporaryDirectory(prefix="ppa-rsync_")
# we always want itemized rsync output, so we can report
# on which htids have updated content
htimporter = HathiImporter(
source_ids=working_htids, rsync_output=True, output_dir=output_dir.name
)
logfile = htimporter.rsync_data()
# read the rsync itemized output to identify and report on changes
updated_files = []
with open(logfile) as rsync_output:
for line in rsync_output:
# check for a line indicating that a file was updated
if " >f" in line:
# rsync itemized output is white-space delimited
parts = line.split()
# last element is the filename that was updated
filename = parts[-1]
# itemized info flags precede the filename
flags = parts[-2]
# we only care about zip files and mets.xml files
if not filename.endswith(".zip") and not filename.endswith(".xml"):
continue
# reconstruct the hathi id from the filepath
ht_prefix, pairtree_dir = filename.split("/pairtree_root/", 1)
# get the directory one level up from the updated file
pairtree_id = os.path.dirname(os.path.dirname(pairtree_dir))
# use pairtree to determine the id based on the path
# (handles special characters like those used in ARKs)
htid = f"{ht_prefix}.{path2id(pairtree_id)}"
updated_files.append(
{
"htid": htid,
"filename": os.path.basename(filename),
# rsync itemized flags look like >f.st....
# or >f+++++++ for new files
"size_changed": flags[3] == "s",
"modification_time": flags[4] == "t",
"rsync_flags": flags,
}
)
# should this behavior only be when updating all?
# if specific htids are specified on the command line, maybe report on them only?
if updated_files:
outfilename = "ppa_rsync_changes_{time}.csv".format(
time=datetime.now().strftime("%Y%m%d-%H%M%S")
)
# use keys from the first row to populate csv header row
fields = updated_files[0].keys()
with open(outfilename, "w") as outfile:
csvwriter = csv.DictWriter(outfile, fieldnames=fields)
csvwriter.writeheader()
csvwriter.writerows(updated_files)
updated_htids = set([i["htid"] for i in updated_files])
success_msg = (
f"Updated {len(updated_files)} files for {len(updated_htids)} volumes; "
+ f"full details in {outfilename}"
)
else:
success_msg = "rsync completed; no changes to report"
self.stdout.write(self.style.SUCCESS(success_msg))