Source code for ppa.archive.management.commands.gale_import

"""
**gale_import** is a custom manage command for bulk import of Gale
materials into the local database for management.  It takes either
a list of Gale item ids or a path to a CSV file.

Items are imported into the database for management and also indexed into
Solr as part of this import script (both works and pages).

Example usage::

    # import from a csv file
    python manage.py gale_import -c path/to/import.csv
    # import specific items
    python manage.py hathi_import galeid1 galeid2 galeid3

When using a CSV file for import, it *must* include an **ID** field;
it may also include **NOTES** (any contents will be imported into private notes),
and fields to indicate collection membership to be set on import.
These are the supported collection abbreviations:

- OB: Original Bibliography
- LIT: Literary
- MUS: Music
- TYP: Typographically Unique
- LING: Linguistic
- DIC: Dictionaries
- WL: Word Lists

"""
import csv
import logging
from collections import Counter

from django.conf import settings
from django.contrib.admin.models import ADDITION, LogEntry
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import ImproperlyConfigured
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import pluralize, truncatechars
from parasolr.django.signals import IndexableSignalHandler

from ppa.archive.gale import GaleAPI, GaleAPIError, MARCRecordNotFound, get_marc_record
from ppa.archive.import_util import GaleImporter
from ppa.archive.models import Collection, DigitizedWork, Page

logger = logging.getLogger(__name__)


[docs] class Command(BaseCommand): """Import Gale content into PPA for management and search""" help = __doc__ stats = None #: normal verbosity level v_normal = 1 verbosity = v_normal # input spreadsheets use the following codes as field names # to indicate collection membership collection_codes = { "OB": "Original Bibliography", "LIT": "Literary", "MUS": "Music", "TYP": "Typographically Unique", "LING": "Linguistic", "DIC": "Dictionaries", "WL": "Word Lists", } # item type lookup for supported types — adapted from hathi excerpt script item_type = { "Excerpt": DigitizedWork.EXCERPT, "Article": DigitizedWork.ARTICLE, "Full work": DigitizedWork.FULL, }
[docs] def add_arguments(self, parser): parser.add_argument( "ids", nargs="*", help="Optional list of specific items to import by Gale id.", ) parser.add_argument( "-c", "--csv", type=str, help="CSV file with items to import be imported." )
# NOTE: no support for updating records for now, since Gale/ECCO records # will not change.
[docs] def handle(self, *args, **kwargs): if not (kwargs["ids"] or kwargs["csv"]): raise CommandError("A list of IDs or CSV file for is required for import") # error handling in case user forgets to specify csv file correctly if ( "ids" in kwargs and len(kwargs["ids"]) == 1 and kwargs["ids"][0].endswith(".csv") ): self.stdout.write( self.style.WARNING( "%s is not a valid id; did you forget to specify -c/--csv?" % kwargs["ids"][0] ) ) return self.verbosity = kwargs.get("verbosity", self.v_normal) # disconnect signal-based indexing to avoid unnecessary indexing IndexableSignalHandler.disconnect() # api initialization will error if username is not in settings # catch and output error as command error for readability try: self.gale_api = GaleAPI() except ImproperlyConfigured as err: raise CommandError(str(err)) self.stats = Counter() # if ids are specified on the command line, create a list # of dictionaries so import will look similar to csv if kwargs["ids"]: to_import = [{"ID": gale_id} for gale_id in kwargs["ids"]] # when csv is specified, load rows into a list of dics elif kwargs["csv"]: to_import = self.load_csv(kwargs["csv"]) # load collections when importing from CSV self.load_collections() # total is needed for progessbar (if we add it) self.stats["total"] = len(to_import) # initialize importer and run pre-steps needed before importi self.importer = GaleImporter() self.importer.add_item_prep() for item in to_import: if self.verbosity >= self.v_normal: # include title in output if present, but truncate since many are long self.stdout.write( " ".join([item["ID"], truncatechars(item.get("Title", ""), 55)]) ) # send extra details to import method # to handle notes and collection membership from CSV item_info = item.copy() del item_info["ID"] # don't send ID twice self.import_record(item["ID"], **item_info) summary = ( "\nProcessed {:,d} item{} for import." + "\nImported {:,d}; {:,d} missing MARC record{}; " + "skipped {:,d}; {:,d} error{}; imported {:,d} page{}." ) summary = summary.format( self.stats["total"], pluralize(self.stats["total"]), self.stats["imported"], self.stats["no_marc"], pluralize(self.stats["no_marc"]), self.stats["skipped"], self.stats["error"], pluralize(self.stats["error"]), self.stats["pages"], pluralize(self.stats["pages"]), ) self.stdout.write(summary)
collections = {}
[docs] def load_collections(self): """Load :class:`~ppa.archive.model.Collection` records from the database and create a lookup based on the codes used in the spreadsheet.""" collections = {c.name: c for c in Collection.objects.all()} for code, name in self.collection_codes.items(): self.collections[code] = collections[name]
[docs] def load_csv(self, path): """Load a CSV file with items to be imported.""" try: with open(path, encoding="utf-8-sig") as csvfile: csvreader = csv.DictReader(csvfile) data = [row for row in csvreader] except FileNotFoundError: raise CommandError("Error loading the specified CSV file: %s" % path) if "ID" not in data[0].keys(): raise CommandError("ID column is required in CSV file") return data
[docs] def import_record(self, gale_id, **kwargs): """Import a single work into the database. Retrieves record data from Gale API.""" # check if an item with this source id + page range exists # (check local db first because API call is slow for large items) # use an unsaved digitized work to parse the page range (if any) # for queryset filter to check for duplicates dw_pages = DigitizedWork( pages_digital=kwargs.get("Digital Page Range", "").replace(";", ",") ) if DigitizedWork.objects.filter( source_id=gale_id, pages_digital=dw_pages.pages_digital ).exists(): self.stderr.write("%s is already in the database; skipping" % gale_id) self.stats["skipped"] += 1 return # determine collection membership based on spreadsheet columns digwork_collections = [ collection for code, collection in self.collections.items() if kwargs.get(code) ] # translate item type in spreadsheet to digitized work item type code # strip whitespace in case any was added in the spreadsheet kwargs["item_type"] = self.item_type.get(kwargs.get("Item Type", "").strip()) digwork = self.importer.import_digitizedwork( gale_id, collections=digwork_collections, **kwargs ) # if import failed, check status if not digwork: if isinstance(self.importer.results[gale_id], GaleAPIError): self.stderr.write("Error getting item information for %s" % gale_id) self.stats["error"] += 1 return # check for marc record not found error if isinstance(self.importer.results[gale_id], MARCRecordNotFound): self.stats["no_marc"] += 1 self.stderr.write( self.style.WARNING("MARC record not found for %s" % (gale_id)) ) # if record was created successfully, update stats if digwork: self.stats["imported"] += 1 self.stats["pages"] += digwork.page_count return digwork