Source code for ppa.archive.management.commands.gale_import

"""
**gale_import** is a custom manage command for bulk import of Gale
materials into the local database for management.  It takes either
a list of Gale item ids or a path to a CSV file.

Items are imported into the database for management and also indexed into
Solr as part of this import script (both works and pages).

Example usage::

    # import from a csv file
    python manage.py gale_import -c path/to/import.csv
    # import specific items
    python manage.py gale_import galeid1 galeid2 galeid3

When using a CSV file for import, it *must* include an **ID** field;
it may also include **NOTES** (any contents will be imported into private notes),
and fields to indicate collection membership to be set on import.
These are the supported collection abbreviations:

- OB: Original Bibliography
- LIT: Literary
- MUS: Music
- TYP: Typographically Unique
- LING: Linguistic
- DIC: Dictionaries
- WL: Word Lists

"""
import csv
import logging
from collections import Counter

from django.core.exceptions import ImproperlyConfigured
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import pluralize, truncatechars
from parasolr.django.signals import IndexableSignalHandler

from ppa.archive.gale import GaleAPI, GaleAPIError, MARCRecordNotFound
from ppa.archive.import_util import GaleImporter
from ppa.archive.models import Collection, DigitizedWork

logger = logging.getLogger(__name__)



[docs]
class Command(BaseCommand):
    """Import Gale content into PPA for management and search"""

    help = __doc__

    stats = None
    #: normal verbosity level
    v_normal = 1
    verbosity = v_normal

    # input spreadsheets use the following codes as field names
    # to indicate collection membership
    collection_codes = {
        "OB": "Original Bibliography",
        "LIT": "Literary",
        "MUS": "Music",
        "TYP": "Typographically Unique",
        "LING": "Linguistic",
        "DIC": "Dictionaries",
        "WL": "Word Lists",
    }

    # item type lookup for supported types — adapted from hathi excerpt script
    item_type = {
        "Excerpt": DigitizedWork.EXCERPT,
        "Article": DigitizedWork.ARTICLE,
        "Full work": DigitizedWork.FULL,
    }


[docs]
    def add_arguments(self, parser):
        parser.add_argument(
            "ids",
            nargs="*",
            help="Optional list of specific items to import by Gale id.",
        )
        parser.add_argument(
            "-c", "--csv", type=str, help="CSV file with items to import be imported."
        )

        # NOTE: no support for updating records for now, since Gale/ECCO records
        # will not change.


[docs]
    def handle(self, *args, **kwargs):
        if not (kwargs["ids"] or kwargs["csv"]):
            raise CommandError("A list of IDs or CSV file for is required for import")

        # error handling in case user forgets to specify csv file correctly
        if (
            "ids" in kwargs
            and len(kwargs["ids"]) == 1
            and kwargs["ids"][0].endswith(".csv")
        ):
            self.stdout.write(
                self.style.WARNING(
                    "%s is not a valid id; did you forget to specify -c/--csv?"
                    % kwargs["ids"][0]
                )
            )
            return

        self.verbosity = kwargs.get("verbosity", self.v_normal)

        # disconnect signal-based indexing to avoid unnecessary indexing
        IndexableSignalHandler.disconnect()

        # api initialization will error if username is not in settings
        # catch and output error as command error for readability
        try:
            self.gale_api = GaleAPI()
        except ImproperlyConfigured as err:
            raise CommandError(str(err))

        self.stats = Counter()

        # if ids are specified on the command line, create a list
        # of dictionaries so import will look similar to csv
        if kwargs["ids"]:
            to_import = [{"ID": gale_id} for gale_id in kwargs["ids"]]
        # when csv is specified, load rows into a list of dics
        elif kwargs["csv"]:
            to_import = self.load_csv(kwargs["csv"])
            # load collections when importing from CSV
            self.load_collections()

        # total is needed for progessbar (if we add it)
        self.stats["total"] = len(to_import)

        # initialize importer and run pre-steps needed before importi
        self.importer = GaleImporter()
        self.importer.add_item_prep()

        for item in to_import:
            if self.verbosity >= self.v_normal:
                # include title in output if present, but truncate since many are long
                self.stdout.write(
                    " ".join([item["ID"], truncatechars(item.get("Title", ""), 55)])
                )
            # send extra details to import method
            # to handle notes and collection membership from CSV
            item_info = item.copy()
            del item_info["ID"]  # don't send ID twice
            self.import_record(item["ID"], **item_info)

        summary = (
            "\nProcessed {:,d} item{} for import."
            + "\nImported {:,d}; {:,d} missing MARC record{}; "
            + "skipped {:,d}; {:,d} error{}; imported {:,d} page{}."
        )
        summary = summary.format(
            self.stats["total"],
            pluralize(self.stats["total"]),
            self.stats["imported"],
            self.stats["no_marc"],
            pluralize(self.stats["no_marc"]),
            self.stats["skipped"],
            self.stats["error"],
            pluralize(self.stats["error"]),
            self.stats["pages"],
            pluralize(self.stats["pages"]),
        )
        self.stdout.write(summary)


    collections = {}


[docs]
    def load_collections(self):
        """Load :class:`~ppa.archive.model.Collection` records from the
        database and create a lookup based on the codes used in the spreadsheet."""
        collections = {c.name: c for c in Collection.objects.all()}
        for code, name in self.collection_codes.items():
            self.collections[code] = collections[name]



[docs]
    def load_csv(self, path):
        """Load a CSV file with items to be imported."""
        try:
            with open(path, encoding="utf-8-sig") as csvfile:
                csvreader = csv.DictReader(csvfile)
                data = [row for row in csvreader]
        except FileNotFoundError:
            raise CommandError("Error loading the specified CSV file: %s" % path)

        if "ID" not in data[0].keys():
            raise CommandError("ID column is required in CSV file")
        return data



[docs]
    def import_record(self, gale_id, **kwargs):
        """Import a single work into the database.
        Retrieves record data from Gale API."""

        # check if an item with this source id + page range exists
        # (check local db first because API call is slow for large items)

        # use an unsaved digitized work to parse the page range (if any)
        # for queryset filter to check for duplicates
        dw_pages = DigitizedWork(
            pages_digital=kwargs.get("Digital Page Range", "").replace(";", ",")
        )

        if DigitizedWork.objects.filter(
            source_id=gale_id, pages_digital=dw_pages.pages_digital
        ).exists():
            self.stderr.write("%s is already in the database; skipping" % gale_id)
            self.stats["skipped"] += 1
            return

        # determine collection membership based on spreadsheet columns
        digwork_collections = [
            collection
            for code, collection in self.collections.items()
            if kwargs.get(code)
        ]

        # translate item type in spreadsheet to digitized work item type code
        # strip whitespace in case any was added in the spreadsheet
        kwargs["item_type"] = self.item_type.get(kwargs.get("Item Type", "").strip())

        digwork = self.importer.import_digitizedwork(
            gale_id, collections=digwork_collections, **kwargs
        )

        # if import failed, check status
        if not digwork:
            if isinstance(self.importer.results[gale_id], GaleAPIError):
                self.stderr.write("Error getting item information for %s" % gale_id)
                self.stats["error"] += 1
                return

        # check for marc record not found error
        if isinstance(self.importer.results[gale_id], MARCRecordNotFound):
            self.stats["no_marc"] += 1
            self.stderr.write(
                self.style.WARNING("MARC record not found for %s" % (gale_id))
            )

        # if record was created successfully, update stats
        if digwork:
            self.stats["imported"] += 1
            self.stats["pages"] += digwork.page_count
            return digwork
Source code for ppa.archive.management.commands.gale_import

Princeton Prosody Archive

Navigation