Source code for ppa.archive.models

import logging
import re
import time
from zipfile import ZipFile

from cached_property import cached_property
from django.conf import settings
from django.contrib.admin.models import ADDITION, CHANGE, LogEntry
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import ValidationError
from django.db import models
from django.urls import reverse
from flags import Flags
from intspan import ParseError as IntSpanParseError
from intspan import intspan
from pairtree import storage_exceptions
from parasolr.django import SolrQuerySet
from parasolr.django.indexing import ModelIndexable
from parasolr.indexing import Indexable
from wagtail.admin.panels import FieldPanel
from wagtail.fields import RichTextField
from wagtail.snippets.models import register_snippet

from ppa.archive.gale import GaleAPI, MARCRecordNotFound, get_marc_record
from ppa.archive.hathi import HathiBibliographicAPI, HathiObject

logger = logging.getLogger(__name__)


#: label to use for items that are not in a collection
NO_COLLECTION_LABEL = "Uncategorized"


[docs] class TrackChangesModel(models.Model): """:class:`~django.models.Model` mixin that keeps a copy of initial data in order to check if fields have been changed. Change detection only works on the current instance of an object.""" class Meta: abstract = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # store a copy of model data to allow for checking if # it has changed self.__initial = self.__dict__.copy()
[docs] def save(self, *args, **kwargs): """Saves data and reset copy of initial data.""" super().save(*args, **kwargs) # update copy of initial data to reflect saved state self.__initial = self.__dict__.copy()
[docs] def has_changed(self, field): """check if a field has been changed""" # Only consider the field changed if the object has been saved return self.pk and getattr(self, field) != self.__initial[field]
[docs] def initial_value(self, field): """return the initial value for a field""" return self.__initial[field]
[docs] @register_snippet class Collection(TrackChangesModel): """A collection of :class:`ppa.archive.models.DigitizedWork` instances.""" #: the name of the collection name = models.CharField(max_length=255) #: a RichText description of the collection description = RichTextField(blank=True) #: flag to indicate collections to be excluded by default in #: public search exclude = models.BooleanField( default=False, help_text="Exclude by default on public search." ) # configure for editing in wagtail admin panels = [ FieldPanel("name"), FieldPanel("description"), ] class Meta: ordering = ("name",) def __str__(self): return self.name @property def name_changed(self): """check if name has been changed (only works on current instance)""" return self.has_changed("name")
[docs] @staticmethod def stats(): """Collection counts and date ranges, based on what is in Solr. Returns a dictionary where they keys are collection names and values are a dictionary with count and dates. """ # NOTE: if we *only* want counts, could just do a regular facet sqs = ( SolrQuerySet() .stats("{!tag=piv1 min=true max=true}pub_date") .facet(pivot="{!stats=piv1}collections_exact") ) facet_pivot = sqs.get_facets().facet_pivot # simplify the pivot stat data for display stats = {} for collection in facet_pivot.collections_exact: pub_date_stats = collection.stats.stats_fields.pub_date stats[collection.value] = { "count": collection.count, "dates": "%(min)d%(max)d" % pub_date_stats if pub_date_stats.max != pub_date_stats.min else "%d" % (pub_date_stats.min or 0,), } return stats
[docs] class Cluster(TrackChangesModel): """A model to collect groups of works such as reprints or editions that should be collapsed in the main archive search and accessible together.""" cluster_id = models.CharField( "Cluster ID", help_text="Unique identifier for a cluster of digitized works", unique=True, max_length=255, ) class Meta: ordering = ("cluster_id",) def __str__(self): return self.cluster_id def __repr__(self): return "<cluster %s>" % str(self)
[docs] class ProtectedWorkFieldFlags(Flags): """:class:`flags.Flags` instance to indicate which :class:`DigitizedWork` fields should be protected if edited in the admin.""" #: title title = () #: subtitle subtitle = () #: sort title sort_title = () #: enumcron enumcron = () #: author author = () #: place of publication pub_place = () #: publisher publisher = () #: publication date pub_date = ()
[docs] @classmethod def deconstruct(cls): """Give Django information needed to make :class:`ProtectedWorkFieldFlags.no_flags` default in migration.""" # (import path, [args], kwargs) return ("ppa.archive.models.ProtectedWorkFieldFlags", ["no_flags"], {})
def __str__(self): return ", ".join(sorted(self.to_simple_str().split("|")))
[docs] class ProtectedWorkField(models.Field): """PositiveSmallIntegerField subclass that returns a :class:`ProtectedWorkFieldFlags` object and stores as integer.""" description = ( "A field that stores an instance of :class:`ProtectedWorkFieldFlags` " "as an integer." ) def __init__(self, verbose_name=None, name=None, **kwargs): """Make the field unnullable; by default, not allowed to be blank.""" if "blank" not in kwargs: kwargs["blank"] = False super().__init__(verbose_name, name, null=False, **kwargs)
[docs] def from_db_value(self, value, expression, connection): """Always return an instance of :class:`ProtectedWorkFieldFlags`""" return ProtectedWorkFieldFlags(value)
[docs] def get_internal_type(self): "Preserve type as PositiveSmallIntegerField" return "PositiveSmallIntegerField"
[docs] def get_prep_value(self, value): if value == "": return 0 return int(value)
[docs] def to_python(self, value): """Always return an instance of :class:`ProtectedWorkFieldFlags`""" return ProtectedWorkFieldFlags(value)
[docs] class SignalHandlers: """Signal handlers for indexing :class:`DigitizedWork` records when :class:`Collection` or :class:`Cluster` records are saved or deleted."""
[docs] @staticmethod def collection_save(sender, instance, **kwargs): """signal handler for collection save; reindex associated digitized works""" # only reindex if collection name has changed # and if collection has already been saved if instance.pk and instance.name_changed: # if the collection has any works associated works = instance.digitizedwork_set.all() if works.exists(): logger.debug( "collection save, reindexing %d related works", works.count() ) DigitizedWork.index_items(works)
[docs] @staticmethod def collection_delete(sender, instance, **kwargs): """signal handler for collection delete; clear associated digitized works and reindex""" # get a list of ids for collected works before clearing them digwork_ids = instance.digitizedwork_set.values_list("id", flat=True) # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) logger.debug("collection delete, reindexing %d works" % len(digworks)) # NOTE: this sends pre/post clear signal, but it's not obvious # how to take advantage of that instance.digitizedwork_set.clear() DigitizedWork.index_items(digworks)
[docs] @staticmethod def cluster_save(sender, instance, **kwargs): """signal handler for cluster save; reindex pages for associated digitized works""" # only reindex if cluster id has changed # and if object has already been saved to the db if instance.pk and instance.has_changed("cluster_id"): # if the cluster has any works associated works = instance.digitizedwork_set.all() if works.exists(): # get a total of page count for affected works page_count = works.aggregate(page_count=models.Sum("page_count")) logger.debug( "cluster id has changed, reindexing %d works and %d pages", works.count(), page_count.get("page_count", 0), ) DigitizedWork.index_items(works) # reindex pages (this may be slow...) for work in works: work.index_items(Page.page_index_data(work))
[docs] @staticmethod def cluster_delete(sender, instance, **kwargs): """signal handler for cluster delete; clear associated digitized works and reindex""" # get a list of ids for collected works before clearing them digwork_ids = instance.digitizedwork_set.values_list("id", flat=True) # find the items based on the list of ids to reindex digworks = DigitizedWork.objects.filter(id__in=list(digwork_ids)) # get a total of page count for affected works page_count = digworks.aggregate(page_count=models.Sum("page_count")) logger.debug( "cluster delete, reindexing %d works and %d pages", digworks.count(), page_count["page_count"], ) # NOTE: this sends pre/post clear signal, but it's not obvious # how to take advantage of that for reindexing instance.digitizedwork_set.clear() DigitizedWork.index_items(digworks) # reindex pages (this may be slow...) for work in digworks: work.index_items(Page.page_index_data(work))
[docs] @staticmethod def handle_digwork_cluster_change(sender, instance, **kwargs): """when a :class:`DigitizedWork` is saved, reindex pages if cluster id has changed""" if isinstance(instance, DigitizedWork) and instance.has_changed("cluster_id"): logger.debug( "Cluster changed for %s; indexing %d pages", instance, instance.page_count, ) instance.index_items(Page.page_index_data(instance))
[docs] def validate_page_range(value): """Ensure page range can be parsed as an integer span""" try: intspan(value) except IntSpanParseError as err: raise ValidationError( "Parse error: %(message)s", params={"message": err}, )
[docs] class DigitizedWorkQuerySet(models.QuerySet):
[docs] def by_first_page_orig(self, start_page): "find records based on first page in original page range" return self.filter(pages_orig__regex=f"^{start_page}([,-]|\b|$)")
[docs] class DigitizedWork(ModelIndexable, TrackChangesModel): """ Record to manage digitized works included in PPA and store their basic metadata. """ HATHI = "HT" GALE = "G" OTHER = "O" SOURCE_CHOICES = ( (HATHI, "HathiTrust"), (GALE, "Gale"), (OTHER, "Other"), ) #: source of the record, HathiTrust or elsewhere source = models.CharField( max_length=2, choices=SOURCE_CHOICES, default=HATHI, help_text="Source of the record.", ) #: source identifier; hathi id for HathiTrust materials source_id = models.CharField( max_length=255, verbose_name="Source ID", help_text="Source identifier. Must be unique when combined with page range; " + "used for site URL. (HT id for HathiTrust materials.)", ) #: source url where the original can be accessed source_url = models.URLField( max_length=255, verbose_name="Source URL", blank=True, help_text="URL where the source item can be accessed", ) #: record id; for Hathi materials, used for different copies of #: the same work or for different editions/volumes of a work record_id = models.CharField( max_length=255, blank=True, help_text="For HathiTrust materials, record id (use to aggregate " + "copies or volumes); for Gale materials, ESTC id.", ) #: title of the work; using TextField to allow for long titles title = models.TextField(help_text="Main title") #: subtitle of the work; using TextField to allow for long titles subtitle = models.TextField( blank=True, default="", help_text="Subtitle, if any (optional)" ) #: sort title: title without leading non-sort characters, from marc sort_title = models.TextField( default="", help_text="Sort title from MARC record or title without leading article", ) #: enumeration/chronology (hathi-specific; contains volume or version) enumcron = models.CharField( "Enumeration/Chronology/Volume", max_length=255, blank=True, help_text="Enumcron for HathiTrust material; volume for Gale material", ) # NOTE: may eventually to convert to foreign key author = models.CharField( max_length=255, blank=True, help_text="Authorized name of the author, last name first.", ) #: place of publication pub_place = models.CharField("Place of Publication", max_length=255, blank=True) #: publisher publisher = models.TextField(blank=True) # Needs to be integer to allow aggregating max/min, filtering by date pub_date = models.PositiveIntegerField("Publication Date", null=True, blank=True) #: number of pages in the work (or page range, for an excerpt) page_count = models.PositiveIntegerField( null=True, blank=True, help_text="Automatically calculated on import; " + "recalculated on save when digital page range changes", ) #: public notes field for this work public_notes = models.TextField( blank=True, default="", help_text="Notes on edition or other details (displayed on public site)", ) #: internal team notes, not displayed on the public facing site notes = models.TextField( blank=True, default="", help_text="Internal curation notes (not displayed on public site)", ) #: :class:`ProtectedWorkField` instance to indicate metadata fields #: that should be preserved from bulk updates because they have been #: modified in Django admin. protected_fields = ProtectedWorkField( default=ProtectedWorkFieldFlags, blank=True, # required for save as new, where we make editable to copy help_text="Fields protected from HathiTrust bulk " "update because they have been manually edited in the " "Django admin.", ) #: collections that this work is part of collections = models.ManyToManyField(Collection, blank=True) #: optional cluster for aggregating works cluster = models.ForeignKey( Cluster, blank=True, null=True, on_delete=models.SET_NULL ) #: date added to the archive added = models.DateTimeField(auto_now_add=True) #: date of last modification of the local record updated = models.DateTimeField(auto_now=True) PUBLIC = "P" SUPPRESSED = "S" STATUS_CHOICES = ( (PUBLIC, "Public"), (SUPPRESSED, "Suppressed"), ) #: status of record; currently choices are public or suppressed status = models.CharField( max_length=2, choices=STATUS_CHOICES, default=PUBLIC, help_text="Changing status to suppressed will remove rsync data " + "for that volume and remove from the public index. This is " + "currently not reversible; use with caution.", ) FULL = "F" EXCERPT = "E" ARTICLE = "A" ITEMTYPE_CHOICES = ( (FULL, "Full work"), (EXCERPT, "Excerpt"), (ARTICLE, "Article"), ) #: type of record, whether excerpt, article, or full; defaults to full item_type = models.CharField( max_length=1, choices=ITEMTYPE_CHOICES, default=FULL, help_text="Portion of the work that is included; " + "used to determine icon for public display.", ) #: book or journal title for excerpt or article book_journal = models.TextField( "Book/Journal title", help_text="title of the book or journal that includes " + "this content (excerpt/article only)", blank=True, ) pages_orig = models.CharField( "Page range (original)", max_length=255, help_text="Page range in the original work (for display and citation).", blank=True, ) pages_digital = models.CharField( "Page range (digital edition)", max_length=255, help_text="Sequence of pages in the digital edition. " + "Use full digits for start and end separated by a dash (##-##); " + "for multiple sequences, separate ranges by a comma (##-##, ##-##). " + "NOTE: removing page range may have unexpected results.", blank=True, validators=[validate_page_range], ) old_workid = models.CharField( "Old Work ID", max_length=255, help_text="past work id; used for excerpts previously " + "identified by start of digital page range", blank=True, ) # use custom queryset objects = DigitizedWorkQuerySet.as_manager() class Meta: ordering = ("sort_title",) # require unique combination of source id + page range, # since we need to allow multiple excerpts from the same source constraints = [ models.UniqueConstraint( fields=["source_id", "pages_digital"], name="unique_sourceid_pagerange" ), # we are now using original page range for unique id, # so require source id + pages_orig to be unique models.UniqueConstraint( fields=["source_id", "pages_orig"], name="unique_sourceid_pages_orig" ), ]
[docs] def get_absolute_url(self): """ Return object's url for :class:`ppa.archive.views.DigitizedWorkDetailView` """ url_opts = {"source_id": self.source_id} # start page must be specified if set but must not be included if empty if self.pages_orig: url_opts["start_page"] = self.first_page() return reverse("archive:detail", kwargs=url_opts)
def __str__(self): """Default string display. Uses :attr:`source_id` and :attr:`pages_orig` if any""" if self.pages_orig: return "%s (%s)" % (self.source_id, self.pages_orig) return self.source_id @property def index_cluster_id(self): """ Convenience function to get a string representation of the cluster (or self if no cluster). Reduces redunadancy elsewhere. """ return str(self.cluster) if self.cluster else self.index_id()
[docs] def clean_fields(self, exclude=None): if not exclude or "pages_digital" not in exclude: # normalize whitespace before applying regex validation self.pages_digital = " ".join(self.pages_digital.strip().split()) super().clean_fields(exclude=exclude)
@property def is_suppressed(self): """Item has been suppressed (based on :attr:`status`).""" return self.status == self.SUPPRESSED
[docs] def display_title(self): """admin display title to allow displaying title but sorting on sort_title""" return self.title
display_title.short_description = "title" display_title.admin_order_field = "sort_title"
[docs] def is_public(self): """admin display field indicating if record is public or suppressed""" return self.status == self.PUBLIC
is_public.short_description = "Public" is_public.boolean = True is_public.admin_order_field = "status" #: regular expresion for cleaning preliminary text from publisher names printed_by_re = ( r"^(Printed)?( and )?(Pub(.|lished|lisht)?)?( and sold)? (by|for|at)( the)? ?" ) # Printed by/for (the); Printed and sold by; Printed and published by; # Pub./Published/Publisht at/by/for the pubyear_re = re.compile(r"(?P<year>\d{4})") @property def has_fulltext(self): """Checks if an item has full text (currently only items from HathiTrust or Gale).""" return self.source in [self.HATHI, self.GALE] @cached_property def hathi(self): """:class:`ppa.archive.hathi.HathiObject` for HathiTrust records, for working with data in HathiTrust pairtree data structure.""" if self.source == self.HATHI: return HathiObject(self.source_id) return None
[docs] def save(self, *args, **kwargs): # if status has changed so that object is now suppressed, # do some cleanup if self.has_changed("status") and self.status == DigitizedWork.SUPPRESSED: # remove indexed page content from Solr using index id # (i.e., if excerpt, should only remove content for this excerpt, # not all excerpts in this volume) self.solr.update.delete_by_query('group_id_s:"%s"' % self.index_id()) # if this is a HathiTrust item, remove pairtree data if self.source == DigitizedWork.HATHI: # if this is a full work (not excerpted), remove # if this is an excerpt, should only remove if there are no other # public excerpts from this volume if ( self.item_type == DigitizedWork.FULL or not DigitizedWork.objects.filter( status=DigitizedWork.PUBLIC, source_id=self.source_id ) .exclude(pk=self.pk) .exists() ): self.hathi.delete_pairtree_data() # Solr identifier is based on combination of source id and first page; # if either changes, remove the old record from Solr before saving # with the new identifier if self.has_changed("source_id") or self.has_changed("pages_digital"): # store the updated values new_source_id = self.source_id new_pages_digital = self.pages_digital # temporarily revert to previous value to remove from index self.source_id = self.initial_value("source_id") self.pages_digital = self.initial_value("pages_digital") self.remove_from_index() # restore new values self.source_id = new_source_id self.pages_digital = new_pages_digital # if excerpt page range has changed # OR this is a new record with a page range if self.has_changed("pages_digital") or ( self.pk is None and self.pages_digital ): # update the page count if possible (i.e., not a Gale record) self.page_count = self.count_pages() # if page range changed on existing record, clear out old index if self.pages_digital and self.pk is not None: # update index to remove all pages that are no longer in range self.solr.update.delete_by_query( 'source_id:"%s" AND item_type:page NOT order:(%s)' % (self.source_id, " OR ".join(str(p) for p in self.page_span)) ) # any page range change requires reindexing (potentially slow) if self.pk is None: logger.debug("Indexing pages for new excerpt %s", self) else: logger.debug("Reindexing pages for %s after change to page range", self) self.index_items(Page.page_index_data(self)) # NOTE: removing a page range may not work as expected # (does not recalculate page count; cannot recalculate for Gale items) super().save(*args, **kwargs)
[docs] def clean(self): """Add custom validation to trigger a save error in the admin if someone tries to unsuppress a record that has been suppressed (not yet supported).""" if self.has_changed("status") and self.status != self.SUPPRESSED: raise ValidationError("Unsuppressing records not yet supported.") # should not be editable in admin, but add a validation check # just in case if self.has_changed("source_id") and self.source == self.HATHI: raise ValidationError( "Changing source ID for HathiTrust records is not supported" ) # if original page range is set, check that first page is unique if self.pages_orig: first_page = self.first_page_original() # check for other excerpts in this work with the same first page other_excerpts = DigitizedWork.objects.filter( source_id=self.source_id ).by_first_page_orig(first_page) # if this record has already been saved, exclude it when checking if self.pk: other_excerpts = other_excerpts.exclude(pk=self.pk) if other_excerpts.exists(): raise ValidationError( { "pages_orig": f"First page {first_page} is not unique for this source", } )
[docs] def compare_protected_fields(self, db_obj): """Compare protected fields in a :class:`ppa.archive.models.DigitizedWork` instance and return those that are changed. :param object db_obj: Database instance of a :class:`~ppa.archive.models.DigitizedWork`. """ changed_fields = [] # if a field has changed, append to changed fields for field in ProtectedWorkFieldFlags.all_flags: # field is in format of ProtectedWorkFieldFlags.title field_name = str(field) # if obj has a different value for a protected field # than its db counterpart if getattr(self, field_name) != getattr(db_obj, field_name): # append as a now protected field changed_fields.append(field_name) return changed_fields
[docs] def populate_fields(self, field_data): """Conditionally update fields as protected by flags using Hathi bibdata information. :param dict field_data: A dictionary of fields updated from a :class:`ppa.archive.hathi.HathiBibliographicRecord` instance. """ protected_fields = [str(field) for field in self.protected_fields] for field, value in field_data.items(): if field not in protected_fields: setattr(self, field, value)
[docs] def metadata_from_marc(self, marc_record, populate=True): """Get metadata from MARC record and return a dictionary of the data. When populate is True, calls `populate_fields` to set values.""" # create dictionary to store bibliographic information field_data = {} # set title and subtitle from marc if possible # - clean title: strip trailing space & slash and initial bracket field_data["title"] = marc_record["245"]["a"].rstrip(" /").lstrip("[") # according to PUL CAMS, # 245 subfield contains the subtitle *if* the preceding field # ends with a colon. (Otherwise could be a parallel title, # e.g. title in another language). # HOWEVER: metadata from Hathi doesn't seem to follow this # pattern (possibly due to records being older?) # subfields is a list of code, value, code, value # iterate in paired steps of two starting with first and second # for code, value in zip(marc_record['245'].subfields[0::2], # marc_record['245'].subfields[1::2]): # if code == 'b': # break # preceding_character = value[-1:] # if preceding_character == ':': # self.subtitle = marc_record['245']['b'] or '' # NOTE: skipping preceding character check for now field_data["subtitle"] = marc_record["245"]["b"] or "" # strip trailing space & slash from subtitle field_data["subtitle"] = field_data["subtitle"].rstrip(" /") # indicator 2 provides the number of characters to be # skipped when sorting (could be 0) try: non_sort = int(marc_record["245"].indicators[1]) except ValueError: # at least one record has a space here instead of a number # probably a data error, but handle it # - assuming no non-sort characters non_sort = 0 # strip whitespace, since a small number of records have a # nonsort value that doesn't include a space after a # definite article. # Also strip punctuation, since MARC only includes it in # non-sort count when there is a definite article. field_data["sort_title"] = marc_record.title()[non_sort:].strip(' "[') field_data["author"] = marc_record.author() or "" # remove a note present on some records and strip whitespace field_data["author"] = ( field_data["author"].replace("[from old catalog]", "").strip() ) # removing trailing period, except when it is part of an # initial or known abbreviation (i.e, Esq.) # Look for single initial, but support initials with no spaces if field_data["author"].endswith(".") and not re.search( r"( ([A-Z]\.)*[A-Z]| Esq)\.$", field_data["author"] ): field_data["author"] = field_data["author"].rstrip(".") # field 260 includes publication information if "260" in marc_record: # strip trailing punctuation from publisher and pub place # subfield $a is place of publication field_data["pub_place"] = marc_record["260"]["a"] or "" field_data["pub_place"] = field_data["pub_place"].rstrip(";:,") # if place is marked as unknown ("sine loco"), leave empty if field_data["pub_place"].lower() == "[s.l.]": field_data["pub_place"] = "" # subfield $b is name of publisher field_data["publisher"] = marc_record["260"]["b"] or "" field_data["publisher"] = field_data["publisher"].rstrip(";:,") # if publisher is marked as unknown ("sine nomine"), leave empty if field_data["publisher"].lower() == "[s.n.]": field_data["publisher"] = "" # remove printed by statement before publisher name, # then strip any remaining whitespace field_data["publisher"] = re.sub( self.printed_by_re, "", field_data["publisher"], flags=re.IGNORECASE ).strip() # Gale/ECCO dates may include non-numeric, e.g. MDCCLXXXVIII. [1788] # try as numeric first, then extract year with regex pubdate = marc_record.pubyear() # at least one case returns None here, # which results in a TypeError on attemped conversion to integer if pubdate: try: field_data["pub_date"] = int(pubdate) except ValueError: yearmatch = self.pubyear_re.search(pubdate) if yearmatch: field_data["pub_date"] = int(yearmatch.groupdict()["year"]) # remove brackets around inferred publishers, place of publication # *only* if they wrap the whole text for field in ["publisher", "pub_place"]: if field in field_data: field_data[field] = re.sub( r"^\[(.*)\]$", r"\1", field_data[field] ).strip() if populate: # conditionally update fields that are protected (or not) self.populate_fields(field_data) return field_data
[docs] def populate_from_bibdata(self, bibdata): """Update record fields based on Hathi bibdata information. Full record is required in order to set all fields :param bibdata: bibliographic data returned from HathiTrust as instance of :class:`ppa.archive.hathi.HathiBibliographicRecord` """ # create dictionary to store bibliographic information field_data = {} # store hathi record id field_data["record_id"] = bibdata.record_id # set fields from marc if available, since it has more details if bibdata.marcxml: # get metadata from marcxml, but don't save it yet field_data.update(self.metadata_from_marc(bibdata.marcxml, populate=False)) else: # fallback behavior, if marc is not availiable # use dublin core title field_data["title"] = bibdata.title # could guess at non-sort, but hopefully unnecessary # pub date returned in api JSON is list; use first for now (if available) if bibdata.pub_dates: field_data["pub_date"] = bibdata.pub_dates[0] copy_details = bibdata.copy_details(self.source_id) # hathi version/volume information for this specific copy of a work field_data["enumcron"] = copy_details["enumcron"] or "" # hathi source url can currently be inferred from htid, but is # included in the bibdata in case it changes - so let's just store it field_data["source_url"] = copy_details["itemURL"] # should also consider storing: # - last update, rights code / rights string, item url # (maybe solr only?) # conditionally update fields that are protected (or not) self.populate_fields(field_data)
index_depends_on = { "collections": { "post_save": SignalHandlers.collection_save, "pre_delete": SignalHandlers.collection_delete, }, "cluster": { "post_save": SignalHandlers.cluster_save, "pre_delete": SignalHandlers.cluster_delete, }, "archive.DigitizedWork": { "post_save": SignalHandlers.handle_digwork_cluster_change }, }
[docs] def first_page(self): """Number of the first page in range, if this is an excerpt (first of original page range, not digital)""" return self.first_page_original()
[docs] def first_page_digital(self): """Number of the first page in range (digital pages / page index), if this is an excerpt. :return: first page number for digital page range; None if no page range :rtype: int, None """ if self.pages_digital: return list(self.page_span)[0]
[docs] def first_page_original(self): """Number of the first page in range (original page numbering) if this is an excerpt :return: first page number for original page range; None if no page range :rtype: str, None """ # use regex since it handles all cases (intspan only works for a subset) match = re.match(r"([\da-z]+)([,-]|\b)", self.pages_orig) if match: return match.group(1)
[docs] def index_id(self): """use source id + first page in range (if any) as solr identifier""" first_page = self.first_page() if first_page: return "%s-p%s" % (self.source_id, first_page) return self.source_id
[docs] @classmethod def index_item_type(cls): """override index item type label to just work""" return "work"
[docs] @classmethod def items_to_index(cls): """Queryset of works for indexing everything; excludes suppressed works.""" return ( DigitizedWork.objects.exclude(status=cls.SUPPRESSED) .select_related("cluster") .prefetch_related("collections") )
# NOTE: prefetch_related is ignored when used with Iterator, # which parasolr indexing does # specify chunk size; using previous django iterator default index_chunk_size = 2000
[docs] @classmethod def prep_index_chunk(cls, chunk): # prefetch collections when indexing in chunks # (method modifies queryset in place) models.prefetch_related_objects(chunk, "collections") return chunk
[docs] def index_data(self): """data for indexing in Solr""" # When an item has been suppressed, return id only. # This will blank out any previously indexed values, and item # will not be findable by any public searchable fields. if self.status == self.SUPPRESSED: return {"id": self.source_id} index_id = self.index_id() return { "id": index_id, "source_id": self.source_id, "first_page_s": self.first_page(), "group_id_s": index_id, # for grouping pages by work or excerpt "source_t": self.get_source_display(), "source_url": self.source_url, "title": self.title, "subtitle": self.subtitle, "sort_title": self.sort_title, "pub_date": self.pub_date, "pub_place": self.pub_place, "publisher": self.publisher, "enumcron": self.enumcron, "author": self.author, # set default value to simplify queries to find uncollected items # (not set in Solr schema because needs to be works only) "collections": [collection.name for collection in self.collections.all()] if self.collections.exists() else [NO_COLLECTION_LABEL], "cluster_id_s": self.index_cluster_id, # public notes field for display on site_name "notes": self.public_notes, # hard-coded to distinguish from & sort with pages "item_type": "work", "order": "0", "work_type_s": self.get_item_type_display() .lower() .replace(" ", "-"), # full-work, excerpt, or article "book_journal_s": self.book_journal, }
[docs] def remove_from_index(self): """Remove the current work and associated pages from Solr index""" # Default parasolr logic only removes current item record; # we need to remove associated pages as well logger.debug( "Deleting DigitizedWork and associated pages from index with group_id %s", self.index_id(), ) self.solr.update.delete_by_query('group_id_s:("%s")' % self.index_id())
[docs] def count_pages(self, ptree_client=None): """Count the number of pages for a digitized work. If a pages are specified for an excerpt or article, page count is determined based on the number of pages in the combined ranges. Otherwise, page count is based on the number of files in the zipfile within the pairtree content (Hathi-specific). Raises :class:`pairtree.storage_exceptions.ObjectNotFoundException` if the data is not found in the pairtree storage. Returns page count found; updates the `page_count` attribute on the current instance, but does NOT save the object.""" # if this item has a page span defined, calculate number of pages # based on the number of pages across all spans if self.page_span: return len(self.page_span) if not self.source == DigitizedWork.HATHI: raise storage_exceptions.ObjectNotFoundException( "Using Hathi-specific page count for non-Hathi item" ) if not ptree_client: ptree_client = self.hathi.pairtree_client() # count the files in the zipfile start = time.time() # could raise pairtree exception, but allow calling context to catch with ZipFile(self.hathi.zipfile_path(ptree_client)) as ht_zip: # some aggregate packages retrieved from Data API # include jp2 and xml files as well as txt; only count text page_count = len( [ filename for filename in ht_zip.namelist() if filename.endswith(".txt") ] ) logger.debug( "Counted %d pages in zipfile in %f sec", page_count, time.time() - start ) # NOTE: could also count pages via mets file, but that's slower # than counting via zipfile name list # update page count on the instance, but don't save changes self.page_count = page_count # return the total return page_count
@property def page_span(self): # TODO: relabel to make it explicit that this is digital pages? # convert the specified page numbers into an intspan # if empty, returns an empty set return intspan(self.pages_digital)
[docs] def get_metadata(self, metadata_format): """Get metadata for this item in the specified format. Currently only supports marc.""" if metadata_format == "marc": # get metadata from hathi bib api and serialize # as binary marc if self.source == DigitizedWork.HATHI: bib_api = HathiBibliographicAPI() bibdata = bib_api.record("htid", self.source_id) return bibdata.marcxml.as_marc() if self.source == DigitizedWork.GALE: # get record from local marc pairtree storage using ESTC id # (stored as record id) try: record = get_marc_record(self.record_id) # specify encoding to avoid errors record.force_utf8 = True return record.as_marc() except MARCRecordNotFound: logger.warning( "MARC record for %s/%s not found" % (self.source_id, self.record_id) ) return "" return "" # error for unknown raise ValueError("Unsupported format %s" % metadata_format)
[docs] @staticmethod def add_from_hathi(htid, bib_api=None, update=False, log_msg_src=None, user=None): """Add or update a HathiTrust work in the database. Retrieves bibliographic data from Hathi api, retrieves or creates a :class:`DigitizedWork` record, and populates the metadata if this is a new record, if the Hathi metadata has changed, or if update is requested. Creates admin log entry to document record creation or update. Raises :class:`ppa.archive.hathi.HathiItemNotFound` for invalid id. Returns the new or updated :class:`~ppa.archive.models.DigitizedWork`. :param htid: HathiTrust record identifier :param bib_api: optional :class:`~ppa.archive.hathi.HathiBibliographicAPI` instance, to allow for shared sessions in scripts :param update: update bibliographic metadata even if the hathitrust record is not newer than the local database record (default: False) :param log_msg_src: source of the change to be used included in log entry messages (optional). Will be used as "Created/updated [log_msg_src]". :param user: optional user responsible for the change, to be associated with :class:`~django.admin.models.LogEntry` record """ # initialize new bibliographic API if none is passed in bib_api = bib_api or HathiBibliographicAPI() # set a default log message source if not specified log_msg_src = log_msg_src or "from HathiTrust bibliographic data" # get bibliographic data for this record from Hathi api # - needed to check if update is required for existing records, # and to populate metadata for new records # could raise HathiItemNotFound for invalid id bibdata = bib_api.record("htid", htid) # if hathi id is valid and we have bibliographic data, create # a new record # find existing record or create a new one # @NOTE @BUG: This is sometimes returning >1 entry and failing. Need to find why digwork, created = DigitizedWork.objects.get_or_create(source_id=htid) # get configured script user for log entries if no user passed in if not user: user = User.objects.get(username=settings.SCRIPT_USERNAME) # if this is an existing record, check if updates are needed source_updated = None if not created and not update: source_updated = bibdata.copy_last_updated(htid) if digwork.updated.date() > source_updated: # local copy is newer than last source modification date # and update is not requested; return un modified return digwork # populate digitized item in the database digwork.populate_from_bibdata(bibdata) digwork.save() # create a log entry to document record creation or change # if created, action is addition and message is creation log_change_message = "Created %s" % log_msg_src log_action = ADDITION # if this was not a new record, log as an update if not created: # create log entry for updating an existing record # include details about why the update happened if possible if update: msg_detail = " (forced update)" else: msg_detail = "; source record last updated %s" % source_updated log_change_message = "Updated %s%s" % (log_msg_src, msg_detail) log_action = CHANGE # create log entry for record creation LogEntry.objects.log_action( user_id=user.id, content_type_id=ContentType.objects.get_for_model(digwork).pk, object_id=digwork.pk, object_repr=str(digwork), change_message=log_change_message, action_flag=log_action, ) return digwork
[docs] class Page(Indexable): """Indexable for pages to make page data available for indexing with parasolr index manage command.""" index_chunk_size = 2000
[docs] @classmethod def items_to_index(cls): """Return a generator of page data to be indexed, with data for pages for each work returned by :meth:`Page.page_index_data` """ for work in DigitizedWork.items_to_index(): for page_data in Page.page_index_data(work): yield page_data
[docs] @classmethod def total_to_index(cls): """Calculate the total number of pages to be indexed by aggregating page count of items to index in the database.""" return ( DigitizedWork.items_to_index().aggregate( total_pages=models.Sum("page_count") )["total_pages"] or 0 )
[docs] @classmethod def index_item_type(cls): """index item type for parasolr indexing script""" return "page"
[docs] @classmethod def page_index_data(cls, digwork): """Get page content for the specified digitized work from Hathi pairtree and return data to be indexed in solr.""" # TODO: how to share common fields/logic across sources? # Only index pages for items that are not suppressed if not digwork.is_suppressed: # get index page data based on the source if digwork.source == digwork.HATHI: return cls.hathi_page_index_data(digwork) if digwork.source == digwork.GALE: return cls.gale_page_index_data(digwork) # return an empty list for anything else return []
[docs] @classmethod def hathi_page_index_data(cls, digwork): """Get page content for the specified digitized work from Hathi pairtree and return data to be indexed in solr.""" # load mets record to pull metadata about the images try: mmets = digwork.hathi.mets_xml() except storage_exceptions.ObjectNotFoundException: logger.error( "Pairtree data for %s not found but status is %s", digwork.source_id, digwork.get_status_display(), ) return # get page span from digitized work page_span = digwork.page_span digwork_index_id = digwork.index_id() # digwork index id is fallback for cluster, since it is used # to collapse works and pages that belong together # read zipfile contents in place, without unzipping try: zpath = digwork.hathi.zipfile_path() except storage_exceptions.PartNotFoundException: # missing file inside pairtree for this logging.error(f"Missing pairtree data for: {digwork}") return with ZipFile(zpath) as ht_zip: # yield a generator of index data for each page; iterate # over pages in METS structmap for i, page in enumerate(mmets.structmap_pages, 1): # if the document has a page range defined, skip any pages not in range if page_span and i not in page_span: continue # zipfile spec uses / for path regardless of OS pagefilename = "/".join( [digwork.hathi.content_dir, page.text_file_location] ) try: with ht_zip.open(pagefilename) as pagefile: try: yield { "id": "%s.%s" % (digwork_index_id, page.text_file.sequence), "source_id": digwork.source_id, # for grouping with work record "group_id_s": digwork_index_id, # for grouping with cluster "cluster_id_s": digwork.index_cluster_id, "content": pagefile.read().decode("utf-8"), "order": page.order, "label": page.display_label, "tags": page.label.split(", ") if page.label else [], "item_type": "page", } except StopIteration: return except KeyError: # we know of one HathiTrust work (uc1.$b31619) where # the METS references pages that are not present in the zip file; # they are at the end of the document and don't have any # page content, so log a warning but don't treat as an error logger.warn( "Indexing %s pages: " + "%s referenced in METS but not found in zip file", digwork, pagefilename, )
[docs] @classmethod def gale_page_index_data(cls, digwork, gale_record=None): """Get page content for the specified digitized work from Gale API and return data to be indexed in solr. Takes an optional gale_record parameter (item record as returned by Gale API), to avoid making an extra API call if data is already available.""" if gale_record is None: gale_record = GaleAPI().get_item(digwork.source_id) # get page span from digitized work page_span = digwork.page_span digwork_index_id = digwork.index_id() # digwork index id is fallback for cluster, since it is used # to collapse works and pages that belong together for i, page in enumerate(gale_record["pageResponse"]["pages"], 1): page_number = page["pageNumber"] # folio number not yet set for all volumes; fallback to page number page_label = page.get("folioNumber", int(page_number)) # if the document has a page range defined, skip any pages not in range if page_span and i not in page_span: continue yield { "id": "%s.%s" % (digwork_index_id, page_number), "source_id": digwork.source_id, "group_id_s": digwork_index_id, # for grouping with work record "cluster_id_s": digwork.index_cluster_id, # for grouping with cluster "content": page.get("ocrText"), # some pages have no text "order": i, "label": page_label, "item_type": "page", # image id needed for thumbnail url; use solr dynamic field "image_id_s": page["image"]["id"], }