Source code for ppa.archive.hathi

"""
Utilities for working with HathiTrust materials and APIs.
"""
import io
import logging
import os.path
import time
from datetime import datetime
from zipfile import ZipFile

import pymarc
import requests
from cached_property import cached_property
from django.conf import settings
from eulxml import xmlmap
from pairtree import pairtree_client, pairtree_path, storage_exceptions

from ppa import __version__ as ppa_version

logger = logging.getLogger(__name__)


[docs] class HathiItemNotFound(Exception): """Item not found in bibliographic or data API""" pass
[docs] class HathiItemForbidden(Exception): """Permission denied to access item in data API""" pass
[docs] class HathiBaseAPI: """Base client class for HathiTrust APIs""" #: base api URL for all requests api_root = "" def __init__(self): # create a request session, for request pooling self.session = requests.Session() # set a user-agent header, but preserve requests version information headers = { "User-Agent": "ppa-django/%s (%s)" % (ppa_version, self.session.headers["User-Agent"]) } # include technical contact as From header, if set tech_contact = getattr(settings, "TECHNICAL_CONTACT", None) if tech_contact: headers["From"] = tech_contact self.session.headers.update(headers) def __del__(self): # close the request session self.session.close() def _make_request(self, url, params=None): """Make a GET request with the configured session. Takes a url relative to :attr:`api_root` and optional dictionary of parameters. Returns the response for status 200 OK; raises :class:`HathiItemNotFound` for 404 and :class:`HathiItemForbidden` for 403. """ url = "%s/%s" % (self.api_root, url) rqst_opts = {} if params: rqst_opts["params"] = params start = time.time() resp = self.session.get(url, **rqst_opts) logger.debug("get %s %s: %f sec", url, resp.status_code, time.time() - start) if resp.status_code == requests.codes.ok: return resp if resp.status_code == requests.codes.not_found: raise HathiItemNotFound if resp.status_code == requests.codes.forbidden: raise HathiItemForbidden
[docs] class HathiBibliographicAPI(HathiBaseAPI): """Wrapper for HathiTrust Bibliographic API. https://www.hathitrust.org/bib_api """ api_root = "http://catalog.hathitrust.org/api" def _get_record(self, mode, id_type, id_value): url = "volumes/%(mode)s/%(id_type)s/%(id_value)s.json" % { "mode": mode, "id_type": id_type, "id_value": id_value, # NOTE: / in ark ids is *not* escaped } resp = self._make_request(url) # for an invalid id, hathi seems to return a 200 ok # but json has no records if not resp.json().get("records", None): raise HathiItemNotFound return HathiBibliographicRecord(resp.json())
[docs] def brief_record(self, id_type, id_value): """Get brief record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` """ return self._get_record("brief", id_type, id_value)
[docs] def record(self, id_type, id_value): """Get full record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` """ return self._get_record("full", id_type, id_value)
# also possible: get multiple records at once
[docs] class HathiBibliographicRecord: """Representation of a HathiTrust bibliographic record.""" def __init__(self, data): self._data = data # for a single bib api json result, we only want the first item self.record_id = list(data["records"].keys())[0] self.info = list(data["records"].values())[0] @property def title(self): """First title (standard title)""" # returns list of titles - standard title; could also have # title without leading article and other language titles return self.info["titles"][0] @property def pub_dates(self): """list of available publication dates""" return self.info["publishDates"]
[docs] def copy_details(self, htid): """Details for a specific copy identified by hathi id""" for item in self._data["items"]: if item["htid"] == htid: return item
[docs] def copy_last_updated(self, htid): """Return last update date for a specificy copy identified by hathi id. Returns as :class:`datetime.date`""" # get last update from copy details last_update = self.copy_details(htid)["lastUpdate"] # use datetime to parse, then return just thed ate return datetime.strptime(last_update, "%Y%m%d").date()
@cached_property def marcxml(self): """Record marcxml if included (full records only), as an instance of :class:`pymarc.Record`""" marcxml = self._data["records"][self.record_id].get("marc-xml", None) if marcxml: return pymarc.parse_xml_to_array(io.StringIO(marcxml))[0]
class _METS(xmlmap.XmlObject): """Base :class:`~eulxml.xmlmap.XmlObject`. with METS namespace configured""" ROOT_NAMESPACES = {"m": "http://www.loc.gov/METS/"}
[docs] class StructMapPage(_METS): """Single logical page within a METS StructMap""" #: page order order = xmlmap.IntegerField("@ORDER") #: page label label = xmlmap.StringField("@LABEL") #: order label orderlabel = xmlmap.StringField("@ORDERLABEL") #: identifier for a text or ocr file, from a file pointer text_file_id = xmlmap.StringField( 'm:fptr/@FILEID[contains(., "TXT") or contains(. , "OCR")]' ) ## example struct map page """<METS:div ORDER="1" LABEL="FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER" TYPE="page"> <METS:fptr FILEID="HTML00000001"/> <METS:fptr FILEID="TXT00000001"/> <METS:fptr FILEID="IMG00000001"/> <METS:file SIZE="1003" ID="HTML00000496" MIMETYPE="text/html" CREATED="2017-03-20T10:40:21Z" CHECKSUM="f0a326c10b2a6dc9ae5e3ede261c9897" SEQ="00000496" CHECKSUMTYPE="MD5"> """ # noqa: E501 @cached_property def display_label(self): """page display labeel; use order label if present; otherwise use order""" return self.orderlabel or str(self.order) @cached_property def text_file(self): """:class:`METSFiile` corresponding to the text file pointer for this page""" return METSFile( self.node.xpath( '//m:file[@ID="%s"]' % self.text_file_id, namespaces=self.ROOT_NAMESPACES, )[0] ) @cached_property def text_file_location(self): """location for the text file""" return self.text_file.location
[docs] class METSFile(_METS): """File location information within a METS document.""" #: xml identifier id = xmlmap.StringField("@ID") #: sequence attribute sequence = xmlmap.StringField("@SEQ") #: file location location = xmlmap.StringField("m:FLocat/@xlink:href") # example file """<METS:file SIZE="1" ID="TXT00000001" MIMETYPE="text/plain" CREATED="2016-06-24T09:04:15Z" CHECKSUM="68b329da9893e34099c7d8ad5cb9c940" SEQ="00000001" CHECKSUMTYPE="MD5"> """
[docs] class MinimalMETS(_METS): """Minimal :class:`~eulxml.xmlmap.XmlObject` for METS that maps only what is needed to support page indexing for :mod:`ppa`.""" #: list of struct map pages as :class:`StructMapPage` structmap_pages = xmlmap.NodeListField( 'm:structMap[@TYPE="physical"]//m:div[@TYPE="page"]', StructMapPage )
[docs] class HathiObject: """An object for working with a HathiTrust item with data in a locally configured pairtree datastore.""" # Pairtree version statement usd by pairtree package pairtree_version_stmt = ( "This directory conforms to Pairtree Version 0.1. Updated spec: " + "http://www.cdlib.org/inside/diglib/pairtree/pairtreespec.html" ) def __init__(self, hathi_id): # HathiTrust record id self.hathi_id = hathi_id # Identifiers for owning institution and volume which form the overall # HathiTrust record id: [lib_id].[vol_id] self.lib_id, self.vol_id = hathi_id.split(".", 1) # Pairtree prefix self.pairtree_prefix = f"{self.lib_id}." # Content directory for this work within the appropriate pairtree # which is based on a pairtree encoded version of the volume id self.content_dir = pairtree_path.id_encode(self.vol_id)
[docs] def pairtree_client(self): """Initialize a pairtree client for the pairtree datastore this object belongs to, based on its HathiTrust record id.""" store_dir = os.path.join(settings.HATHI_DATA, self.lib_id) # Check if store_dir exists, check if pairtree files exist if os.path.isdir(store_dir): # Check if "pairtree_prefix" file exists. If not, create it. pairtree_prefix_fn = os.path.join(store_dir, "pairtree_prefix") if not os.path.isfile(pairtree_prefix_fn): with open(pairtree_prefix_fn, mode="w") as writer: writer.write(self.pairtree_prefix) # Check if "pairtree_version0_1" file exists. If not, create it. # Note: Mimicking paitree packages behavior. File contents are not # actually verified pairtree_vn_fn = os.path.join(store_dir, "pairtree_version0_1") if not os.path.isfile(pairtree_vn_fn): with open(pairtree_vn_fn, mode="w") as writer: writer.write(self.pairtree_version_stmt) return pairtree_client.PairtreeStorageClient( self.pairtree_prefix, store_dir, )
[docs] def pairtree_object(self, ptree_client=None, create=False): """get a pairtree object for this record :param ptree_client: optional :class:`pairtree_client.PairtreeStorageClient` if one has already been initialized, to avoid repeated initialization (currently used in hathi_import manage command) """ if ptree_client is None: # get pairtree client if not passed in ptree_client = self.pairtree_client() # return the pairtree object for current work return ptree_client.get_object(self.vol_id, create_if_doesnt_exist=create)
[docs] def delete_pairtree_data(self): """Delete pairtree object from the pairtree datastore.""" logger.info("Deleting pairtree data for %s", self.hathi_id) try: self.pairtree_client().delete_object(self.vol_id) except storage_exceptions.ObjectNotFoundException: # data is already gone; warn, but not an error logger.warning( "Pairtree deletion failed; object not found %s", self.hathi_id )
def _content_path(self, ext, ptree_client=None): """path to zipfile within the hathi contents for this work""" pairtree_obj = self.pairtree_object(ptree_client=ptree_client) # - expect a mets file and a zip file # NOTE: not yet making use of the metsfile # - don't rely on them being returned in the same order on every machine parts = pairtree_obj.list_parts(self.content_dir) # find the first zipfile in the list (should only be one) filepaths = [part for part in parts if part.endswith(ext)] if not filepaths: # An error has occurred -- there is no zip file here in parts raise storage_exceptions.PartNotFoundException return os.path.join( pairtree_obj.id_to_dirpath(), self.content_dir, filepaths[0] )
[docs] def zipfile_path(self, ptree_client=None): """path to zipfile within the hathi contents for this work""" return self._content_path("zip", ptree_client=ptree_client)
[docs] def metsfile_path(self, ptree_client=None): """path to mets xml file within the hathi contents for this work""" return self._content_path(".mets.xml", ptree_client=ptree_client)
[docs] def mets_xml(self) -> MinimalMETS: """load METS xml file from pairtree and initialize as an instance of :class:`MinimalMETS` :rtype: :class:`MinimalMETS` :raises: :class:`storage_exceptions.ObjectNotFoundException` if the object is not found in pairtree storage :raises: :class:`storage_exceptions.PartNotFoundException` if the mets.xml flie is not found in pairtree storage for this object """ return xmlmap.load_xmlobject_from_file(self.metsfile_path(), MinimalMETS)
[docs] def page_data(self): """Return a generator of page content for this HathiTrust work based on pairtree and METS data, for indexing pages in Solr.""" # load mets record to pull metadata about the images try: mmets = self.mets_xml() except storage_exceptions.ObjectNotFoundException: logger.error(f"Pairtree data for {self.hathi_id} not found") return # read zipfile contents in place, without unzipping try: zpath = self.zipfile_path() except storage_exceptions.PartNotFoundException: # missing file inside pairtree if this error occurs logging.error(f"Missing pairtree data for: {self.hathi_id}") return with ZipFile(zpath) as ht_zip: # yield a generator of index data for each page; iterate # over pages in METS structmap for i, page in enumerate(mmets.structmap_pages, 1): # zipfile spec uses / for path regardless of OS pagefilename = "/".join([self.content_dir, page.text_file_location]) try: with ht_zip.open(pagefilename) as pagefile: try: yield { "page_id": page.text_file.sequence, "content": pagefile.read().decode("utf-8"), "order": page.order, "label": page.display_label, "tags": page.label.split(", ") if page.label else [], } except StopIteration: return except KeyError: # we know of one HathiTrust work (uc1.$b31619) where # the METS references pages that are not present in the zip file; # they are at the end of the document and don't have any # page content, so log a warning but don't treat as an error logger.warn( "Indexing %s pages: " + "%s referenced in METS but not found in zip file", self.hathi_id, pagefilename, )