Source code for ppa.archive.hathi

"""
Utilities for working with HathiTrust materials and APIs.
"""
import io
import logging
import os.path
import time
from datetime import datetime

import pymarc
import requests
from cached_property import cached_property
from django.conf import settings
from eulxml import xmlmap
from pairtree import pairtree_client, pairtree_path, storage_exceptions

from ppa import __version__ as ppa_version

logger = logging.getLogger(__name__)


[docs] class HathiItemNotFound(Exception): """Item not found in bibliographic or data API""" pass
[docs] class HathiItemForbidden(Exception): """Permission denied to access item in data API""" pass
[docs] class HathiBaseAPI: """Base client class for HathiTrust APIs""" #: base api URL for all requests api_root = "" def __init__(self): # create a request session, for request pooling self.session = requests.Session() # set a user-agent header, but preserve requests version information headers = { "User-Agent": "ppa-django/%s (%s)" % (ppa_version, self.session.headers["User-Agent"]) } # include technical contact as From header, if set tech_contact = getattr(settings, "TECHNICAL_CONTACT", None) if tech_contact: headers["From"] = tech_contact self.session.headers.update(headers) def __del__(self): # close the request session self.session.close() def _make_request(self, url, params=None): """Make a GET request with the configured session. Takes a url relative to :attr:`api_root` and optional dictionary of parameters. Returns the response for status 200 OK; raises :class:`HathiItemNotFound` for 404 and :class:`HathiItemForbidden` for 403. """ url = "%s/%s" % (self.api_root, url) rqst_opts = {} if params: rqst_opts["params"] = params start = time.time() resp = self.session.get(url, **rqst_opts) logger.debug("get %s %s: %f sec", url, resp.status_code, time.time() - start) if resp.status_code == requests.codes.ok: return resp if resp.status_code == requests.codes.not_found: raise HathiItemNotFound if resp.status_code == requests.codes.forbidden: raise HathiItemForbidden
[docs] class HathiBibliographicAPI(HathiBaseAPI): """Wrapper for HathiTrust Bibliographic API. https://www.hathitrust.org/bib_api """ api_root = "http://catalog.hathitrust.org/api" def _get_record(self, mode, id_type, id_value): url = "volumes/%(mode)s/%(id_type)s/%(id_value)s.json" % { "mode": mode, "id_type": id_type, "id_value": id_value, # NOTE: / in ark ids is *not* escaped } resp = self._make_request(url) # for an invalid id, hathi seems to return a 200 ok # but json has no records if not resp.json().get("records", None): raise HathiItemNotFound return HathiBibliographicRecord(resp.json())
[docs] def brief_record(self, id_type, id_value): """Get brief record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` """ return self._get_record("brief", id_type, id_value)
[docs] def record(self, id_type, id_value): """Get full record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` """ return self._get_record("full", id_type, id_value)
# also possible: get multiple records at once
[docs] class HathiBibliographicRecord: """Representation of a HathiTrust bibliographic record.""" def __init__(self, data): self._data = data # for a single bib api json result, we only want the first item self.record_id = list(data["records"].keys())[0] self.info = list(data["records"].values())[0] @property def title(self): """First title (standard title)""" # returns list of titles - standard title; could also have # title without leading article and other language titles return self.info["titles"][0] @property def pub_dates(self): """list of available publication dates""" return self.info["publishDates"]
[docs] def copy_details(self, htid): """Details for a specific copy identified by hathi id""" for item in self._data["items"]: if item["htid"] == htid: return item
[docs] def copy_last_updated(self, htid): """Return last update date for a specificy copy identified by hathi id. Returns as :class:`datetime.date`""" # get last update from copy details last_update = self.copy_details(htid)["lastUpdate"] # use datetime to parse, then return just thed ate return datetime.strptime(last_update, "%Y%m%d").date()
@cached_property def marcxml(self): """Record marcxml if included (full records only), as an instance of :class:`pymarc.Record`""" marcxml = self._data["records"][self.record_id].get("marc-xml", None) if marcxml: return pymarc.parse_xml_to_array(io.StringIO(marcxml))[0]
class _METS(xmlmap.XmlObject): """Base :class:`~eulxml.xmlmap.XmlObject`. with METS namespace configured""" ROOT_NAMESPACES = {"m": "http://www.loc.gov/METS/"}
[docs] class StructMapPage(_METS): """Single logical page within a METS StructMap""" #: page order order = xmlmap.IntegerField("@ORDER") #: page label label = xmlmap.StringField("@LABEL") #: order label orderlabel = xmlmap.StringField("@ORDERLABEL") #: identifier for a text or ocr file, from a file pointer text_file_id = xmlmap.StringField( 'm:fptr/@FILEID[contains(., "TXT") or contains(. , "OCR")]' ) ## example struct map page """<METS:div ORDER="1" LABEL="FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER" TYPE="page"> <METS:fptr FILEID="HTML00000001"/> <METS:fptr FILEID="TXT00000001"/> <METS:fptr FILEID="IMG00000001"/> <METS:file SIZE="1003" ID="HTML00000496" MIMETYPE="text/html" CREATED="2017-03-20T10:40:21Z" CHECKSUM="f0a326c10b2a6dc9ae5e3ede261c9897" SEQ="00000496" CHECKSUMTYPE="MD5"> """ @cached_property def display_label(self): """page display labeel; use order label if present; otherwise use order""" return self.orderlabel or str(self.order) @cached_property def text_file(self): """:class:`METSFiile` corresponding to the text file pointer for this page""" return METSFile( self.node.xpath( '//m:file[@ID="%s"]' % self.text_file_id, namespaces=self.ROOT_NAMESPACES, )[0] ) @cached_property def text_file_location(self): """location for the text file""" return self.text_file.location
[docs] class METSFile(_METS): """File location information within a METS document.""" #: xml identifier id = xmlmap.StringField("@ID") #: sequence attribute sequence = xmlmap.StringField("@SEQ") #: file location location = xmlmap.StringField("m:FLocat/@xlink:href") # example file """<METS:file SIZE="1" ID="TXT00000001" MIMETYPE="text/plain" CREATED="2016-06-24T09:04:15Z" CHECKSUM="68b329da9893e34099c7d8ad5cb9c940" SEQ="00000001" CHECKSUMTYPE="MD5"> """
[docs] class MinimalMETS(_METS): """Minimal :class:`~eulxml.xmlmap.XmlObject` for METS that maps only what is needed to support page indexing for :mod:`ppa`.""" #: list of struct map pages as :class:`StructMapPage` structmap_pages = xmlmap.NodeListField( 'm:structMap[@TYPE="physical"]//m:div[@TYPE="page"]', StructMapPage )
[docs] class HathiObject: """An object for working with a HathiTrust item with data in a locally configured pairtree datastore.""" hathi_id = None def __init__(self, hathi_id): self.hathi_id = hathi_id @cached_property def pairtree_prefix(self): """pairtree prefix (first portion of the hathi id, short-form identifier for owning institution)""" return self.hathi_id.split(".", 1)[0] @cached_property def pairtree_id(self): """pairtree identifier (second portion of source id)""" return self.hathi_id.split(".", 1)[1] @cached_property def content_dir(self): """content directory for this work within the appropriate pairtree""" # contents are stored in a directory named based on a # pairtree encoded version of the id return pairtree_path.id_encode(self.pairtree_id)
[docs] def pairtree_client(self): """Initialize a pairtree client for the pairtree datastore this object belongs to, based on its Hathi prefix id.""" return pairtree_client.PairtreeStorageClient( self.pairtree_prefix, os.path.join(settings.HATHI_DATA, self.pairtree_prefix), )
[docs] def pairtree_object(self, ptree_client=None, create=False): """get a pairtree object for this record :param ptree_client: optional :class:`pairtree_client.PairtreeStorageClient` if one has already been initialized, to avoid repeated initialization (currently used in hathi_import manage command) """ if ptree_client is None: # get pairtree client if not passed in ptree_client = self.pairtree_client() # return the pairtree object for current work return ptree_client.get_object(self.pairtree_id, create_if_doesnt_exist=create)
[docs] def delete_pairtree_data(self): """Delete pairtree object from the pairtree datastore.""" logger.info("Deleting pairtree data for %s", self.hathi_id) try: self.pairtree_client().delete_object(self.pairtree_id) except storage_exceptions.ObjectNotFoundException: # data is already gone; warn, but not an error logger.warning( "Pairtree deletion failed; object not found %s", self.hathi_id )
def _content_path(self, ext, ptree_client=None): """path to zipfile within the hathi contents for this work""" pairtree_obj = self.pairtree_object(ptree_client=ptree_client) # - expect a mets file and a zip file # NOTE: not yet making use of the metsfile # - don't rely on them being returned in the same order on every machine parts = pairtree_obj.list_parts(self.content_dir) # find the first zipfile in the list (should only be one) filepaths = [part for part in parts if part.endswith(ext)] if not filepaths: # An error has occurred -- there is no zip file here in parts raise storage_exceptions.PartNotFoundException return os.path.join(pairtree_obj.id_to_dirpath(), self.content_dir, filepaths[0])
[docs] def zipfile_path(self, ptree_client=None): """path to zipfile within the hathi contents for this work""" return self._content_path("zip", ptree_client=ptree_client)
[docs] def metsfile_path(self, ptree_client=None): """path to mets xml file within the hathi contents for this work""" return self._content_path(".mets.xml", ptree_client=ptree_client)