Source code for ppa.archive.hathi

'''
Utilities for working with HathiTrust materials and APIs.
'''
from datetime import datetime
import logging
import os.path
import io
import time

from cached_property import cached_property
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from eulxml import xmlmap
from pairtree import pairtree_path, pairtree_client, storage_exceptions
import pymarc
import requests
from requests_oauthlib import OAuth1


from ppa import __version__ as ppa_version


logger = logging.getLogger(__name__)


[docs]class HathiItemNotFound(Exception): '''Item not found in bibliographic or data API''' pass
[docs]class HathiItemForbidden(Exception): '''Permission denied to access item in data API''' pass
[docs]class HathiBaseAPI: '''Base client class for HathiTrust APIs''' #: base api URL for all requests api_root = '' def __init__(self): # create a request session, for request pooling self.session = requests.Session() # set a user-agent header, but preserve requests version information headers = { 'User-Agent': 'ppa-django/%s (%s)' % \ (ppa_version, self.session.headers['User-Agent']) } # include technical contact as From header, if set tech_contact = getattr(settings, 'TECHNICAL_CONTACT', None) if tech_contact: headers['From'] = tech_contact self.session.headers.update(headers) def _make_request(self, url, params=None): '''Make a GET request with the configured session. Takes a url relative to :attr:`api_root` and optional dictionary of parameters. Returns the response for status 200 OK; raises :class:`HathiItemNotFound` for 404 and :class:`HathiItemForbidden` for 403. ''' url = '%s/%s' % (self.api_root, url) rqst_opts = {} if params: rqst_opts['params'] = params start = time.time() resp = self.session.get(url, **rqst_opts) logger.debug('get %s %s: %f sec', url, resp.status_code, time.time() - start) if resp.status_code == requests.codes.ok: return resp if resp.status_code == requests.codes.not_found: raise HathiItemNotFound if resp.status_code == requests.codes.forbidden: raise HathiItemForbidden
[docs]class HathiBibliographicAPI(HathiBaseAPI): '''Wrapper for HathiTrust Bibliographic API. https://www.hathitrust.org/bib_api ''' api_root = 'http://catalog.hathitrust.org/api' def _get_record(self, mode, id_type, id_value): url = 'volumes/%(mode)s/%(id_type)s/%(id_value)s.json' % { 'mode': mode, 'id_type': id_type, 'id_value': id_value # NOTE: / in ark ids is *not* escaped } resp = self._make_request(url) # for an invalid id, hathi seems to return a 200 ok # but json has no records if not resp.json().get('records', None): raise HathiItemNotFound return HathiBibliographicRecord(resp.json())
[docs] def brief_record(self, id_type, id_value): '''Get brief record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` ''' return self._get_record('brief', id_type, id_value)
[docs] def record(self, id_type, id_value): '''Get full record by id type and value. :returns: :class:`HathiBibliographicRecord` :raises: :class:`HathiItemNotFound` ''' return self._get_record('full', id_type, id_value)
# also possible: get multiple records at once
[docs]class HathiBibliographicRecord: '''Representation of a HathiTrust bibliographic record.''' def __init__(self, data): self._data = data # for a single bib api json result, we only want the first item self.record_id = list(data['records'].keys())[0] self.info = list(data['records'].values())[0] @property def title(self): '''First title (standard title)''' # returns list of titles - standard title; could also have # title without leading article and other language titles return self.info['titles'][0] @property def pub_dates(self): ''' list of available publication dates''' return self.info['publishDates']
[docs] def copy_details(self, htid): '''Details for a specific copy identified by hathi id''' for item in self._data['items']: if item['htid'] == htid: return item
[docs] def copy_last_updated(self, htid): '''Return last update date for a specificy copy identified by hathi id. Returns as :class:`datetime.date`''' # get last update from copy details last_update = self.copy_details(htid)['lastUpdate'] # use datetime to parse, then return just thed ate return datetime.strptime(last_update, '%Y%m%d').date()
@cached_property def marcxml(self): '''Record marcxml if included (full records only), as an instance of :class:`pymarc.Record`''' marcxml = self._data['records'][self.record_id].get('marc-xml', None) if marcxml: return pymarc.parse_xml_to_array(io.StringIO(marcxml))[0]
[docs]class HathiDataAPI(HathiBaseAPI): '''Wrapper for HathiTrust DATA API. Pulls OAuth credentials from Django settings.''' api_root = 'https://babel.hathitrust.org/cgi/htd' cfg_err = 'OAuth key and secret configuration required for HathiDataAPI use' def __init__(self): super().__init__() # get oauth configuration and attach to session oauth_key = getattr(settings, 'HATHITRUST_OAUTH_KEY', None) oauth_secret = getattr(settings, 'HATHITRUST_OAUTH_SECRET', None) if not oauth_key or not oauth_secret: raise ImproperlyConfigured(self.cfg_err) self.oauth = OAuth1(client_key=oauth_key, client_secret=oauth_secret, signature_type='query') self.session.auth = self.oauth # NOTE: response headers include filename in content-disposition, # so return the whole response for aggregrate and structure requests
[docs] def get_aggregate(self, htid): '''Get aggregate date package for a HathiTrust record by hathi id.''' # http[s]://babel.hathitrust.org/cgi/htd/aggregate/:ID?v=2 return self._make_request('aggregate/%s' % htid, params={'v': 2})
[docs] def get_structure(self, htid, fmt='xml'): '''Get structure information for a HathiTrust record by hathi id.''' # http[s]://babel.hathitrust.org/cgi/htd/structure/:ID?format=xml&v=2 return self._make_request('structure/%s' % htid, params={'v': 2, 'format': fmt})
class _METS(xmlmap.XmlObject): '''Base :class:`~eulxml.xmlmap.XmlObject`. with METS namespace configured''' ROOT_NAMESPACES = { 'm': 'http://www.loc.gov/METS/' }
[docs]class StructMapPage(_METS): '''Single logical page within a METS StructMap''' #: page order order = xmlmap.IntegerField('@ORDER') #: page label label = xmlmap.StringField('@LABEL') #: order label orderlabel = xmlmap.StringField('@ORDERLABEL') #: identifier for a text or ocr file, from a file pointer text_file_id = xmlmap.StringField('m:fptr/@FILEID[contains(., "TXT") or contains(. , "OCR")]') ## example struct map page '''<METS:div ORDER="1" LABEL="FRONT_COVER, IMAGE_ON_PAGE, IMPLICIT_PAGE_NUMBER" TYPE="page"> <METS:fptr FILEID="HTML00000001"/> <METS:fptr FILEID="TXT00000001"/> <METS:fptr FILEID="IMG00000001"/> <METS:file SIZE="1003" ID="HTML00000496" MIMETYPE="text/html" CREATED="2017-03-20T10:40:21Z" CHECKSUM="f0a326c10b2a6dc9ae5e3ede261c9897" SEQ="00000496" CHECKSUMTYPE="MD5"> ''' @property def display_label(self): '''page display labeel; use order label if present; otherwise use order''' return self.orderlabel or str(self.order) @property def text_file(self): ''':class:`METSFiile` corresponding to the text file pointer for this page''' return METSFile(self.node.xpath('//m:file[@ID="%s"]' % self.text_file_id, namespaces=self.ROOT_NAMESPACES)[0]) @property def text_file_location(self): '''location for the text file''' return self.text_file.location
[docs]class METSFile(_METS): '''File location information within a METS document.''' #: xml identifier id = xmlmap.StringField('@ID') #: sequence attribute sequence = xmlmap.StringField('@SEQ') #: file location location = xmlmap.StringField('m:FLocat/@xlink:href') # example file '''<METS:file SIZE="1" ID="TXT00000001" MIMETYPE="text/plain" CREATED="2016-06-24T09:04:15Z" CHECKSUM="68b329da9893e34099c7d8ad5cb9c940" SEQ="00000001" CHECKSUMTYPE="MD5"> '''
[docs]class MinimalMETS(_METS): '''Minimal :class:`~eulxml.xmlmap.XmlObject` for METS that maps only what is needed to support page indexing for :mod:`ppa`.''' #: list of struct map pages as :class:`StructMapPage` structmap_pages = xmlmap.NodeListField('m:structMap[@TYPE="physical"]//m:div[@TYPE="page"]', StructMapPage)
[docs]class HathiObject: '''An object for working with a HathiTrust item with data in a locally configured pairtree datastore.''' hathi_id = None def __init__(self, hathi_id): self.hathi_id = hathi_id @cached_property def pairtree_prefix(self): '''pairtree prefix (first portion of the hathi id, short-form identifier for owning institution)''' return self.hathi_id.split('.', 1)[0] @cached_property def pairtree_id(self): '''pairtree identifier (second portion of source id)''' return self.hathi_id.split('.', 1)[1] @cached_property def content_dir(self): '''content directory for this work within the appropriate pairtree''' # contents are stored in a directory named based on a # pairtree encoded version of the id return pairtree_path.id_encode(self.pairtree_id)
[docs] def pairtree_client(self): '''Initialize a pairtree client for the pairtree datastore this object belongs to, based on its Hathi prefix id.''' return pairtree_client.PairtreeStorageClient( self.pairtree_prefix, os.path.join(settings.HATHI_DATA, self.pairtree_prefix))
[docs] def pairtree_object(self, ptree_client=None, create=False): '''get a pairtree object for this record :param ptree_client: optional :class:`pairtree_client.PairtreeStorageClient` if one has already been initialized, to avoid repeated initialization (currently used in hathi_import manage command) ''' if ptree_client is None: # get pairtree client if not passed in ptree_client = self.pairtree_client() # return the pairtree object for current work return ptree_client.get_object(self.pairtree_id, create_if_doesnt_exist=create)
[docs] def delete_pairtree_data(self): '''Delete pairtree object from the pairtree datastore.''' logger.info('Deleting pairtree data for %s', self.hathi_id) try: self.pairtree_client() \ .delete_object(self.pairtree_id) except storage_exceptions.ObjectNotFoundException: # data is already gone; warn, but not an error logger.warning('Pairtree deletion failed; object not found %s', self.hathi_id)
def _content_path(self, ext, ptree_client=None): '''path to zipfile within the hathi contents for this work''' pairtree_obj = self.pairtree_object(ptree_client=ptree_client) # - expect a mets file and a zip file # NOTE: not yet making use of the metsfile # - don't rely on them being returned in the same order on every machine parts = pairtree_obj.list_parts(self.content_dir) # find the first zipfile in the list (should only be one) filepath = [part for part in parts if part.endswith(ext)][0] return os.path.join(pairtree_obj.id_to_dirpath(), self.content_dir, filepath)
[docs] def zipfile_path(self, ptree_client=None): '''path to zipfile within the hathi contents for this work''' return self._content_path('zip', ptree_client=ptree_client)
[docs] def metsfile_path(self, ptree_client=None): '''path to mets xml file within the hathi contents for this work''' return self._content_path('.mets.xml', ptree_client=ptree_client)