import logging
import time
import pymarc
import requests
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from pairtree import PairtreeStorageFactory, storage_exceptions
from ppa import __version__ as ppa_version
logger = logging.getLogger(__name__)
[docs]
class GaleAPIError(Exception):
"""Base exception class for Gale API errors"""
[docs]
class GaleItemForbidden(GaleAPIError):
"""Permission denied to access item in Gale API"""
[docs]
class GaleUnauthorized(GaleAPIError):
"""Permission not authorized for Gale API access"""
[docs]
class GaleItemNotFound(GaleAPIError):
"""Item not found in Gale API"""
[docs]
class GaleAPI:
"""Minimal Gale API client with functionality need for PPA import.
Requires **GALE_API_USERNAME** configured in Django settings. Automatically
uses the configured username to retrieve an API key when needed, and has
logic to refresh the API key when it expires (30 minutes).
If **TECHNICAL_CONTACT** is configured in Django settings, it will
be included in request headers when making API calls.
Implemented as a singleton; instanciating the class will return the
same shared instance every time.
"""
#: base URL for all API requests
api_root = "https://api.gale.com/api"
#: shared singleton instance; populated on first instantiation
instance = None
def __new__(cls):
# implement as a singleton
# adapted from https://softwareengineering.stackexchange.com/a/333710
# if no instance has been initialized, create and store on the class
if cls.instance is None:
cls.instance = super().__new__(cls)
# return the instance
return cls.instance
def __init__(self):
# NOTE: copied from hathi.py base api class; should be generalized
# into a common base class if/when we add a third provider
# first make sure we have a username configured
try:
self.username = settings.GALE_API_USERNAME
except AttributeError:
raise ImproperlyConfigured(
"GALE_API_USERNAME configuration is required for Gale API"
)
# create a request session, for request pooling
self.session = requests.Session()
# set a user-agent header, but preserve requests version information
headers = {
"User-Agent": "ppa-django/%s (%s)"
% (ppa_version, self.session.headers["User-Agent"])
}
# include technical contact as From header, if set
tech_contact = getattr(settings, "TECHNICAL_CONTACT", None)
if tech_contact:
headers["From"] = tech_contact
self.session.headers.update(headers)
def _make_request(
self, url, params=None, requires_api_key=True, stream=False, retry=0
):
"""Make a GET request with the configured session. Takes a url
relative to :attr:`api_root`, optional dictionary of parameters for the request,
and flags to indicate if the request needs an API key, should be streamed,
or is a retry."""
# NOTE: also copied from hathi.py
# Returns the response for status 200 OK; raises
# :class:`HathiItemNotFound` for 404 and :class:`HathiItemForbidden`
# for 403.
# '''
rqst_url = "%s/%s" % (self.api_root, url)
rqst_opts = {}
if params:
rqst_opts["params"] = params.copy()
# add api key to parameters if neded for this request
if requires_api_key:
if "params" not in rqst_opts:
rqst_opts["params"] = {}
rqst_opts["params"]["api_key"] = self.api_key
resp = self.session.get(rqst_url, stream=stream, **rqst_opts)
logger.debug(
"get %s %s: %f sec",
rqst_url,
resp.status_code,
resp.elapsed.total_seconds(),
)
if resp.status_code == requests.codes.ok:
return resp
if resp.status_code == requests.codes.not_found:
raise GaleItemNotFound
# when api key expires, API returns:
# HTTP Status 401 - Authentication Failed: Invalid or Expired API key
# If we get a 401 on a request that requires an api key, try getting a new one
if resp.status_code == requests.codes.unauthorized:
# If we get a 401 on a request that requires an api key,
# get a fresh key and then try the same request again
if requires_api_key and retry < 1:
self.refresh_api_key()
return self._make_request(
url,
params=params,
requires_api_key=requires_api_key,
stream=stream,
retry=retry + 1,
)
# response is html error, not json; could try
# extracting h1, but not sure it's worth parsing
raise GaleUnauthorized()
if resp.status_code == requests.codes.forbidden:
# forbidden results return a message
# NOTE that item requests for invalid ids may return 403
raise GaleItemForbidden(resp.json()["message"])
# raise anything else as a generic error with status code
# getting 406 not acceptable in some cases
# (attempt to access item with invalid item id)
raise GaleAPIError(resp.status_code)
[docs]
def get_api_key(self):
"""Get a new API key to use for requests in the next 30 minutes."""
# GALE API requires use of an API key, which lasts for 30 minutes
# request a new one when needed using configured username
response = self._make_request(
"tools/generate_key", {"user": self.username}, requires_api_key=False
)
return response.json()["apiKey"]
_api_key = None
@property
def api_key(self):
"""Property for current api key. Uses :meth:`get_api_key` to
request a new one when needed."""
if self._api_key is None:
self._api_key = self.get_api_key()
return self._api_key
[docs]
def refresh_api_key(self):
"""clear cached api key and request a new one"""
self._api_key = None
assert self.api_key # populate new key through property
[docs]
def get_item(self, item_id):
"""Get the full record for a single item"""
# full id looks like GALE|CW###### or GALE|CB#######
# using streaming makes a *significant* difference in response time,
# especially for larger results
response = self._make_request("v1/item/GALE%%7C%s" % item_id, stream=True)
if response:
return response.json()
# MARC records needed for import and metadata are stored in a local pairtree.
# currently used for Gale/ECCO content
[docs]
def get_marc_storage():
"""return pairtree storage for marc records"""
return PairtreeStorageFactory().get_store(
store_dir=settings.MARC_DATA, uri_base="info:local/"
)
[docs]
class MARCRecordNotFound(Exception):
"""record not found in local MARC record storage"""
[docs]
def get_marc_record(marc_id):
"""get a marc record from the pairtree storage by Gale ESTC id"""
start_time = time.time()
try:
marc_object = get_marc_storage().get_object(marc_id)
with marc_object.get_bytestream("marc.dat", streamable=True) as marcfile:
reader = pymarc.MARCReader(marcfile, to_unicode=True, file_encoding="utf-8")
record = [rec for rec in reader][0]
logger.debug(
"Loaded MARC record for %s in %.5fs"
% (marc_id, time.time() - start_time)
)
except storage_exceptions.PartNotFoundException:
raise MARCRecordNotFound(marc_id)
return record