# methods to convert historical dates to standard dates
# will be used for reporting and automatic conversion in admin
import calendar
import re
from datetime import date
import convertdate
from django.core.exceptions import ValidationError
from django.core.validators import RegexValidator
from django.db import models
from django.utils.formats import date_format
from django.utils.safestring import mark_safe
from unidecode import unidecode
from geniza.common.models import TrackChangesModel
[docs]
class Calendar:
"""Codes for supported calendars"""
#: Hijri calendar (Islamic)
HIJRI = "h"
#: Kharaji calendar
KHARAJI = "k"
#: Seleucid calendar
SELEUCID = "s"
#: Anno Mundi calendar (Hebrew)
ANNO_MUNDI = "am"
#: calendars that can be converted to Julian/Gregorian
can_convert = [ANNO_MUNDI, HIJRI, SELEUCID]
#: offset for Seleucid calendar: Anno Mundi - 3449
SELEUCID_OFFSET = 3449
[docs]
class PartialDate:
"""Simple partial date object to handle parsing and display of
dates in the format YYYY, YYYY-MM, or YYYY-MM-DD. Display format
is based on known precision of year, month, or day."""
available_precision = ["year", "month", "day"]
#: public display format based on date precision
display_format = {
"year": "Y",
"month": "F Y",
"day": "DATE_FORMAT", # honors locale formatting
}
#: ISO format based on date precision
iso_format = {
"year": "%Y",
"month": "%Y-%m",
"day": "%Y-%m-%d",
}
#: numeric format for indexing and sorting
num_fmt = "%Y%m%d"
def __init__(self, str):
# TODO: probably still need more validation/error handling here
# for real world use
date_parts = str.split("-")
if len(date_parts) > 3:
raise ValueError(f"Error parsing standard date {str}")
# since we don't currently support unknown year,
# precision can be determined by number of date parts
self.precision = self.available_precision[len(date_parts) - 1]
# fill in unknown month/day as 1
date_parts += [1] * (3 - len(date_parts))
# cast to integer and convert to datetime.date
self.date = date(*[int(p) for p in date_parts])
def __str__(self):
# format the date based on known precision
return date_format(
self.date, format=self.display_format[self.precision], use_l10n=True
)
def __repr__(self) -> str:
return f"PartialDate({self.isoformat()})"
def __eq__(self, other):
if not isinstance(other, PartialDate):
# don't attempt to compare against unrelated types
return NotImplemented
# equivalent if date and precision are the same
return self.date == other.date and self.precision == other.precision
[docs]
@staticmethod
def get_date_range(old_range, new_range):
"""Compute the union (widest possible date range) between two PartialDate ranges."""
minmax = old_range
[start, end] = new_range
# use numeric format to compare to current min, replace if smaller
start_numeric = int(start.numeric_format(mode="min"))
min = minmax[0]
if min is None or start_numeric < int(min.numeric_format(mode="min")):
# store as PartialDate, not numeric format
minmax[0] = start
# use numeric format to compare to current max, replace if larger
end_numeric = int(end.numeric_format(mode="max"))
max = minmax[1]
if max is None or end_numeric > int(max.numeric_format(mode="max")):
# store as PartialDate, not numeric format
minmax[1] = end
return minmax
[docs]
class DocumentDateMixin(TrackChangesModel):
"""Mixin for document date fields (original and standardized),
and related logic for displaying, converting,a nd validating dates."""
doc_date_original = models.CharField(
"Date on document",
help_text="Explicit date on the document, in original format",
blank=True,
max_length=255,
)
CALENDAR_CHOICES = (
(Calendar.HIJRI, "Hijrī"),
(Calendar.KHARAJI, "Kharājī"),
(Calendar.SELEUCID, "Seleucid"),
(Calendar.ANNO_MUNDI, "Anno Mundi"),
)
doc_date_calendar = models.CharField(
"Calendar",
max_length=2,
choices=CALENDAR_CHOICES,
help_text="Calendar according to which the document gives a date: "
+ "Hijrī (AH); Kharājī (rare - mostly for fiscal docs); "
+ "\nSeleucid (sometimes listed as Minyan Shetarot); Anno Mundi (Hebrew calendar)",
blank=True,
)
# supports YYYY, YYYY-MM, YYYY-MM-DD and same formats as ranges YYYY/YYYY
re_date_format = re.compile(
r"^\d{3,4}(-[01]\d(-[0-3]\d)?)?(/\d{3,4}(-[01]\d(-[0-3]\d)?)?)?$"
)
standard_date_helptext = str(
"Convert to Julian before 1582, Gregorian after 1582. "
+ "\nUse YYYY, YYYY-MM, YYYY-MM-DD format or YYYY-MM-DD/YYYY-MM-DD for date ranges.",
)
doc_date_standard = models.CharField(
"CE date",
help_text=f"{standard_date_helptext} \nLeave blank or clear out to automatically "
+ "calculate standardized date for supported calendars.",
blank=True,
max_length=255,
validators=[RegexValidator(re_date_format)],
)
class Meta:
abstract = True
@property
def original_date(self):
"""Generate formatted display for the document's original/historical date"""
# combine date and calendar or return empty string
return " ".join(
[self.doc_date_original, self.get_doc_date_calendar_display()]
).strip()
@property
def document_date(self):
"""Generate formatted display of combined original and standardized dates"""
if self.doc_date_standard:
standardized_date = standard_date_display(self.doc_date_standard)
# add parentheses to standardized date if original date is also present
if self.original_date:
# NOTE: we want no-wrap for individual dates when displaying as html
# may want to split out formatted/unformatted versions
return mark_safe(
"<span>%s</span> <span>(%s)</span>"
% (
self.original_date,
standardized_date,
)
)
# should we always use spans, or only when both dates are present?
return standardized_date
else:
# if there's no standardized date, just display the historical date
return self.original_date
[docs]
def clean(self):
"""
Require doc_date_original and doc_date_calendar to be set
if either one is present.
"""
if self.doc_date_calendar and not self.doc_date_original:
raise ValidationError("Original date is required when calendar is set")
if self.doc_date_original and not self.doc_date_calendar:
raise ValidationError("Calendar is required when original date is set")
[docs]
def standardize_date(self, update=False):
"""
Convert the document's original date to a standardized date, if possible.
If update is requested, will store the converted value on :attr:`doc_date_standard`
"""
# if original date is set and conversion is supported for this calendar,
# generate the standardized date
if self.doc_date_original and self.doc_date_calendar in Calendar.can_convert:
std_date = standardize_date(self.doc_date_original, self.doc_date_calendar)
if std_date:
converted_date = display_date_range(*std_date)
# if update is requested and standard date is unset, save the converted date
if update and not self.doc_date_standard:
self.doc_date_standard = converted_date
return converted_date
_parsed_date = {}
@property
def parsed_date(self):
"""Parse standard date (if set) and return as dictionary
of start/end :class:`PartialDate` objects"""
# for efficiency, parse and cache standard date into
# a dictionary of start/end partial dates.
# recalculate if not set or standard date has changed
if (
self.doc_date_standard
and not self._parsed_date
or self.has_changed("doc_date_standard")
):
try:
date_parts = self.doc_date_standard.split("/")
start = PartialDate(date_parts[0])
# if a single date instead of a range, start and end are the same
if len(date_parts) == 1:
end = start
else:
end = PartialDate(date_parts[1])
self._parsed_date = {"start": start, "end": end}
except ValueError:
# ignore if it can't be parsed (records before validation added)
pass
return self._parsed_date
@property
def start_date(self):
"""
Return the start date of the document's standardized date or date range, if set.
"""
return self.parsed_date.get("start")
@property
def end_date(self):
"""
Return the end date of the document's standardized date or date range, if set.
"""
return self.parsed_date.get("end")
[docs]
def solr_date_range(self):
"""
Return a Solr date range for the document's standardized date.
"""
# only convert if standardized document date is set and passes validation
if self.doc_date_standard and self.re_date_format.match(self.doc_date_standard):
# if we have a single date, return it as is
date_parts = self.doc_date_standard.split("/")
# if a single date instead of a range, start and end are the same
if len(date_parts) == 1:
return date_parts[0]
# if there's more than one date, return as a range
return "[%s TO %s]" % tuple(date_parts)
# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October
# cut off between gregorian/julian dates, in julian days
gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)
# PGP month names don't match exactly those used in convertdate;
# add local aliases to map them
hebrew_month_aliases = {
"Tevet": "Teveth",
"Iyar": "Iyyar",
"Tishrei": "Tishri",
"Adar I": "Adar",
"Adar II": "Adar Bet",
}
[docs]
def get_hebrew_month(month_name):
"""Convert Hebrew month name to month number.
Supports local month name aliases for alternate spellings."""
# use unidecode to simplify handling with/without accents
month_name = unidecode(month_name)
return (
convertdate.hebrew.MONTHS.index(
hebrew_month_aliases.get(month_name, month_name)
)
+ 1
)
#: regular expression for extracting information from original date string
re_original_date = re.compile(
# for months, match any non-numeric characters, since some month names are multi-word
r"(?:(?P<weekday>\w+day),? )?(?:(?P<day>\d+) )?(?:(?P<month>[^\d]+( I{1,2})?) )?(?P<year>\d{3,4})",
flags=re.UNICODE,
)
# characters to remove before applying regex to historical date
ignorechars = str.maketrans({val: "" for val in "[]()"})
[docs]
def get_calendar_month(convertdate_module, month):
""" "Convert month name to month number for the specified calendar.
:param convertdate_module: `convertdate` calendar module to use
:param month: string month name
:return int: month number
"""
if convertdate_module == convertdate.hebrew:
return get_hebrew_month(month)
elif convertdate_module == convertdate.islamic:
return get_islamic_month(month)
[docs]
def convert_hebrew_date(historic_date):
"""Convert a date in the Hebrew Anno Mundi calendar to the Julian or Gregorian calendar"""
return standardize_date(historic_date, Calendar.ANNO_MUNDI)
[docs]
def convert_seleucid_date(historic_date):
"""Convert a date in the Greek Seleucid calendar to the Julian or Gregorian calendar"""
return standardize_date(historic_date, Calendar.SELEUCID)
[docs]
def convert_islamic_date(historic_date):
"""Convert a date in the Islamic Hijri calendar to the Julian or Gregorian calendar"""
return standardize_date(historic_date, Calendar.HIJRI)
#: mapping between supported calendars and corresponding convertdate module
calendar_converter = {
Calendar.ANNO_MUNDI: convertdate.hebrew,
Calendar.HIJRI: convertdate.islamic,
# NOTE: Seleucid years cannot be passed directly into convertdate.hebrew; instead,
# convert them to AM using the Seleucid offset first, in order to handle leap years
Calendar.SELEUCID: convertdate.hebrew,
}
[docs]
def standardize_date(historic_date, calendar):
"""
convert hebrew date in text format to standard date range
"""
# some historic dates include brackets or parentheses; ignore them
historic_date = historic_date.translate(ignorechars)
# get the convertdate calendar module for the specified calendar
converter = calendar_converter[calendar]
match = re_original_date.match(historic_date)
# may want to log or debug regex mismatch for dev or reporting
if match:
date_info = match.groupdict()
year = int(date_info["year"])
if calendar == Calendar.SELEUCID:
year = year + Calendar.SELEUCID_OFFSET
month = date_info["month"]
if month:
# ignore seasons for now
if month.lower() in ["spring", "summer", "fall", "winter"]:
month = None
else:
month = get_calendar_month(converter, month)
day = date_info["day"]
day = int(day) if day else None
# weekday = date_info["weekday"] # use for checking
# convert
cdate = get_calendar_date(converter, year, month, day)
# cdate = get_hebrew_date(year, month, day)
# if it returned a single date, convert to a tuple for consistency
if isinstance(cdate, date):
# could check weekday and warn here...
return (cdate, cdate)
else:
return cdate
[docs]
def get_calendar_date(converter, year, month=None, day=None, mode=None):
"""
Convert a date from a supported calendar and return
as a :class:`datetime.date` or tuple of dates for a date range,
when the conversion is ambiguous. Takes year and optional month and day.
"""
# NOTE: this raises an error if conversion is out of range
# if we know month but not day, determine the number of days in the month
# then generate standard dates for max and min (earliest and latest)
if month and not day:
# convertdate is inconsistent; should be fixed in 2.4.1
if hasattr(converter, "month_days"):
# hebrew calendar has month_days method
month_days = converter.month_days(year, month)
else:
# islamic calendar has month_length
month_days = converter.month_length(year, month)
# earliest is 1, latest is month_days
# when mode is latest, only return the last day of the month
if mode == "latest":
return get_calendar_date(converter, year, month, month_days)
# otherwise, return first and last
return get_calendar_date(converter, year, month, 1), get_calendar_date(
converter, year, month, month_days
)
# if we don't know the month, we want to calculate
# the earliest and latest
if not month:
if converter == convertdate.hebrew:
# hebrew calendar civil year starts in Tishri
earliest_month = convertdate.hebrew.TISHRI
# Elul is the month before Tishri
latest_month = convertdate.hebrew.ELUL
else:
# fall back to the number of months;
# In Islamic calendar, does not vary by year
year_months = len(converter.MONTHS)
earliest_month = 1
latest_month = year_months
# return the first day of the first month and the last day of the last month
# OR: would it make more sense / be simpler to get the first day
# of the next year and subtract one day?
return get_calendar_date(converter, year, earliest_month, 1), get_calendar_date(
converter, year, latest_month, mode="latest"
)
# year/month/day all values determined; convert and return
# convert to julian days
converted_jd = converter.to_jd(year, month, day)
# if before the start of the gregorian calendar, convert to julian
if converted_jd < gregorian_start_jd:
converted_date = convertdate.julian.from_jd(converted_jd)
# otherwise, convert to gregorian
else:
converted_date = convertdate.gregorian.from_jd(converted_jd)
# convert tuple of year, month, day to datetime.date
return date(*converted_date)
[docs]
def display_date_range(earliest, latest):
"""
display a date range or single date in a isoformat
"""
# if both dates are the same, only display once
if earliest == latest:
return earliest.isoformat()
else:
return "%s/%s" % (earliest.isoformat(), latest.isoformat())
## islamic calendar
# generate unicode-decoded version of month list for better matching
islamic_months = [unidecode(m) for m in convertdate.islamic.MONTHS]
# local overrides for months
islamic_month_aliases = {
"Muharram": "al-Muharram",
"Rabi' I": "Rabi' al-`Awwal",
# NOTE: trailing commas are typeod in convertdate; will be fixed in 2.4.1
"Rabi' II": "Rabi' ath-Thani,",
"Jumada I": "Jumada al-`Awwal,",
"Jumada II": "Jumada ath-Thaniyah,",
"Dhu l-Qa'da": "Zu al-Qa'dah",
"Dhu l-Hijja": "Zu al-Hijjah",
}
[docs]
def get_islamic_month(month_name):
"""Convert Islamic month name to month number; works
with or without accents, and supports local month-name overrides."""
month_name = unidecode(month_name)
return islamic_months.index(islamic_month_aliases.get(month_name, month_name)) + 1
[docs]
def standard_date_display(standard_date):
"""Display a standardized CE date in human readable format."""
# bail out if there is nothing to display
if not standard_date:
return
# currently storing in isoformat, with slash if a date range
dates = standard_date.split("/")
# we should always have at least one date, if date is set
# convert to local partial date object for precision-aware string formatting
# join dates with en-dash if more than one;
# add CE to the end to make calendar system explicit
try:
return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates)
except ValueError:
# dates entered before validation was applied may not parse
# as fallback, display as is
return "%s CE" % standard_date