Source code for mep.books.utils
'''
Utility methods for generating slugs for works. Used in Work
model save method and model migration.
'''
import re
from string import punctuation
from django.utils.text import slugify
import stop_words
from unidecode import unidecode
STOP_WORDS = stop_words.get_stop_words('en') + \
stop_words.get_stop_words('fr')
[docs]def nonstop_words(text):
'''split text into words, remove stopwords, and return a list of all
non-stopwords. Removes punctuation, including apostrophes within words.'''
# remove French L' at beginning of words; remove all other apostrophes
# to avoid splitting words with contractions or possives
text = re.sub(r'\bL\'', '', text, flags=re.IGNORECASE).replace("'", "")
# split on whitespace and punctuation, remove empty strings
words = [word for word in re.split(r'[\s\W]+', text) if word]
title_words = [word for word in words if slugify(word) not in STOP_WORDS]
# return filtered list if not empty; otherwise use unfiltered words
# (i.e., title "Car" which is a French stopword)
return title_words or words
[docs]def creator_lastname(work):
'''Get the lastname of the first creator (first author or first
editor if no authors) on this work.'''
creators = work.creator_set.all()
lastname = ''
if creators.exists():
creator = creators.filter(creator_type__name='Author').first()
# if no author, use first editor
if not creator:
creator = creators.filter(creator_type__name='Editor').first()
if creator:
# based on logic from person short name property
lastname = creator.person.sort_name.split(',')[0] \
.split('(')[0].strip()
return lastname
[docs]def work_slug(work, max_words=3):
'''Generate a slug for a work. Uses last name of first author (or
first editor if no author), and first few non-stopwords in the title.'''
lastname = creator_lastname(work)
# title with stop words removed
nonstop_title_words = nonstop_words(work.title)
# by default, use at most first three words in the title
slug_text = '%s %s' % (lastname, nonstop_title_words[:max_words])
return slugify(unidecode(slug_text))
[docs]def generate_sort_title(title):
'''Generate sort title based on title. Removes leading punctuation and
stop word.'''
# english & french definite/indefinite articles
non_sort = ('the', 'a', 'an', 'la', 'le', 'les', 'l')
# remove leading punctuation (quotes, brackets, etc)
sort_title = title.lstrip(punctuation)
# split on punctuation or whitespace to get the first word
title_parts = [w for w in re.split(r'[\s\W]+', sort_title, maxsplit=1)
if w] # skip blank string
# if more than one word and first word is an article, skip it
if len(title_parts) > 1 and title_parts[0].lower() in non_sort:
return title_parts[1]
return sort_title