la_core_web_sm / functions.py
diyclassics's picture
Update spaCy pipeline
7effaea verified
import spacy
from spacy.language import Language
from typing import List
from spacy.util import registry, compile_suffix_regex
from spacy.tokenizer import Tokenizer
from spacy.util import registry
import unicodedata
from typing import Dict, Any
from spacy.language import Language
import re
# ----- que_exceptions ----- #
que_exceptions = [] # type: List[str]
# quisque / quique
que_exceptions += [
"quisque",
"quidque",
"quicque",
"quodque",
"cuiusque",
"cuique",
"quemque",
"quamque",
"quoque",
"quaque",
"quique",
"quaeque",
"quorumque",
"quarumque",
"quibusque",
"quosque",
"quasque",
]
# uterque
que_exceptions += [
"uterque",
"utraque",
"utrumque",
"utriusque",
"utrique",
"utrumque",
"utramque",
"utroque",
"utraque",
"utrique",
"utraeque",
"utrorumque",
"utrarumque",
"utrisque",
"utrosque",
"utrasque",
]
# quiscumque
que_exceptions += [
"quicumque",
"quidcumque",
"quodcumque",
"cuiuscumque",
"cuicumque",
"quemcumque",
"quamcumque",
"quocumque",
"quacumque",
"quicumque",
"quaecumque",
"quorumcumque",
"quarumcumque",
"quibuscumque",
"quoscumque",
"quascumque",
]
# unuscumque
que_exceptions += [
"unusquisque",
"unaquaeque",
"unumquodque",
"unumquidque",
"uniuscuiusque",
"unicuique",
"unumquemque",
"unamquamque",
"unoquoque",
"unaquaque",
]
# plerusque
que_exceptions += [
"plerusque",
"pleraque",
"plerumque",
"plerique",
"pleraeque",
"pleroque",
"pleramque",
"plerorumque",
"plerarumque",
"plerisque",
"plerosque",
"plerasque",
]
# misc
que_exceptions += [
"absque",
"abusque",
"adaeque",
"adusque",
"aeque",
"antique",
"atque",
"circumundique",
"conseque",
"cumque",
"cunque",
"denique",
"deque",
"donique",
"hucusque",
"inique",
"inseque",
"itaque",
"longinque",
"namque",
"neque",
"oblique",
"peraeque",
"praecoque",
"propinque",
"qualiscumque",
"quandocumque",
"quandoque",
"quantuluscumque",
"quantumcumque",
"quantuscumque",
"quinque",
"quocumque",
"quomodocumque",
"quomque",
"quotacumque",
"quotcumque",
"quotienscumque",
"quotiensque",
"quotusquisque",
"quousque",
"relinque",
"simulatque",
"torque",
"ubicumque",
"ubique",
"undecumque",
"undique",
"usque",
"usquequaque",
"utcumque",
"utercumque",
"utique",
"utrimque",
"utrique",
"utriusque",
"utrobique",
"utrubique",
]
# ---------- #
# ----- lookup_lemmatizer ----- #
from spacy.language import Language
from spacy.lookups import load_lookups
from spacy.tokens import Token
from spacy.lookups import Lookups
import string
blank_nlp = spacy.blank("la")
lookups = Lookups()
lookups_data = load_lookups(lang=blank_nlp.vocab.lang, tables=["lemma_lookup"])
LOOKUPS = lookups_data.get_table("lemma_lookup")
Token.set_extension(
"predicted_lemma", default=None, force=True
) # TODO: test that this works
@Language.component(name="lookup_lemmatizer")
def make_lookup_lemmatizer_function(doc):
for token in doc:
token._.predicted_lemma = token.lemma_
# Handle punctuation
if token.text in string.punctuation:
token.lemma_ = token.text
token.pos_ = "PUNCT"
token.tag_ = "punc"
# Handle "que" enclitics
if token.text == "que" and (
token.pos_ == "CCONJ" or token.tag_ == "conjunction"
):
token.lemma_ = token.text
# Lookup lemmatizer
token.lemma_ = LOOKUPS.get(token.text, token.lemma_)
# Better handle capitalization
if token.text[0].isupper() and token.text not in LOOKUPS:
token.lemma_ = LOOKUPS.get(token.text.lower(), token.lemma_)
return doc
# ---------- #
# ----- trf_vectors ----- #
from spacy.language import Language
from spacy.tokens import Doc
import numpy as np
@Language.factory("trf_vectors")
class TrfContextualVectors:
"""
Spacy pipeline which add transformer vectors to each token based on user hooks.
https://spacy.io/usage/processing-pipelines#custom-components-user-hooks
https://github.com/explosion/spaCy/discussions/6511
"""
def __init__(self, nlp: Language, name: str):
self.name = name
Doc.set_extension("trf_token_vecs", default=None)
def __call__(self, sdoc):
# inject hooks from this class into the pipeline
if type(sdoc) == str:
sdoc = self._nlp(sdoc)
# pre-calculate all vectors for every token:
# calculate groups for spacy token boundaries in the trf vectors
vec_idx_splits = np.cumsum(sdoc._.trf_data.align.lengths)
# get transformer vectors and reshape them into one large continous tensor
trf_vecs = sdoc._.trf_data.tensors[0].reshape(-1, 768)
# calculate mapping groups from spacy tokens to transformer vector indices
vec_idxs = np.split(sdoc._.trf_data.align.dataXd, vec_idx_splits)
# take sum of mapped transformer vector indices for spacy vectors
vecs = np.stack([trf_vecs[idx].sum(0) for idx in vec_idxs[:-1]])
sdoc._.trf_token_vecs = vecs
sdoc.user_token_hooks["vector"] = self.vector
sdoc.user_token_hooks["has_vector"] = self.has_vector
return sdoc
def vector(self, token):
return token.doc._.trf_token_vecs[token.i]
def has_vector(self, token):
return True
# ---------- #
# ----- normer ----- #
import unicodedata
from spacy.language import Language
import spacy
@Language.component("normer")
def normer(doc):
def norm(text):
return (
text.replace("v", "u").replace("j", "i").replace("V", "U").replace("J", "I")
)
for token in doc:
token.norm_ = norm(token.norm_)
return doc
# ---------- #
# ----- remorpher ----- #
from spacy.language import Language
from spacy.tokens import Token, MorphAnalysis
Token.set_extension("remorph", default=None, force=True)
@Language.component("remorpher")
def remorpher(doc):
for token in doc:
token._.remorph = token.morph
morph = token.morph.to_dict()
if morph.get("Tense"):
if morph["Tense"] == "Perf" or morph["Tense"] == "Imp":
morph["Tense"] = "Past"
elif morph["Tense"] == "FutPerf":
morph["Tense"] = "Fut"
token.set_morph(morph)
return doc
# ---------- #
# ----- customize_tokenizer ----- #
@registry.tokenizers("latin_core_tokenizer")
def create_latin_tokenizer():
def create_tokenizer(nlp):
tokenizer = LatinTokenizer(nlp.vocab)
# Add que-splitting
suffixes = nlp.Defaults.suffixes + ["que", "qve"]
suffix_regex = compile_suffix_regex(suffixes)
tokenizer.suffix_search = suffix_regex.search
# Add special cases
for item in que_exceptions:
tokenizer.add_special_case(item, [{"ORTH": item}])
tokenizer.add_special_case(item.lower(), [{"ORTH": item.lower()}])
tokenizer.add_special_case(item.title(), [{"ORTH": item.title()}])
tokenizer.add_special_case(item.upper(), [{"ORTH": item.upper()}])
return tokenizer
return create_tokenizer
class LatinTokenizer(Tokenizer):
def separate_ligatures(self, text: str) -> str:
"""Convert ligatures while preserving case"""
result = text
result = result.replace("Æ", "Ae").replace("Œ", "Oe")
result = result.replace("æ", "ae").replace("œ", "oe")
return result
def remove_macrons(self, text: str) -> str:
"""Remove macrons while preserving case"""
macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY")
return text.translate(macron_map)
def remove_accents(self, text: str) -> str:
"""Remove diacritical marks"""
return "".join(
c
for c in unicodedata.normalize("NFD", text)
if unicodedata.category(c) != "Mn"
)
def norm_spacing(self, text: str) -> str:
"""Normalize spacing and strip whitespace"""
return re.sub(r"\s+", " ", text).strip()
def preprocess(self, text: str) -> str:
"""Apply all preprocessing steps in sequence"""
text = self.separate_ligatures(text)
text = self.remove_macrons(text)
text = self.remove_accents(text)
text = self.norm_spacing(text)
return text
def __call__(self, text):
"""Process text before tokenization"""
processed_text = self.preprocess(text)
return super().__call__(processed_text)
# ---------- #
if __name__ == "__main__":
pass