import spacy from spacy.language import Language from typing import List from spacy.util import registry, compile_suffix_regex from spacy.tokenizer import Tokenizer from spacy.util import registry import unicodedata from typing import Dict, Any from spacy.language import Language import re # ----- que_exceptions ----- # que_exceptions = [] # type: List[str] # quisque / quique que_exceptions += [ "quisque", "quidque", "quicque", "quodque", "cuiusque", "cuique", "quemque", "quamque", "quoque", "quaque", "quique", "quaeque", "quorumque", "quarumque", "quibusque", "quosque", "quasque", ] # uterque que_exceptions += [ "uterque", "utraque", "utrumque", "utriusque", "utrique", "utrumque", "utramque", "utroque", "utraque", "utrique", "utraeque", "utrorumque", "utrarumque", "utrisque", "utrosque", "utrasque", ] # quiscumque que_exceptions += [ "quicumque", "quidcumque", "quodcumque", "cuiuscumque", "cuicumque", "quemcumque", "quamcumque", "quocumque", "quacumque", "quicumque", "quaecumque", "quorumcumque", "quarumcumque", "quibuscumque", "quoscumque", "quascumque", ] # unuscumque que_exceptions += [ "unusquisque", "unaquaeque", "unumquodque", "unumquidque", "uniuscuiusque", "unicuique", "unumquemque", "unamquamque", "unoquoque", "unaquaque", ] # plerusque que_exceptions += [ "plerusque", "pleraque", "plerumque", "plerique", "pleraeque", "pleroque", "pleramque", "plerorumque", "plerarumque", "plerisque", "plerosque", "plerasque", ] # misc que_exceptions += [ "absque", "abusque", "adaeque", "adusque", "aeque", "antique", "atque", "circumundique", "conseque", "cumque", "cunque", "denique", "deque", "donique", "hucusque", "inique", "inseque", "itaque", "longinque", "namque", "neque", "oblique", "peraeque", "praecoque", "propinque", "qualiscumque", "quandocumque", "quandoque", "quantuluscumque", "quantumcumque", "quantuscumque", "quinque", "quocumque", "quomodocumque", "quomque", "quotacumque", "quotcumque", "quotienscumque", "quotiensque", "quotusquisque", "quousque", "relinque", "simulatque", "torque", "ubicumque", "ubique", "undecumque", "undique", "usque", "usquequaque", "utcumque", "utercumque", "utique", "utrimque", "utrique", "utriusque", "utrobique", "utrubique", ] # ---------- # # ----- lookup_lemmatizer ----- # from spacy.language import Language from spacy.lookups import load_lookups from spacy.tokens import Token from spacy.lookups import Lookups import string blank_nlp = spacy.blank("la") lookups = Lookups() lookups_data = load_lookups(lang=blank_nlp.vocab.lang, tables=["lemma_lookup"]) LOOKUPS = lookups_data.get_table("lemma_lookup") Token.set_extension( "predicted_lemma", default=None, force=True ) # TODO: test that this works @Language.component(name="lookup_lemmatizer") def make_lookup_lemmatizer_function(doc): for token in doc: token._.predicted_lemma = token.lemma_ # Handle punctuation if token.text in string.punctuation: token.lemma_ = token.text token.pos_ = "PUNCT" token.tag_ = "punc" # Handle "que" enclitics if token.text == "que" and ( token.pos_ == "CCONJ" or token.tag_ == "conjunction" ): token.lemma_ = token.text # Lookup lemmatizer token.lemma_ = LOOKUPS.get(token.text, token.lemma_) # Better handle capitalization if token.text[0].isupper() and token.text not in LOOKUPS: token.lemma_ = LOOKUPS.get(token.text.lower(), token.lemma_) return doc # ---------- # # ----- trf_vectors ----- # from spacy.language import Language from spacy.tokens import Doc import numpy as np @Language.factory("trf_vectors") class TrfContextualVectors: """ Spacy pipeline which add transformer vectors to each token based on user hooks. https://spacy.io/usage/processing-pipelines#custom-components-user-hooks https://github.com/explosion/spaCy/discussions/6511 """ def __init__(self, nlp: Language, name: str): self.name = name Doc.set_extension("trf_token_vecs", default=None) def __call__(self, sdoc): # inject hooks from this class into the pipeline if type(sdoc) == str: sdoc = self._nlp(sdoc) # pre-calculate all vectors for every token: # calculate groups for spacy token boundaries in the trf vectors vec_idx_splits = np.cumsum(sdoc._.trf_data.align.lengths) # get transformer vectors and reshape them into one large continous tensor trf_vecs = sdoc._.trf_data.tensors[0].reshape(-1, 768) # calculate mapping groups from spacy tokens to transformer vector indices vec_idxs = np.split(sdoc._.trf_data.align.dataXd, vec_idx_splits) # take sum of mapped transformer vector indices for spacy vectors vecs = np.stack([trf_vecs[idx].sum(0) for idx in vec_idxs[:-1]]) sdoc._.trf_token_vecs = vecs sdoc.user_token_hooks["vector"] = self.vector sdoc.user_token_hooks["has_vector"] = self.has_vector return sdoc def vector(self, token): return token.doc._.trf_token_vecs[token.i] def has_vector(self, token): return True # ---------- # # ----- normer ----- # import unicodedata from spacy.language import Language import spacy @Language.component("normer") def normer(doc): def norm(text): return ( text.replace("v", "u").replace("j", "i").replace("V", "U").replace("J", "I") ) for token in doc: token.norm_ = norm(token.norm_) return doc # ---------- # # ----- remorpher ----- # from spacy.language import Language from spacy.tokens import Token, MorphAnalysis Token.set_extension("remorph", default=None, force=True) @Language.component("remorpher") def remorpher(doc): for token in doc: token._.remorph = token.morph morph = token.morph.to_dict() if morph.get("Tense"): if morph["Tense"] == "Perf" or morph["Tense"] == "Imp": morph["Tense"] = "Past" elif morph["Tense"] == "FutPerf": morph["Tense"] = "Fut" token.set_morph(morph) return doc # ---------- # # ----- customize_tokenizer ----- # @registry.tokenizers("latin_core_tokenizer") def create_latin_tokenizer(): def create_tokenizer(nlp): tokenizer = LatinTokenizer(nlp.vocab) # Add que-splitting suffixes = nlp.Defaults.suffixes + ["que", "qve"] suffix_regex = compile_suffix_regex(suffixes) tokenizer.suffix_search = suffix_regex.search # Add special cases for item in que_exceptions: tokenizer.add_special_case(item, [{"ORTH": item}]) tokenizer.add_special_case(item.lower(), [{"ORTH": item.lower()}]) tokenizer.add_special_case(item.title(), [{"ORTH": item.title()}]) tokenizer.add_special_case(item.upper(), [{"ORTH": item.upper()}]) return tokenizer return create_tokenizer class LatinTokenizer(Tokenizer): def separate_ligatures(self, text: str) -> str: """Convert ligatures while preserving case""" result = text result = result.replace("Æ", "Ae").replace("Œ", "Oe") result = result.replace("æ", "ae").replace("œ", "oe") return result def remove_macrons(self, text: str) -> str: """Remove macrons while preserving case""" macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY") return text.translate(macron_map) def remove_accents(self, text: str) -> str: """Remove diacritical marks""" return "".join( c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn" ) def norm_spacing(self, text: str) -> str: """Normalize spacing and strip whitespace""" return re.sub(r"\s+", " ", text).strip() def preprocess(self, text: str) -> str: """Apply all preprocessing steps in sequence""" text = self.separate_ligatures(text) text = self.remove_macrons(text) text = self.remove_accents(text) text = self.norm_spacing(text) return text def __call__(self, text): """Process text before tokenization""" processed_text = self.preprocess(text) return super().__call__(processed_text) # ---------- # if __name__ == "__main__": pass