|
import spacy |
|
from spacy.language import Language |
|
from typing import List |
|
from spacy.util import registry, compile_suffix_regex |
|
|
|
from spacy.tokenizer import Tokenizer |
|
from spacy.util import registry |
|
import unicodedata |
|
from typing import Dict, Any |
|
from spacy.language import Language |
|
|
|
import re |
|
|
|
|
|
que_exceptions = [] |
|
|
|
|
|
que_exceptions += [ |
|
"quisque", |
|
"quidque", |
|
"quicque", |
|
"quodque", |
|
"cuiusque", |
|
"cuique", |
|
"quemque", |
|
"quamque", |
|
"quoque", |
|
"quaque", |
|
"quique", |
|
"quaeque", |
|
"quorumque", |
|
"quarumque", |
|
"quibusque", |
|
"quosque", |
|
"quasque", |
|
] |
|
|
|
|
|
que_exceptions += [ |
|
"uterque", |
|
"utraque", |
|
"utrumque", |
|
"utriusque", |
|
"utrique", |
|
"utrumque", |
|
"utramque", |
|
"utroque", |
|
"utraque", |
|
"utrique", |
|
"utraeque", |
|
"utrorumque", |
|
"utrarumque", |
|
"utrisque", |
|
"utrosque", |
|
"utrasque", |
|
] |
|
|
|
|
|
que_exceptions += [ |
|
"quicumque", |
|
"quidcumque", |
|
"quodcumque", |
|
"cuiuscumque", |
|
"cuicumque", |
|
"quemcumque", |
|
"quamcumque", |
|
"quocumque", |
|
"quacumque", |
|
"quicumque", |
|
"quaecumque", |
|
"quorumcumque", |
|
"quarumcumque", |
|
"quibuscumque", |
|
"quoscumque", |
|
"quascumque", |
|
] |
|
|
|
|
|
que_exceptions += [ |
|
"unusquisque", |
|
"unaquaeque", |
|
"unumquodque", |
|
"unumquidque", |
|
"uniuscuiusque", |
|
"unicuique", |
|
"unumquemque", |
|
"unamquamque", |
|
"unoquoque", |
|
"unaquaque", |
|
] |
|
|
|
|
|
que_exceptions += [ |
|
"plerusque", |
|
"pleraque", |
|
"plerumque", |
|
"plerique", |
|
"pleraeque", |
|
"pleroque", |
|
"pleramque", |
|
"plerorumque", |
|
"plerarumque", |
|
"plerisque", |
|
"plerosque", |
|
"plerasque", |
|
] |
|
|
|
|
|
que_exceptions += [ |
|
"absque", |
|
"abusque", |
|
"adaeque", |
|
"adusque", |
|
"aeque", |
|
"antique", |
|
"atque", |
|
"circumundique", |
|
"conseque", |
|
"cumque", |
|
"cunque", |
|
"denique", |
|
"deque", |
|
"donique", |
|
"hucusque", |
|
"inique", |
|
"inseque", |
|
"itaque", |
|
"longinque", |
|
"namque", |
|
"neque", |
|
"oblique", |
|
"peraeque", |
|
"praecoque", |
|
"propinque", |
|
"qualiscumque", |
|
"quandocumque", |
|
"quandoque", |
|
"quantuluscumque", |
|
"quantumcumque", |
|
"quantuscumque", |
|
"quinque", |
|
"quocumque", |
|
"quomodocumque", |
|
"quomque", |
|
"quotacumque", |
|
"quotcumque", |
|
"quotienscumque", |
|
"quotiensque", |
|
"quotusquisque", |
|
"quousque", |
|
"relinque", |
|
"simulatque", |
|
"torque", |
|
"ubicumque", |
|
"ubique", |
|
"undecumque", |
|
"undique", |
|
"usque", |
|
"usquequaque", |
|
"utcumque", |
|
"utercumque", |
|
"utique", |
|
"utrimque", |
|
"utrique", |
|
"utriusque", |
|
"utrobique", |
|
"utrubique", |
|
] |
|
|
|
|
|
|
|
|
|
from spacy.language import Language |
|
from spacy.lookups import load_lookups |
|
from spacy.tokens import Token |
|
from spacy.lookups import Lookups |
|
import string |
|
|
|
blank_nlp = spacy.blank("la") |
|
lookups = Lookups() |
|
|
|
|
|
lookups_data = load_lookups(lang=blank_nlp.vocab.lang, tables=["lemma_lookup"]) |
|
LOOKUPS = lookups_data.get_table("lemma_lookup") |
|
|
|
Token.set_extension( |
|
"predicted_lemma", default=None, force=True |
|
) |
|
|
|
|
|
@Language.component(name="lookup_lemmatizer") |
|
def make_lookup_lemmatizer_function(doc): |
|
for token in doc: |
|
token._.predicted_lemma = token.lemma_ |
|
|
|
|
|
if token.text in string.punctuation: |
|
token.lemma_ = token.text |
|
token.pos_ = "PUNCT" |
|
token.tag_ = "punc" |
|
|
|
|
|
if token.text == "que" and ( |
|
token.pos_ == "CCONJ" or token.tag_ == "conjunction" |
|
): |
|
token.lemma_ = token.text |
|
|
|
|
|
|
|
token.lemma_ = LOOKUPS.get(token.text, token.lemma_) |
|
|
|
|
|
if token.text[0].isupper() and token.text not in LOOKUPS: |
|
token.lemma_ = LOOKUPS.get(token.text.lower(), token.lemma_) |
|
return doc |
|
|
|
|
|
|
|
|
|
|
|
|
|
from spacy.language import Language |
|
from spacy.tokens import Doc |
|
import numpy as np |
|
|
|
|
|
@Language.factory("trf_vectors") |
|
class TrfContextualVectors: |
|
""" |
|
Spacy pipeline which add transformer vectors to each token based on user hooks. |
|
https://spacy.io/usage/processing-pipelines#custom-components-user-hooks |
|
https://github.com/explosion/spaCy/discussions/6511 |
|
""" |
|
|
|
def __init__(self, nlp: Language, name: str): |
|
self.name = name |
|
Doc.set_extension("trf_token_vecs", default=None) |
|
|
|
def __call__(self, sdoc): |
|
|
|
if type(sdoc) == str: |
|
sdoc = self._nlp(sdoc) |
|
|
|
|
|
|
|
|
|
vec_idx_splits = np.cumsum(sdoc._.trf_data.align.lengths) |
|
|
|
trf_vecs = sdoc._.trf_data.tensors[0].reshape(-1, 768) |
|
|
|
vec_idxs = np.split(sdoc._.trf_data.align.dataXd, vec_idx_splits) |
|
|
|
|
|
vecs = np.stack([trf_vecs[idx].sum(0) for idx in vec_idxs[:-1]]) |
|
sdoc._.trf_token_vecs = vecs |
|
|
|
sdoc.user_token_hooks["vector"] = self.vector |
|
sdoc.user_token_hooks["has_vector"] = self.has_vector |
|
return sdoc |
|
|
|
def vector(self, token): |
|
return token.doc._.trf_token_vecs[token.i] |
|
|
|
def has_vector(self, token): |
|
return True |
|
|
|
|
|
|
|
|
|
|
|
import unicodedata |
|
from spacy.language import Language |
|
import spacy |
|
|
|
|
|
@Language.component("normer") |
|
def normer(doc): |
|
def norm(text): |
|
return ( |
|
text.replace("v", "u").replace("j", "i").replace("V", "U").replace("J", "I") |
|
) |
|
|
|
for token in doc: |
|
token.norm_ = norm(token.norm_) |
|
|
|
return doc |
|
|
|
|
|
|
|
|
|
|
|
|
|
from spacy.language import Language |
|
from spacy.tokens import Token, MorphAnalysis |
|
|
|
Token.set_extension("remorph", default=None, force=True) |
|
|
|
|
|
@Language.component("remorpher") |
|
def remorpher(doc): |
|
for token in doc: |
|
token._.remorph = token.morph |
|
morph = token.morph.to_dict() |
|
if morph.get("Tense"): |
|
if morph["Tense"] == "Perf" or morph["Tense"] == "Imp": |
|
morph["Tense"] = "Past" |
|
elif morph["Tense"] == "FutPerf": |
|
morph["Tense"] = "Fut" |
|
token.set_morph(morph) |
|
return doc |
|
|
|
|
|
|
|
|
|
|
|
|
|
@registry.tokenizers("latin_core_tokenizer") |
|
def create_latin_tokenizer(): |
|
def create_tokenizer(nlp): |
|
tokenizer = LatinTokenizer(nlp.vocab) |
|
|
|
|
|
suffixes = nlp.Defaults.suffixes + ["que", "qve"] |
|
suffix_regex = compile_suffix_regex(suffixes) |
|
tokenizer.suffix_search = suffix_regex.search |
|
|
|
|
|
for item in que_exceptions: |
|
tokenizer.add_special_case(item, [{"ORTH": item}]) |
|
tokenizer.add_special_case(item.lower(), [{"ORTH": item.lower()}]) |
|
tokenizer.add_special_case(item.title(), [{"ORTH": item.title()}]) |
|
tokenizer.add_special_case(item.upper(), [{"ORTH": item.upper()}]) |
|
|
|
return tokenizer |
|
|
|
return create_tokenizer |
|
|
|
|
|
class LatinTokenizer(Tokenizer): |
|
def separate_ligatures(self, text: str) -> str: |
|
"""Convert ligatures while preserving case""" |
|
result = text |
|
result = result.replace("Æ", "Ae").replace("Œ", "Oe") |
|
result = result.replace("æ", "ae").replace("œ", "oe") |
|
return result |
|
|
|
def remove_macrons(self, text: str) -> str: |
|
"""Remove macrons while preserving case""" |
|
macron_map = str.maketrans("āēīōūȳĀĒĪŌŪȲ", "aeiouyAEIOUY") |
|
return text.translate(macron_map) |
|
|
|
def remove_accents(self, text: str) -> str: |
|
"""Remove diacritical marks""" |
|
return "".join( |
|
c |
|
for c in unicodedata.normalize("NFD", text) |
|
if unicodedata.category(c) != "Mn" |
|
) |
|
|
|
def norm_spacing(self, text: str) -> str: |
|
"""Normalize spacing and strip whitespace""" |
|
return re.sub(r"\s+", " ", text).strip() |
|
|
|
def preprocess(self, text: str) -> str: |
|
"""Apply all preprocessing steps in sequence""" |
|
text = self.separate_ligatures(text) |
|
text = self.remove_macrons(text) |
|
text = self.remove_accents(text) |
|
text = self.norm_spacing(text) |
|
return text |
|
|
|
def __call__(self, text): |
|
"""Process text before tokenization""" |
|
processed_text = self.preprocess(text) |
|
return super().__call__(processed_text) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
pass |
|
|