Spaces:
Running
Running
import re | |
from phonemizer import backend | |
from typing import List | |
class Tokenizer: | |
def __init__(self): | |
self.VOCAB = self._get_vocab() | |
self.phonemizers = { | |
'en-us': backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True), | |
'en-gb': backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True), | |
} | |
def _get_vocab(): | |
""" | |
Generates a mapping of symbols to integer indices for tokenization. | |
Returns: | |
dict: A dictionary where keys are symbols and values are unique integer indices. | |
""" | |
# Define the symbols | |
_pad = "$" | |
_punctuation = ';:,.!?¡¿—…"«»“” ' | |
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' | |
_letters_ipa = ( | |
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" | |
) | |
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) | |
# Create a dictionary mapping each symbol to its index | |
return {symbol: index for index, symbol in enumerate(symbols)} | |
def split_num(num: re.Match) -> str: | |
""" | |
Processes numeric strings, formatting them as time, years, or other representations. | |
Args: | |
num (re.Match): A regex match object representing the numeric string. | |
Returns: | |
str: A formatted string based on the numeric input. | |
""" | |
num = num.group() | |
# Handle time (e.g., "12:30") | |
if ':' in num: | |
hours, minutes = map(int, num.split(':')) | |
if minutes == 0: | |
return f"{hours} o'clock" | |
elif minutes < 10: | |
return f'{hours} oh {minutes}' | |
return f'{hours} {minutes}' | |
# Handle years or general numeric cases | |
year = int(num[:4]) | |
if year < 1100 or year % 1000 < 10: | |
return num | |
left, right = num[:2], int(num[2:4]) | |
suffix = 's' if num.endswith('s') else '' | |
# Format years | |
if 100 <= year % 1000 <= 999: | |
if right == 0: | |
return f'{left} hundred{suffix}' | |
elif right < 10: | |
return f'{left} oh {right}{suffix}' | |
return f'{left} {right}{suffix}' | |
def flip_money(match: re.Match) -> str: | |
""" | |
Converts monetary values to a textual representation. | |
Args: | |
m (re.Match): A regex match object representing the monetary value. | |
Returns: | |
str: A formatted string describing the monetary value. | |
""" | |
m = m.group() | |
currency = 'dollar' if m[0] == '$' else 'pound' | |
# Handle whole amounts (e.g., "$10", "£20") | |
if '.' not in m: | |
singular = '' if m[1:] == '1' else 's' | |
return f'{m[1:]} {currency}{singular}' | |
# Handle amounts with decimals (e.g., "$10.50", "£5.25") | |
whole, cents = m[1:].split('.') | |
singular = '' if whole == '1' else 's' | |
cents = int(cents.ljust(2, '0')) # Ensure 2 decimal places | |
coins = f"cent{'' if cents == 1 else 's'}" if m[0] == '$' else ('penny' if cents == 1 else 'pence') | |
return f'{whole} {currency}{singular} and {cents} {coins}' | |
def point_num(match): | |
whole, fractional = match.group().split('.') | |
return ' point '.join([whole, ' '.join(fractional)]) | |
def normalize_text(self, text: str) -> str: | |
""" | |
Normalizes input text by replacing special characters, punctuation, and applying custom transformations. | |
Args: | |
text (str): Input text to normalize. | |
Returns: | |
str: Normalized text. | |
""" | |
# Replace specific characters with standardized versions | |
replacements = { | |
chr(8216): "'", # Left single quotation mark | |
chr(8217): "'", # Right single quotation mark | |
'«': chr(8220), # Left double angle quotation mark to left double quotation mark | |
'»': chr(8221), # Right double angle quotation mark to right double quotation mark | |
chr(8220): '"', # Left double quotation mark | |
chr(8221): '"', # Right double quotation mark | |
'(': '«', # Replace parentheses with angle quotation marks | |
')': '»' | |
} | |
for old, new in replacements.items(): | |
text = text.replace(old, new) | |
# Replace punctuation and add spaces | |
punctuation_replacements = { | |
'、': ',', | |
'。': '.', | |
'!': '!', | |
',': ',', | |
':': ':', | |
';': ';', | |
'?': '?', | |
} | |
for old, new in punctuation_replacements.items(): | |
text = text.replace(old, new + ' ') | |
# Apply regex-based replacements | |
text = re.sub(r'[^\S\n]', ' ', text) | |
text = re.sub(r' +', ' ', text) | |
text = re.sub(r'(?<=\n) +(?=\n)', '', text) | |
# Expand abbreviations and handle special cases | |
abbreviation_patterns = [ | |
(r'\bD[Rr]\.(?= [A-Z])', 'Doctor'), | |
(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister'), | |
(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss'), | |
(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs'), | |
(r'\betc\.(?! [A-Z])', 'etc'), | |
(r'(?i)\b(y)eah?\b', r"\1e'a"), | |
] | |
for pattern, replacement in abbreviation_patterns: | |
text = re.sub(pattern, replacement, text) | |
# Handle numbers and monetary values | |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', self.split_num, text) | |
text = re.sub(r'(?<=\d),(?=\d)', '', text) # Remove commas from numbers | |
text = re.sub( | |
r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', | |
self.flip_money, | |
text | |
) | |
text = re.sub(r'\d*\.\d+', self.point_num, text) | |
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text) | |
# Handle possessives and specific letter cases | |
text = re.sub(r'(?<=\d)S', ' S', text) | |
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text) | |
text = re.sub(r"(?<=X')S\b", 's', text) | |
# Handle abbreviations with dots | |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text) | |
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text) | |
return text.strip() | |
def tokenize(self, phonemes: str) -> List[int]: | |
""" | |
Tokenizes a given string into a list of indices based on VOCAB. | |
Args: | |
text (str): Input string to tokenize. | |
Returns: | |
list: A list of integer indices corresponding to the characters in the input string. | |
""" | |
return [self.VOCAB[x] for x in phonemes if x in self.VOCAB] | |
def phonemize(self, text: str, lang: str = 'en-us', normalize: bool = True) -> str: | |
""" | |
Converts text to phonemes using the specified language phonemizer and applies normalization. | |
Args: | |
text (str): Input text to be phonemized. | |
lang (str): Language identifier ('en-us' or 'en-gb') for selecting the phonemizer. | |
normalize (bool): Whether to normalize the text before phonemization. | |
Returns: | |
str: A processed string of phonemes. | |
""" | |
# Normalize text if required | |
if normalize: | |
text = self.normalize_text(text) | |
# Generate phonemes using the specified phonemizer | |
if lang not in self.phonemizers: | |
print(f"Language '{lang}' not supported. Defaulting to 'en-us'.") | |
lang = 'en-us' | |
phonemes = self.phonemizers[lang].phonemize([text]) | |
phonemes = phonemes[0] if phonemes else '' | |
# Apply custom phoneme replacements | |
replacements = { | |
'kəkˈoːɹoʊ': 'kˈoʊkəɹoʊ', | |
'kəkˈɔːɹəʊ': 'kˈəʊkəɹəʊ', | |
'ʲ': 'j', | |
'r': 'ɹ', | |
'x': 'k', | |
'ɬ': 'l', | |
} | |
for old, new in replacements.items(): | |
phonemes = phonemes.replace(old, new) | |
# Apply regex-based replacements | |
phonemes = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', phonemes) | |
phonemes = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', phonemes) | |
# Additional language-specific rules | |
if lang == 'a': | |
phonemes = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', phonemes) | |
# Filter out characters not in VOCAB | |
phonemes = ''.join(filter(lambda p: p in self.VOCAB, phonemes)) | |
return phonemes.strip() | |