from polyglot.detect import Detector from polyglot.text import Text from difflib import Differ from icecream import ic from patch import * from llama_index.core.node_parser import SentenceSplitter def lang_detector(text): min_chars = 5 if len(text) < min_chars: return "Input text too short" try: detector = Detector(text).language lang_info = str(detector) code = re.search(r"name: (\w+)", lang_info).group(1) return code except Exception as e: return f"ERROR:{str(e)}" def tokenize(text): # Use polyglot to tokenize the text polyglot_text = Text(text) words = polyglot_text.words # Check if the text contains spaces if ' ' in text: # Create a list of words and spaces tokens = [] for word in words: tokens.append(word) tokens.append(' ') # Add space after each word return tokens[:-1] # Remove the last space else: return words def diff_texts(text1, text2): tokens1 = tokenize(text1) tokens2 = tokenize(text2) d = Differ() diff_result = list(d.compare(tokens1, tokens2)) highlighted_text = [] for token in diff_result: word = token[2:] category = None if token[0] == '+': category = 'added' elif token[0] == '-': category = 'removed' elif token[0] == '?': continue # Ignore the hints line highlighted_text.append((word, category)) return highlighted_text #modified from src.translaation-agent.utils.tranlsate def translator( source_lang, target_lang, source_text, country, max_tokens=MAX_TOKENS_PER_CHUNK ): """Translate the source_text from source_lang to target_lang.""" num_tokens_in_text = num_tokens_in_string(source_text) ic(num_tokens_in_text) if num_tokens_in_text < max_tokens: ic("Translating text as single chunk") #Note: use yield from B() if put yield in function B() init_translation = one_chunk_initial_translation( source_lang, target_lang, source_text ) reflection = one_chunk_reflect_on_translation( source_lang, target_lang, source_text, init_translation, country ) final_translation = one_chunk_improve_translation( source_lang, target_lang, source_text, init_translation, reflection ) return init_translation, reflection, final_translation else: ic("Translating text as multiple chunks") token_size = calculate_chunk_size( token_count=num_tokens_in_text, token_limit=max_tokens ) ic(token_size) #using sentence splitter text_parser = SentenceSplitter( chunk_size=token_size, ) source_text_chunks = text_parser.split_text(source_text) translation_1_chunks = multichunk_initial_translation( source_lang, target_lang, source_text_chunks ) init_translation = "".join(translation_1_chunks) reflection_chunks = multichunk_reflect_on_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, country, ) reflection = "".join(reflection_chunks) translation_2_chunks = multichunk_improve_translation( source_lang, target_lang, source_text_chunks, translation_1_chunks, reflection_chunks, ) final_translation = "".join(translation_2_chunks) return init_translation, reflection, final_translation