Text-to-Speech
English

Kokoro can also speak untrained languages ?! (examples: Greek and Armenian).

#105
by vahanAI - opened

In the model card, I noticed that the user can provide Kokoro with the exact IPA text for audio export. After a few trials and errors, I have created two simple draft Python files for converting Greek and Armenian text to tokens that Kokoro can understand and produce speech. Of course, the results are not perfect, as it inherits the tone of the initial speaker (better results give em_santa hoho MC XD). If there is a better implementation for a natural tone, it would be awesome. @hexgrad , do you have any suggestions?
Please note that the code can seamlessly produce Greek and Armenian text
I also provide a simple GUI (created with LLM) for the community to play with, hoping for some improvements.
Finally, I provide an audio example.

put in same directory
greek.py
armenian.py
inference.py
simple_gui.py

and have fun :)

-------------------------------------------------------------------------------------------------------------------

greek.py

class GreekToIPA:
    def __init__(self):
        # Basic Greek to IPA mapping
        self.greek_to_ipa = {
            'α': 'a',
            'ά': 'ˈa',
            'β': 'v',
            'γ': 'gh',
            'δ': 'ð',
            'ε': 'ɛ',
            'έ': 'ˈɛ',
            'ζ': 'z',
            'η': 'i',
            'ή': 'ˈi',
            'θ': 'th',
            'ι': 'i',
            'ί': 'ˈi',
            'κ': 'k',
            'λ': 'l',
            'μ': 'm',
            'ν': 'n',
            'ξ': 'ks',
            'ο': 'o',
            'ό': 'ˈo',
            'π': 'p',
            'ρ': 'r',
            'σ': 's',
            'ς': 's',
            'τ': 't',
            'υ': 'i',
            'ύ': 'ˈi',
            'φ': 'f',
            'χ': 'x',
            'ψ': 'ps',
            'ω': 'o',
            'ώ': 'ˈo',
            'ϊ': 'i',
            'ΐ': 'ˈi',
            'ϋ': 'i',
            'ΰ': 'ˈi'
        }
        
        # Special combinations
        self.digraphs = {
            'αι': 'ɛ',
            'άι': 'ˈai',
            'αί': 'ɛˈ',
            'ει': 'i',
            'εί': 'ˈi',
            'οι': 'i',
            'οί': 'ˈi',
            'ού': 'ˈu',
            'ου': 'u',
            'μπ': 'b',
            'ντ': 'd',
            'γκ': 'ghk',
            'γγ': 'ŋg',
            'αυ': 'av',
            'άυ': 'ˈav',
            'ευ': 'ev',
            'έυ': 'ˈev',
            'εύ': 'ˈev'
        }

        # New dictionary for numbers 0-10
        self.numbers = {
            '0': 'μηδέν',
            '1': 'ένα',
            '2': 'δύο',
            '3': 'τρία',
            '4': 'τέσσερα',
            '5': 'πέντε',
            '6': 'έξι',
            '7': 'επτά',
            '8': 'οκτώ',
            '9': 'εννέα',
            '10': 'δέκα'
        }

        # Time format dictionary
        self.time_formats = {
            '00:00': 'μεσάνυχτα',
            '12:00': 'μεσημέρι',
            '13:00': 'μία',
            '14:00': 'δύο',
            '15:00': 'τρεις',
            '16:00': 'τέσσερις',
            '17:00': 'πέντε',
            '18:00': 'έξι',
            '19:00': 'επτά',
            '20:00': 'οκτώ',
            '21:00': 'εννέα',
            '22:00': 'δέκα',
            '23:00': 'έντεκα'
        }

    def preprocess_text(self, text):
        """Convert numbers and time formats to Greek text before IPA conversion."""
        words = text.split()
        processed_words = []
        
        for word in words:
            # Check for time format (HH:MM)
            if ':' in word and len(word) == 5:
                if word in self.time_formats:
                    processed_words.append(self.time_formats[word])
                    continue
            
            # Check for numbers
            if word.isdigit():
                if word in self.numbers:
                    processed_words.append(self.numbers[word])
                    continue
            
            processed_words.append(word)
            
        return ' '.join(processed_words)

    def convert_word(self, greek_word):
        """Convert a single Greek word to IPA."""
        if not greek_word:
            return ''
            
        word = greek_word.lower()
        
        for digraph, ipa in self.digraphs.items():
            word = word.replace(digraph, ipa)
        
        ipa = ''
        i = 0
        while i < len(word):
            if word[i] in self.greek_to_ipa:
                ipa += self.greek_to_ipa[word[i]]
            else:
                ipa += word[i]
            i += 1
            
        return ipa

    def convert_text(self, text):
        """Convert Greek text to IPA with formatting."""
        # First preprocess the text
        preprocessed_text = self.preprocess_text(text)
        
        words = preprocessed_text.split()
        converted = []
        for word in words:
            if any(c in self.greek_to_ipa for c in word.lower()):
                ipa = self.convert_word(word)
                converted.append(f'[{word}](/{ipa}/)')
            else:
                converted.append(word)
        return ' '.join(converted)

# # Example usage
# if __name__ == "__main__":
#     converter = GreekToIPA()
#     test_text = "Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά."
#     result = converter.convert_text(test_text)
#     print(result)

-------------------------------------------------------------------------------------------------------------------

armenian.py

class ArmenianToIPA:
    def __init__(self):
        # Basic Armenian to IPA mapping
        self.armenian_to_ipa = {
            'ա': 'ɑ',
            'բ': 'b',
            'գ': 'ɡ',
            'դ': 'd',
            'ե': 'ɛ',
            'զ': 'z',
            'է': 'ɛ',
            'ը': 'ə',
            'թ': 'th',
            'ժ': 'ʒ',
            'ի': 'i',
            'լ': 'l',
            'խ': 'x',
            'ծ': 'ts',
            'կ': 'kə',
            'հ': 'h',
            'ձ': 'dz',
            'ղ': 'ʁ',
            'ճ': 'tʃ',
            'մ': 'm',
            'յ': 'j',
            'ն': 'n',
            'շ': 'ʃ',
            'ո': 'o',
            'չ': 'tʃh',
            'պ': 'p',
            'ջ': 'dʒ',
            'ռ': 'rr',
            'ս': 's',
            'վ': 'v',
            'տ': 't',
            'ր': 'ɾr',
            'ց': 'tsh',
            'ւ': 'v',
            'փ': 'ph',
            'ք': 'kh',
            'օ': 'o',
            'ֆ': 'f',
            'և': 'jev'
        }
        # Rest of the dictionaries remain the same
        self.numbers = {
            '0': 'զրո',
            '1': 'մեկ',
            '2': 'երկու',
            '3': 'երեք',
            '4': 'չորս',
            '5': 'հինգ',
            '6': 'վեց',
            '7': 'յոթ',
            '8': 'ութ',
            '9': 'ինը',
            '10': 'տասը'
        }

        self.time_formats = {
            '00:00': 'կեսգիշեր',
            '12:00': 'կեսօր',
            '13:00': 'ժամը մեկ',
            '14:00': 'ժամը երկու',
            '15:00': 'ժամը երեք',
            '16:00': 'ժամը չորս',
            '17:00': 'ժամը հինգ',
            '18:00': 'ժամը վեց',
            '19:00': 'ժամը յոթ',
            '20:00': 'ժամը ութ',
            '21:00': 'ժամը ինը',
            '22:00': 'ժամը տասը',
            '23:00': 'ժամը տասնմեկ'
        }

        self.digraphs = {
            'ու': 'u',
            'իւ': 'ju',
            'եա': 'ja',
            'եո': 'jo',
            'եւ': 'ev',
        }

    def preprocess_text(self, text):
        """Convert numbers and time formats to Armenian text before IPA conversion."""
        words = text.split()
        processed_words = []
        
        for word in words:
            if ':' in word and len(word) == 5:
                if word in self.time_formats:
                    processed_words.append(self.time_formats[word])
                    continue
            
            if word.isdigit():
                if word in self.numbers:
                    processed_words.append(self.numbers[word])
                    continue
            
            processed_words.append(word)
            
        return ' '.join(processed_words)

    def convert_word(self, armenian_word):
        """Convert a single Armenian word to IPA."""
        if not armenian_word:
            return ''
            
        word = armenian_word.lower()
        
        # Handle digraphs first
        for digraph, ipa in self.digraphs.items():
            word = word.replace(digraph, ipa)
        
        ipa = ''
        for i, char in enumerate(word):
            if char == 'ե':
                ipa += 'jɛ' if i == 0 else 'ɛ'
            elif char == 'կ':
                ipa += 'k' if i == len(word) - 1 else 'kə'
            elif char == 'ո':
                ipa += 'vo' if i == 0 else 'o'
            elif char in self.armenian_to_ipa:
                ipa += self.armenian_to_ipa[char]
            else:
                ipa += char
            
        return ipa

    def convert_text(self, text):
        """Convert Armenian text to IPA with formatting."""
        preprocessed_text = self.preprocess_text(text)
        
        words = preprocessed_text.split()
        converted = []
        for word in words:
            if any(c in self.armenian_to_ipa for c in word):
                ipa = self.convert_word(word)
                converted.append(f'[{word}](/{ipa}/)')
            else:
                converted.append(word)
        return ' '.join(converted)

# # Example usage
# if __name__ == "__main__":
#     converter = ArmenianToIPA()
    
#     # Test examples
#     test_text = "Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:"
#     result = converter.convert_text(test_text)
#     print(result)

-------------------------------------------------------------------------------------------------------------------

inference.py

from kokoro import KPipeline
import soundfile as sf
import numpy as np
from greek import GreekToIPA
from armenian import ArmenianToIPA

pipeline = KPipeline(lang_code='a')

text = """
Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά.
Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:
"""

gr_converter = GreekToIPA()
arm_converter = ArmenianToIPA()

# Test example
greek_check=True
if greek_check:
    text = gr_converter.convert_text(text)
    print(text)

# Test example
armenian_check=True
if armenian_check:
    text = arm_converter.convert_text(text)
    print(text)

# Create a list to store all audio segments
all_audio = []

generator = pipeline(
    text, voice='em_santa',
    speed=1.0, split_pattern=r'\n+'
)

# Collect all audio segments
for i, (gs, ps, audio) in enumerate(generator):
    all_audio.append(audio)

# Concatenate all audio segments
combined_audio = np.concatenate(all_audio)

# Save the combined audio
sf.write('combined_output.wav', combined_audio, 24000)

###-------------------------------------------------------------------------------------------------------------------

simple_gui.py

import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import wave
import pyaudio
import tempfile
import threading
import queue
from armenian import ArmenianToIPA
from greek import GreekToIPA

gr_converter = GreekToIPA()
hy_converter = ArmenianToIPA()

class AudioPlayer:
    def __init__(self):
        self.pyaudio = pyaudio.PyAudio()
        self.stream = None
        self.is_playing = False
        self.is_paused = False
        self.audio_thread = None
        self.current_file = None

    def play_file(self, filename, chunk_size=1024):
        if self.is_paused and self.current_file == filename:
            self.is_paused = False
            self.is_playing = True
            return

        if self.is_playing:
            self.stop()

        self.current_file = filename
        self.is_playing = True
        self.is_paused = False
        
        def stream_audio():
            wf = wave.open(filename, 'rb')
            self.stream = self.pyaudio.open(
                format=self.pyaudio.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True
            )
            
            data = wf.readframes(chunk_size)
            while data and self.is_playing:
                if not self.is_paused:
                    self.stream.write(data)
                    data = wf.readframes(chunk_size)
            
            self.stream.stop_stream()
            self.stream.close()
            wf.close()
            self.is_playing = False
            self.is_paused = False

        self.audio_thread = threading.Thread(target=stream_audio)
        self.audio_thread.daemon = True
        self.audio_thread.start()

    def pause(self):
        self.is_paused = True

    def stop(self):
        self.is_playing = False
        if self.audio_thread:
            self.audio_thread.join(timeout=1)

    def __del__(self):
        self.stop()
        self.pyaudio.terminate()

class TTSApp:
    def __init__(self, root):
        self.root = root
        self.root.title("TTS Generator")
        self.audio_player = AudioPlayer()
        
        self.voices = [
            'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 
            'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
            'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
            'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
            'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
            'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
            'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
            'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
            'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
            'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
        ]
        self.lang_codes = ['a', 'b', 'j', 'z']
        
        self.lang_code = tk.StringVar(value='a')
        self.voice = tk.StringVar(value='em_santa')
        self.speed = tk.DoubleVar(value=1.0)
        self.speed_label = tk.StringVar(value="Speed: 1.0")
        self.current_audio = None
        self.is_processing = False
        
        self.create_widgets()
        self.create_loading_indicator()

    def update_speed_label(self, *args):
        self.speed_label.set(f"Speed: {self.speed.get():.1f}")

    def create_widgets(self):
        main_frame = ttk.Frame(self.root)
        main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)

        # Text input with label
        ttk.Label(main_frame, text="Input Text:").pack(anchor='w')
        self.text_input = scrolledtext.ScrolledText(main_frame, height=10)
        self.text_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
        
        # Parameters frame
        params_frame = ttk.LabelFrame(main_frame, text="Parameters")
        params_frame.pack(fill=tk.X, pady=5)
        
        # Grid layout for parameters
        for i, (label, var, values) in enumerate([
            ("Language Code:", self.lang_code, self.lang_codes),
            ("Voice:", self.voice, self.voices)
        ]):
            ttk.Label(params_frame, text=label).grid(row=i, column=0, padx=5, pady=2)
            combo = ttk.Combobox(params_frame, textvariable=var, values=values, state='readonly')
            combo.grid(row=i, column=1, padx=5, pady=2, sticky='ew')
        
        # Speed control
        ttk.Label(params_frame, textvariable=self.speed_label).grid(row=2, column=0, padx=5, pady=2)
        speed_scale = ttk.Scale(params_frame, from_=0.5, to=2.0, variable=self.speed,
                              orient=tk.HORIZONTAL, command=lambda x: self.update_speed_label())
        speed_scale.grid(row=2, column=1, padx=5, pady=2, sticky='ew')
        
        # Configure grid columns
        params_frame.grid_columnconfigure(1, weight=1)
        
        # Buttons frame
        buttons_frame = ttk.Frame(main_frame)
        buttons_frame.pack(pady=10)
        
        # Generate button
        self.generate_btn = ttk.Button(buttons_frame, text="Generate", 
                                     command=self.start_generation)
        self.generate_btn.pack(side=tk.LEFT, padx=5)
        
        # Audio control buttons
        self.play_btn = ttk.Button(buttons_frame, text="Play", 
                                 command=self.play_audio, state='disabled')
        self.play_btn.pack(side=tk.LEFT, padx=5)
        
        self.pause_btn = ttk.Button(buttons_frame, text="Pause", 
                                  command=self.pause_audio, state='disabled')
        self.pause_btn.pack(side=tk.LEFT, padx=5)
        
        self.stop_btn = ttk.Button(buttons_frame, text="Stop", 
                                 command=self.stop_audio, state='disabled')
        self.stop_btn.pack(side=tk.LEFT, padx=5)
        
        self.save_btn = ttk.Button(buttons_frame, text="Save", 
                                 command=self.save_audio, state='disabled')
        self.save_btn.pack(side=tk.LEFT, padx=5)

    def create_loading_indicator(self):
        self.loading_frames = []
        chars = ["|", "/", "-", "\\"]
        for char in chars:
            label = ttk.Label(self.root, text=char, font=('Courier', 24))
            self.loading_frames.append(label)
        self.current_frame = 0

    def animate_loading(self):
        if self.is_processing:
            self.loading_frames[self.current_frame].place_forget()
            self.current_frame = (self.current_frame + 1) % len(self.loading_frames)
            self.loading_frames[self.current_frame].place(relx=0.5, rely=0.5, anchor='center')
            self.root.after(100, self.animate_loading)
        else:
            self.loading_frames[self.current_frame].place_forget()

    def start_generation(self):
        self.is_processing = True
        self.generate_btn.configure(state='disabled')
        self.play_btn.configure(state='disabled')
        self.pause_btn.configure(state='disabled')
        self.stop_btn.configure(state='disabled')
        self.save_btn.configure(state='disabled')
        self.animate_loading()
        
        thread = threading.Thread(target=self.generate_audio)
        thread.daemon = True
        thread.start()

    def generate_audio(self):
        text = self.text_input.get("1.0", tk.END).strip()
        if text:
            try:
                pipeline = KPipeline(lang_code=self.lang_code.get())
                all_audio = []
                text = gr_converter.convert_text(text)
                text = hy_converter.convert_text(text)
                print(text)
                generator = pipeline(
                    text, voice=self.voice.get(),
                    speed=self.speed.get(), split_pattern=r'\n+'
                )
                for _, _, audio in generator:
                    all_audio.append(audio)
                self.current_audio = np.concatenate(all_audio)
                self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
                sf.write(self.temp_file.name, self.current_audio, 24000)
            except Exception as e:
                print("Error", str(e))
                # self.root.after(0, lambda: messagebox.showerror("Error", str(e)))
        
        self.is_processing = False
        self.root.after(0, self.generation_complete)

    def generation_complete(self):
        self.generate_btn.configure(state='normal')
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='normal')
        self.stop_btn.configure(state='normal')
        self.save_btn.configure(state='normal')

    def play_audio(self):
        if self.current_audio is not None:
            self.audio_player.play_file(self.temp_file.name)
            self.play_btn.configure(state='disabled')
            self.pause_btn.configure(state='normal')
            self.stop_btn.configure(state='normal')

    def pause_audio(self):
        self.audio_player.pause()
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='disabled')

    def stop_audio(self):
        self.audio_player.stop()
        self.play_btn.configure(state='normal')
        self.pause_btn.configure(state='disabled')
        self.stop_btn.configure(state='disabled')

    def save_audio(self):
        if self.current_audio is not None:
            sf.write('output_audio.wav', self.current_audio, 24000)
            messagebox.showinfo("Success", "Audio saved as 'output_audio.wav'")

    def __del__(self):
        if hasattr(self, 'audio_player'):
            self.audio_player.stop()

if __name__ == "__main__":
    root = tk.Tk()
    app = TTSApp(root)
    root.mainloop()

I used your methodology to test in Spanish
the one with the least accent, like you, was a "pm_santa".
He's not doing badly, but let's hope they release pesos trained in other languages ​​soon.



# 1️⃣ Instalar Kokoro (>=0.7.11) y soundfile
!pip install -q kokoro>=0.7.11 soundfile
# 2️⃣ Instalar espeak-ng (usado para fallback en algunos idiomas)
!apt-get -qq -y install espeak-ng > /dev/null 2>&1

#############################################
# Importaciones y definición del pipeline
#############################################
import numpy as np
import soundfile as sf
import tempfile
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output
import time

# Importamos KPipeline desde kokoro
from kokoro import KPipeline

#############################################
# (Opcional) Clase para conversión a IPA
# Se utiliza únicamente para mostrar el texto convertido a IPA como referencia.
#############################################
class SpanishToIPA:
    def __init__(self):
        self.basic_mapping = {
            'a': 'a', 'á': 'ˈa', 'b': 'b', 'd': 'd', 'e': 'e', 'é': 'ˈe',
            'f': 'f', 'i': 'i', 'í': 'ˈi', 'j': 'x', 'k': 'k', 'l': 'l',
            'm': 'm', 'n': 'n', 'ñ': 'ɲ', 'o': 'o', 'ó': 'ˈo', 'p': 'p',
            's': 's', 't': 't', 'u': 'u', 'ú': 'ˈu', 'ü': 'u', 'v': 'b',
            'w': 'w', 'x': 'ks', 'z': 'θ'
        }
        self.digraphs = {
            'ch': 'tʃ', 'll': 'ʝ', 'rr': 'r',
            'qu': 'k', 'gue': '§ue', 'gui': '§ui',
            'güe': '§üe', 'güi': '§üi'
        }
        self.numbers = {
            '0': 'cero', '1': 'uno', '2': 'dos', '3': 'tres', '4': 'cuatro',
            '5': 'cinco', '6': 'seis', '7': 'siete', '8': 'ocho', '9': 'nueve',
            '10': 'diez'
        }
        self.time_formats = {
            '00:00': 'medianoche', '12:00': 'mediodía', '13:00': 'la una',
            '14:00': 'las dos', '15:00': 'las tres', '16:00': 'las cuatro',
            '17:00': 'las cinco', '18:00': 'las seis', '19:00': 'las siete',
            '20:00': 'las ocho', '21:00': 'las nueve', '22:00': 'las diez',
            '23:00': 'las once'
        }
    
    def preprocess_text(self, text):
        words = text.split()
        processed_words = []
        for word in words:
            if ':' in word and len(word) == 5 and word in self.time_formats:
                processed_words.append(self.time_formats[word])
            elif word.isdigit() and word in self.numbers:
                processed_words.append(self.numbers[word])
            else:
                processed_words.append(word)
        return ' '.join(processed_words)
    
    def convert_word(self, spanish_word):
        if not spanish_word:
            return ''
        word = spanish_word.lower()
        for digraph in sorted(self.digraphs, key=len, reverse=True):
            word = word.replace(digraph, self.digraphs[digraph])
        ipa = ''
        i = 0
        spanish_letters = "aábcdeéfghiíjklmnñoópqrstuúüvwxyz"
        while i < len(word):
            char = word[i]
            if char == '§':
                if i + 1 < len(word):
                    if word[i+1] == 'ü':
                        ipa += 'gw'
                        i += 2
                        continue
                    elif word[i+1] == 'u':
                        ipa += 'g'
                        i += 2
                        continue
                ipa += 'g'
                i += 1
                continue
            if char not in spanish_letters:
                ipa += char
                i += 1
                continue
            if char == 'c':
                if i + 1 < len(word) and word[i+1] in "eéií":
                    ipa += 'θ'
                else:
                    ipa += 'k'
                i += 1
                continue
            if char == 'g':
                if i + 1 < len(word) and word[i+1] in "eéií":
                    ipa += 'x'
                else:
                    ipa += 'g'
                i += 1
                continue
            if char == 'r':
                ipa += 'r' if i == 0 else 'ɾ'
                i += 1
                continue
            if char == 'y':
                ipa += 'i' if word == 'y' else self.basic_mapping.get(char, char)
                i += 1
                continue
            ipa += self.basic_mapping.get(char, char)
            i += 1
        return ipa
    
    def convert_text(self, text):
        preprocessed_text = self.preprocess_text(text)
        words = preprocessed_text.split()
        spanish_letters = "aábcdeéfghiíjklmnñoópqrstuúüvwxyz"
        converted = []
        for word in words:
            if any(c in spanish_letters for c in word.lower()):
                ipa_word = self.convert_word(word)
                converted.append(f'[{word}](/{ipa_word}/)')
            else:
                converted.append(word)
        return ' '.join(converted)

#############################################
# Interfaz en Colab con ipywidgets
#############################################
class TTSAppColab:
    def __init__(self):
        # Se usa el conversor a IPA solo para mostrar la conversión (referencia)
        self.spanish_converter = SpanishToIPA()
        self.current_audio = None
        self.temp_file = None
        
        # Lista de voces oficiales (extraída del repositorio Kokoro)
        voices = [
            'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica', 
            'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
            'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
            'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
            'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
            'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
            'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
            'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
            'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
            'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
        ]
        
        # Widgets de la interfaz
        self.text_area = widgets.Textarea(
            value='El cielo sobre el puerto tenía el color de un televisor sintonizado en un canal muerto.',
            placeholder='Ingresa texto en español...',
            description='Texto:',
            layout=widgets.Layout(width='100%', height='150px')
        )
        self.voice_dropdown = widgets.Dropdown(
            options=voices,
            value='em_santa',  # voz por defecto recomendada
            description='Voz:'
        )
        self.speed_slider = widgets.FloatSlider(
            value=1.0,
            min=0.5,
            max=2.0,
            step=0.1,
            description='Velocidad:',
            continuous_update=True,
            readout_format='.1f'
        )
        self.generate_button = widgets.Button(
            description='Generar Audio',
            button_style='success'
        )
        self.save_button = widgets.Button(
            description='Guardar Audio',
            button_style='info',
            disabled=True
        )
        self.loading_label = widgets.Label(value='')
        self.audio_output = widgets.Output()
        
        # Asignar eventos (síncronos para mantener la sesión activa)
        self.generate_button.on_click(self.on_generate_clicked)
        self.save_button.on_click(self.on_save_clicked)
        
        # Organizar y mostrar la interfaz
        self.ui = widgets.VBox([
            self.text_area,
            self.voice_dropdown,
            self.speed_slider,
            self.generate_button,
            self.loading_label,
            self.audio_output,
            self.save_button
        ])
        display(self.ui)
    
    def on_generate_clicked(self, b):
        self.audio_output.clear_output()
        self.loading_label.value = "Generando audio..."
        self.generate_button.disabled = True
        self.save_button.disabled = True
        
        # Llamada síncrona a la síntesis
        self.generate_audio()
    
    def generate_audio(self):
        text = self.text_area.value.strip()
        if not text:
            self.loading_label.value = "Por favor, ingresa algún texto."
            self.generate_button.disabled = False
            return
        
        # Mostrar el texto convertido a IPA para referencia (no se usa para síntesis)
        ipa_text = self.spanish_converter.convert_text(text)
        print("Texto en IPA (solo referencia):", ipa_text)
        
        try:
            # Crear el pipeline TTS para español (lang_code='e')
            pipeline = KPipeline(lang_code='e')
            all_audio = []
            voice_value = self.voice_dropdown.value.strip()
            print("Llamando a pipeline con -> Texto:", text)
            print("Voz:", voice_value, "Velocidad:", self.speed_slider.value)
            generator = pipeline(
                text,
                voice=voice_value,
                speed=self.speed_slider.value,
                split_pattern=r'\n+'
            )
            chunk_count = 0
            for gs, ps, audio in generator:
                print("Chunk", chunk_count, "generado.")
                print("Graphemes:", gs)
                print("Phonemes:", ps)
                all_audio.append(audio)
                chunk_count += 1
            
            if chunk_count == 0 or not all_audio:
                self.loading_label.value = "No se generó audio. Verifica la voz o el texto."
            else:
                self.current_audio = np.concatenate(all_audio)
                self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
                sf.write(self.temp_file.name, self.current_audio, 24000)
                with self.audio_output:
                    clear_output()
                    display(Audio(self.current_audio, rate=24000))
                self.loading_label.value = "Audio generado con éxito."
                self.save_button.disabled = False
        except Exception as e:
            self.loading_label.value = f"Error: {e}"
            print("Error:", e)
        self.generate_button.disabled = False
    
    def on_save_clicked(self, b):
        if self.current_audio is not None:
            filename = 'output_audio.wav'
            sf.write(filename, self.current_audio, 24000)
            self.loading_label.value = f"Audio guardado como {filename}."
            try:
                from google.colab import files
                files.download(filename)
            except Exception as e:
                self.loading_label.value += " (Descarga manual requerida)"
        else:
            self.loading_label.value = "No hay audio para guardar."

# Instanciar y mostrar la aplicación
app = TTSAppColab()

Somehow, I missed the OP. I can confirm that the model did not see any Greek or Armenian training data. My guess is that these languages might be "close enough" to languages that were trained on, but I only know English so it's difficult to evaluate.

Sign up or log in to comment