Kokoro can also speak untrained languages ?! (examples: Greek and Armenian).
In the model card, I noticed that the user can provide Kokoro with the exact IPA text for audio export. After a few trials and errors, I have created two simple draft Python files for converting Greek and Armenian text to tokens that Kokoro can understand and produce speech. Of course, the results are not perfect, as it inherits the tone of the initial speaker (better results give em_santa hoho MC XD). If there is a better implementation for a natural tone, it would be awesome.
@hexgrad
, do you have any suggestions?
Please note that the code can seamlessly produce Greek and Armenian text
I also provide a simple GUI (created with LLM) for the community to play with, hoping for some improvements.
Finally, I provide an audio example.
put in same directory
greek.py
armenian.py
inference.py
simple_gui.py
and have fun :)
-------------------------------------------------------------------------------------------------------------------
greek.py
class GreekToIPA:
def __init__(self):
# Basic Greek to IPA mapping
self.greek_to_ipa = {
'α': 'a',
'ά': 'ˈa',
'β': 'v',
'γ': 'gh',
'δ': 'ð',
'ε': 'ɛ',
'έ': 'ˈɛ',
'ζ': 'z',
'η': 'i',
'ή': 'ˈi',
'θ': 'th',
'ι': 'i',
'ί': 'ˈi',
'κ': 'k',
'λ': 'l',
'μ': 'm',
'ν': 'n',
'ξ': 'ks',
'ο': 'o',
'ό': 'ˈo',
'π': 'p',
'ρ': 'r',
'σ': 's',
'ς': 's',
'τ': 't',
'υ': 'i',
'ύ': 'ˈi',
'φ': 'f',
'χ': 'x',
'ψ': 'ps',
'ω': 'o',
'ώ': 'ˈo',
'ϊ': 'i',
'ΐ': 'ˈi',
'ϋ': 'i',
'ΰ': 'ˈi'
}
# Special combinations
self.digraphs = {
'αι': 'ɛ',
'άι': 'ˈai',
'αί': 'ɛˈ',
'ει': 'i',
'εί': 'ˈi',
'οι': 'i',
'οί': 'ˈi',
'ού': 'ˈu',
'ου': 'u',
'μπ': 'b',
'ντ': 'd',
'γκ': 'ghk',
'γγ': 'ŋg',
'αυ': 'av',
'άυ': 'ˈav',
'ευ': 'ev',
'έυ': 'ˈev',
'εύ': 'ˈev'
}
# New dictionary for numbers 0-10
self.numbers = {
'0': 'μηδέν',
'1': 'ένα',
'2': 'δύο',
'3': 'τρία',
'4': 'τέσσερα',
'5': 'πέντε',
'6': 'έξι',
'7': 'επτά',
'8': 'οκτώ',
'9': 'εννέα',
'10': 'δέκα'
}
# Time format dictionary
self.time_formats = {
'00:00': 'μεσάνυχτα',
'12:00': 'μεσημέρι',
'13:00': 'μία',
'14:00': 'δύο',
'15:00': 'τρεις',
'16:00': 'τέσσερις',
'17:00': 'πέντε',
'18:00': 'έξι',
'19:00': 'επτά',
'20:00': 'οκτώ',
'21:00': 'εννέα',
'22:00': 'δέκα',
'23:00': 'έντεκα'
}
def preprocess_text(self, text):
"""Convert numbers and time formats to Greek text before IPA conversion."""
words = text.split()
processed_words = []
for word in words:
# Check for time format (HH:MM)
if ':' in word and len(word) == 5:
if word in self.time_formats:
processed_words.append(self.time_formats[word])
continue
# Check for numbers
if word.isdigit():
if word in self.numbers:
processed_words.append(self.numbers[word])
continue
processed_words.append(word)
return ' '.join(processed_words)
def convert_word(self, greek_word):
"""Convert a single Greek word to IPA."""
if not greek_word:
return ''
word = greek_word.lower()
for digraph, ipa in self.digraphs.items():
word = word.replace(digraph, ipa)
ipa = ''
i = 0
while i < len(word):
if word[i] in self.greek_to_ipa:
ipa += self.greek_to_ipa[word[i]]
else:
ipa += word[i]
i += 1
return ipa
def convert_text(self, text):
"""Convert Greek text to IPA with formatting."""
# First preprocess the text
preprocessed_text = self.preprocess_text(text)
words = preprocessed_text.split()
converted = []
for word in words:
if any(c in self.greek_to_ipa for c in word.lower()):
ipa = self.convert_word(word)
converted.append(f'[{word}](/{ipa}/)')
else:
converted.append(word)
return ' '.join(converted)
# # Example usage
# if __name__ == "__main__":
# converter = GreekToIPA()
# test_text = "Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά."
# result = converter.convert_text(test_text)
# print(result)
-------------------------------------------------------------------------------------------------------------------
armenian.py
class ArmenianToIPA:
def __init__(self):
# Basic Armenian to IPA mapping
self.armenian_to_ipa = {
'ա': 'ɑ',
'բ': 'b',
'գ': 'ɡ',
'դ': 'd',
'ե': 'ɛ',
'զ': 'z',
'է': 'ɛ',
'ը': 'ə',
'թ': 'th',
'ժ': 'ʒ',
'ի': 'i',
'լ': 'l',
'խ': 'x',
'ծ': 'ts',
'կ': 'kə',
'հ': 'h',
'ձ': 'dz',
'ղ': 'ʁ',
'ճ': 'tʃ',
'մ': 'm',
'յ': 'j',
'ն': 'n',
'շ': 'ʃ',
'ո': 'o',
'չ': 'tʃh',
'պ': 'p',
'ջ': 'dʒ',
'ռ': 'rr',
'ս': 's',
'վ': 'v',
'տ': 't',
'ր': 'ɾr',
'ց': 'tsh',
'ւ': 'v',
'փ': 'ph',
'ք': 'kh',
'օ': 'o',
'ֆ': 'f',
'և': 'jev'
}
# Rest of the dictionaries remain the same
self.numbers = {
'0': 'զրո',
'1': 'մեկ',
'2': 'երկու',
'3': 'երեք',
'4': 'չորս',
'5': 'հինգ',
'6': 'վեց',
'7': 'յոթ',
'8': 'ութ',
'9': 'ինը',
'10': 'տասը'
}
self.time_formats = {
'00:00': 'կեսգիշեր',
'12:00': 'կեսօր',
'13:00': 'ժամը մեկ',
'14:00': 'ժամը երկու',
'15:00': 'ժամը երեք',
'16:00': 'ժամը չորս',
'17:00': 'ժամը հինգ',
'18:00': 'ժամը վեց',
'19:00': 'ժամը յոթ',
'20:00': 'ժամը ութ',
'21:00': 'ժամը ինը',
'22:00': 'ժամը տասը',
'23:00': 'ժամը տասնմեկ'
}
self.digraphs = {
'ու': 'u',
'իւ': 'ju',
'եա': 'ja',
'եո': 'jo',
'եւ': 'ev',
}
def preprocess_text(self, text):
"""Convert numbers and time formats to Armenian text before IPA conversion."""
words = text.split()
processed_words = []
for word in words:
if ':' in word and len(word) == 5:
if word in self.time_formats:
processed_words.append(self.time_formats[word])
continue
if word.isdigit():
if word in self.numbers:
processed_words.append(self.numbers[word])
continue
processed_words.append(word)
return ' '.join(processed_words)
def convert_word(self, armenian_word):
"""Convert a single Armenian word to IPA."""
if not armenian_word:
return ''
word = armenian_word.lower()
# Handle digraphs first
for digraph, ipa in self.digraphs.items():
word = word.replace(digraph, ipa)
ipa = ''
for i, char in enumerate(word):
if char == 'ե':
ipa += 'jɛ' if i == 0 else 'ɛ'
elif char == 'կ':
ipa += 'k' if i == len(word) - 1 else 'kə'
elif char == 'ո':
ipa += 'vo' if i == 0 else 'o'
elif char in self.armenian_to_ipa:
ipa += self.armenian_to_ipa[char]
else:
ipa += char
return ipa
def convert_text(self, text):
"""Convert Armenian text to IPA with formatting."""
preprocessed_text = self.preprocess_text(text)
words = preprocessed_text.split()
converted = []
for word in words:
if any(c in self.armenian_to_ipa for c in word):
ipa = self.convert_word(word)
converted.append(f'[{word}](/{ipa}/)')
else:
converted.append(word)
return ' '.join(converted)
# # Example usage
# if __name__ == "__main__":
# converter = ArmenianToIPA()
# # Test examples
# test_text = "Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:"
# result = converter.convert_text(test_text)
# print(result)
-------------------------------------------------------------------------------------------------------------------
inference.py
from kokoro import KPipeline
import soundfile as sf
import numpy as np
from greek import GreekToIPA
from armenian import ArmenianToIPA
pipeline = KPipeline(lang_code='a')
text = """
Γεια σας, είμαι το Κόκορο και μπορώ να μιλήσω Ελληνικά.
Բարև, ես Կոկորոն եմ և կարող եմ հայերեն խոսել:
"""
gr_converter = GreekToIPA()
arm_converter = ArmenianToIPA()
# Test example
greek_check=True
if greek_check:
text = gr_converter.convert_text(text)
print(text)
# Test example
armenian_check=True
if armenian_check:
text = arm_converter.convert_text(text)
print(text)
# Create a list to store all audio segments
all_audio = []
generator = pipeline(
text, voice='em_santa',
speed=1.0, split_pattern=r'\n+'
)
# Collect all audio segments
for i, (gs, ps, audio) in enumerate(generator):
all_audio.append(audio)
# Concatenate all audio segments
combined_audio = np.concatenate(all_audio)
# Save the combined audio
sf.write('combined_output.wav', combined_audio, 24000)
###-------------------------------------------------------------------------------------------------------------------
simple_gui.py
import tkinter as tk
from tkinter import ttk, scrolledtext, messagebox
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import wave
import pyaudio
import tempfile
import threading
import queue
from armenian import ArmenianToIPA
from greek import GreekToIPA
gr_converter = GreekToIPA()
hy_converter = ArmenianToIPA()
class AudioPlayer:
def __init__(self):
self.pyaudio = pyaudio.PyAudio()
self.stream = None
self.is_playing = False
self.is_paused = False
self.audio_thread = None
self.current_file = None
def play_file(self, filename, chunk_size=1024):
if self.is_paused and self.current_file == filename:
self.is_paused = False
self.is_playing = True
return
if self.is_playing:
self.stop()
self.current_file = filename
self.is_playing = True
self.is_paused = False
def stream_audio():
wf = wave.open(filename, 'rb')
self.stream = self.pyaudio.open(
format=self.pyaudio.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True
)
data = wf.readframes(chunk_size)
while data and self.is_playing:
if not self.is_paused:
self.stream.write(data)
data = wf.readframes(chunk_size)
self.stream.stop_stream()
self.stream.close()
wf.close()
self.is_playing = False
self.is_paused = False
self.audio_thread = threading.Thread(target=stream_audio)
self.audio_thread.daemon = True
self.audio_thread.start()
def pause(self):
self.is_paused = True
def stop(self):
self.is_playing = False
if self.audio_thread:
self.audio_thread.join(timeout=1)
def __del__(self):
self.stop()
self.pyaudio.terminate()
class TTSApp:
def __init__(self, root):
self.root = root
self.root.title("TTS Generator")
self.audio_player = AudioPlayer()
self.voices = [
'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica',
'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
]
self.lang_codes = ['a', 'b', 'j', 'z']
self.lang_code = tk.StringVar(value='a')
self.voice = tk.StringVar(value='em_santa')
self.speed = tk.DoubleVar(value=1.0)
self.speed_label = tk.StringVar(value="Speed: 1.0")
self.current_audio = None
self.is_processing = False
self.create_widgets()
self.create_loading_indicator()
def update_speed_label(self, *args):
self.speed_label.set(f"Speed: {self.speed.get():.1f}")
def create_widgets(self):
main_frame = ttk.Frame(self.root)
main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=5)
# Text input with label
ttk.Label(main_frame, text="Input Text:").pack(anchor='w')
self.text_input = scrolledtext.ScrolledText(main_frame, height=10)
self.text_input.pack(fill=tk.BOTH, expand=True, pady=(0, 10))
# Parameters frame
params_frame = ttk.LabelFrame(main_frame, text="Parameters")
params_frame.pack(fill=tk.X, pady=5)
# Grid layout for parameters
for i, (label, var, values) in enumerate([
("Language Code:", self.lang_code, self.lang_codes),
("Voice:", self.voice, self.voices)
]):
ttk.Label(params_frame, text=label).grid(row=i, column=0, padx=5, pady=2)
combo = ttk.Combobox(params_frame, textvariable=var, values=values, state='readonly')
combo.grid(row=i, column=1, padx=5, pady=2, sticky='ew')
# Speed control
ttk.Label(params_frame, textvariable=self.speed_label).grid(row=2, column=0, padx=5, pady=2)
speed_scale = ttk.Scale(params_frame, from_=0.5, to=2.0, variable=self.speed,
orient=tk.HORIZONTAL, command=lambda x: self.update_speed_label())
speed_scale.grid(row=2, column=1, padx=5, pady=2, sticky='ew')
# Configure grid columns
params_frame.grid_columnconfigure(1, weight=1)
# Buttons frame
buttons_frame = ttk.Frame(main_frame)
buttons_frame.pack(pady=10)
# Generate button
self.generate_btn = ttk.Button(buttons_frame, text="Generate",
command=self.start_generation)
self.generate_btn.pack(side=tk.LEFT, padx=5)
# Audio control buttons
self.play_btn = ttk.Button(buttons_frame, text="Play",
command=self.play_audio, state='disabled')
self.play_btn.pack(side=tk.LEFT, padx=5)
self.pause_btn = ttk.Button(buttons_frame, text="Pause",
command=self.pause_audio, state='disabled')
self.pause_btn.pack(side=tk.LEFT, padx=5)
self.stop_btn = ttk.Button(buttons_frame, text="Stop",
command=self.stop_audio, state='disabled')
self.stop_btn.pack(side=tk.LEFT, padx=5)
self.save_btn = ttk.Button(buttons_frame, text="Save",
command=self.save_audio, state='disabled')
self.save_btn.pack(side=tk.LEFT, padx=5)
def create_loading_indicator(self):
self.loading_frames = []
chars = ["|", "/", "-", "\\"]
for char in chars:
label = ttk.Label(self.root, text=char, font=('Courier', 24))
self.loading_frames.append(label)
self.current_frame = 0
def animate_loading(self):
if self.is_processing:
self.loading_frames[self.current_frame].place_forget()
self.current_frame = (self.current_frame + 1) % len(self.loading_frames)
self.loading_frames[self.current_frame].place(relx=0.5, rely=0.5, anchor='center')
self.root.after(100, self.animate_loading)
else:
self.loading_frames[self.current_frame].place_forget()
def start_generation(self):
self.is_processing = True
self.generate_btn.configure(state='disabled')
self.play_btn.configure(state='disabled')
self.pause_btn.configure(state='disabled')
self.stop_btn.configure(state='disabled')
self.save_btn.configure(state='disabled')
self.animate_loading()
thread = threading.Thread(target=self.generate_audio)
thread.daemon = True
thread.start()
def generate_audio(self):
text = self.text_input.get("1.0", tk.END).strip()
if text:
try:
pipeline = KPipeline(lang_code=self.lang_code.get())
all_audio = []
text = gr_converter.convert_text(text)
text = hy_converter.convert_text(text)
print(text)
generator = pipeline(
text, voice=self.voice.get(),
speed=self.speed.get(), split_pattern=r'\n+'
)
for _, _, audio in generator:
all_audio.append(audio)
self.current_audio = np.concatenate(all_audio)
self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
sf.write(self.temp_file.name, self.current_audio, 24000)
except Exception as e:
print("Error", str(e))
# self.root.after(0, lambda: messagebox.showerror("Error", str(e)))
self.is_processing = False
self.root.after(0, self.generation_complete)
def generation_complete(self):
self.generate_btn.configure(state='normal')
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='normal')
self.stop_btn.configure(state='normal')
self.save_btn.configure(state='normal')
def play_audio(self):
if self.current_audio is not None:
self.audio_player.play_file(self.temp_file.name)
self.play_btn.configure(state='disabled')
self.pause_btn.configure(state='normal')
self.stop_btn.configure(state='normal')
def pause_audio(self):
self.audio_player.pause()
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='disabled')
def stop_audio(self):
self.audio_player.stop()
self.play_btn.configure(state='normal')
self.pause_btn.configure(state='disabled')
self.stop_btn.configure(state='disabled')
def save_audio(self):
if self.current_audio is not None:
sf.write('output_audio.wav', self.current_audio, 24000)
messagebox.showinfo("Success", "Audio saved as 'output_audio.wav'")
def __del__(self):
if hasattr(self, 'audio_player'):
self.audio_player.stop()
if __name__ == "__main__":
root = tk.Tk()
app = TTSApp(root)
root.mainloop()
I used your methodology to test in Spanish
the one with the least accent, like you, was a "pm_santa".
He's not doing badly, but let's hope they release pesos trained in other languages soon.
# 1️⃣ Instalar Kokoro (>=0.7.11) y soundfile
!pip install -q kokoro>=0.7.11 soundfile
# 2️⃣ Instalar espeak-ng (usado para fallback en algunos idiomas)
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
#############################################
# Importaciones y definición del pipeline
#############################################
import numpy as np
import soundfile as sf
import tempfile
import ipywidgets as widgets
from IPython.display import display, Audio, clear_output
import time
# Importamos KPipeline desde kokoro
from kokoro import KPipeline
#############################################
# (Opcional) Clase para conversión a IPA
# Se utiliza únicamente para mostrar el texto convertido a IPA como referencia.
#############################################
class SpanishToIPA:
def __init__(self):
self.basic_mapping = {
'a': 'a', 'á': 'ˈa', 'b': 'b', 'd': 'd', 'e': 'e', 'é': 'ˈe',
'f': 'f', 'i': 'i', 'í': 'ˈi', 'j': 'x', 'k': 'k', 'l': 'l',
'm': 'm', 'n': 'n', 'ñ': 'ɲ', 'o': 'o', 'ó': 'ˈo', 'p': 'p',
's': 's', 't': 't', 'u': 'u', 'ú': 'ˈu', 'ü': 'u', 'v': 'b',
'w': 'w', 'x': 'ks', 'z': 'θ'
}
self.digraphs = {
'ch': 'tʃ', 'll': 'ʝ', 'rr': 'r',
'qu': 'k', 'gue': '§ue', 'gui': '§ui',
'güe': '§üe', 'güi': '§üi'
}
self.numbers = {
'0': 'cero', '1': 'uno', '2': 'dos', '3': 'tres', '4': 'cuatro',
'5': 'cinco', '6': 'seis', '7': 'siete', '8': 'ocho', '9': 'nueve',
'10': 'diez'
}
self.time_formats = {
'00:00': 'medianoche', '12:00': 'mediodía', '13:00': 'la una',
'14:00': 'las dos', '15:00': 'las tres', '16:00': 'las cuatro',
'17:00': 'las cinco', '18:00': 'las seis', '19:00': 'las siete',
'20:00': 'las ocho', '21:00': 'las nueve', '22:00': 'las diez',
'23:00': 'las once'
}
def preprocess_text(self, text):
words = text.split()
processed_words = []
for word in words:
if ':' in word and len(word) == 5 and word in self.time_formats:
processed_words.append(self.time_formats[word])
elif word.isdigit() and word in self.numbers:
processed_words.append(self.numbers[word])
else:
processed_words.append(word)
return ' '.join(processed_words)
def convert_word(self, spanish_word):
if not spanish_word:
return ''
word = spanish_word.lower()
for digraph in sorted(self.digraphs, key=len, reverse=True):
word = word.replace(digraph, self.digraphs[digraph])
ipa = ''
i = 0
spanish_letters = "aábcdeéfghiíjklmnñoópqrstuúüvwxyz"
while i < len(word):
char = word[i]
if char == '§':
if i + 1 < len(word):
if word[i+1] == 'ü':
ipa += 'gw'
i += 2
continue
elif word[i+1] == 'u':
ipa += 'g'
i += 2
continue
ipa += 'g'
i += 1
continue
if char not in spanish_letters:
ipa += char
i += 1
continue
if char == 'c':
if i + 1 < len(word) and word[i+1] in "eéií":
ipa += 'θ'
else:
ipa += 'k'
i += 1
continue
if char == 'g':
if i + 1 < len(word) and word[i+1] in "eéií":
ipa += 'x'
else:
ipa += 'g'
i += 1
continue
if char == 'r':
ipa += 'r' if i == 0 else 'ɾ'
i += 1
continue
if char == 'y':
ipa += 'i' if word == 'y' else self.basic_mapping.get(char, char)
i += 1
continue
ipa += self.basic_mapping.get(char, char)
i += 1
return ipa
def convert_text(self, text):
preprocessed_text = self.preprocess_text(text)
words = preprocessed_text.split()
spanish_letters = "aábcdeéfghiíjklmnñoópqrstuúüvwxyz"
converted = []
for word in words:
if any(c in spanish_letters for c in word.lower()):
ipa_word = self.convert_word(word)
converted.append(f'[{word}](/{ipa_word}/)')
else:
converted.append(word)
return ' '.join(converted)
#############################################
# Interfaz en Colab con ipywidgets
#############################################
class TTSAppColab:
def __init__(self):
# Se usa el conversor a IPA solo para mostrar la conversión (referencia)
self.spanish_converter = SpanishToIPA()
self.current_audio = None
self.temp_file = None
# Lista de voces oficiales (extraída del repositorio Kokoro)
voices = [
'af_alloy', 'af_aoede', 'af_bella', 'af_heart', 'af_jessica',
'af_kore', 'af_nicole', 'af_nova', 'af_river', 'af_sarah', 'af_sky',
'am_adam', 'am_echo', 'am_eric', 'am_fenrir', 'am_liam', 'am_michael',
'am_onyx', 'am_puck', 'am_santa', 'bf_alice', 'bf_emma', 'bf_isabella',
'bf_lily', 'bm_daniel', 'bm_fable', 'bm_george', 'bm_lewis', 'ef_dora',
'em_alex', 'em_santa', 'ff_siwis', 'hf_alpha', 'hf_beta', 'hm_omega',
'hm_psi', 'if_sara', 'im_nicola', 'jf_alpha', 'jf_gongitsune',
'jf_nezumi', 'jf_tebukuro', 'jm_kumo', 'pf_dora', 'pm_alex',
'pm_santa', 'zf_xiaobei', 'zf_xiaoni', 'zf_xiaoxiao', 'zf_xiaoyi',
'zm_yunjian', 'zm_yunxi', 'zm_yunxia', 'zm_yunyang'
]
# Widgets de la interfaz
self.text_area = widgets.Textarea(
value='El cielo sobre el puerto tenía el color de un televisor sintonizado en un canal muerto.',
placeholder='Ingresa texto en español...',
description='Texto:',
layout=widgets.Layout(width='100%', height='150px')
)
self.voice_dropdown = widgets.Dropdown(
options=voices,
value='em_santa', # voz por defecto recomendada
description='Voz:'
)
self.speed_slider = widgets.FloatSlider(
value=1.0,
min=0.5,
max=2.0,
step=0.1,
description='Velocidad:',
continuous_update=True,
readout_format='.1f'
)
self.generate_button = widgets.Button(
description='Generar Audio',
button_style='success'
)
self.save_button = widgets.Button(
description='Guardar Audio',
button_style='info',
disabled=True
)
self.loading_label = widgets.Label(value='')
self.audio_output = widgets.Output()
# Asignar eventos (síncronos para mantener la sesión activa)
self.generate_button.on_click(self.on_generate_clicked)
self.save_button.on_click(self.on_save_clicked)
# Organizar y mostrar la interfaz
self.ui = widgets.VBox([
self.text_area,
self.voice_dropdown,
self.speed_slider,
self.generate_button,
self.loading_label,
self.audio_output,
self.save_button
])
display(self.ui)
def on_generate_clicked(self, b):
self.audio_output.clear_output()
self.loading_label.value = "Generando audio..."
self.generate_button.disabled = True
self.save_button.disabled = True
# Llamada síncrona a la síntesis
self.generate_audio()
def generate_audio(self):
text = self.text_area.value.strip()
if not text:
self.loading_label.value = "Por favor, ingresa algún texto."
self.generate_button.disabled = False
return
# Mostrar el texto convertido a IPA para referencia (no se usa para síntesis)
ipa_text = self.spanish_converter.convert_text(text)
print("Texto en IPA (solo referencia):", ipa_text)
try:
# Crear el pipeline TTS para español (lang_code='e')
pipeline = KPipeline(lang_code='e')
all_audio = []
voice_value = self.voice_dropdown.value.strip()
print("Llamando a pipeline con -> Texto:", text)
print("Voz:", voice_value, "Velocidad:", self.speed_slider.value)
generator = pipeline(
text,
voice=voice_value,
speed=self.speed_slider.value,
split_pattern=r'\n+'
)
chunk_count = 0
for gs, ps, audio in generator:
print("Chunk", chunk_count, "generado.")
print("Graphemes:", gs)
print("Phonemes:", ps)
all_audio.append(audio)
chunk_count += 1
if chunk_count == 0 or not all_audio:
self.loading_label.value = "No se generó audio. Verifica la voz o el texto."
else:
self.current_audio = np.concatenate(all_audio)
self.temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
sf.write(self.temp_file.name, self.current_audio, 24000)
with self.audio_output:
clear_output()
display(Audio(self.current_audio, rate=24000))
self.loading_label.value = "Audio generado con éxito."
self.save_button.disabled = False
except Exception as e:
self.loading_label.value = f"Error: {e}"
print("Error:", e)
self.generate_button.disabled = False
def on_save_clicked(self, b):
if self.current_audio is not None:
filename = 'output_audio.wav'
sf.write(filename, self.current_audio, 24000)
self.loading_label.value = f"Audio guardado como {filename}."
try:
from google.colab import files
files.download(filename)
except Exception as e:
self.loading_label.value += " (Descarga manual requerida)"
else:
self.loading_label.value = "No hay audio para guardar."
# Instanciar y mostrar la aplicación
app = TTSAppColab()
Somehow, I missed the OP. I can confirm that the model did not see any Greek or Armenian training data. My guess is that these languages might be "close enough" to languages that were trained on, but I only know English so it's difficult to evaluate.