Spaces:

yakhyo
/

kokoro-onnx

Running

App Files Files Community

kokoro-onnx / app.py

yakhyo

Update app.py

1ea9ebb verified 14 days ago

raw

history blame contribute delete

4.12 kB

	import os
	import gradio as gr
	import tempfile
	import soundfile as sf
	from models import Tokenizer, Kokoro

	# Function to fetch available style vectors dynamically


	def get_style_vector_choices(directory="voices"):
	return [file for file in os.listdir(directory) if file.endswith(".pt")]


	def get_onnx_models(directory="weights"):
	return [file for file in os.listdir(directory) if file.endswith(".onnx")]

	# Function to perform TTS using your local model


	def local_tts(
	text: str,
	model_path: str,
	style_vector: str,
	output_file_format: str = "wav",
	speed: float = 1.0
	):
	if len(text) > 0:
	try:
	tokenizer = Tokenizer()
	style_vector_path = os.path.join("voices", style_vector)
	model_path = os.path.join("weights", model_path)

	inference = Kokoro(model_path, style_vector_path, tokenizer=tokenizer, lang='en-us')

	audio, sample_rate = inference.generate_audio(text, speed=speed)

	with tempfile.NamedTemporaryFile(suffix=f".{output_file_format}", delete=False) as temp_file:
	sf.write(temp_file.name, audio, sample_rate)
	temp_file_path = temp_file.name

	return temp_file_path

	except Exception as e:
	raise gr.Error(f"An error occurred during TTS inference: {str(e)}")
	else:
	raise gr.Error("Input text cannot be empty.")


	# Get the list of available style vectors
	style_vector_choices = get_style_vector_choices()
	onnx_models_choices = get_onnx_models()

	# sample texts and their corresponding audio
	sample_outputs = [
	("Educational Note", "Machine learning models rely on large datasets and complex algorithms to identify patterns and make predictions.", "assets/edu_note.wav"),
	("Fun Fact", "Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still edible!", "assets/fun_fact.wav"),
	("Thanks", "Thank you for listening to this audio. It was generated by the Kokoro TTS model.", "assets/thanks.wav")
	]

	example_texts = [
	["Machine learning models rely on large datasets and complex algorithms to identify patterns and make predictions."],
	["Did you know that honey never spoils? Archaeologists have found pots of honey in ancient Egyptian tombs that are over 3,000 years old and still edible!"],
	["Thank you for listening to this audio. It was generated by the Kokoro TTS model."]
	]

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("## <center> Kokoro TTS ONNX Inference \| [GitHub Link](https://github.com/yakhyo/kokoro-onnx) </center>")

	# Model-specific inputs
	with gr.Row(variant="panel"):
	model_path = gr.Dropdown(choices=onnx_models_choices, label="ONNX Model Path", value=onnx_models_choices[0])
	style_vector = gr.Dropdown(choices=style_vector_choices, label="Style Vector", value=style_vector_choices[0])
	output_file_format = gr.Dropdown(choices=["wav", "mp3"], label="Output Format", value="wav")
	speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed")

	# Text input and output
	text = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to convert to speech."
	)
	btn = gr.Button("Generate Speech")
	output_audio = gr.Audio(label="Generated Audio", type="filepath")

	# Link inputs and outputs
	btn.click(
	fn=local_tts,
	inputs=[text, model_path, style_vector, output_file_format, speed],
	outputs=output_audio
	)

	# Add example texts
	gr.Examples(
	examples=example_texts,
	inputs=[text],
	label="Click an example to populate the input text"
	)

	# Add example texts and audios
	gr.Markdown("### Sample Texts and Audio")
	for topic, sample_text, sample_audio in sample_outputs:
	with gr.Row():
	gr.Textbox(value=sample_text, label=topic, interactive=False)
	gr.Audio(value=sample_audio, label="Example Audio", type="filepath", interactive=False)

	demo.launch(server_name="0.0.0.0")