hivecorp commited on
Commit
454da09
·
verified ·
1 Parent(s): 05a4865

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -21
app.py CHANGED
@@ -1,31 +1,58 @@
 
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline
3
 
4
- # Define a list of available speaker models
5
- SPEAKER_MODELS = {
6
- "Default": "onnx-community/Kokoro-82M-ONNX",
7
- "Speaker 1": "onnx-community/Kokoro-82M-ONNX",
8
- "Speaker 2": "onnx-community/Kokoro-82M-ONNX"
 
 
 
 
9
  }
10
 
11
- def kokoro_tts(text, speaker):
12
- # Initialize the transformers pipeline for text-to-speech with the selected speaker model
13
- tts_pipeline = pipeline("text-to-speech", model=SPEAKER_MODELS[speaker])
14
- # Generate speech from text
15
- speech = tts_pipeline(text)
16
- return speech["audio"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # Create a Gradio interface
19
  iface = gr.Interface(
20
- fn=kokoro_tts,
21
  inputs=[
22
- gr.Textbox(lines=2, placeholder="Enter text here..."),
23
- gr.Dropdown(choices=list(SPEAKER_MODELS.keys()), label="Select Speaker")
24
  ],
25
- outputs=gr.Audio(label="Generated Speech"),
26
- title="Kokoro Text-to-Speech",
27
- description="A Text-to-Speech app powered by Hugging Face Transformers.js with multiple speaker options"
28
  )
29
 
30
- if __name__ == "__main__":
31
- iface.launch()
 
1
+ import numpy as np
2
+ import onnxruntime as ort
3
+ import torch
4
+ import scipy.io.wavfile as wav
5
  import gradio as gr
 
6
 
7
+ # Load the ONNX model
8
+ model_path = "Kokoro-82M-ONNX/model.onnx"
9
+ ort_session = ort.InferenceSession(model_path)
10
+
11
+ # Define speaker options (replace with actual speaker IDs or embeddings)
12
+ speaker_options = {
13
+ "Speaker 1": "spk_1_embedding",
14
+ "Speaker 2": "spk_2_embedding",
15
+ "Speaker 3": "spk_3_embedding",
16
  }
17
 
18
+ # Function to generate speech
19
+ def generate_speech(text, speaker):
20
+ # Preprocess the input text and speaker embedding
21
+ input_text = np.array([text], dtype=np.str_)
22
+ speaker_embedding = np.array([speaker_options[speaker]], dtype=np.float32)
23
+
24
+ # Run the ONNX model
25
+ ort_inputs = {
26
+ "text": input_text,
27
+ "speaker_embedding": speaker_embedding,
28
+ }
29
+ ort_outputs = ort_session.run(None, ort_inputs)
30
+
31
+ # Postprocess the output (assuming the output is a waveform)
32
+ waveform = ort_outputs[0].squeeze()
33
+
34
+ # Save the waveform as a WAV file
35
+ output_file = "output.wav"
36
+ wav.write(output_file, 22050, waveform) # Adjust sample rate as needed
37
+
38
+ return output_file
39
+
40
+ # Gradio interface
41
+ def tts_app(text, speaker):
42
+ audio_file = generate_speech(text, speaker)
43
+ return audio_file
44
 
45
+ # Create the Gradio app
46
  iface = gr.Interface(
47
+ fn=tts_app,
48
  inputs=[
49
+ gr.Textbox(label="Input Text"),
50
+ gr.Dropdown(choices=list(speaker_options.keys()), label="Speaker"),
51
  ],
52
+ outputs=gr.Audio(label="Generated Speech", type="filepath"),
53
+ title="Text-to-Speech with Kokoro-82M-ONNX",
54
+ description="Generate speech from text using the Kokoro-82M-ONNX model with multiple speaker options.",
55
  )
56
 
57
+ # Launch the app
58
+ iface.launch()