Tejas1206 commited on
Commit
fdabac0
·
1 Parent(s): 2335126
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+
6
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
+
8
+
9
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
+ model = SpeechT5ForTextToSpeech.from_pretrained("tejas1206/speecht5_tts_technical_en")
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
+
13
+
14
+ speaker_embeddings = {
15
+ "BDL": "speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
16
+ "CLB": "speaker/cmu_us_clb_arctic-wav-arctic_a0144.npy",
17
+ "KSP": "speaker/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
18
+ "RMS": "speaker/cmu_us_rms_arctic-wav-arctic_b0353.npy",
19
+ "SLT": "speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy",
20
+ }
21
+
22
+
23
+ def predict(text, speaker):
24
+ if len(text.strip()) == 0:
25
+ return (16000, np.zeros(0).astype(np.int16))
26
+
27
+ inputs = processor(text=text, return_tensors="pt")
28
+
29
+ # limit input length
30
+ input_ids = inputs["input_ids"]
31
+ input_ids = input_ids[..., :model.config.max_text_positions]
32
+
33
+ if speaker == "Surprise Me!":
34
+ # load one of the provided speaker embeddings at random
35
+ idx = np.random.randint(len(speaker_embeddings))
36
+ key = list(speaker_embeddings.keys())[idx]
37
+ speaker_embedding = np.load(speaker_embeddings[key])
38
+
39
+ # randomly shuffle the elements
40
+ np.random.shuffle(speaker_embedding)
41
+
42
+ # randomly flip half the values
43
+ x = (np.random.rand(512) >= 0.5) * 1.0
44
+ x[x == 0] = -1.0
45
+ speaker_embedding *= x
46
+
47
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
48
+ else:
49
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
50
+
51
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
52
+
53
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
54
+
55
+ speech = (speech.numpy() * 32767).astype(np.int16)
56
+ return (16000, speech)
57
+
58
+
59
+ title = "Text-to-Speech App using SpeechT5"
60
+
61
+ gr.Interface(
62
+ fn=predict,
63
+ inputs=[
64
+ gr.Text(label="Input Text"),
65
+ gr.Radio(label="Speaker", choices=[
66
+ "BDL (male)",
67
+ "CLB (female)",
68
+ "KSP (male)",
69
+ "RMS (male)",
70
+ "SLT (female)",
71
+ "Surprise Me!"
72
+ ],
73
+ value="BDL (male)"),
74
+ ],
75
+ outputs=[
76
+ gr.Audio(label="Generated Speech", type="numpy"),
77
+ ],
78
+ title=title,
79
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==5.1.0
2
+ torch==2.4.0
3
+ git+https://github.com/huggingface/transformers.git
4
+ soundfile==0.12.1
5
+ sentencepiece==0.2.0
6
+ samplerate
7
+ librosa
8
+ resampy
speaker/cmu_us_awb_arctic-wav-arctic_a0002.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db7a684ab490f21cec1628e00d461a184e369fe4eafb1ee441a796faf4ab6ae
3
+ size 2176
speaker/cmu_us_bdl_arctic-wav-arctic_a0009.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215326eae3a428af8934c385fbe043b36c72849ca17d1d013adeb189e6bd6962
3
+ size 2176
speaker/cmu_us_clb_arctic-wav-arctic_a0144.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf67b36c47edfb1851466a1dff081b436bc6809b5ebc12811d9df0c0d0f28d0e
3
+ size 2176
speaker/cmu_us_ksp_arctic-wav-arctic_b0087.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c5c2a38c2e400179019c560a74c4322f4ee13beda22ee601807545edee283e
3
+ size 2176
speaker/cmu_us_rms_arctic-wav-arctic_b0353.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49dac3e9c3a71a4dbca4c364233c7915ae6e0cb71b2ceaed97296231b95cb50
3
+ size 2176
speaker/cmu_us_slt_arctic-wav-arctic_a0508.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71ffadda3f3a4de079740a0b34963824dc644d9d5442283bd0a2b0d4f44ff0b
3
+ size 2176