AshwinSankar commited on
Commit
e9706fe
·
1 Parent(s): b47d734

initial commit

Browse files
Files changed (4) hide show
  1. README.md +10 -7
  2. app.py +257 -0
  3. assets/.gitkeep +0 -0
  4. lang_list.py +64 -0
README.md CHANGED
@@ -1,14 +1,17 @@
1
  ---
2
- title: Seamless M4t V2 Large Stt
3
- emoji:
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.20.0
8
  app_file: app.py
9
- pinned: false
10
  license: cc-by-nc-4.0
11
- short_description: A Indian speech translation demo
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Seamless M4T v2
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.7.1
8
  app_file: app.py
9
+ pinned: true
10
  license: cc-by-nc-4.0
11
+ short_description: A demo of Indic Seamless M4t V2 Large
12
+ suggested_hardware: l4x1
13
+ models:
14
+ - ai4bharat/seamless-m4t-v2-large-stt
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pathlib
5
+ from typing import Any, Dict
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import torch
10
+
11
+ # from seamless_communication.inference import Translator
12
+ import torchaudio
13
+
14
+ # from fairseq2.assets import InProcAssetMetadataProvider, asset_store
15
+ from huggingface_hub import snapshot_download
16
+ from transformers import (
17
+ SeamlessM4TFeatureExtractor,
18
+ SeamlessM4TTokenizer,
19
+ SeamlessM4Tv2ForSpeechToText,
20
+ )
21
+
22
+ from lang_list import (
23
+ ASR_TARGET_LANGUAGE_NAMES,
24
+ LANGUAGE_NAME_TO_CODE,
25
+ S2ST_TARGET_LANGUAGE_NAMES,
26
+ S2TT_TARGET_LANGUAGE_NAMES,
27
+ T2ST_TARGET_LANGUAGE_NAMES,
28
+ # T2TT_TARGET_LANGUAGE_NAMES,
29
+ TEXT_SOURCE_LANGUAGE_NAMES,
30
+ )
31
+
32
+
33
+ DESCRIPTION = """\
34
+ # SeamlessM4T
35
+
36
+ [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
37
+ translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
38
+ This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
39
+ translation and more, without relying on multiple separate models. The model is also in use on the
40
+ [SeamlessM4T demo website](https://seamless.metademolab.com/m4t?utm_source=huggingface&utm_medium=web&utm_campaign=seamless&utm_content=m4tspace).
41
+ """
42
+
43
+ hf_token = os.getenv("HF_TOKEN")
44
+
45
+ model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/seamless-m4t-v2-large-stt", torch_dtype=torch.float16, token=hf_token).to("cuda")
46
+ processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/seamless-m4t-v2-large-stt", token=hf_token)
47
+ tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/seamless-m4t-v2-large-stt", token=hf_token)
48
+
49
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
50
+
51
+ AUDIO_SAMPLE_RATE = 16000.0
52
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
53
+ DEFAULT_TARGET_LANGUAGE = "Hindi"
54
+
55
+ if torch.cuda.is_available():
56
+ device = torch.device("cuda:0")
57
+ dtype = torch.float16
58
+ else:
59
+ device = torch.device("cpu")
60
+ dtype = torch.float32
61
+
62
+
63
+
64
+
65
+
66
+ def preprocess_audio(input_audio: str) -> None:
67
+ arr, org_sr = torchaudio.load(input_audio)
68
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
69
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
70
+ if new_arr.shape[1] > max_length:
71
+ new_arr = new_arr[:, :max_length]
72
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
73
+ torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
74
+
75
+
76
+ def run_s2tt(input_audio: str, source_language: str, target_language: str) -> str:
77
+ # preprocess_audio(input_audio)
78
+ # source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
79
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
80
+
81
+ input_audio, orig_freq = torchaudio.load(input_audio)
82
+ input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
83
+ audio_inputs= processor(input_audio, sampling_rate=16000, return_tensors="pt").to(device="cuda",dtype=torch.float16)
84
+
85
+ text_out = model.generate(**audio_inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
86
+
87
+ return tokenizer.decode(text_out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
88
+
89
+
90
+ def run_asr(input_audio: str, target_language: str) -> str:
91
+ # preprocess_audio(input_audio)
92
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
93
+
94
+ input_audio, orig_freq = torchaudio.load(input_audio)
95
+ input_audio = torchaudio.functional.resample(input_audio, orig_freq=orig_freq, new_freq=16000)
96
+ audio_inputs= processor(input_audio, sampling_rate=16000, return_tensors="pt").to(device="cuda",dtype=torch.float16)
97
+
98
+ text_out = model.generate(**audio_inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
99
+
100
+ return tokenizer.decode(text_out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
101
+
102
+
103
+
104
+ with gr.Blocks() as demo_s2st:
105
+ with gr.Row():
106
+ with gr.Column():
107
+ with gr.Group():
108
+ input_audio = gr.Audio(label="Input speech", type="filepath")
109
+ source_language = gr.Dropdown(
110
+ label="Source language",
111
+ choices=ASR_TARGET_LANGUAGE_NAMES,
112
+ value="English",
113
+ )
114
+ target_language = gr.Dropdown(
115
+ label="Target language",
116
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
117
+ value=DEFAULT_TARGET_LANGUAGE,
118
+ )
119
+ btn = gr.Button("Translate")
120
+ with gr.Column():
121
+ with gr.Group():
122
+ output_audio = gr.Audio(
123
+ label="Translated speech",
124
+ autoplay=False,
125
+ streaming=False,
126
+ type="numpy",
127
+ )
128
+ output_text = gr.Textbox(label="Translated text")
129
+
130
+ with gr.Blocks() as demo_s2tt:
131
+ with gr.Row():
132
+ with gr.Column():
133
+ with gr.Group():
134
+ input_audio = gr.Audio(label="Input speech", type="filepath")
135
+ source_language = gr.Dropdown(
136
+ label="Source language",
137
+ choices=ASR_TARGET_LANGUAGE_NAMES,
138
+ value="English",
139
+ )
140
+ target_language = gr.Dropdown(
141
+ label="Target language",
142
+ choices=S2TT_TARGET_LANGUAGE_NAMES,
143
+ value=DEFAULT_TARGET_LANGUAGE,
144
+ )
145
+ btn = gr.Button("Translate")
146
+ with gr.Column():
147
+ output_text = gr.Textbox(label="Translated text")
148
+
149
+ gr.Examples(
150
+ examples=[
151
+ ["assets/Bengali.wav", "Bengali", "English"],
152
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
153
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
154
+
155
+ ],
156
+ inputs=[input_audio, source_language, target_language],
157
+ outputs=output_text,
158
+ fn=run_s2tt,
159
+ cache_examples=CACHE_EXAMPLES,
160
+ api_name=False,
161
+ )
162
+
163
+ btn.click(
164
+ fn=run_s2tt,
165
+ inputs=[input_audio, source_language, target_language],
166
+ outputs=output_text,
167
+ api_name="s2tt",
168
+ )
169
+
170
+ with gr.Blocks() as demo_t2st:
171
+ with gr.Row():
172
+ with gr.Column():
173
+ with gr.Group():
174
+ input_text = gr.Textbox(label="Input text")
175
+ with gr.Row():
176
+ source_language = gr.Dropdown(
177
+ label="Source language",
178
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
179
+ value="English",
180
+ )
181
+ target_language = gr.Dropdown(
182
+ label="Target language",
183
+ choices=T2ST_TARGET_LANGUAGE_NAMES,
184
+ value=DEFAULT_TARGET_LANGUAGE,
185
+ )
186
+ btn = gr.Button("Translate")
187
+ with gr.Column():
188
+ with gr.Group():
189
+ output_audio = gr.Audio(
190
+ label="Translated speech",
191
+ autoplay=False,
192
+ streaming=False,
193
+ type="numpy",
194
+ )
195
+ output_text = gr.Textbox(label="Translated text")
196
+
197
+
198
+
199
+ with gr.Blocks() as demo_asr:
200
+ with gr.Row():
201
+ with gr.Column():
202
+ with gr.Group():
203
+ input_audio = gr.Audio(label="Input speech", type="filepath")
204
+ target_language = gr.Dropdown(
205
+ label="Target language",
206
+ choices=ASR_TARGET_LANGUAGE_NAMES,
207
+ value=DEFAULT_TARGET_LANGUAGE,
208
+ )
209
+ btn = gr.Button("Translate")
210
+ with gr.Column():
211
+ output_text = gr.Textbox(label="Translated text")
212
+
213
+ gr.Examples(
214
+ examples=[
215
+ ["assets/Bengali.wav", "Bengali", "English"],
216
+ ["assets/Gujarati.wav", "Gujarati", "Hindi"],
217
+ ["assets/Punjabi.wav", "Punjabi", "Hindi"],
218
+
219
+ ],
220
+ inputs=[input_audio, target_language],
221
+ outputs=output_text,
222
+ fn=run_asr,
223
+ cache_examples=CACHE_EXAMPLES,
224
+ api_name=False,
225
+ )
226
+
227
+ btn.click(
228
+ fn=run_asr,
229
+ inputs=[input_audio, target_language],
230
+ outputs=output_text,
231
+ api_name="asr",
232
+ )
233
+
234
+
235
+ with gr.Blocks(css="style.css") as demo:
236
+ gr.Markdown(DESCRIPTION)
237
+ gr.DuplicateButton(
238
+ value="Duplicate Space for private use",
239
+ elem_id="duplicate-button",
240
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
241
+ )
242
+
243
+ with gr.Tabs():
244
+ # with gr.Tab(label="S2ST"):
245
+ # demo_s2st.render()
246
+ with gr.Tab(label="S2TT"):
247
+ demo_s2tt.render()
248
+ # with gr.Tab(label="T2ST"):
249
+ # demo_t2st.render()
250
+ # with gr.Tab(label="T2TT"):
251
+ # demo_t2tt.render()
252
+ with gr.Tab(label="ASR"):
253
+ demo_asr.render()
254
+
255
+
256
+ if __name__ == "__main__":
257
+ demo.queue(max_size=50).launch()
assets/.gitkeep ADDED
File without changes
lang_list.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "asm": "Assamese",
4
+ "ben": "Bengali",
5
+ "guj": "Gujarati",
6
+ "hin": "Hindi",
7
+ "kan": "Kannada",
8
+ "mal": "Malayalam",
9
+ "mar": "Marathi",
10
+ "ory": "Odia",
11
+ "pan": "Punjabi",
12
+ "tam": "Tamil",
13
+ "tel": "Telugu",
14
+ "urd": "Urdu",
15
+ "eng": "English"
16
+
17
+ }
18
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
19
+
20
+ # Source langs: S2ST / S2TT / ASR don't need source lang
21
+ # T2TT / T2ST use this
22
+ text_source_language_codes = [
23
+ "asm",
24
+ "ben",
25
+ "guj",
26
+ "hin",
27
+ "kan",
28
+ "mal",
29
+ "mar",
30
+ "ory",
31
+ "pan",
32
+ "tam",
33
+ "tel",
34
+ "urd",
35
+ "eng"
36
+
37
+ ]
38
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
39
+
40
+ # Target langs:
41
+ # S2ST / T2ST
42
+ s2st_target_language_codes = [
43
+ "asm",
44
+ "ben",
45
+ "guj",
46
+ "hin",
47
+ "kan",
48
+ "mal",
49
+ "mar",
50
+ "ory",
51
+ "pan",
52
+ "tam",
53
+ "tel",
54
+ "urd",
55
+ "eng"
56
+
57
+ ]
58
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
59
+ T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
60
+
61
+ # S2TT / T2TT / ASR
62
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
63
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
64
+ ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES