jhj0517 commited on
Commit
a85ea1b
·
1 Parent(s): 19ab4f1

add gradio components

Browse files
Files changed (1) hide show
  1. app.py +319 -109
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import argparse
 
3
 
4
  from modules.whisper.whisper_Inference import WhisperInference
5
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
@@ -84,7 +85,7 @@ class App:
84
  with gr.Column():
85
  input_file = gr.Files(type="filepath", label="Upload File here")
86
  tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
87
- info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
88
  " Leave this field empty if you do not wish to use a local path.",
89
  visible=self.args.colab,
90
  value="")
@@ -97,32 +98,83 @@ class App:
97
  with gr.Row():
98
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
99
  with gr.Row():
100
- cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
 
101
  with gr.Accordion("Advanced Parameters", open=False):
102
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
103
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
104
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
105
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
 
 
106
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
107
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
108
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
 
109
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
110
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
111
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  with gr.Accordion("VAD", open=False):
113
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
114
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
115
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
 
 
116
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
117
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
118
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
119
  with gr.Accordion("Diarization", open=False):
120
  cb_diarize = gr.Checkbox(label="Enable Diarization")
121
  tb_hf_token = gr.Text(label="HuggingFace Token", value="",
122
  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
123
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
124
- dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
125
- with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
 
 
 
126
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
127
  nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
128
  with gr.Row():
@@ -133,30 +185,48 @@ class App:
133
  btn_openfolder = gr.Button('📂', scale=1)
134
 
135
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
136
- whisper_params = WhisperParameters(model_size=dd_model,
137
- lang=dd_lang,
138
- is_translate=cb_translate,
139
- beam_size=nb_beam_size,
140
- log_prob_threshold=nb_log_prob_threshold,
141
- no_speech_threshold=nb_no_speech_threshold,
142
- compute_type=dd_compute_type,
143
- best_of=nb_best_of,
144
- patience=nb_patience,
145
- condition_on_previous_text=cb_condition_on_previous_text,
146
- initial_prompt=tb_initial_prompt,
147
- temperature=sd_temperature,
148
- compression_ratio_threshold=nb_compression_ratio_threshold,
149
- vad_filter=cb_vad_filter,
150
- threshold=sd_threshold,
151
- min_speech_duration_ms=nb_min_speech_duration_ms,
152
- max_speech_duration_s=nb_max_speech_duration_s,
153
- min_silence_duration_ms=nb_min_silence_duration_ms,
154
- speech_pad_ms=nb_speech_pad_ms,
155
- chunk_length_s=nb_chunk_length_s,
156
- batch_size=nb_batch_size,
157
- is_diarize=cb_diarize,
158
- hf_token=tb_hf_token,
159
- diarization_device=dd_diarization_device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  btn_run.click(fn=self.whisper_inf.transcribe_file,
162
  inputs=params + whisper_params.as_list(),
@@ -186,28 +256,77 @@ class App:
186
  interactive=True)
187
  with gr.Accordion("Advanced Parameters", open=False):
188
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
189
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
190
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
191
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
 
 
192
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
193
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
194
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
 
195
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
196
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
197
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  with gr.Accordion("VAD", open=False):
199
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
200
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
201
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
 
 
202
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
203
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
204
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
205
  with gr.Accordion("Diarization", open=False):
206
  cb_diarize = gr.Checkbox(label="Enable Diarization")
207
  tb_hf_token = gr.Text(label="HuggingFace Token", value="",
208
  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
209
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
210
- dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
 
 
211
  with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
212
  visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
213
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
@@ -220,30 +339,48 @@ class App:
220
  btn_openfolder = gr.Button('📂', scale=1)
221
 
222
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
223
- whisper_params = WhisperParameters(model_size=dd_model,
224
- lang=dd_lang,
225
- is_translate=cb_translate,
226
- beam_size=nb_beam_size,
227
- log_prob_threshold=nb_log_prob_threshold,
228
- no_speech_threshold=nb_no_speech_threshold,
229
- compute_type=dd_compute_type,
230
- best_of=nb_best_of,
231
- patience=nb_patience,
232
- condition_on_previous_text=cb_condition_on_previous_text,
233
- initial_prompt=tb_initial_prompt,
234
- temperature=sd_temperature,
235
- compression_ratio_threshold=nb_compression_ratio_threshold,
236
- vad_filter=cb_vad_filter,
237
- threshold=sd_threshold,
238
- min_speech_duration_ms=nb_min_speech_duration_ms,
239
- max_speech_duration_s=nb_max_speech_duration_s,
240
- min_silence_duration_ms=nb_min_silence_duration_ms,
241
- speech_pad_ms=nb_speech_pad_ms,
242
- chunk_length_s=nb_chunk_length_s,
243
- batch_size=nb_batch_size,
244
- is_diarize=cb_diarize,
245
- hf_token=tb_hf_token,
246
- diarization_device=dd_diarization_device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
249
  inputs=params + whisper_params.as_list(),
@@ -266,20 +403,67 @@ class App:
266
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
267
  with gr.Accordion("Advanced Parameters", open=False):
268
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
269
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
270
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
271
- dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
 
 
272
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
273
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
274
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
 
275
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
276
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  with gr.Accordion("VAD", open=False):
278
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
279
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
280
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
 
 
281
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
282
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
 
283
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
284
  with gr.Accordion("Diarization", open=False):
285
  cb_diarize = gr.Checkbox(label="Enable Diarization")
@@ -301,30 +485,48 @@ class App:
301
  btn_openfolder = gr.Button('📂', scale=1)
302
 
303
  params = [mic_input, dd_file_format]
304
- whisper_params = WhisperParameters(model_size=dd_model,
305
- lang=dd_lang,
306
- is_translate=cb_translate,
307
- beam_size=nb_beam_size,
308
- log_prob_threshold=nb_log_prob_threshold,
309
- no_speech_threshold=nb_no_speech_threshold,
310
- compute_type=dd_compute_type,
311
- best_of=nb_best_of,
312
- patience=nb_patience,
313
- condition_on_previous_text=cb_condition_on_previous_text,
314
- initial_prompt=tb_initial_prompt,
315
- temperature=sd_temperature,
316
- compression_ratio_threshold=nb_compression_ratio_threshold,
317
- vad_filter=cb_vad_filter,
318
- threshold=sd_threshold,
319
- min_speech_duration_ms=nb_min_speech_duration_ms,
320
- max_speech_duration_s=nb_max_speech_duration_s,
321
- min_silence_duration_ms=nb_min_silence_duration_ms,
322
- speech_pad_ms=nb_speech_pad_ms,
323
- chunk_length_s=nb_chunk_length_s,
324
- batch_size=nb_batch_size,
325
- is_diarize=cb_diarize,
326
- hf_token=tb_hf_token,
327
- diarization_device=dd_diarization_device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
330
  inputs=params + whisper_params.as_list(),
@@ -389,7 +591,8 @@ class App:
389
  md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
390
 
391
  btn_run.click(fn=self.nllb_inf.translate_file,
392
- inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang, nb_max_length, cb_timestamp],
 
393
  outputs=[tb_indicator, files_subtitles])
394
 
395
  btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
@@ -415,7 +618,8 @@ class App:
415
 
416
  # Create the parser for command-line arguments
417
  parser = argparse.ArgumentParser()
418
- parser.add_argument('--whisper_type', type=str, default="faster-whisper", help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
 
419
  parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
420
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
421
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
@@ -425,11 +629,17 @@ parser.add_argument('--password', type=str, default=None, help='Gradio authentic
425
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
426
  parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
427
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
428
- parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
429
- parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
430
- parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
431
- parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"), help='Directory path of the diarization model')
432
- parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 
 
 
 
 
 
433
  parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
434
  _args = parser.parse_args()
435
 
 
1
  import os
2
  import argparse
3
+ import gradio as gr
4
 
5
  from modules.whisper.whisper_Inference import WhisperInference
6
  from modules.whisper.faster_whisper_inference import FasterWhisperInference
 
85
  with gr.Column():
86
  input_file = gr.Files(type="filepath", label="Upload File here")
87
  tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
88
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
89
  " Leave this field empty if you do not wish to use a local path.",
90
  visible=self.args.colab,
91
  value="")
 
98
  with gr.Row():
99
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
100
  with gr.Row():
101
+ cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
102
+ interactive=True)
103
  with gr.Accordion("Advanced Parameters", open=False):
104
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
105
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
106
+ interactive=True)
107
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
108
+ dd_compute_type = gr.Dropdown(label="Compute Type",
109
+ choices=self.whisper_inf.available_compute_types,
110
+ value=self.whisper_inf.current_compute_type, interactive=True)
111
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
112
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
113
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
114
+ interactive=True)
115
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
116
+ sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
117
+ interactive=True)
118
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4,
119
+ interactive=True)
120
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
121
+ with gr.Column():
122
+ nb_length_penalty = gr.Number(label="Length Penalty", value=1,
123
+ info="Exponential length penalty constant.")
124
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
125
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
126
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
127
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
128
+ tb_prefix = gr.Textbox(label="Prefix", value="",
129
+ info="Optional text to provide as a prefix for the first window.")
130
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
131
+ info="Suppress blank outputs at the beginning of the sampling.")
132
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="-1",
133
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
134
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
135
+ info="The initial timestamp cannot be later than this.")
136
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
137
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
138
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
139
+ info="If word_timestamps is True, merge these punctuation symbols with the next word.")
140
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations",
141
+ value="\"'.。,,!!??::”)]}、",
142
+ info="If word_timestamps is True, merge these punctuation symbols with the previous word.")
143
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=None, precision=0,
144
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
145
+ nb_chunk_length = gr.Number(label="Chunk Length", value=None,
146
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
147
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold",
148
+ value=None,
149
+ info="When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
150
+ tb_hotwords = gr.Textbox(label="Hotwords", value="",
151
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
152
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
153
+ value=None,
154
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
155
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
156
+ precision=0,
157
+ info="Number of segments to consider for the language detection.")
158
  with gr.Accordion("VAD", open=False):
159
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
160
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
161
+ value=0.5, info="Lower it to be more sensitive to small sounds.")
162
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
163
+ value=250)
164
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
165
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
166
+ value=2000)
167
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
168
  with gr.Accordion("Diarization", open=False):
169
  cb_diarize = gr.Checkbox(label="Enable Diarization")
170
  tb_hf_token = gr.Text(label="HuggingFace Token", value="",
171
  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
172
+ "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
173
+ dd_diarization_device = gr.Dropdown(label="Device",
174
+ choices=self.whisper_inf.diarizer.get_available_device(),
175
+ value=self.whisper_inf.diarizer.get_device())
176
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
177
+ visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
178
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
179
  nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
180
  with gr.Row():
 
185
  btn_openfolder = gr.Button('📂', scale=1)
186
 
187
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
188
+ whisper_params = WhisperParameters(
189
+ model_size=dd_model,
190
+ lang=dd_lang,
191
+ is_translate=cb_translate,
192
+ beam_size=nb_beam_size,
193
+ log_prob_threshold=nb_log_prob_threshold,
194
+ no_speech_threshold=nb_no_speech_threshold,
195
+ compute_type=dd_compute_type,
196
+ best_of=nb_best_of,
197
+ patience=nb_patience,
198
+ condition_on_previous_text=cb_condition_on_previous_text,
199
+ initial_prompt=tb_initial_prompt,
200
+ temperature=sd_temperature,
201
+ compression_ratio_threshold=nb_compression_ratio_threshold,
202
+ vad_filter=cb_vad_filter,
203
+ threshold=sd_threshold,
204
+ min_speech_duration_ms=nb_min_speech_duration_ms,
205
+ max_speech_duration_s=nb_max_speech_duration_s,
206
+ min_silence_duration_ms=nb_min_silence_duration_ms,
207
+ speech_pad_ms=nb_speech_pad_ms,
208
+ chunk_length_s=nb_chunk_length_s,
209
+ batch_size=nb_batch_size,
210
+ is_diarize=cb_diarize,
211
+ hf_token=tb_hf_token,
212
+ diarization_device=dd_diarization_device,
213
+ length_penalty=nb_length_penalty,
214
+ repetition_penalty=nb_repetition_penalty,
215
+ no_repeat_ngram_size=nb_no_repeat_ngram_size,
216
+ prefix=tb_prefix,
217
+ suppress_blank=cb_suppress_blank,
218
+ suppress_tokens=tb_suppress_tokens,
219
+ max_initial_timestamp=nb_max_initial_timestamp,
220
+ word_timestamps=cb_word_timestamps,
221
+ prepend_punctuations=tb_prepend_punctuations,
222
+ append_punctuations=tb_append_punctuations,
223
+ max_new_tokens=nb_max_new_tokens,
224
+ chunk_length=nb_chunk_length,
225
+ hallucination_silence_threshold=nb_hallucination_silence_threshold,
226
+ hotwords=tb_hotwords,
227
+ language_detection_threshold=nb_language_detection_threshold,
228
+ language_detection_segments=nb_language_detection_segments
229
+ )
230
 
231
  btn_run.click(fn=self.whisper_inf.transcribe_file,
232
  inputs=params + whisper_params.as_list(),
 
256
  interactive=True)
257
  with gr.Accordion("Advanced Parameters", open=False):
258
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
259
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
260
+ interactive=True)
261
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
262
+ dd_compute_type = gr.Dropdown(label="Compute Type",
263
+ choices=self.whisper_inf.available_compute_types,
264
+ value=self.whisper_inf.current_compute_type, interactive=True)
265
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
266
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
267
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
268
+ interactive=True)
269
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
270
+ sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
271
+ interactive=True)
272
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4,
273
+ interactive=True)
274
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
275
+ with gr.Column():
276
+ nb_length_penalty = gr.Number(label="Length Penalty", value=1,
277
+ info="Exponential length penalty constant.")
278
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
279
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
280
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
281
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
282
+ tb_prefix = gr.Textbox(label="Prefix", value="",
283
+ info="Optional text to provide as a prefix for the first window.")
284
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
285
+ info="Suppress blank outputs at the beginning of the sampling.")
286
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="-1",
287
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
288
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
289
+ info="The initial timestamp cannot be later than this.")
290
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
291
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
292
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
293
+ info="If word_timestamps is True, merge these punctuation symbols with the next word.")
294
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations",
295
+ value="\"'.。,,!!??::”)]}、",
296
+ info="If word_timestamps is True, merge these punctuation symbols with the previous word.")
297
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=None, precision=0,
298
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
299
+ nb_chunk_length = gr.Number(label="Chunk Length", value=None,
300
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
301
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold",
302
+ value=None,
303
+ info="When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
304
+ tb_hotwords = gr.Textbox(label="Hotwords", value="",
305
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
306
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
307
+ value=None,
308
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
309
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
310
+ precision=0,
311
+ info="Number of segments to consider for the language detection.")
312
  with gr.Accordion("VAD", open=False):
313
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
314
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
315
+ value=0.5, info="Lower it to be more sensitive to small sounds.")
316
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
317
+ value=250)
318
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
319
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
320
+ value=2000)
321
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
322
  with gr.Accordion("Diarization", open=False):
323
  cb_diarize = gr.Checkbox(label="Enable Diarization")
324
  tb_hf_token = gr.Text(label="HuggingFace Token", value="",
325
  info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
326
+ "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
327
+ dd_diarization_device = gr.Dropdown(label="Device",
328
+ choices=self.whisper_inf.diarizer.get_available_device(),
329
+ value=self.whisper_inf.diarizer.get_device())
330
  with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
331
  visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
332
  nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
 
339
  btn_openfolder = gr.Button('📂', scale=1)
340
 
341
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
342
+ whisper_params = WhisperParameters(
343
+ model_size=dd_model,
344
+ lang=dd_lang,
345
+ is_translate=cb_translate,
346
+ beam_size=nb_beam_size,
347
+ log_prob_threshold=nb_log_prob_threshold,
348
+ no_speech_threshold=nb_no_speech_threshold,
349
+ compute_type=dd_compute_type,
350
+ best_of=nb_best_of,
351
+ patience=nb_patience,
352
+ condition_on_previous_text=cb_condition_on_previous_text,
353
+ initial_prompt=tb_initial_prompt,
354
+ temperature=sd_temperature,
355
+ compression_ratio_threshold=nb_compression_ratio_threshold,
356
+ vad_filter=cb_vad_filter,
357
+ threshold=sd_threshold,
358
+ min_speech_duration_ms=nb_min_speech_duration_ms,
359
+ max_speech_duration_s=nb_max_speech_duration_s,
360
+ min_silence_duration_ms=nb_min_silence_duration_ms,
361
+ speech_pad_ms=nb_speech_pad_ms,
362
+ chunk_length_s=nb_chunk_length_s,
363
+ batch_size=nb_batch_size,
364
+ is_diarize=cb_diarize,
365
+ hf_token=tb_hf_token,
366
+ diarization_device=dd_diarization_device,
367
+ length_penalty=nb_length_penalty,
368
+ repetition_penalty=nb_repetition_penalty,
369
+ no_repeat_ngram_size=nb_no_repeat_ngram_size,
370
+ prefix=tb_prefix,
371
+ suppress_blank=cb_suppress_blank,
372
+ suppress_tokens=tb_suppress_tokens,
373
+ max_initial_timestamp=nb_max_initial_timestamp,
374
+ word_timestamps=cb_word_timestamps,
375
+ prepend_punctuations=tb_prepend_punctuations,
376
+ append_punctuations=tb_append_punctuations,
377
+ max_new_tokens=nb_max_new_tokens,
378
+ chunk_length=nb_chunk_length,
379
+ hallucination_silence_threshold=nb_hallucination_silence_threshold,
380
+ hotwords=tb_hotwords,
381
+ language_detection_threshold=nb_language_detection_threshold,
382
+ language_detection_segments=nb_language_detection_segments
383
+ )
384
 
385
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
386
  inputs=params + whisper_params.as_list(),
 
403
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
404
  with gr.Accordion("Advanced Parameters", open=False):
405
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
406
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
407
+ interactive=True)
408
  nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
409
+ dd_compute_type = gr.Dropdown(label="Compute Type",
410
+ choices=self.whisper_inf.available_compute_types,
411
+ value=self.whisper_inf.current_compute_type, interactive=True)
412
  nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
413
  nb_patience = gr.Number(label="Patience", value=1, interactive=True)
414
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
415
+ interactive=True)
416
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
417
+ sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
418
+ interactive=True)
419
+
420
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
421
+ with gr.Column():
422
+ nb_length_penalty = gr.Number(label="Length Penalty", value=1,
423
+ info="Exponential length penalty constant.")
424
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
425
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
426
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
427
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
428
+ tb_prefix = gr.Textbox(label="Prefix", value="",
429
+ info="Optional text to provide as a prefix for the first window.")
430
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
431
+ info="Suppress blank outputs at the beginning of the sampling.")
432
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="-1",
433
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
434
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
435
+ info="The initial timestamp cannot be later than this.")
436
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
437
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
438
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
439
+ info="If word_timestamps is True, merge these punctuation symbols with the next word.")
440
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations",
441
+ value="\"'.。,,!!??::”)]}、",
442
+ info="If word_timestamps is True, merge these punctuation symbols with the previous word.")
443
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=None, precision=0,
444
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
445
+ nb_chunk_length = gr.Number(label="Chunk Length", value=None,
446
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
447
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold",
448
+ value=None,
449
+ info="When word_timestamps is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
450
+ tb_hotwords = gr.Textbox(label="Hotwords", value="",
451
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
452
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
453
+ value=None,
454
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
455
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
456
+ precision=0,
457
+ info="Number of segments to consider for the language detection.")
458
  with gr.Accordion("VAD", open=False):
459
  cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
460
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
461
+ value=0.5, info="Lower it to be more sensitive to small sounds.")
462
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
463
+ value=250)
464
  nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
465
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
466
+ value=2000)
467
  nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
468
  with gr.Accordion("Diarization", open=False):
469
  cb_diarize = gr.Checkbox(label="Enable Diarization")
 
485
  btn_openfolder = gr.Button('📂', scale=1)
486
 
487
  params = [mic_input, dd_file_format]
488
+ whisper_params = WhisperParameters(
489
+ model_size=dd_model,
490
+ lang=dd_lang,
491
+ is_translate=cb_translate,
492
+ beam_size=nb_beam_size,
493
+ log_prob_threshold=nb_log_prob_threshold,
494
+ no_speech_threshold=nb_no_speech_threshold,
495
+ compute_type=dd_compute_type,
496
+ best_of=nb_best_of,
497
+ patience=nb_patience,
498
+ condition_on_previous_text=cb_condition_on_previous_text,
499
+ initial_prompt=tb_initial_prompt,
500
+ temperature=sd_temperature,
501
+ compression_ratio_threshold=nb_compression_ratio_threshold,
502
+ vad_filter=cb_vad_filter,
503
+ threshold=sd_threshold,
504
+ min_speech_duration_ms=nb_min_speech_duration_ms,
505
+ max_speech_duration_s=nb_max_speech_duration_s,
506
+ min_silence_duration_ms=nb_min_silence_duration_ms,
507
+ speech_pad_ms=nb_speech_pad_ms,
508
+ chunk_length_s=nb_chunk_length_s,
509
+ batch_size=nb_batch_size,
510
+ is_diarize=cb_diarize,
511
+ hf_token=tb_hf_token,
512
+ diarization_device=dd_diarization_device,
513
+ length_penalty=nb_length_penalty,
514
+ repetition_penalty=nb_repetition_penalty,
515
+ no_repeat_ngram_size=nb_no_repeat_ngram_size,
516
+ prefix=tb_prefix,
517
+ suppress_blank=cb_suppress_blank,
518
+ suppress_tokens=tb_suppress_tokens,
519
+ max_initial_timestamp=nb_max_initial_timestamp,
520
+ word_timestamps=cb_word_timestamps,
521
+ prepend_punctuations=tb_prepend_punctuations,
522
+ append_punctuations=tb_append_punctuations,
523
+ max_new_tokens=nb_max_new_tokens,
524
+ chunk_length=nb_chunk_length,
525
+ hallucination_silence_threshold=nb_hallucination_silence_threshold,
526
+ hotwords=tb_hotwords,
527
+ language_detection_threshold=nb_language_detection_threshold,
528
+ language_detection_segments=nb_language_detection_segments
529
+ )
530
 
531
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
532
  inputs=params + whisper_params.as_list(),
 
591
  md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
592
 
593
  btn_run.click(fn=self.nllb_inf.translate_file,
594
+ inputs=[file_subs, dd_nllb_model, dd_nllb_sourcelang, dd_nllb_targetlang,
595
+ nb_max_length, cb_timestamp],
596
  outputs=[tb_indicator, files_subtitles])
597
 
598
  btn_openfolder.click(fn=lambda: self.open_folder(os.path.join("outputs", "translations")),
 
618
 
619
  # Create the parser for command-line arguments
620
  parser = argparse.ArgumentParser()
621
+ parser.add_argument('--whisper_type', type=str, default="faster-whisper",
622
+ help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
623
  parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
624
  parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
625
  parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 
629
  parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
630
  parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
631
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
632
+ parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"),
633
+ help='Directory path of the whisper model')
634
+ parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"),
635
+ help='Directory path of the faster-whisper model')
636
+ parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
637
+ default=os.path.join("models", "Whisper", "insanely-fast-whisper"),
638
+ help='Directory path of the insanely-fast-whisper model')
639
+ parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"),
640
+ help='Directory path of the diarization model')
641
+ parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"),
642
+ help='Directory path of the Facebook NLLB model')
643
  parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
644
  _args = parser.parse_args()
645