jhj0517 commited on
Commit
c14cab5
·
1 Parent(s): 31fd6fe

refactor parameters to function

Browse files
Files changed (1) hide show
  1. app.py +115 -393
app.py CHANGED
@@ -60,6 +60,112 @@ class App:
60
  )
61
  return whisper_inf
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def launch(self):
64
  with self.app:
65
  with gr.Row():
@@ -74,94 +180,9 @@ class App:
74
  " Leave this field empty if you do not wish to use a local path.",
75
  visible=self.args.colab,
76
  value="")
77
- with gr.Row():
78
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
79
- label="Model")
80
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
81
- value="Automatic Detection", label="Language")
82
- dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
83
- with gr.Row():
84
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
85
- with gr.Row():
86
- cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
87
- interactive=True)
88
- with gr.Accordion("Advanced Parameters", open=False):
89
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
90
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
91
- interactive=True)
92
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
93
- dd_compute_type = gr.Dropdown(label="Compute Type",
94
- choices=self.whisper_inf.available_compute_types,
95
- value=self.whisper_inf.current_compute_type, interactive=True)
96
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
97
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
98
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
99
- interactive=True)
100
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
101
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
102
- interactive=True)
103
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4,
104
- interactive=True)
105
- with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
106
- with gr.Column():
107
- nb_length_penalty = gr.Number(label="Length Penalty", value=1,
108
- info="Exponential length penalty constant.")
109
- nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
110
- info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
111
- nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
112
- info="Prevent repetitions of n-grams with this size (set 0 to disable).")
113
- tb_prefix = gr.Textbox(label="Prefix", value=lambda: None, # Bug Fix https://github.com/gradio-app/gradio/issues/6728
114
- info="Optional text to provide as a prefix for the first window.")
115
- cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
116
- info="Suppress blank outputs at the beginning of the sampling.")
117
- tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
118
- info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
119
- nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
120
- info="The initial timestamp cannot be later than this.")
121
- cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
122
- info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
123
- tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
124
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
125
- tb_append_punctuations = gr.Textbox(label="Append Punctuations",
126
- value="\"'.。,,!!??::”)]}、",
127
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
128
- nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
129
- info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
130
- nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
131
- info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
132
- nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
133
- value=lambda: None,
134
- info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
135
- tb_hotwords = gr.Textbox(label="Hotwords", value=None,
136
- info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
137
- nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
138
- value=None,
139
- info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
140
- nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
141
- precision=0,
142
- info="Number of segments to consider for the language detection.")
143
-
144
- with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
145
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
146
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
147
- with gr.Accordion("VAD", open=False):
148
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
149
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
150
- value=0.5, info="Lower it to be more sensitive to small sounds.")
151
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
152
- value=250)
153
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
154
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
155
- value=2000)
156
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
157
- with gr.Accordion("Diarization", open=False):
158
- cb_diarize = gr.Checkbox(label="Enable Diarization")
159
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
160
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
161
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
162
- dd_diarization_device = gr.Dropdown(label="Device",
163
- choices=self.whisper_inf.diarizer.get_available_device(),
164
- value=self.whisper_inf.diarizer.get_device())
165
  with gr.Row():
166
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
167
  with gr.Row():
@@ -170,54 +191,10 @@ class App:
170
  btn_openfolder = gr.Button('📂', scale=1)
171
 
172
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
173
- whisper_params = WhisperParameters(
174
- model_size=dd_model,
175
- lang=dd_lang,
176
- is_translate=cb_translate,
177
- beam_size=nb_beam_size,
178
- log_prob_threshold=nb_log_prob_threshold,
179
- no_speech_threshold=nb_no_speech_threshold,
180
- compute_type=dd_compute_type,
181
- best_of=nb_best_of,
182
- patience=nb_patience,
183
- condition_on_previous_text=cb_condition_on_previous_text,
184
- initial_prompt=tb_initial_prompt,
185
- temperature=sd_temperature,
186
- compression_ratio_threshold=nb_compression_ratio_threshold,
187
- vad_filter=cb_vad_filter,
188
- threshold=sd_threshold,
189
- min_speech_duration_ms=nb_min_speech_duration_ms,
190
- max_speech_duration_s=nb_max_speech_duration_s,
191
- min_silence_duration_ms=nb_min_silence_duration_ms,
192
- speech_pad_ms=nb_speech_pad_ms,
193
- chunk_length_s=nb_chunk_length_s,
194
- batch_size=nb_batch_size,
195
- is_diarize=cb_diarize,
196
- hf_token=tb_hf_token,
197
- diarization_device=dd_diarization_device,
198
- length_penalty=nb_length_penalty,
199
- repetition_penalty=nb_repetition_penalty,
200
- no_repeat_ngram_size=nb_no_repeat_ngram_size,
201
- prefix=tb_prefix,
202
- suppress_blank=cb_suppress_blank,
203
- suppress_tokens=tb_suppress_tokens,
204
- max_initial_timestamp=nb_max_initial_timestamp,
205
- word_timestamps=cb_word_timestamps,
206
- prepend_punctuations=tb_prepend_punctuations,
207
- append_punctuations=tb_append_punctuations,
208
- max_new_tokens=nb_max_new_tokens,
209
- chunk_length=nb_chunk_length,
210
- hallucination_silence_threshold=nb_hallucination_silence_threshold,
211
- hotwords=tb_hotwords,
212
- language_detection_threshold=nb_language_detection_threshold,
213
- language_detection_segments=nb_language_detection_segments
214
- )
215
-
216
  btn_run.click(fn=self.whisper_inf.transcribe_file,
217
  inputs=params + whisper_params.as_list(),
218
  outputs=[tb_indicator, files_subtitles])
219
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
220
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
221
 
222
  with gr.TabItem("Youtube"): # tab2
223
  with gr.Row():
@@ -228,95 +205,9 @@ class App:
228
  with gr.Column():
229
  tb_title = gr.Label(label="Youtube Title")
230
  tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
231
- with gr.Row():
232
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
233
- label="Model")
234
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
235
- value="Automatic Detection", label="Language")
236
- dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
237
- with gr.Row():
238
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
239
- with gr.Row():
240
- cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
241
- interactive=True)
242
- with gr.Accordion("Advanced Parameters", open=False):
243
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
244
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
245
- interactive=True)
246
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
247
- dd_compute_type = gr.Dropdown(label="Compute Type",
248
- choices=self.whisper_inf.available_compute_types,
249
- value=self.whisper_inf.current_compute_type, interactive=True)
250
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
251
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
252
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
253
- interactive=True)
254
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
255
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
256
- interactive=True)
257
- nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4,
258
- interactive=True)
259
- with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
260
- with gr.Column():
261
- nb_length_penalty = gr.Number(label="Length Penalty", value=1,
262
- info="Exponential length penalty constant.")
263
- nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
264
- info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
265
- nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
266
- info="Prevent repetitions of n-grams with this size (set 0 to disable).")
267
- tb_prefix = gr.Textbox(label="Prefix", value=lambda: None, # Bug Fix https://github.com/gradio-app/gradio/issues/6728
268
- info="Optional text to provide as a prefix for the first window.")
269
- cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
270
- info="Suppress blank outputs at the beginning of the sampling.")
271
- tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
272
- info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
273
- nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
274
- info="The initial timestamp cannot be later than this.")
275
- cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
276
- info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
277
- tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
278
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
279
- tb_append_punctuations = gr.Textbox(label="Append Punctuations",
280
- value="\"'.。,,!!??::”)]}、",
281
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
282
- nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
283
- info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
284
- nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
285
- info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
286
- nb_hallucination_silence_threshold = gr.Number(
287
- label="Hallucination Silence Threshold (sec)",
288
- value=lambda: None,
289
- info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
290
- tb_hotwords = gr.Textbox(label="Hotwords", value=None,
291
- info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
292
- nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
293
- value=None,
294
- info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
295
- nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
296
- precision=0,
297
- info="Number of segments to consider for the language detection.")
298
-
299
- with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
300
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
301
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
302
- with gr.Accordion("VAD", open=False):
303
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
304
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
305
- value=0.5, info="Lower it to be more sensitive to small sounds.")
306
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
307
- value=250)
308
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
309
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
310
- value=2000)
311
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
312
- with gr.Accordion("Diarization", open=False):
313
- cb_diarize = gr.Checkbox(label="Enable Diarization")
314
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
315
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
316
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
317
- dd_diarization_device = gr.Dropdown(label="Device",
318
- choices=self.whisper_inf.diarizer.get_available_device(),
319
- value=self.whisper_inf.diarizer.get_device())
320
  with gr.Row():
321
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
322
  with gr.Row():
@@ -325,48 +216,6 @@ class App:
325
  btn_openfolder = gr.Button('📂', scale=1)
326
 
327
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
328
- whisper_params = WhisperParameters(
329
- model_size=dd_model,
330
- lang=dd_lang,
331
- is_translate=cb_translate,
332
- beam_size=nb_beam_size,
333
- log_prob_threshold=nb_log_prob_threshold,
334
- no_speech_threshold=nb_no_speech_threshold,
335
- compute_type=dd_compute_type,
336
- best_of=nb_best_of,
337
- patience=nb_patience,
338
- condition_on_previous_text=cb_condition_on_previous_text,
339
- initial_prompt=tb_initial_prompt,
340
- temperature=sd_temperature,
341
- compression_ratio_threshold=nb_compression_ratio_threshold,
342
- vad_filter=cb_vad_filter,
343
- threshold=sd_threshold,
344
- min_speech_duration_ms=nb_min_speech_duration_ms,
345
- max_speech_duration_s=nb_max_speech_duration_s,
346
- min_silence_duration_ms=nb_min_silence_duration_ms,
347
- speech_pad_ms=nb_speech_pad_ms,
348
- chunk_length_s=nb_chunk_length_s,
349
- batch_size=nb_batch_size,
350
- is_diarize=cb_diarize,
351
- hf_token=tb_hf_token,
352
- diarization_device=dd_diarization_device,
353
- length_penalty=nb_length_penalty,
354
- repetition_penalty=nb_repetition_penalty,
355
- no_repeat_ngram_size=nb_no_repeat_ngram_size,
356
- prefix=tb_prefix,
357
- suppress_blank=cb_suppress_blank,
358
- suppress_tokens=tb_suppress_tokens,
359
- max_initial_timestamp=nb_max_initial_timestamp,
360
- word_timestamps=cb_word_timestamps,
361
- prepend_punctuations=tb_prepend_punctuations,
362
- append_punctuations=tb_append_punctuations,
363
- max_new_tokens=nb_max_new_tokens,
364
- chunk_length=nb_chunk_length,
365
- hallucination_silence_threshold=nb_hallucination_silence_threshold,
366
- hotwords=tb_hotwords,
367
- language_detection_threshold=nb_language_detection_threshold,
368
- language_detection_segments=nb_language_detection_segments
369
- )
370
 
371
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
372
  inputs=params + whisper_params.as_list(),
@@ -374,97 +223,13 @@ class App:
374
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
375
  outputs=[img_thumbnail, tb_title, tb_description])
376
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
377
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
378
 
379
  with gr.TabItem("Mic"): # tab3
380
  with gr.Row():
381
  mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
382
- with gr.Row():
383
- dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
384
- label="Model")
385
- dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
386
- value="Automatic Detection", label="Language")
387
- dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
388
- with gr.Row():
389
- cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
390
- with gr.Accordion("Advanced Parameters", open=False):
391
- nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
392
- nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0,
393
- interactive=True)
394
- nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
395
- dd_compute_type = gr.Dropdown(label="Compute Type",
396
- choices=self.whisper_inf.available_compute_types,
397
- value=self.whisper_inf.current_compute_type, interactive=True)
398
- nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
399
- nb_patience = gr.Number(label="Patience", value=1, interactive=True)
400
- cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
401
- interactive=True)
402
- tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
403
- sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0,
404
- interactive=True)
405
-
406
- with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
407
- with gr.Column():
408
- nb_length_penalty = gr.Number(label="Length Penalty", value=1,
409
- info="Exponential length penalty constant.")
410
- nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
411
- info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
412
- nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
413
- info="Prevent repetitions of n-grams with this size (set 0 to disable).")
414
- tb_prefix = gr.Textbox(label="Prefix", value=lambda: None, # Bug Fix https://github.com/gradio-app/gradio/issues/6728
415
- info="Optional text to provide as a prefix for the first window.")
416
- cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
417
- info="Suppress blank outputs at the beginning of the sampling.")
418
- tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
419
- info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
420
- nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
421
- info="The initial timestamp cannot be later than this.")
422
- cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
423
- info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
424
- tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
425
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
426
- tb_append_punctuations = gr.Textbox(label="Append Punctuations",
427
- value="\"'.。,,!!??::”)]}、",
428
- info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
429
- nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
430
- info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
431
- nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
432
- info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
433
- nb_hallucination_silence_threshold = gr.Number(
434
- label="Hallucination Silence Threshold (sec)",
435
- value=lambda: None,
436
- info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
437
- tb_hotwords = gr.Textbox(label="Hotwords", value=None,
438
- info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
439
- nb_language_detection_threshold = gr.Number(label="Language Detection Threshold",
440
- value=None,
441
- info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
442
- nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1,
443
- precision=0,
444
- info="Number of segments to consider for the language detection.")
445
-
446
- with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
447
- nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
448
- nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
449
-
450
- with gr.Accordion("VAD", open=False):
451
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
452
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
453
- value=0.5, info="Lower it to be more sensitive to small sounds.")
454
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0,
455
- value=250)
456
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
457
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0,
458
- value=2000)
459
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
460
- with gr.Accordion("Diarization", open=False):
461
- cb_diarize = gr.Checkbox(label="Enable Diarization")
462
- tb_hf_token = gr.Text(label="HuggingFace Token", value="",
463
- info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
464
- "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
465
- dd_diarization_device = gr.Dropdown(label="Device",
466
- choices=self.whisper_inf.diarizer.get_available_device(),
467
- value=self.whisper_inf.diarizer.get_device())
468
  with gr.Row():
469
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
470
  with gr.Row():
@@ -473,54 +238,11 @@ class App:
473
  btn_openfolder = gr.Button('📂', scale=1)
474
 
475
  params = [mic_input, dd_file_format]
476
- whisper_params = WhisperParameters(
477
- model_size=dd_model,
478
- lang=dd_lang,
479
- is_translate=cb_translate,
480
- beam_size=nb_beam_size,
481
- log_prob_threshold=nb_log_prob_threshold,
482
- no_speech_threshold=nb_no_speech_threshold,
483
- compute_type=dd_compute_type,
484
- best_of=nb_best_of,
485
- patience=nb_patience,
486
- condition_on_previous_text=cb_condition_on_previous_text,
487
- initial_prompt=tb_initial_prompt,
488
- temperature=sd_temperature,
489
- compression_ratio_threshold=nb_compression_ratio_threshold,
490
- vad_filter=cb_vad_filter,
491
- threshold=sd_threshold,
492
- min_speech_duration_ms=nb_min_speech_duration_ms,
493
- max_speech_duration_s=nb_max_speech_duration_s,
494
- min_silence_duration_ms=nb_min_silence_duration_ms,
495
- speech_pad_ms=nb_speech_pad_ms,
496
- chunk_length_s=nb_chunk_length_s,
497
- batch_size=nb_batch_size,
498
- is_diarize=cb_diarize,
499
- hf_token=tb_hf_token,
500
- diarization_device=dd_diarization_device,
501
- length_penalty=nb_length_penalty,
502
- repetition_penalty=nb_repetition_penalty,
503
- no_repeat_ngram_size=nb_no_repeat_ngram_size,
504
- prefix=tb_prefix,
505
- suppress_blank=cb_suppress_blank,
506
- suppress_tokens=tb_suppress_tokens,
507
- max_initial_timestamp=nb_max_initial_timestamp,
508
- word_timestamps=cb_word_timestamps,
509
- prepend_punctuations=tb_prepend_punctuations,
510
- append_punctuations=tb_append_punctuations,
511
- max_new_tokens=nb_max_new_tokens,
512
- chunk_length=nb_chunk_length,
513
- hallucination_silence_threshold=nb_hallucination_silence_threshold,
514
- hotwords=tb_hotwords,
515
- language_detection_threshold=nb_language_detection_threshold,
516
- language_detection_segments=nb_language_detection_segments
517
- )
518
 
519
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
520
  inputs=params + whisper_params.as_list(),
521
  outputs=[tb_indicator, files_subtitles])
522
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
523
- dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
524
 
525
  with gr.TabItem("T2T Translation"): # tab 4
526
  with gr.Row():
 
60
  )
61
  return whisper_inf
62
 
63
+ def create_whisper_parameters(self):
64
+ with gr.Row():
65
+ dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value="large-v2",
66
+ label="Model")
67
+ dd_lang = gr.Dropdown(choices=["Automatic Detection"] + self.whisper_inf.available_langs,
68
+ value="Automatic Detection", label="Language")
69
+ dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
70
+ with gr.Row():
71
+ cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
72
+ with gr.Row():
73
+ cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
74
+ interactive=True)
75
+ with gr.Accordion("Advanced Parameters", open=False):
76
+ nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
77
+ nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
78
+ nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
79
+ dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types,
80
+ value=self.whisper_inf.current_compute_type, interactive=True)
81
+ nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
82
+ nb_patience = gr.Number(label="Patience", value=1, interactive=True)
83
+ cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True,
84
+ interactive=True)
85
+ tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
86
+ sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
87
+ nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
88
+ with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
89
+ nb_length_penalty = gr.Number(label="Length Penalty", value=1,
90
+ info="Exponential length penalty constant.")
91
+ nb_repetition_penalty = gr.Number(label="Repetition Penalty", value=1,
92
+ info="Penalty applied to the score of previously generated tokens (set > 1 to penalize).")
93
+ nb_no_repeat_ngram_size = gr.Number(label="No Repeat N-gram Size", value=0, precision=0,
94
+ info="Prevent repetitions of n-grams with this size (set 0 to disable).")
95
+ tb_prefix = gr.Textbox(label="Prefix", value=lambda: None,
96
+ info="Optional text to provide as a prefix for the first window.")
97
+ cb_suppress_blank = gr.Checkbox(label="Suppress Blank", value=True,
98
+ info="Suppress blank outputs at the beginning of the sampling.")
99
+ tb_suppress_tokens = gr.Textbox(label="Suppress Tokens", value="[-1]",
100
+ info="List of token IDs to suppress. -1 will suppress a default set of symbols as defined in the model config.json file.")
101
+ nb_max_initial_timestamp = gr.Number(label="Max Initial Timestamp", value=1.0,
102
+ info="The initial timestamp cannot be later than this.")
103
+ cb_word_timestamps = gr.Checkbox(label="Word Timestamps", value=False,
104
+ info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment.")
105
+ tb_prepend_punctuations = gr.Textbox(label="Prepend Punctuations", value="\"'“¿([{-",
106
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the next word.")
107
+ tb_append_punctuations = gr.Textbox(label="Append Punctuations", value="\"'.。,,!!??::”)]}、",
108
+ info="If 'Word Timestamps' is True, merge these punctuation symbols with the previous word.")
109
+ nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: None, precision=0,
110
+ info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
111
+ nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: None, precision=0,
112
+ info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
113
+ nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
114
+ value=lambda: None,
115
+ info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
116
+ tb_hotwords = gr.Textbox(label="Hotwords", value=None,
117
+ info="Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.")
118
+ nb_language_detection_threshold = gr.Number(label="Language Detection Threshold", value=None,
119
+ info="If the maximum probability of the language tokens is higher than this value, the language is detected.")
120
+ nb_language_detection_segments = gr.Number(label="Language Detection Segments", value=1, precision=0,
121
+ info="Number of segments to consider for the language detection.")
122
+ with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
123
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
124
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
125
+
126
+ with gr.Accordion("VAD", open=False):
127
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
128
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5,
129
+ info="Lower it to be more sensitive to small sounds.")
130
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
131
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
132
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
133
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
134
+
135
+ with gr.Accordion("Diarization", open=False):
136
+ cb_diarize = gr.Checkbox(label="Enable Diarization")
137
+ tb_hf_token = gr.Text(label="HuggingFace Token", value="",
138
+ info="This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
139
+ dd_diarization_device = gr.Dropdown(label="Device",
140
+ choices=self.whisper_inf.diarizer.get_available_device(),
141
+ value=self.whisper_inf.diarizer.get_device())
142
+
143
+ dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
144
+
145
+ return (
146
+ WhisperParameters(
147
+ model_size=dd_model, lang=dd_lang, is_translate=cb_translate, beam_size=nb_beam_size,
148
+ log_prob_threshold=nb_log_prob_threshold, no_speech_threshold=nb_no_speech_threshold,
149
+ compute_type=dd_compute_type, best_of=nb_best_of, patience=nb_patience,
150
+ condition_on_previous_text=cb_condition_on_previous_text, initial_prompt=tb_initial_prompt,
151
+ temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
152
+ vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
153
+ max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
154
+ speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
155
+ is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
156
+ length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
157
+ no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
158
+ suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
159
+ word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
160
+ append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
161
+ hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
162
+ language_detection_threshold=nb_language_detection_threshold,
163
+ language_detection_segments=nb_language_detection_segments
164
+ ),
165
+ dd_file_format,
166
+ cb_timestamp
167
+ )
168
+
169
  def launch(self):
170
  with self.app:
171
  with gr.Row():
 
180
  " Leave this field empty if you do not wish to use a local path.",
181
  visible=self.args.colab,
182
  value="")
183
+
184
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
185
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  with gr.Row():
187
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
188
  with gr.Row():
 
191
  btn_openfolder = gr.Button('📂', scale=1)
192
 
193
  params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  btn_run.click(fn=self.whisper_inf.transcribe_file,
195
  inputs=params + whisper_params.as_list(),
196
  outputs=[tb_indicator, files_subtitles])
197
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
198
 
199
  with gr.TabItem("Youtube"): # tab2
200
  with gr.Row():
 
205
  with gr.Column():
206
  tb_title = gr.Label(label="Youtube Title")
207
  tb_description = gr.Textbox(label="Youtube Description", max_lines=15)
208
+
209
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
210
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  with gr.Row():
212
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
213
  with gr.Row():
 
216
  btn_openfolder = gr.Button('📂', scale=1)
217
 
218
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
221
  inputs=params + whisper_params.as_list(),
 
223
  tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
224
  outputs=[img_thumbnail, tb_title, tb_description])
225
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
226
 
227
  with gr.TabItem("Mic"): # tab3
228
  with gr.Row():
229
  mic_input = gr.Microphone(label="Record with Mic", type="filepath", interactive=True)
230
+
231
+ whisper_params, dd_file_format, cb_timestamp = self.create_whisper_parameters()
232
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  with gr.Row():
234
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
235
  with gr.Row():
 
238
  btn_openfolder = gr.Button('📂', scale=1)
239
 
240
  params = [mic_input, dd_file_format]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
243
  inputs=params + whisper_params.as_list(),
244
  outputs=[tb_indicator, files_subtitles])
245
  btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
 
246
 
247
  with gr.TabItem("T2T Translation"): # tab 4
248
  with gr.Row():