jhj0517 commited on
Commit
661e83c
Β·
1 Parent(s): 296b5e1

add parameters for insanely_fast_whisper

Browse files
app.py CHANGED
@@ -74,14 +74,6 @@ class App:
74
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
75
  with gr.Row():
76
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
77
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
78
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
79
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
80
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
81
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
82
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
83
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
84
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
85
  with gr.Accordion("Advanced_Parameters", open=False):
86
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
87
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -93,6 +85,17 @@ class App:
93
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
94
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
95
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
96
  with gr.Row():
97
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
98
  with gr.Row():
@@ -101,26 +104,28 @@ class App:
101
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
102
 
103
  params = [input_file, dd_file_format, cb_timestamp]
104
- whisper_params = WhisperGradioComponents(model_size=dd_model,
105
- lang=dd_lang,
106
- is_translate=cb_translate,
107
- beam_size=nb_beam_size,
108
- log_prob_threshold=nb_log_prob_threshold,
109
- no_speech_threshold=nb_no_speech_threshold,
110
- compute_type=dd_compute_type,
111
- best_of=nb_best_of,
112
- patience=nb_patience,
113
- condition_on_previous_text=cb_condition_on_previous_text,
114
- initial_prompt=tb_initial_prompt,
115
- temperature=sd_temperature,
116
- compression_ratio_threshold=nb_compression_ratio_threshold,
117
- vad_filter=cb_vad_filter,
118
- threshold=sd_threshold,
119
- min_speech_duration_ms=nb_min_speech_duration_ms,
120
- max_speech_duration_s=nb_max_speech_duration_s,
121
- min_silence_duration_ms=nb_min_silence_duration_ms,
122
- window_size_sample=nb_window_size_sample,
123
- speech_pad_ms=nb_speech_pad_ms)
 
 
124
 
125
  btn_run.click(fn=self.whisper_inf.transcribe_file,
126
  inputs=params + whisper_params.to_list(),
@@ -148,14 +153,6 @@ class App:
148
  with gr.Row():
149
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
150
  interactive=True)
151
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
152
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
153
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
154
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
155
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
156
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
157
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
158
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
159
  with gr.Accordion("Advanced_Parameters", open=False):
160
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
161
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -167,6 +164,18 @@ class App:
167
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
168
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
169
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
170
  with gr.Row():
171
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
172
  with gr.Row():
@@ -175,26 +184,29 @@ class App:
175
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
176
 
177
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
178
- whisper_params = WhisperGradioComponents(model_size=dd_model,
179
- lang=dd_lang,
180
- is_translate=cb_translate,
181
- beam_size=nb_beam_size,
182
- log_prob_threshold=nb_log_prob_threshold,
183
- no_speech_threshold=nb_no_speech_threshold,
184
- compute_type=dd_compute_type,
185
- best_of=nb_best_of,
186
- patience=nb_patience,
187
- condition_on_previous_text=cb_condition_on_previous_text,
188
- initial_prompt=tb_initial_prompt,
189
- temperature=sd_temperature,
190
- compression_ratio_threshold=nb_compression_ratio_threshold,
191
- vad_filter=cb_vad_filter,
192
- threshold=sd_threshold,
193
- min_speech_duration_ms=nb_min_speech_duration_ms,
194
- max_speech_duration_s=nb_max_speech_duration_s,
195
- min_silence_duration_ms=nb_min_silence_duration_ms,
196
- window_size_sample=nb_window_size_sample,
197
- speech_pad_ms=nb_speech_pad_ms)
 
 
 
198
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
199
  inputs=params + whisper_params.to_list(),
200
  outputs=[tb_indicator, files_subtitles])
@@ -214,14 +226,6 @@ class App:
214
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
215
  with gr.Row():
216
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
217
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
218
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
219
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
220
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
221
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
222
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
223
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
224
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
225
  with gr.Accordion("Advanced_Parameters", open=False):
226
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
227
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -232,6 +236,18 @@ class App:
232
  cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
233
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
234
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
235
  with gr.Row():
236
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
237
  with gr.Row():
@@ -240,26 +256,29 @@ class App:
240
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
241
 
242
  params = [mic_input, dd_file_format]
243
- whisper_params = WhisperGradioComponents(model_size=dd_model,
244
- lang=dd_lang,
245
- is_translate=cb_translate,
246
- beam_size=nb_beam_size,
247
- log_prob_threshold=nb_log_prob_threshold,
248
- no_speech_threshold=nb_no_speech_threshold,
249
- compute_type=dd_compute_type,
250
- best_of=nb_best_of,
251
- patience=nb_patience,
252
- condition_on_previous_text=cb_condition_on_previous_text,
253
- initial_prompt=tb_initial_prompt,
254
- temperature=sd_temperature,
255
- compression_ratio_threshold=nb_compression_ratio_threshold,
256
- vad_filter=cb_vad_filter,
257
- threshold=sd_threshold,
258
- min_speech_duration_ms=nb_min_speech_duration_ms,
259
- max_speech_duration_s=nb_max_speech_duration_s,
260
- min_silence_duration_ms=nb_min_silence_duration_ms,
261
- window_size_sample=nb_window_size_sample,
262
- speech_pad_ms=nb_speech_pad_ms)
 
 
 
263
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
264
  inputs=params + whisper_params.to_list(),
265
  outputs=[tb_indicator, files_subtitles])
 
74
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
75
  with gr.Row():
76
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
 
 
 
 
 
 
 
 
77
  with gr.Accordion("Advanced_Parameters", open=False):
78
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
79
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
85
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
86
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
87
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
88
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
89
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
90
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
91
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
92
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
93
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
94
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
95
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
96
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
97
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
98
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
99
  with gr.Row():
100
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
101
  with gr.Row():
 
104
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
105
 
106
  params = [input_file, dd_file_format, cb_timestamp]
107
+ whisper_params = WhisperParameters(model_size=dd_model,
108
+ lang=dd_lang,
109
+ is_translate=cb_translate,
110
+ beam_size=nb_beam_size,
111
+ log_prob_threshold=nb_log_prob_threshold,
112
+ no_speech_threshold=nb_no_speech_threshold,
113
+ compute_type=dd_compute_type,
114
+ best_of=nb_best_of,
115
+ patience=nb_patience,
116
+ condition_on_previous_text=cb_condition_on_previous_text,
117
+ initial_prompt=tb_initial_prompt,
118
+ temperature=sd_temperature,
119
+ compression_ratio_threshold=nb_compression_ratio_threshold,
120
+ vad_filter=cb_vad_filter,
121
+ threshold=sd_threshold,
122
+ min_speech_duration_ms=nb_min_speech_duration_ms,
123
+ max_speech_duration_s=nb_max_speech_duration_s,
124
+ min_silence_duration_ms=nb_min_silence_duration_ms,
125
+ window_size_sample=nb_window_size_sample,
126
+ speech_pad_ms=nb_speech_pad_ms,
127
+ chunk_length_s=nb_chunk_length_s,
128
+ batch_size=nb_batch_size)
129
 
130
  btn_run.click(fn=self.whisper_inf.transcribe_file,
131
  inputs=params + whisper_params.to_list(),
 
153
  with gr.Row():
154
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
155
  interactive=True)
 
 
 
 
 
 
 
 
156
  with gr.Accordion("Advanced_Parameters", open=False):
157
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
158
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
164
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
165
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
166
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
167
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
168
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
169
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
170
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
171
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
172
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
173
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
174
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
175
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
176
+ visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
177
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
178
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
179
  with gr.Row():
180
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
181
  with gr.Row():
 
184
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
185
 
186
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
187
+ whisper_params = WhisperParameters(model_size=dd_model,
188
+ lang=dd_lang,
189
+ is_translate=cb_translate,
190
+ beam_size=nb_beam_size,
191
+ log_prob_threshold=nb_log_prob_threshold,
192
+ no_speech_threshold=nb_no_speech_threshold,
193
+ compute_type=dd_compute_type,
194
+ best_of=nb_best_of,
195
+ patience=nb_patience,
196
+ condition_on_previous_text=cb_condition_on_previous_text,
197
+ initial_prompt=tb_initial_prompt,
198
+ temperature=sd_temperature,
199
+ compression_ratio_threshold=nb_compression_ratio_threshold,
200
+ vad_filter=cb_vad_filter,
201
+ threshold=sd_threshold,
202
+ min_speech_duration_ms=nb_min_speech_duration_ms,
203
+ max_speech_duration_s=nb_max_speech_duration_s,
204
+ min_silence_duration_ms=nb_min_silence_duration_ms,
205
+ window_size_sample=nb_window_size_sample,
206
+ speech_pad_ms=nb_speech_pad_ms,
207
+ chunk_length_s=nb_chunk_length_s,
208
+ batch_size=nb_batch_size)
209
+
210
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
211
  inputs=params + whisper_params.to_list(),
212
  outputs=[tb_indicator, files_subtitles])
 
226
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
227
  with gr.Row():
228
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
 
 
 
 
 
 
 
 
229
  with gr.Accordion("Advanced_Parameters", open=False):
230
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
231
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
236
  cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
237
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
238
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
239
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
240
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
241
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
242
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
243
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
244
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
245
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
246
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
247
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
248
+ visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
249
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
250
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
251
  with gr.Row():
252
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
253
  with gr.Row():
 
256
  btn_openfolder = gr.Button('πŸ“‚', scale=1)
257
 
258
  params = [mic_input, dd_file_format]
259
+ whisper_params = WhisperParameters(model_size=dd_model,
260
+ lang=dd_lang,
261
+ is_translate=cb_translate,
262
+ beam_size=nb_beam_size,
263
+ log_prob_threshold=nb_log_prob_threshold,
264
+ no_speech_threshold=nb_no_speech_threshold,
265
+ compute_type=dd_compute_type,
266
+ best_of=nb_best_of,
267
+ patience=nb_patience,
268
+ condition_on_previous_text=cb_condition_on_previous_text,
269
+ initial_prompt=tb_initial_prompt,
270
+ temperature=sd_temperature,
271
+ compression_ratio_threshold=nb_compression_ratio_threshold,
272
+ vad_filter=cb_vad_filter,
273
+ threshold=sd_threshold,
274
+ min_speech_duration_ms=nb_min_speech_duration_ms,
275
+ max_speech_duration_s=nb_max_speech_duration_s,
276
+ min_silence_duration_ms=nb_min_silence_duration_ms,
277
+ window_size_sample=nb_window_size_sample,
278
+ speech_pad_ms=nb_speech_pad_ms,
279
+ chunk_length_s=nb_chunk_length_s,
280
+ batch_size=nb_batch_size)
281
+
282
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
283
  inputs=params + whisper_params.to_list(),
284
  outputs=[tb_indicator, files_subtitles])
modules/insanely_fast_whisper_inference.py CHANGED
@@ -71,8 +71,8 @@ class InsanelyFastWhisperInference(WhisperBase):
71
  segments = self.model(
72
  inputs=audio,
73
  return_timestamps=True,
74
- chunk_length_s=30,
75
- batch_size=24,
76
  generate_kwargs={
77
  "language": params.lang,
78
  "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
 
71
  segments = self.model(
72
  inputs=audio,
73
  return_timestamps=True,
74
+ chunk_length_s=params.chunk_length_s,
75
+ batch_size=params.batch_size,
76
  generate_kwargs={
77
  "language": params.lang,
78
  "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
modules/whisper_parameter.py CHANGED
@@ -25,8 +25,12 @@ class WhisperParameters:
25
  min_silence_duration_ms: gr.Number
26
  window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
 
 
28
  """
29
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
 
 
30
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
31
 
32
  Attributes
@@ -111,6 +115,13 @@ class WhisperParameters:
111
 
112
  speech_pad_ms: gr.Number
113
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
 
 
 
 
 
 
 
114
  """
115
 
116
  def to_list(self) -> list:
@@ -155,7 +166,9 @@ class WhisperParameters:
155
  max_speech_duration_s=args[16],
156
  min_silence_duration_ms=args[17],
157
  window_size_samples=args[18],
158
- speech_pad_ms=args[19]
 
 
159
  )
160
 
161
 
@@ -181,6 +194,8 @@ class WhisperValues:
181
  min_silence_duration_ms: int
182
  window_size_samples: int
183
  speech_pad_ms: int
 
 
184
  """
185
  A data class to use Whisper parameters.
186
  """
 
25
  min_silence_duration_ms: gr.Number
26
  window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
28
+ chunk_length_s: gr.Number
29
+ batch_size: gr.Number
30
  """
31
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
32
+ This data class is used to mitigate the key-value problem between Gradio components and function parameters.
33
+ Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
34
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
35
 
36
  Attributes
 
115
 
116
  speech_pad_ms: gr.Number
117
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
118
+
119
+ chunk_length_s: gr.Number
120
+ This parameter is related with insanely-fast-whisper pipe.
121
+ Maximum length of each chunk
122
+
123
+ batch_size: gr.Number
124
+ This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
125
  """
126
 
127
  def to_list(self) -> list:
 
166
  max_speech_duration_s=args[16],
167
  min_silence_duration_ms=args[17],
168
  window_size_samples=args[18],
169
+ speech_pad_ms=args[19],
170
+ chunk_length_s=args[20],
171
+ batch_size=args[21]
172
  )
173
 
174
 
 
194
  min_silence_duration_ms: int
195
  window_size_samples: int
196
  speech_pad_ms: int
197
+ chunk_length_s: int
198
+ batch_size: int
199
  """
200
  A data class to use Whisper parameters.
201
  """