Spaces:
Running
Running
jhj0517
commited on
Commit
Β·
661e83c
1
Parent(s):
296b5e1
add parameters for insanely_fast_whisper
Browse files- app.py +103 -84
- modules/insanely_fast_whisper_inference.py +2 -2
- modules/whisper_parameter.py +16 -1
app.py
CHANGED
@@ -74,14 +74,6 @@ class App:
|
|
74 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
75 |
with gr.Row():
|
76 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
77 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
78 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
79 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
80 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
81 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
82 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
83 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
84 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
85 |
with gr.Accordion("Advanced_Parameters", open=False):
|
86 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
87 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
@@ -93,6 +85,17 @@ class App:
|
|
93 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
94 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
95 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
with gr.Row():
|
97 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
98 |
with gr.Row():
|
@@ -101,26 +104,28 @@ class App:
|
|
101 |
btn_openfolder = gr.Button('π', scale=1)
|
102 |
|
103 |
params = [input_file, dd_file_format, cb_timestamp]
|
104 |
-
whisper_params =
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
124 |
|
125 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
126 |
inputs=params + whisper_params.to_list(),
|
@@ -148,14 +153,6 @@ class App:
|
|
148 |
with gr.Row():
|
149 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
150 |
interactive=True)
|
151 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
152 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
153 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
154 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
155 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
156 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
157 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
158 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
159 |
with gr.Accordion("Advanced_Parameters", open=False):
|
160 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
161 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
@@ -167,6 +164,18 @@ class App:
|
|
167 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
168 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
169 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
with gr.Row():
|
171 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
172 |
with gr.Row():
|
@@ -175,26 +184,29 @@ class App:
|
|
175 |
btn_openfolder = gr.Button('π', scale=1)
|
176 |
|
177 |
params = [tb_youtubelink, dd_file_format, cb_timestamp]
|
178 |
-
whisper_params =
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
198 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
199 |
inputs=params + whisper_params.to_list(),
|
200 |
outputs=[tb_indicator, files_subtitles])
|
@@ -214,14 +226,6 @@ class App:
|
|
214 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
215 |
with gr.Row():
|
216 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
217 |
-
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
218 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
219 |
-
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
220 |
-
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
221 |
-
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
222 |
-
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
223 |
-
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
224 |
-
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
225 |
with gr.Accordion("Advanced_Parameters", open=False):
|
226 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
227 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
@@ -232,6 +236,18 @@ class App:
|
|
232 |
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
233 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
234 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
with gr.Row():
|
236 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
237 |
with gr.Row():
|
@@ -240,26 +256,29 @@ class App:
|
|
240 |
btn_openfolder = gr.Button('π', scale=1)
|
241 |
|
242 |
params = [mic_input, dd_file_format]
|
243 |
-
whisper_params =
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
263 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
264 |
inputs=params + whisper_params.to_list(),
|
265 |
outputs=[tb_indicator, files_subtitles])
|
|
|
74 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
75 |
with gr.Row():
|
76 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
with gr.Accordion("Advanced_Parameters", open=False):
|
78 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
79 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
85 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
86 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
87 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
88 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
89 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
90 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
91 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
92 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
93 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
94 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
95 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
96 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
97 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
98 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
99 |
with gr.Row():
|
100 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
101 |
with gr.Row():
|
|
|
104 |
btn_openfolder = gr.Button('π', scale=1)
|
105 |
|
106 |
params = [input_file, dd_file_format, cb_timestamp]
|
107 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
108 |
+
lang=dd_lang,
|
109 |
+
is_translate=cb_translate,
|
110 |
+
beam_size=nb_beam_size,
|
111 |
+
log_prob_threshold=nb_log_prob_threshold,
|
112 |
+
no_speech_threshold=nb_no_speech_threshold,
|
113 |
+
compute_type=dd_compute_type,
|
114 |
+
best_of=nb_best_of,
|
115 |
+
patience=nb_patience,
|
116 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
117 |
+
initial_prompt=tb_initial_prompt,
|
118 |
+
temperature=sd_temperature,
|
119 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
120 |
+
vad_filter=cb_vad_filter,
|
121 |
+
threshold=sd_threshold,
|
122 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
123 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
124 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
125 |
+
window_size_sample=nb_window_size_sample,
|
126 |
+
speech_pad_ms=nb_speech_pad_ms,
|
127 |
+
chunk_length_s=nb_chunk_length_s,
|
128 |
+
batch_size=nb_batch_size)
|
129 |
|
130 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
131 |
inputs=params + whisper_params.to_list(),
|
|
|
153 |
with gr.Row():
|
154 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
155 |
interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
with gr.Accordion("Advanced_Parameters", open=False):
|
157 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
158 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
164 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
165 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
166 |
nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
|
167 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
168 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
169 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
170 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
171 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
172 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
173 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
174 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
175 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
|
176 |
+
visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
177 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
178 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
179 |
with gr.Row():
|
180 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
181 |
with gr.Row():
|
|
|
184 |
btn_openfolder = gr.Button('π', scale=1)
|
185 |
|
186 |
params = [tb_youtubelink, dd_file_format, cb_timestamp]
|
187 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
188 |
+
lang=dd_lang,
|
189 |
+
is_translate=cb_translate,
|
190 |
+
beam_size=nb_beam_size,
|
191 |
+
log_prob_threshold=nb_log_prob_threshold,
|
192 |
+
no_speech_threshold=nb_no_speech_threshold,
|
193 |
+
compute_type=dd_compute_type,
|
194 |
+
best_of=nb_best_of,
|
195 |
+
patience=nb_patience,
|
196 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
197 |
+
initial_prompt=tb_initial_prompt,
|
198 |
+
temperature=sd_temperature,
|
199 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
200 |
+
vad_filter=cb_vad_filter,
|
201 |
+
threshold=sd_threshold,
|
202 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
203 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
204 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
205 |
+
window_size_sample=nb_window_size_sample,
|
206 |
+
speech_pad_ms=nb_speech_pad_ms,
|
207 |
+
chunk_length_s=nb_chunk_length_s,
|
208 |
+
batch_size=nb_batch_size)
|
209 |
+
|
210 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
211 |
inputs=params + whisper_params.to_list(),
|
212 |
outputs=[tb_indicator, files_subtitles])
|
|
|
226 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
227 |
with gr.Row():
|
228 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
with gr.Accordion("Advanced_Parameters", open=False):
|
230 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
231 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
|
|
236 |
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
237 |
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
238 |
sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
|
239 |
+
with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
|
240 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
241 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
242 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
243 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
244 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
245 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
246 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
247 |
+
with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
|
248 |
+
visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
|
249 |
+
nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
|
250 |
+
nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
|
251 |
with gr.Row():
|
252 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
253 |
with gr.Row():
|
|
|
256 |
btn_openfolder = gr.Button('π', scale=1)
|
257 |
|
258 |
params = [mic_input, dd_file_format]
|
259 |
+
whisper_params = WhisperParameters(model_size=dd_model,
|
260 |
+
lang=dd_lang,
|
261 |
+
is_translate=cb_translate,
|
262 |
+
beam_size=nb_beam_size,
|
263 |
+
log_prob_threshold=nb_log_prob_threshold,
|
264 |
+
no_speech_threshold=nb_no_speech_threshold,
|
265 |
+
compute_type=dd_compute_type,
|
266 |
+
best_of=nb_best_of,
|
267 |
+
patience=nb_patience,
|
268 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
269 |
+
initial_prompt=tb_initial_prompt,
|
270 |
+
temperature=sd_temperature,
|
271 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
272 |
+
vad_filter=cb_vad_filter,
|
273 |
+
threshold=sd_threshold,
|
274 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
275 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
276 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
277 |
+
window_size_sample=nb_window_size_sample,
|
278 |
+
speech_pad_ms=nb_speech_pad_ms,
|
279 |
+
chunk_length_s=nb_chunk_length_s,
|
280 |
+
batch_size=nb_batch_size)
|
281 |
+
|
282 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
283 |
inputs=params + whisper_params.to_list(),
|
284 |
outputs=[tb_indicator, files_subtitles])
|
modules/insanely_fast_whisper_inference.py
CHANGED
@@ -71,8 +71,8 @@ class InsanelyFastWhisperInference(WhisperBase):
|
|
71 |
segments = self.model(
|
72 |
inputs=audio,
|
73 |
return_timestamps=True,
|
74 |
-
chunk_length_s=
|
75 |
-
batch_size=
|
76 |
generate_kwargs={
|
77 |
"language": params.lang,
|
78 |
"task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
|
|
71 |
segments = self.model(
|
72 |
inputs=audio,
|
73 |
return_timestamps=True,
|
74 |
+
chunk_length_s=params.chunk_length_s,
|
75 |
+
batch_size=params.batch_size,
|
76 |
generate_kwargs={
|
77 |
"language": params.lang,
|
78 |
"task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
|
modules/whisper_parameter.py
CHANGED
@@ -25,8 +25,12 @@ class WhisperParameters:
|
|
25 |
min_silence_duration_ms: gr.Number
|
26 |
window_size_sample: gr.Number
|
27 |
speech_pad_ms: gr.Number
|
|
|
|
|
28 |
"""
|
29 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
|
|
|
|
30 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
31 |
|
32 |
Attributes
|
@@ -111,6 +115,13 @@ class WhisperParameters:
|
|
111 |
|
112 |
speech_pad_ms: gr.Number
|
113 |
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
"""
|
115 |
|
116 |
def to_list(self) -> list:
|
@@ -155,7 +166,9 @@ class WhisperParameters:
|
|
155 |
max_speech_duration_s=args[16],
|
156 |
min_silence_duration_ms=args[17],
|
157 |
window_size_samples=args[18],
|
158 |
-
speech_pad_ms=args[19]
|
|
|
|
|
159 |
)
|
160 |
|
161 |
|
@@ -181,6 +194,8 @@ class WhisperValues:
|
|
181 |
min_silence_duration_ms: int
|
182 |
window_size_samples: int
|
183 |
speech_pad_ms: int
|
|
|
|
|
184 |
"""
|
185 |
A data class to use Whisper parameters.
|
186 |
"""
|
|
|
25 |
min_silence_duration_ms: gr.Number
|
26 |
window_size_sample: gr.Number
|
27 |
speech_pad_ms: gr.Number
|
28 |
+
chunk_length_s: gr.Number
|
29 |
+
batch_size: gr.Number
|
30 |
"""
|
31 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
32 |
+
This data class is used to mitigate the key-value problem between Gradio components and function parameters.
|
33 |
+
Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
|
34 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
35 |
|
36 |
Attributes
|
|
|
115 |
|
116 |
speech_pad_ms: gr.Number
|
117 |
This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
|
118 |
+
|
119 |
+
chunk_length_s: gr.Number
|
120 |
+
This parameter is related with insanely-fast-whisper pipe.
|
121 |
+
Maximum length of each chunk
|
122 |
+
|
123 |
+
batch_size: gr.Number
|
124 |
+
This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
|
125 |
"""
|
126 |
|
127 |
def to_list(self) -> list:
|
|
|
166 |
max_speech_duration_s=args[16],
|
167 |
min_silence_duration_ms=args[17],
|
168 |
window_size_samples=args[18],
|
169 |
+
speech_pad_ms=args[19],
|
170 |
+
chunk_length_s=args[20],
|
171 |
+
batch_size=args[21]
|
172 |
)
|
173 |
|
174 |
|
|
|
194 |
min_silence_duration_ms: int
|
195 |
window_size_samples: int
|
196 |
speech_pad_ms: int
|
197 |
+
chunk_length_s: int
|
198 |
+
batch_size: int
|
199 |
"""
|
200 |
A data class to use Whisper parameters.
|
201 |
"""
|