jhj0517 commited on
Commit
363657b
·
2 Parent(s): ac84c91 171d562

Merge pull request #175 from jhj0517/feature/integrate-insanely_fast_whisper

Browse files
app.py CHANGED
@@ -4,6 +4,7 @@ import argparse
4
 
5
  from modules.whisper_Inference import WhisperInference
6
  from modules.faster_whisper_inference import FasterWhisperInference
 
7
  from modules.nllb_inference import NLLBInference
8
  from ui.htmls import *
9
  from modules.youtube_manager import get_ytmetas
@@ -24,12 +25,16 @@ class App:
24
  def init_whisper(self):
25
  whisper_type = self.args.whisper_type.lower().strip()
26
 
27
- if whisper_type in ["faster_whisper", "faster-whisper"]:
28
  whisper_inf = FasterWhisperInference()
29
  whisper_inf.model_dir = self.args.faster_whisper_model_dir
30
- if whisper_type in ["whisper"]:
31
  whisper_inf = WhisperInference()
32
  whisper_inf.model_dir = self.args.whisper_model_dir
 
 
 
 
33
  else:
34
  whisper_inf = FasterWhisperInference()
35
  whisper_inf.model_dir = self.args.faster_whisper_model_dir
@@ -69,14 +74,6 @@ class App:
69
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
70
  with gr.Row():
71
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
72
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
73
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
74
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
75
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
76
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
77
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
78
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
79
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
80
  with gr.Accordion("Advanced_Parameters", open=False):
81
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
82
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -88,6 +85,17 @@ class App:
88
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
89
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
90
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
91
  with gr.Row():
92
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
93
  with gr.Row():
@@ -96,26 +104,28 @@ class App:
96
  btn_openfolder = gr.Button('📂', scale=1)
97
 
98
  params = [input_file, dd_file_format, cb_timestamp]
99
- whisper_params = WhisperGradioComponents(model_size=dd_model,
100
- lang=dd_lang,
101
- is_translate=cb_translate,
102
- beam_size=nb_beam_size,
103
- log_prob_threshold=nb_log_prob_threshold,
104
- no_speech_threshold=nb_no_speech_threshold,
105
- compute_type=dd_compute_type,
106
- best_of=nb_best_of,
107
- patience=nb_patience,
108
- condition_on_previous_text=cb_condition_on_previous_text,
109
- initial_prompt=tb_initial_prompt,
110
- temperature=sd_temperature,
111
- compression_ratio_threshold=nb_compression_ratio_threshold,
112
- vad_filter=cb_vad_filter,
113
- threshold=sd_threshold,
114
- min_speech_duration_ms=nb_min_speech_duration_ms,
115
- max_speech_duration_s=nb_max_speech_duration_s,
116
- min_silence_duration_ms=nb_min_silence_duration_ms,
117
- window_size_sample=nb_window_size_sample,
118
- speech_pad_ms=nb_speech_pad_ms)
 
 
119
 
120
  btn_run.click(fn=self.whisper_inf.transcribe_file,
121
  inputs=params + whisper_params.to_list(),
@@ -143,14 +153,6 @@ class App:
143
  with gr.Row():
144
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
145
  interactive=True)
146
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
147
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
148
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
149
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
150
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
151
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
152
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
153
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
154
  with gr.Accordion("Advanced_Parameters", open=False):
155
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
156
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -162,6 +164,18 @@ class App:
162
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
163
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
164
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
165
  with gr.Row():
166
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
167
  with gr.Row():
@@ -170,26 +184,29 @@ class App:
170
  btn_openfolder = gr.Button('📂', scale=1)
171
 
172
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
173
- whisper_params = WhisperGradioComponents(model_size=dd_model,
174
- lang=dd_lang,
175
- is_translate=cb_translate,
176
- beam_size=nb_beam_size,
177
- log_prob_threshold=nb_log_prob_threshold,
178
- no_speech_threshold=nb_no_speech_threshold,
179
- compute_type=dd_compute_type,
180
- best_of=nb_best_of,
181
- patience=nb_patience,
182
- condition_on_previous_text=cb_condition_on_previous_text,
183
- initial_prompt=tb_initial_prompt,
184
- temperature=sd_temperature,
185
- compression_ratio_threshold=nb_compression_ratio_threshold,
186
- vad_filter=cb_vad_filter,
187
- threshold=sd_threshold,
188
- min_speech_duration_ms=nb_min_speech_duration_ms,
189
- max_speech_duration_s=nb_max_speech_duration_s,
190
- min_silence_duration_ms=nb_min_silence_duration_ms,
191
- window_size_sample=nb_window_size_sample,
192
- speech_pad_ms=nb_speech_pad_ms)
 
 
 
193
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
194
  inputs=params + whisper_params.to_list(),
195
  outputs=[tb_indicator, files_subtitles])
@@ -209,14 +226,6 @@ class App:
209
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
210
  with gr.Row():
211
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
212
- with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
213
- cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
214
- sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
215
- nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
216
- nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
217
- nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
218
- nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
219
- nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
220
  with gr.Accordion("Advanced_Parameters", open=False):
221
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
222
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
@@ -227,6 +236,18 @@ class App:
227
  cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
228
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
229
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
230
  with gr.Row():
231
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
232
  with gr.Row():
@@ -235,26 +256,29 @@ class App:
235
  btn_openfolder = gr.Button('📂', scale=1)
236
 
237
  params = [mic_input, dd_file_format]
238
- whisper_params = WhisperGradioComponents(model_size=dd_model,
239
- lang=dd_lang,
240
- is_translate=cb_translate,
241
- beam_size=nb_beam_size,
242
- log_prob_threshold=nb_log_prob_threshold,
243
- no_speech_threshold=nb_no_speech_threshold,
244
- compute_type=dd_compute_type,
245
- best_of=nb_best_of,
246
- patience=nb_patience,
247
- condition_on_previous_text=cb_condition_on_previous_text,
248
- initial_prompt=tb_initial_prompt,
249
- temperature=sd_temperature,
250
- compression_ratio_threshold=nb_compression_ratio_threshold,
251
- vad_filter=cb_vad_filter,
252
- threshold=sd_threshold,
253
- min_speech_duration_ms=nb_min_speech_duration_ms,
254
- max_speech_duration_s=nb_max_speech_duration_s,
255
- min_silence_duration_ms=nb_min_silence_duration_ms,
256
- window_size_sample=nb_window_size_sample,
257
- speech_pad_ms=nb_speech_pad_ms)
 
 
 
258
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
259
  inputs=params + whisper_params.to_list(),
260
  outputs=[tb_indicator, files_subtitles])
@@ -354,6 +378,7 @@ parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True,
354
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
355
  parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
356
  parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 
357
  _args = parser.parse_args()
358
 
359
  if __name__ == "__main__":
 
4
 
5
  from modules.whisper_Inference import WhisperInference
6
  from modules.faster_whisper_inference import FasterWhisperInference
7
+ from modules.insanely_fast_whisper_inference import InsanelyFastWhisperInference
8
  from modules.nllb_inference import NLLBInference
9
  from ui.htmls import *
10
  from modules.youtube_manager import get_ytmetas
 
25
  def init_whisper(self):
26
  whisper_type = self.args.whisper_type.lower().strip()
27
 
28
+ if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
29
  whisper_inf = FasterWhisperInference()
30
  whisper_inf.model_dir = self.args.faster_whisper_model_dir
31
+ elif whisper_type in ["whisper"]:
32
  whisper_inf = WhisperInference()
33
  whisper_inf.model_dir = self.args.whisper_model_dir
34
+ elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
35
+ "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
36
+ whisper_inf = InsanelyFastWhisperInference()
37
+ whisper_inf.model_dir = self.args.insanely_fast_whisper_model_dir
38
  else:
39
  whisper_inf = FasterWhisperInference()
40
  whisper_inf.model_dir = self.args.faster_whisper_model_dir
 
74
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
75
  with gr.Row():
76
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
 
 
 
 
 
 
 
 
77
  with gr.Accordion("Advanced_Parameters", open=False):
78
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
79
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
85
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
86
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
87
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
88
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
89
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
90
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
91
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
92
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
93
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
94
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
95
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
96
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
97
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
98
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
99
  with gr.Row():
100
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
101
  with gr.Row():
 
104
  btn_openfolder = gr.Button('📂', scale=1)
105
 
106
  params = [input_file, dd_file_format, cb_timestamp]
107
+ whisper_params = WhisperParameters(model_size=dd_model,
108
+ lang=dd_lang,
109
+ is_translate=cb_translate,
110
+ beam_size=nb_beam_size,
111
+ log_prob_threshold=nb_log_prob_threshold,
112
+ no_speech_threshold=nb_no_speech_threshold,
113
+ compute_type=dd_compute_type,
114
+ best_of=nb_best_of,
115
+ patience=nb_patience,
116
+ condition_on_previous_text=cb_condition_on_previous_text,
117
+ initial_prompt=tb_initial_prompt,
118
+ temperature=sd_temperature,
119
+ compression_ratio_threshold=nb_compression_ratio_threshold,
120
+ vad_filter=cb_vad_filter,
121
+ threshold=sd_threshold,
122
+ min_speech_duration_ms=nb_min_speech_duration_ms,
123
+ max_speech_duration_s=nb_max_speech_duration_s,
124
+ min_silence_duration_ms=nb_min_silence_duration_ms,
125
+ window_size_sample=nb_window_size_sample,
126
+ speech_pad_ms=nb_speech_pad_ms,
127
+ chunk_length_s=nb_chunk_length_s,
128
+ batch_size=nb_batch_size)
129
 
130
  btn_run.click(fn=self.whisper_inf.transcribe_file,
131
  inputs=params + whisper_params.to_list(),
 
153
  with gr.Row():
154
  cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
155
  interactive=True)
 
 
 
 
 
 
 
 
156
  with gr.Accordion("Advanced_Parameters", open=False):
157
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
158
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
164
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
165
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
166
  nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
167
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
168
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
169
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
170
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
171
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
172
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
173
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
174
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
175
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
176
+ visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
177
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
178
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
179
  with gr.Row():
180
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
181
  with gr.Row():
 
184
  btn_openfolder = gr.Button('📂', scale=1)
185
 
186
  params = [tb_youtubelink, dd_file_format, cb_timestamp]
187
+ whisper_params = WhisperParameters(model_size=dd_model,
188
+ lang=dd_lang,
189
+ is_translate=cb_translate,
190
+ beam_size=nb_beam_size,
191
+ log_prob_threshold=nb_log_prob_threshold,
192
+ no_speech_threshold=nb_no_speech_threshold,
193
+ compute_type=dd_compute_type,
194
+ best_of=nb_best_of,
195
+ patience=nb_patience,
196
+ condition_on_previous_text=cb_condition_on_previous_text,
197
+ initial_prompt=tb_initial_prompt,
198
+ temperature=sd_temperature,
199
+ compression_ratio_threshold=nb_compression_ratio_threshold,
200
+ vad_filter=cb_vad_filter,
201
+ threshold=sd_threshold,
202
+ min_speech_duration_ms=nb_min_speech_duration_ms,
203
+ max_speech_duration_s=nb_max_speech_duration_s,
204
+ min_silence_duration_ms=nb_min_silence_duration_ms,
205
+ window_size_sample=nb_window_size_sample,
206
+ speech_pad_ms=nb_speech_pad_ms,
207
+ chunk_length_s=nb_chunk_length_s,
208
+ batch_size=nb_batch_size)
209
+
210
  btn_run.click(fn=self.whisper_inf.transcribe_youtube,
211
  inputs=params + whisper_params.to_list(),
212
  outputs=[tb_indicator, files_subtitles])
 
226
  dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
227
  with gr.Row():
228
  cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
 
 
 
 
 
 
 
 
229
  with gr.Accordion("Advanced_Parameters", open=False):
230
  nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
231
  nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
 
236
  cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
237
  tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
238
  sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
239
+ with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
240
+ cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
241
+ sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
242
+ nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
243
+ nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
244
+ nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
245
+ nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
246
+ nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
247
+ with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
248
+ visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
249
+ nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
250
+ nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
251
  with gr.Row():
252
  btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
253
  with gr.Row():
 
256
  btn_openfolder = gr.Button('📂', scale=1)
257
 
258
  params = [mic_input, dd_file_format]
259
+ whisper_params = WhisperParameters(model_size=dd_model,
260
+ lang=dd_lang,
261
+ is_translate=cb_translate,
262
+ beam_size=nb_beam_size,
263
+ log_prob_threshold=nb_log_prob_threshold,
264
+ no_speech_threshold=nb_no_speech_threshold,
265
+ compute_type=dd_compute_type,
266
+ best_of=nb_best_of,
267
+ patience=nb_patience,
268
+ condition_on_previous_text=cb_condition_on_previous_text,
269
+ initial_prompt=tb_initial_prompt,
270
+ temperature=sd_temperature,
271
+ compression_ratio_threshold=nb_compression_ratio_threshold,
272
+ vad_filter=cb_vad_filter,
273
+ threshold=sd_threshold,
274
+ min_speech_duration_ms=nb_min_speech_duration_ms,
275
+ max_speech_duration_s=nb_max_speech_duration_s,
276
+ min_silence_duration_ms=nb_min_silence_duration_ms,
277
+ window_size_sample=nb_window_size_sample,
278
+ speech_pad_ms=nb_speech_pad_ms,
279
+ chunk_length_s=nb_chunk_length_s,
280
+ batch_size=nb_batch_size)
281
+
282
  btn_run.click(fn=self.whisper_inf.transcribe_mic,
283
  inputs=params + whisper_params.to_list(),
284
  outputs=[tb_indicator, files_subtitles])
 
378
  parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='enable api or not')
379
  parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
380
  parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
381
+ parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
382
  _args = parser.parse_args()
383
 
384
  if __name__ == "__main__":
modules/faster_whisper_inference.py CHANGED
@@ -52,7 +52,7 @@ class FasterWhisperInference(WhisperBase):
52
  """
53
  start_time = time.time()
54
 
55
- params = WhisperValues(*whisper_params)
56
 
57
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
58
  self.update_model(params.model_size, params.compute_type, progress)
 
52
  """
53
  start_time = time.time()
54
 
55
+ params = WhisperParameters.post_process(*whisper_params)
56
 
57
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
58
  self.update_model(params.model_size, params.compute_type, progress)
modules/insanely_fast_whisper_inference.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ from typing import BinaryIO, Union, Tuple, List
5
+ import torch
6
+ from transformers import pipeline
7
+ from transformers.utils import is_flash_attn_2_available
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import whisper
11
+ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
+
13
+ from modules.whisper_parameter import *
14
+ from modules.whisper_base import WhisperBase
15
+
16
+
17
+ class InsanelyFastWhisperInference(WhisperBase):
18
+ def __init__(self):
19
+ super().__init__(
20
+ model_dir=os.path.join("models", "Whisper", "insanely_fast_whisper")
21
+ )
22
+ openai_models = whisper.available_models()
23
+ distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
24
+ self.available_models = openai_models + distil_models
25
+ self.available_compute_types = ["float16"]
26
+
27
+ def transcribe(self,
28
+ audio: Union[str, np.ndarray, torch.Tensor],
29
+ progress: gr.Progress,
30
+ *whisper_params,
31
+ ) -> Tuple[List[dict], float]:
32
+ """
33
+ transcribe method for faster-whisper.
34
+
35
+ Parameters
36
+ ----------
37
+ audio: Union[str, BinaryIO, np.ndarray]
38
+ Audio path or file binary or Audio numpy array
39
+ progress: gr.Progress
40
+ Indicator to show progress directly in gradio.
41
+ *whisper_params: tuple
42
+ Gradio components related to Whisper. see whisper_data_class.py for details.
43
+
44
+ Returns
45
+ ----------
46
+ segments_result: List[dict]
47
+ list of dicts that includes start, end timestamps and transcribed text
48
+ elapsed_time: float
49
+ elapsed time for transcription
50
+ """
51
+ start_time = time.time()
52
+ params = WhisperParameters.post_process(*whisper_params)
53
+
54
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
55
+ self.update_model(params.model_size, params.compute_type, progress)
56
+
57
+ if params.lang == "Automatic Detection":
58
+ params.lang = None
59
+ else:
60
+ language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
61
+ params.lang = language_code_dict[params.lang]
62
+
63
+ progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
64
+ with Progress(
65
+ TextColumn("[progress.description]{task.description}"),
66
+ BarColumn(style="yellow1", pulse_style="white"),
67
+ TimeElapsedColumn(),
68
+ ) as progress:
69
+ progress.add_task("[yellow]Transcribing...", total=None)
70
+
71
+ segments = self.model(
72
+ inputs=audio,
73
+ return_timestamps=True,
74
+ chunk_length_s=params.chunk_length_s,
75
+ batch_size=params.batch_size,
76
+ generate_kwargs={
77
+ "language": params.lang,
78
+ "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
79
+ "no_speech_threshold": params.no_speech_threshold,
80
+ "temperature": params.temperature,
81
+ "compression_ratio_threshold": params.compression_ratio_threshold
82
+ }
83
+ )
84
+
85
+ segments_result = self.format_result(
86
+ transcribed_result=segments,
87
+ )
88
+ elapsed_time = time.time() - start_time
89
+ return segments_result, elapsed_time
90
+
91
+ def update_model(self,
92
+ model_size: str,
93
+ compute_type: str,
94
+ progress: gr.Progress,
95
+ ):
96
+ """
97
+ Update current model setting
98
+
99
+ Parameters
100
+ ----------
101
+ model_size: str
102
+ Size of whisper model
103
+ compute_type: str
104
+ Compute type for transcription.
105
+ see more info : https://opennmt.net/CTranslate2/quantization.html
106
+ progress: gr.Progress
107
+ Indicator to show progress directly in gradio.
108
+ """
109
+ progress(0, desc="Initializing Model..")
110
+ model_path = os.path.join(self.model_dir, model_size)
111
+ if not os.path.isdir(model_path) or not os.listdir(model_path):
112
+ self.download_model(
113
+ model_size=model_size,
114
+ download_root=model_path,
115
+ progress=progress
116
+ )
117
+
118
+ self.current_compute_type = compute_type
119
+ self.current_model_size = model_size
120
+ self.model = pipeline(
121
+ "automatic-speech-recognition",
122
+ model=os.path.join(self.model_dir, model_size),
123
+ torch_dtype=self.current_compute_type,
124
+ device=self.device,
125
+ model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
126
+ )
127
+
128
+ @staticmethod
129
+ def format_result(
130
+ transcribed_result: dict
131
+ ) -> List[dict]:
132
+ """
133
+ Format the transcription result of insanely_fast_whisper as the same with other implementation.
134
+
135
+ Parameters
136
+ ----------
137
+ transcribed_result: dict
138
+ Transcription result of the insanely_fast_whisper
139
+
140
+ Returns
141
+ ----------
142
+ result: List[dict]
143
+ Formatted result as the same with other implementation
144
+ """
145
+ result = transcribed_result["chunks"]
146
+ for item in result:
147
+ start, end = item["timestamp"][0], item["timestamp"][1]
148
+ if end is None:
149
+ end = start
150
+ item["start"] = start
151
+ item["end"] = end
152
+ return result
153
+
154
+ @staticmethod
155
+ def download_model(
156
+ model_size: str,
157
+ download_root: str,
158
+ progress: gr.Progress
159
+ ):
160
+ progress(0, 'Initializing model..')
161
+ print(f'Downloading {model_size} to "{download_root}"....')
162
+
163
+ os.makedirs(download_root, exist_ok=True)
164
+ download_list = [
165
+ "model.safetensors",
166
+ "config.json",
167
+ "generation_config.json",
168
+ "preprocessor_config.json",
169
+ "tokenizer.json",
170
+ "tokenizer_config.json",
171
+ "added_tokens.json",
172
+ "special_tokens_map.json",
173
+ "vocab.json",
174
+ ]
175
+
176
+ if model_size.startswith("distil"):
177
+ repo_id = f"distil-whisper/{model_size}"
178
+ else:
179
+ repo_id = f"openai/whisper-{model_size}"
180
+ for item in download_list:
181
+ hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)
modules/whisper_Inference.py CHANGED
@@ -41,7 +41,7 @@ class WhisperInference(WhisperBase):
41
  elapsed time for transcription
42
  """
43
  start_time = time.time()
44
- params = WhisperValues(*whisper_params)
45
 
46
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
47
  self.update_model(params.model_size, params.compute_type, progress)
 
41
  elapsed time for transcription
42
  """
43
  start_time = time.time()
44
+ params = WhisperParameters.post_process(*whisper_params)
45
 
46
  if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
47
  self.update_model(params.model_size, params.compute_type, progress)
modules/whisper_parameter.py CHANGED
@@ -4,7 +4,7 @@ from typing import Optional
4
 
5
 
6
  @dataclass
7
- class WhisperGradioComponents:
8
  model_size: gr.Dropdown
9
  lang: gr.Dropdown
10
  is_translate: gr.Checkbox
@@ -25,8 +25,12 @@ class WhisperGradioComponents:
25
  min_silence_duration_ms: gr.Number
26
  window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
 
 
28
  """
29
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
 
 
30
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
31
 
32
  Attributes
@@ -111,11 +115,18 @@ class WhisperGradioComponents:
111
 
112
  speech_pad_ms: gr.Number
113
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
 
 
 
 
 
 
 
114
  """
115
 
116
  def to_list(self) -> list:
117
  """
118
- Converts the data class attributes into a list. Use "before" Gradio pre-processing.
119
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components
120
 
121
  Returns
@@ -124,6 +135,42 @@ class WhisperGradioComponents:
124
  """
125
  return [getattr(self, f.name) for f in fields(self)]
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  @dataclass
129
  class WhisperValues:
@@ -147,7 +194,8 @@ class WhisperValues:
147
  min_silence_duration_ms: int
148
  window_size_samples: int
149
  speech_pad_ms: int
 
 
150
  """
151
- A data class to use Whisper parameters. Use "after" Gradio pre-processing.
152
- See more about Gradio pre-processing: : https://www.gradio.app/docs/components
153
  """
 
4
 
5
 
6
  @dataclass
7
+ class WhisperParameters:
8
  model_size: gr.Dropdown
9
  lang: gr.Dropdown
10
  is_translate: gr.Checkbox
 
25
  min_silence_duration_ms: gr.Number
26
  window_size_sample: gr.Number
27
  speech_pad_ms: gr.Number
28
+ chunk_length_s: gr.Number
29
+ batch_size: gr.Number
30
  """
31
  A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
32
+ This data class is used to mitigate the key-value problem between Gradio components and function parameters.
33
+ Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
34
  See more about Gradio pre-processing: https://www.gradio.app/docs/components
35
 
36
  Attributes
 
115
 
116
  speech_pad_ms: gr.Number
117
  This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
118
+
119
+ chunk_length_s: gr.Number
120
+ This parameter is related with insanely-fast-whisper pipe.
121
+ Maximum length of each chunk
122
+
123
+ batch_size: gr.Number
124
+ This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
125
  """
126
 
127
  def to_list(self) -> list:
128
  """
129
+ Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
130
  See more about Gradio pre-processing: : https://www.gradio.app/docs/components
131
 
132
  Returns
 
135
  """
136
  return [getattr(self, f.name) for f in fields(self)]
137
 
138
+ @staticmethod
139
+ def post_process(*args) -> 'WhisperValues':
140
+ """
141
+ To use Whisper parameters in function after Gradio post-processing.
142
+ See more about Gradio post-processing: : https://www.gradio.app/docs/components
143
+
144
+ Returns
145
+ ----------
146
+ WhisperValues
147
+ Data class that has values of parameters
148
+ """
149
+ return WhisperValues(
150
+ model_size=args[0],
151
+ lang=args[1],
152
+ is_translate=args[2],
153
+ beam_size=args[3],
154
+ log_prob_threshold=args[4],
155
+ no_speech_threshold=args[5],
156
+ compute_type=args[6],
157
+ best_of=args[7],
158
+ patience=args[8],
159
+ condition_on_previous_text=args[9],
160
+ initial_prompt=args[10],
161
+ temperature=args[11],
162
+ compression_ratio_threshold=args[12],
163
+ vad_filter=args[13],
164
+ threshold=args[14],
165
+ min_speech_duration_ms=args[15],
166
+ max_speech_duration_s=args[16],
167
+ min_silence_duration_ms=args[17],
168
+ window_size_samples=args[18],
169
+ speech_pad_ms=args[19],
170
+ chunk_length_s=args[20],
171
+ batch_size=args[21]
172
+ )
173
+
174
 
175
  @dataclass
176
  class WhisperValues:
 
194
  min_silence_duration_ms: int
195
  window_size_samples: int
196
  speech_pad_ms: int
197
+ chunk_length_s: int
198
+ batch_size: int
199
  """
200
+ A data class to use Whisper parameters.
 
201
  """
user-start-webui.bat CHANGED
@@ -12,6 +12,7 @@ set API_OPEN=
12
  set WHISPER_TYPE=
13
  set WHISPER_MODEL_DIR=
14
  set FASTER_WHISPER_MODEL_DIR=
 
15
 
16
 
17
  if not "%SERVER_NAME%"=="" (
@@ -47,7 +48,10 @@ if not "%WHISPER_MODEL_DIR%"=="" (
47
  if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
48
  set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
49
  )
 
 
 
50
 
51
  :: Call the original .bat script with optional arguments
52
- start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG%
53
  pause
 
12
  set WHISPER_TYPE=
13
  set WHISPER_MODEL_DIR=
14
  set FASTER_WHISPER_MODEL_DIR=
15
+ set INSANELY_FAST_WHISPER_MODEL_DIR=
16
 
17
 
18
  if not "%SERVER_NAME%"=="" (
 
48
  if not "%FASTER_WHISPER_MODEL_DIR%"=="" (
49
  set FASTER_WHISPER_MODEL_DIR_ARG=--faster_whisper_model_dir "%FASTER_WHISPER_MODEL_DIR%"
50
  )
51
+ if not "%INSANELY_FAST_WHISPER_MODEL_DIR%"=="" (
52
+ set INSANELY_FAST_WHISPER_MODEL_DIR_ARG=--insanely_fast_whisper_model_dir "%INSANELY_FAST_WHISPER_MODEL_DIR%"
53
+ )
54
 
55
  :: Call the original .bat script with optional arguments
56
+ start-webui.bat %SERVER_NAME_ARG% %SERVER_PORT_ARG% %USERNAME_ARG% %PASSWORD_ARG% %SHARE_ARG% %THEME_ARG% %API_OPEN% %WHISPER_TYPE_ARG% %WHISPER_MODEL_DIR_ARG% %FASTER_WHISPER_MODEL_DIR_ARG% %INSANELY_FAST_WHISPER_MODEL_DIR_ARG%
57
  pause