Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jun 26, 2024

Commit

c5dc9f3

2 Parent(s): 75fca27 8615b10

Merge pull request #181 from jhj0517/feature/integrate-whisperx

Browse files

Files changed (19) hide show

app.py +63 -27
modules/diarize/__init__.py +0 -0
modules/diarize/audio_loader.py +161 -0
modules/diarize/diarize_pipeline.py +92 -0
modules/diarize/diarizer.py +127 -0
modules/translation/__init__.py +0 -0
modules/{deepl_api.py → translation/deepl_api.py} +3 -3
modules/{nllb_inference.py → translation/nllb_inference.py} +1 -1
modules/{translation_base.py → translation/translation_base.py} +6 -6
modules/utils/__init__.py +0 -0
modules/{subtitle_manager.py → utils/subtitle_manager.py} +0 -0
modules/{youtube_manager.py → utils/youtube_manager.py} +0 -0
modules/whisper/__init__.py +0 -0
modules/{faster_whisper_inference.py → whisper/faster_whisper_inference.py} +9 -10
modules/{insanely_fast_whisper_inference.py → whisper/insanely_fast_whisper_inference.py} +9 -6
modules/{whisper_Inference.py → whisper/whisper_Inference.py} +9 -7
modules/{whisper_base.py → whisper/whisper_base.py} +65 -11
modules/{whisper_parameter.py → whisper/whisper_parameter.py} +22 -3
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
-import gradio as gr
 import os
 import argparse
-from modules.whisper_Inference import WhisperInference
-from modules.faster_whisper_inference import FasterWhisperInference
-from modules.insanely_fast_whisper_inference import InsanelyFastWhisperInference
-from modules.nllb_inference import NLLBInference
 from ui.htmls import *
-from modules.youtube_manager import get_ytmetas
-from modules.deepl_api import DeepLAPI
-from modules.whisper_parameter import *
 class App:
@@ -28,28 +27,35 @@ class App:
         )
     def init_whisper(self):
         whisper_type = self.args.whisper_type.lower().strip()
         if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         elif whisper_type in ["whisper"]:
             whisper_inf = WhisperInference(
                 model_dir=self.args.whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
                               "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
             whisper_inf = InsanelyFastWhisperInference(
                 model_dir=self.args.insanely_fast_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         else:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         return whisper_inf
@@ -87,7 +93,7 @@ class App:
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -98,14 +104,20 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                         nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
@@ -138,10 +150,13 @@ class App:
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
-                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -166,7 +181,7 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -177,14 +192,20 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
@@ -218,10 +239,13 @@ class App:
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
-                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
@@ -239,7 +263,7 @@ class App:
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -249,14 +273,22 @@ class App:
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
-                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
@@ -290,10 +322,13 @@ class App:
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
-                                                       batch_size=nb_batch_size)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
-                                  inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
@@ -392,6 +427,7 @@ parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=Tru
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
 parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

 import os
 import argparse
+from modules.whisper.whisper_Inference import WhisperInference
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.translation.nllb_inference import NLLBInference
 from ui.htmls import *
+from modules.utils.youtube_manager import get_ytmetas
+from modules.translation.deepl_api import DeepLAPI
+from modules.whisper.whisper_parameter import *
 class App:
         )
     def init_whisper(self):
+        # Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
         whisper_type = self.args.whisper_type.lower().strip()
         if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         elif whisper_type in ["whisper"]:
             whisper_inf = WhisperInference(
                 model_dir=self.args.whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
                               "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
             whisper_inf = InsanelyFastWhisperInference(
                 model_dir=self.args.insanely_fast_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         else:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         return whisper_inf
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
+                                                    "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
+                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                         nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size,
+                                                       is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                  inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
+                                                    "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
+                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size,
+                                                       is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                  inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
                                           outputs=[img_thumbnail, tb_title, tb_description])
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
+                        sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5, info="Lower it to be more sensitive to small sounds.")
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter. "
+                                                   "To download the model, you must manually go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and agree to their requirement.")
+                        dd_diarization_device = gr.Dropdown(label="Device",
+                                                            choices=self.whisper_inf.diarizer.get_available_device(),
+                                                            value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                                                        window_size_sample=nb_window_size_sample,
                                                        speech_pad_ms=nb_speech_pad_ms,
                                                        chunk_length_s=nb_chunk_length_s,
+                                                       batch_size=nb_batch_size,
+                                                       is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                  inputs=params + whisper_params.as_list(),
                                   outputs=[tb_indicator, files_subtitles])
                     btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
                     dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"), help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

modules/diarize/__init__.py ADDED Viewed

File without changes

modules/diarize/audio_loader.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import subprocess
+from functools import lru_cache
+from typing import Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    try:
+        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI to be installed.
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads",
+            "0",
+            "-i",
+            file,
+            "-f",
+            "s16le",
+            "-ac",
+            "1",
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            str(sr),
+            "-",
+        ]
+        out = subprocess.run(cmd, capture_output=True, check=True).stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(
+                dim=axis, index=torch.arange(length, device=array.device)
+            )
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

modules/diarize/diarize_pipeline.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import numpy as np
+import pandas as pd
+import os
+from pyannote.audio import Pipeline
+from typing import Optional, Union
+import torch
+from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
+class DiarizationPipeline:
+    def __init__(
+        self,
+        model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = os.path.join("models", "Diarization"),
+        use_auth_token=None,
+        device: Optional[Union[str, torch.device]] = "cpu",
+    ):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model = Pipeline.from_pretrained(
+            model_name,
+            use_auth_token=use_auth_token,
+            cache_dir=cache_dir
+        ).to(device)
+    def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio_data = {
+            'waveform': torch.from_numpy(audio[None, :]),
+            'sample_rate': SAMPLE_RATE
+        }
+        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
+        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
+        return diarize_df
+def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
+    transcript_segments = transcript_result["segments"]
+    for seg in transcript_segments:
+        # assign speaker to segment (if any)
+        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
+                                                                                            seg['start'])
+        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
+        intersected = diarize_df[diarize_df["intersection"] > 0]
+        speaker = None
+        if len(intersected) > 0:
+            # Choosing most strong intersection
+            speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+        elif fill_nearest:
+            # Otherwise choosing closest
+            speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+        if speaker is not None:
+            seg["speaker"] = speaker
+        # assign speaker to words
+        if 'words' in seg:
+            for word in seg['words']:
+                if 'start' in word:
+                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
+                        diarize_df['start'], word['start'])
+                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
+                                                                                                  word['start'])
+                    intersected = diarize_df[diarize_df["intersection"] > 0]
+                    word_speaker = None
+                    if len(intersected) > 0:
+                        # Choosing most strong intersection
+                        word_speaker = \
+                            intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+                    elif fill_nearest:
+                        # Otherwise choosing closest
+                        word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+                    if word_speaker is not None:
+                        word["speaker"] = word_speaker
+    return transcript_result
+class Segment:
+    def __init__(self, start, end, speaker=None):
+        self.start = start
+        self.end = end
+        self.speaker = speaker

modules/diarize/diarizer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import torch
+from typing import List
+import time
+import logging
+from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
+from modules.diarize.audio_loader import load_audio
+class Diarizer:
+    def __init__(self,
+                 model_dir: str = os.path.join("models", "Diarization")
+                 ):
+        self.device = self.get_device()
+        self.available_device = self.get_available_device()
+        self.compute_type = "float16"
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.pipe = None
+    def run(self,
+            audio: str,
+            transcribed_result: List[dict],
+            use_auth_token: str,
+            device: str
+            ):
+        """
+        Diarize transcribed result as a post-processing
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        transcribed_result: List[dict]
+            transcribed result through whisper.
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        start_time = time.time()
+        if (device != self.device
+                or self.pipe is None):
+            self.update_pipe(
+                device=device,
+                use_auth_token=use_auth_token
+            )
+        audio = load_audio(audio)
+        diarization_segments = self.pipe(audio)
+        diarized_result = assign_word_speakers(
+            diarization_segments,
+            {"segments": transcribed_result}
+        )
+        for segment in diarized_result["segments"]:
+            speaker = "None"
+            if "speaker" in segment:
+                speaker = segment["speaker"]
+            segment["text"] = speaker + "|" + segment["text"][1:]
+        elapsed_time = time.time() - start_time
+        return diarized_result["segments"], elapsed_time
+    def update_pipe(self,
+                    use_auth_token: str,
+                    device: str
+                    ):
+        """
+        Set pipeline for diarization
+        Parameters
+        ----------
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        """
+        os.makedirs(self.model_dir, exist_ok=True)
+        if (not os.listdir(self.model_dir) and
+                not use_auth_token):
+            print(
+                "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
+                "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
+            )
+            return
+        logger = logging.getLogger("speechbrain.utils.train_logger")
+        # Disable redundant torchvision warning message
+        logger.disabled = True
+        self.pipe = DiarizationPipeline(
+            use_auth_token=use_auth_token,
+            device=device,
+            cache_dir=self.model_dir
+        )
+        logger.disabled = False
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def get_available_device():
+        devices = ["cpu"]
+        if torch.cuda.is_available():
+            devices.append("cuda")
+        elif torch.backends.mps.is_available():
+            devices.append("mps")
+        return devices

modules/translation/__init__.py ADDED Viewed

File without changes

modules/{deepl_api.py → translation/deepl_api.py} RENAMED Viewed

@@ -4,7 +4,7 @@ import os
 from datetime import datetime
 import gradio as gr
-from modules.subtitle_manager import *
 """
 This is written with reference to the DeepL API documentation.
@@ -144,7 +144,7 @@ class DeepLAPI:
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
                 file_name = file_name[:-9]
-                output_path = os.path.join(self.output_dir, "translations", f"{file_name}-{timestamp}.srt")
                 write_file(subtitle, output_path)
             elif file_ext == ".vtt":
@@ -164,7 +164,7 @@ class DeepLAPI:
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
                 file_name = file_name[:-9]
-                output_path = os.path.join(self.output_dir, "translations", f"{file_name}-{timestamp}.vtt")
                 write_file(subtitle, output_path)

 from datetime import datetime
 import gradio as gr
+from modules.utils.subtitle_manager import *
 """
 This is written with reference to the DeepL API documentation.
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
                 file_name = file_name[:-9]
+                output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.srt")
                 write_file(subtitle, output_path)
             elif file_ext == ".vtt":
                 timestamp = datetime.now().strftime("%m%d%H%M%S")
                 file_name = file_name[:-9]
+                output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.vtt")
                 write_file(subtitle, output_path)

modules/{nllb_inference.py → translation/nllb_inference.py} RENAMED Viewed

@@ -2,7 +2,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import gradio as gr
 import os
-from modules.translation_base import TranslationBase
 class NLLBInference(TranslationBase):

 import gradio as gr
 import os
+from modules.translation.translation_base import TranslationBase
 class NLLBInference(TranslationBase):

modules/{translation_base.py → translation/translation_base.py} RENAMED Viewed

@@ -5,8 +5,8 @@ from abc import ABC, abstractmethod
 from typing import List
 from datetime import datetime
-from modules.whisper_parameter import *
-from modules.subtitle_manager import *
 class TranslationBase(ABC):
@@ -90,9 +90,9 @@ class TranslationBase(ABC):
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
                     if add_timestamp:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}-{timestamp}.srt")
                     else:
-                        output_path = os.path.join("outputs", "translations", f"{file_name}.srt")
                 elif file_ext == ".vtt":
                     parsed_dicts = parse_vtt(file_path=file_path)
@@ -105,9 +105,9 @@ class TranslationBase(ABC):
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
                     if add_timestamp:
-                        output_path = os.path.join(self.output_dir, "translations", f"{file_name}-{timestamp}.vtt")
                     else:
-                        output_path = os.path.join(self.output_dir, "translations", f"{file_name}.vtt")
                 write_file(subtitle, output_path)
                 files_info[file_name] = subtitle

 from typing import List
 from datetime import datetime
+from modules.whisper.whisper_parameter import *
+from modules.utils.subtitle_manager import *
 class TranslationBase(ABC):
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
                     if add_timestamp:
+                        output_path = os.path.join("outputs", "", f"{file_name}-{timestamp}.srt")
                     else:
+                        output_path = os.path.join("outputs", "", f"{file_name}.srt")
                 elif file_ext == ".vtt":
                     parsed_dicts = parse_vtt(file_path=file_path)
                     timestamp = datetime.now().strftime("%m%d%H%M%S")
                     if add_timestamp:
+                        output_path = os.path.join(self.output_dir, "", f"{file_name}-{timestamp}.vtt")
                     else:
+                        output_path = os.path.join(self.output_dir, "", f"{file_name}.vtt")
                 write_file(subtitle, output_path)
                 files_info[file_name] = subtitle

modules/utils/__init__.py ADDED Viewed

File without changes

modules/{subtitle_manager.py → utils/subtitle_manager.py} RENAMED Viewed

File without changes

modules/{youtube_manager.py → utils/youtube_manager.py} RENAMED Viewed

File without changes

modules/whisper/__init__.py ADDED Viewed

File without changes

modules/{faster_whisper_inference.py → whisper/faster_whisper_inference.py} RENAMED Viewed

@@ -2,28 +2,27 @@ import os
 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
 import faster_whisper
 from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import gradio as gr
-from modules.whisper_parameter import *
-from modules.whisper_base import WhisperBase
-# Temporal fix of the issue : https://github.com/jhj0517/Whisper-WebUI/issues/144
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()
@@ -45,7 +44,7 @@ class FasterWhisperInference(WhisperBase):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -56,7 +55,7 @@ class FasterWhisperInference(WhisperBase):
         """
         start_time = time.time()
-        params = WhisperParameters.post_process(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

 import time
 import numpy as np
 from typing import BinaryIO, Union, Tuple, List
 import faster_whisper
 from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import gradio as gr
+from argparse import Namespace
+from modules.whisper.whisper_parameter import *
+from modules.whisper.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
         """
         start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

modules/{insanely_fast_whisper_inference.py → whisper/insanely_fast_whisper_inference.py} RENAMED Viewed

@@ -9,19 +9,22 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
-from modules.whisper_parameter import *
-from modules.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
@@ -43,7 +46,7 @@ class InsanelyFastWhisperInference(WhisperBase):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -53,7 +56,7 @@ class InsanelyFastWhisperInference(WhisperBase):
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.post_process(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

 from huggingface_hub import hf_hub_download
 import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
+from argparse import Namespace
+from modules.whisper.whisper_parameter import *
+from modules.whisper.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

modules/{whisper_Inference.py → whisper/whisper_Inference.py} RENAMED Viewed

@@ -1,23 +1,25 @@
 import whisper
 import gradio as gr
 import time
-import os
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
-from modules.whisper_base import WhisperBase
-from modules.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
     def transcribe(self,
@@ -35,7 +37,7 @@ class WhisperInference(WhisperBase):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -45,7 +47,7 @@ class WhisperInference(WhisperBase):
             elapsed time for transcription
         """
         start_time = time.time()
-        params = WhisperParameters.post_process(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

 import whisper
 import gradio as gr
 import time
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
+from argparse import Namespace
+from modules.whisper.whisper_base import WhisperBase
+from modules.whisper.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
     def transcribe(self,
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperParameters.as_value(*whisper_params)
         if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
             self.update_model(params.model_size, params.compute_type, progress)

modules/{whisper_base.py → whisper/whisper_base.py} RENAMED Viewed

@@ -1,22 +1,24 @@
 import os
 import torch
-from typing import List
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
-from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
-from modules.youtube_manager import get_ytdata, get_ytaudio
-from modules.whisper_parameter import *
 class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         self.model = None
         self.current_model_size = None
@@ -30,6 +32,9 @@ class WhisperBase(ABC):
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
     @abstractmethod
     def transcribe(self,
@@ -47,6 +52,55 @@ class WhisperBase(ABC):
                      ):
         pass
     def transcribe_file(self,
                         files: list,
                         file_format: str,
@@ -68,7 +122,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -80,7 +134,7 @@ class WhisperBase(ABC):
         try:
             files_info = {}
             for file in files:
-                transcribed_segments, time_for_task = self.transcribe(
                     file.name,
                     progress,
                     *whisper_params,
@@ -135,7 +189,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -146,7 +200,7 @@ class WhisperBase(ABC):
         """
         try:
             progress(0, desc="Loading Audio..")
-            transcribed_segments, time_for_task = self.transcribe(
                 mic_audio,
                 progress,
                 *whisper_params,
@@ -190,7 +244,7 @@ class WhisperBase(ABC):
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
-            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
@@ -204,7 +258,7 @@ class WhisperBase(ABC):
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
-            transcribed_segments, time_for_task = self.transcribe(
                 audio,
                 progress,
                 *whisper_params,

 import os
 import torch
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
+from argparse import Namespace
+from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
+from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper.whisper_parameter import *
+from modules.diarize.diarizer import Diarizer
 class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         self.model = None
         self.current_model_size = None
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.diarizer = Diarizer(
+            model_dir=args.diarization_model_dir
+        )
     @abstractmethod
     def transcribe(self,
                      ):
         pass
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            progress: gr.Progress,
+            *whisper_params,
+            ) -> Tuple[List[dict], float]:
+        """
+        Run transcription with conditional post-processing.
+        The diarization will be performed in post-processing if enabled.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[dict]
+            list of dicts that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        params = WhisperParameters.as_value(*whisper_params)
+        if params.lang == "Automatic Detection":
+            params.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.lang = language_code_dict[params.lang]
+        result, elapsed_time = self.transcribe(
+            audio,
+            progress,
+            *whisper_params
+        )
+        if params.is_diarize:
+            result, elapsed_time_diarization = self.diarizer.run(
+                audio=audio,
+                use_auth_token=params.hf_token,
+                transcribed_result=result,
+                device=self.device
+            )
+            elapsed_time += elapsed_time_diarization
+        return result, elapsed_time
     def transcribe_file(self,
                         files: list,
                         file_format: str,
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
         try:
             files_info = {}
             for file in files:
+                transcribed_segments, time_for_task = self.run(
                     file.name,
                     progress,
                     *whisper_params,
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
         """
         try:
             progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 *whisper_params,
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
         Returns
         ----------
             yt = get_ytdata(youtube_link)
             audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.run(
                 audio,
                 progress,
                 *whisper_params,

modules/{whisper_parameter.py → whisper/whisper_parameter.py} RENAMED Viewed

@@ -27,6 +27,9 @@ class WhisperParameters:
     speech_pad_ms: gr.Number
     chunk_length_s: gr.Number
     batch_size: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -122,9 +125,19 @@ class WhisperParameters:
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
     """
-    def to_list(self) -> list:
         """
         Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
         See more about Gradio pre-processing: : https://www.gradio.app/docs/components
@@ -136,7 +149,7 @@ class WhisperParameters:
         return [getattr(self, f.name) for f in fields(self)]
     @staticmethod
-    def post_process(*args) -> 'WhisperValues':
         """
         To use Whisper parameters in function after Gradio post-processing.
         See more about Gradio post-processing: : https://www.gradio.app/docs/components
@@ -168,7 +181,10 @@ class WhisperParameters:
             window_size_samples=args[18],
             speech_pad_ms=args[19],
             chunk_length_s=args[20],
-            batch_size=args[21]
         )
@@ -196,6 +212,9 @@ class WhisperValues:
     speech_pad_ms: int
     chunk_length_s: int
     batch_size: int
     """
     A data class to use Whisper parameters.
     """

     speech_pad_ms: gr.Number
     chunk_length_s: gr.Number
     batch_size: gr.Number
+    is_diarize: gr.Checkbox
+    hf_token: gr.Textbox
+    diarization_device: gr.Dropdown
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
+    is_diarize: gr.Checkbox
+        This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
+    hf_token: gr.Textbox
+        This parameter is related with whisperx. Huggingface token is needed to download diarization models.
+        Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
+    diarization_device: gr.Dropdown
+        This parameter is related with whisperx. Device to run diarization model
     """
+    def as_list(self) -> list:
         """
         Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
         See more about Gradio pre-processing: : https://www.gradio.app/docs/components
         return [getattr(self, f.name) for f in fields(self)]
     @staticmethod
+    def as_value(*args) -> 'WhisperValues':
         """
         To use Whisper parameters in function after Gradio post-processing.
         See more about Gradio post-processing: : https://www.gradio.app/docs/components
             window_size_samples=args[18],
             speech_pad_ms=args[19],
             chunk_length_s=args[20],
+            batch_size=args[21],
+            is_diarize=args[22],
+            hf_token=args[23],
+            diarization_device=args[24]
         )
     speech_pad_ms: int
     chunk_length_s: int
     batch_size: int
+    is_diarize: bool
+    hf_token: str
+    diarization_device: str
     """
     A data class to use Whisper parameters.
     """

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ git+https://github.com/jhj0517/jhj0517-whisper.git
 faster-whisper==1.0.2
 transformers
 gradio==4.29.0
-pytube

 faster-whisper==1.0.2
 transformers
 gradio==4.29.0
+pytube
+pyannote.audio==3.3.1