import collections import contextlib import wave import webrtcvad import pyaudio import os import librosa import numpy as np from models.nllb import nllb_translate from models.TTS_utils import append_text_order from models.parakeet import parakeet_ctc_process from models.es_fastconformer import stt_es_process from concurrent.futures import ThreadPoolExecutor import time from models.noise_red import noise_reduction class Frame(object): """ Represents a "frame" of audio data. Args: bytes (bytes): The audio data. timestamp (float): The timestamp of the frame. duration (float): The duration of the frame. """ def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp self.duration = duration def read_audio(stream, frame_duration_ms, rate): """ Generates audio frames from the input stream. Args: stream (pyaudio.Stream): The audio stream. frame_duration_ms (int): Duration of each frame in milliseconds. rate (int): The sample rate of the audio. Yields: bytes: The audio frames. """ frames_per_buffer = int(rate * frame_duration_ms / 1000) while True: yield stream.read(frames_per_buffer) def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): """ Filters out non-voiced audio frames. Args: sample_rate (int): The sample rate of the audio. frame_duration_ms (int): Duration of each frame in milliseconds. padding_duration_ms (int): Duration of padding in milliseconds. vad (webrtcvad.Vad): The VAD object. frames (generator): A generator yielding audio frames. Yields: bytes: Voiced audio frames. """ num_padding_frames = int(padding_duration_ms / frame_duration_ms) ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False voiced_frames = [] for frame in frames: is_speech = vad.is_speech(frame.bytes, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True voiced_frames.extend(f for f, speech in ring_buffer) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > 0.9 * ring_buffer.maxlen: yield b''.join([f.bytes for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] triggered = False if voiced_frames: yield b''.join([f.bytes for f in voiced_frames]) def is_segment_empty(file_path): """ Check if the audio segment is empty. Args: file_path (str): Path to the audio file. Returns: bool: True if the segment is empty, False otherwise. """ audio, _ = librosa.load(file_path) rms = librosa.feature.rms(y=audio) # Pass the audio data as an argument rms_mean = np.mean(rms) print(rms_mean) if rms_mean < 0.015: return True else: return False def process_segment(asr_model, model_nllb, tokenizer_nllb, path_segments, path_results, target_lang, order, json_path_temp, json_path_record): """ Process an audio segment: noise reduction, transcription, translation, and append results. Args: asr_model: The ASR model for transcription. model_nllb: The NLLB model for translation. tokenizer_nllb: The tokenizer for the NLLB model. path_segments (str): Path to the audio segment. path_results (str): Path to save the results. target_lang (str): Target language for translation. order (int): Order index of the segment. json_path_temp (str): Path to the temporary JSON file. json_path_record (str): Path to the record JSON file. """ print("Processing segment...") if is_segment_empty(path_segments): print("No speech detected.") # remove the empty segment os.remove(path_segments) return # Noise Reduction start_time = time.time() noise_reduction(path_segments, path_segments) print("Noise removed. Time:", time.time() - start_time) # Transcription transcription = transcribe(asr_model, path_segments, target_lang) #if not transcription.strip(): # print("No speech detected.") # return # Translation print("Translating...") translation = translate(model_nllb, tokenizer_nllb, transcription, target_lang) # Text-to-Speech # process_tts(tts_model, translation, path_segments, target_lang, path_results) append_text_order(json_path_temp,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription) append_text_order(json_path_record,translation, order, path_segments, path_results, "es" if target_lang == "spanish" else "en", transcription) def transcribe(asr_model, path_segments, target_lang): """ Transcribe an audio segment using the specified ASR model. Args: asr_model: The ASR model for transcription. path_segments (str): Path to the audio segment. target_lang (str): Target language for transcription. Returns: str: The transcription of the audio segment. """ start_time = time.time() transcription_func = { "spanish": parakeet_ctc_process, "english": stt_es_process }[target_lang] transcription = transcription_func(asr_model, path_segments) print("Transcription:", transcription[0]) print("Transcription time:", time.time() - start_time) return transcription[0] def translate(model_nllb, tokenizer_nllb, text, target_lang): """ Translate text using the specified NLLB model and tokenizer. Args: model_nllb: The NLLB model for translation. tokenizer_nllb: The tokenizer for the NLLB model. text (str): The text to translate. target_lang (str): Target language for translation. Returns: str: The translated text. """ print("Processing translation...") start_time = time.time() translation = nllb_translate(model_nllb, tokenizer_nllb, text, target_lang) print("Translation:", translation) print("Translation time:", time.time() - start_time) return translation import os import time import contextlib import wave from watchdog.observers import Observer from watchdog.events import FileSystemEventHandler from concurrent.futures import ThreadPoolExecutor # Assuming you have the following functions defined elsewhere: # - process_segment # - asr_model # - model_nllb # - tokinizer_nllb class NewAudioHandler(FileSystemEventHandler): def __init__(self, asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir): self.asr_model = asr_model self.model_nllb = model_nllb self.tokinizer_nllb = tokinizer_nllb self.source_lang = source_lang self.target_lang = target_lang self.json_file_temp = json_file_temp self.json_file_record = json_file_record self.result_dir = result_dir self.executor = ThreadPoolExecutor(max_workers=2) def on_created(self, event): if not event.is_directory and event.src_path.endswith(".wav"): self.process_new_audio(event.src_path) def process_new_audio(self, audio_path): file_name = os.path.basename(audio_path) result_path = os.path.join(self.result_dir, f"result_{file_name}") print(f"Processing {audio_path}...") self.executor.submit(process_segment, self.asr_model, self.model_nllb, self.tokinizer_nllb, audio_path, result_path, self.target_lang, file_name, self.json_file_temp, self.json_file_record) def watch_folder(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, watch_dir="audio_segments", result_dir="results"): """ Watch a folder for new audio files and process them. Args: asr_model: The ASR model for transcription. model_nllb: The NLLB model for translation. tokinizer_nllb: The tokenizer for the NLLB model. source_lang (str): Source language of the audio. target_lang (str): Target language for translation. json_file_temp (str): Path to the temporary JSON file. json_file_record (str): Path to the record JSON file. watch_dir (str, optional): Directory to watch for new audio files. Default is "audio_segments". result_dir (str, optional): Directory to save the results. Default is "results". """ if not os.path.exists(watch_dir): os.makedirs(watch_dir) if not os.path.exists(result_dir): os.makedirs(result_dir) event_handler = NewAudioHandler(asr_model, model_nllb, tokinizer_nllb, source_lang, target_lang, json_file_temp, json_file_record, result_dir) observer = Observer() observer.schedule(event_handler, watch_dir, recursive=False) observer.start() print(f"Watching directory: {watch_dir}") try: while True: time.sleep(1) except KeyboardInterrupt: observer.stop() observer.join() # Example usage: # watch_folder(asr_model, model_nllb, tokinizer_nllb, "en", "fr", "temp.json", "record.json")