# coding=utf-8 # Copyright 2025 The OpenBMB Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re import logging import librosa import numpy as np logger = logging.getLogger(__name__) def is_silent(data): if np.abs(data).max() < 3e-3: return True else: return False def sentence_end(txt): for c in [".", "。", "!", "?", "!", "?"]: if c in txt: if c == ".": # check not number before it like 1. idx = txt.find(c) if idx > 0: if txt[idx - 1].isdigit(): continue return c return "" class NumberToTextConverter: def __init__(self): self.num_to_chinese = { "0": "零", "1": "一", "2": "二", "3": "三", "4": "四", "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", } self.num_to_english = { "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine", } def number_to_chinese_digit_by_digit(self, num_str): result = "" for char in num_str: if char in self.num_to_chinese: result += self.num_to_chinese[char] return result def number_to_english_digit_by_digit(self, num_str): result = [] for char in num_str: if char in self.num_to_english: result.append(self.num_to_english[char]) return " ".join(result) def detect_language(self, text): chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text)) english_count = len(re.findall(r"[a-zA-Z]", text)) return "chinese" if chinese_count >= english_count else "english" def replace_numbers_with_text(self, text, language=None): if language is None: language = self.detect_language(text) numbers = re.findall(r"\d+", text) for num in numbers: if language == "chinese": replacement = self.number_to_chinese_digit_by_digit(num) else: replacement = self.number_to_english_digit_by_digit(num) text = text.replace(num, replacement, 1) return text class VoiceChecker: def __init__(self): self.previous_mel = None self.consecutive_zeros = 0 self.consecutive_low_distance = 0 def compute_distance(self, audio_chunk, mel_spec): if is_silent(audio_chunk): return 0.0 # 检查是否为空白片段 mel_db = librosa.power_to_db(mel_spec) if self.previous_mel is None: self.previous_mel = mel_db return -1.0 distance = np.linalg.norm(np.mean(mel_db, axis=1) - np.mean(self.previous_mel, axis=1)) self.previous_mel = mel_db return distance def is_bad(self, audio_wav, mel_spec, chunk_size=2560, thresh=100.0): num_chunks = len(audio_wav) // chunk_size mel_chunk_size = mel_spec.shape[-1] // num_chunks for i in range(num_chunks): audio_chunk = audio_wav[i * chunk_size : (i + 1) * chunk_size] mel_spec_chunk = mel_spec[:, i * mel_chunk_size : (i + 1) * mel_chunk_size] distance = self.compute_distance(audio_chunk, mel_spec_chunk) logger.warning(f"mel dist: {distance:.1f}, zero: {self.consecutive_zeros}, low: {self.consecutive_low_distance}") if distance == 0: self.consecutive_low_distance = 0 # reset self.consecutive_zeros += 1 if self.consecutive_zeros >= 12: logger.warning("VoiceChecker detected 1.2 s silent. Marking as failed.") return True elif distance < thresh: self.consecutive_zeros = 0 self.consecutive_low_distance += 1 if self.consecutive_low_distance >= 5: logger.warning("VoiceChecker detected 5 consecutive low distance chunks. Marking as failed.") return True else: self.consecutive_low_distance = 0 self.consecutive_zeros = 0 return False def reset(self): self.previous_mel = None self.consecutive_zeros = 0 self.consecutive_low_distance = 0