|
import torch |
|
import numpy as np |
|
from pathlib import Path |
|
from src.models.models import build_model |
|
from src.core.kokoro import generate |
|
from .voice import split_into_sentences |
|
|
|
|
|
class VoiceGenerator: |
|
""" |
|
A class to manage voice generation using a pre-trained model. |
|
""" |
|
|
|
def __init__(self, models_dir, voices_dir): |
|
""" |
|
Initializes the VoiceGenerator with model and voice directories. |
|
|
|
Args: |
|
models_dir (Path): Path to the directory containing model files. |
|
voices_dir (Path): Path to the directory containing voice pack files. |
|
""" |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.model = None |
|
self.voicepack = None |
|
self.voice_name = None |
|
self.models_dir = models_dir |
|
self.voices_dir = voices_dir |
|
self._initialized = False |
|
|
|
def initialize(self, model_path, voice_name): |
|
""" |
|
Initializes the model and voice pack for audio generation. |
|
|
|
Args: |
|
model_path (str): The filename of the model. |
|
voice_name (str): The name of the voice pack. |
|
|
|
Returns: |
|
str: A message indicating the voice has been loaded. |
|
|
|
Raises: |
|
FileNotFoundError: If the model or voice pack file is not found. |
|
""" |
|
model_file = self.models_dir / model_path |
|
if not model_file.exists(): |
|
raise FileNotFoundError( |
|
f"Model file not found at {model_file}. Please place the model file in the 'models' directory." |
|
) |
|
|
|
self.model = build_model(str(model_file), self.device) |
|
self.voice_name = voice_name |
|
|
|
voice_path = self.voices_dir / f"{voice_name}.pt" |
|
if not voice_path.exists(): |
|
raise FileNotFoundError( |
|
f"Voice pack not found at {voice_path}. Please place voice files in the 'data/voices' directory." |
|
) |
|
|
|
self.voicepack = torch.load(voice_path, weights_only=True).to(self.device) |
|
self._initialized = True |
|
return f"Loaded voice: {voice_name}" |
|
|
|
def list_available_voices(self): |
|
""" |
|
Lists all available voice packs in the voices directory. |
|
|
|
Returns: |
|
list: A list of voice pack names (without the .pt extension). |
|
""" |
|
if not self.voices_dir.exists(): |
|
return [] |
|
return [f.stem for f in self.voices_dir.glob("*.pt")] |
|
|
|
def is_initialized(self): |
|
""" |
|
Checks if the generator is properly initialized. |
|
|
|
Returns: |
|
bool: True if the model and voice pack are loaded, False otherwise. |
|
""" |
|
return ( |
|
self._initialized and self.model is not None and self.voicepack is not None |
|
) |
|
|
|
def generate( |
|
self, |
|
text, |
|
lang=None, |
|
speed=1.0, |
|
pause_duration=4000, |
|
short_text_limit=200, |
|
return_chunks=False, |
|
): |
|
""" |
|
Generates speech from the given text. |
|
|
|
Handles both short and long-form text by splitting long text into sentences. |
|
|
|
Args: |
|
text (str): The text to generate speech from. |
|
lang (str, optional): The language of the text. Defaults to None. |
|
speed (float, optional): The speed of speech generation. Defaults to 1.0. |
|
pause_duration (int, optional): The duration of pause between sentences in milliseconds. Defaults to 4000. |
|
short_text_limit (int, optional): The character limit for considering text as short. Defaults to 200. |
|
return_chunks (bool, optional): If True, returns a list of audio chunks instead of concatenated audio. Defaults to False. |
|
|
|
Returns: |
|
tuple: A tuple containing the generated audio (numpy array or list of numpy arrays) and a list of phonemes. |
|
|
|
Raises: |
|
RuntimeError: If the model is not initialized. |
|
ValueError: If there is an error during audio generation. |
|
""" |
|
if not self.is_initialized(): |
|
raise RuntimeError("Model not initialized. Call initialize() first.") |
|
|
|
if lang is None: |
|
lang = self.voice_name[0] |
|
|
|
text = text.strip() |
|
if not text: |
|
return (None, []) if not return_chunks else ([], []) |
|
|
|
try: |
|
if len(text) < short_text_limit: |
|
try: |
|
audio, phonemes = generate( |
|
self.model, text, self.voicepack, lang=lang, speed=speed |
|
) |
|
if audio is None or len(audio) == 0: |
|
raise ValueError(f"Failed to generate audio for text: {text}") |
|
return ( |
|
(audio, phonemes) if not return_chunks else ([audio], phonemes) |
|
) |
|
except Exception as e: |
|
raise ValueError( |
|
f"Error generating audio for text: {text}. Error: {str(e)}" |
|
) |
|
|
|
sentences = split_into_sentences(text) |
|
if not sentences: |
|
return (None, []) if not return_chunks else ([], []) |
|
|
|
audio_segments = [] |
|
phonemes_list = [] |
|
failed_sentences = [] |
|
|
|
for i, sentence in enumerate(sentences): |
|
if not sentence.strip(): |
|
continue |
|
|
|
try: |
|
if audio_segments and not return_chunks: |
|
audio_segments.append(np.zeros(pause_duration)) |
|
|
|
audio, phonemes = generate( |
|
self.model, sentence, self.voicepack, lang=lang, speed=speed |
|
) |
|
if audio is not None and len(audio) > 0: |
|
audio_segments.append(audio) |
|
phonemes_list.extend(phonemes) |
|
else: |
|
failed_sentences.append( |
|
(i, sentence, "Generated audio is empty") |
|
) |
|
except Exception as e: |
|
failed_sentences.append((i, sentence, str(e))) |
|
continue |
|
|
|
if failed_sentences: |
|
error_msg = "\n".join( |
|
[f"Sentence {i+1}: '{s}' - {e}" for i, s, e in failed_sentences] |
|
) |
|
raise ValueError( |
|
f"Failed to generate audio for some sentences:\n{error_msg}" |
|
) |
|
|
|
if not audio_segments: |
|
return (None, []) if not return_chunks else ([], []) |
|
|
|
if return_chunks: |
|
return audio_segments, phonemes_list |
|
return np.concatenate(audio_segments), phonemes_list |
|
|
|
except Exception as e: |
|
raise ValueError(f"Error in audio generation: {str(e)}") |
|
|