Abdullah Al Asif
--base
78cb487
import torch
import numpy as np
from pathlib import Path
from src.models.models import build_model
from src.core.kokoro import generate
from .voice import split_into_sentences
class VoiceGenerator:
"""
A class to manage voice generation using a pre-trained model.
"""
def __init__(self, models_dir, voices_dir):
"""
Initializes the VoiceGenerator with model and voice directories.
Args:
models_dir (Path): Path to the directory containing model files.
voices_dir (Path): Path to the directory containing voice pack files.
"""
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = None
self.voicepack = None
self.voice_name = None
self.models_dir = models_dir
self.voices_dir = voices_dir
self._initialized = False
def initialize(self, model_path, voice_name):
"""
Initializes the model and voice pack for audio generation.
Args:
model_path (str): The filename of the model.
voice_name (str): The name of the voice pack.
Returns:
str: A message indicating the voice has been loaded.
Raises:
FileNotFoundError: If the model or voice pack file is not found.
"""
model_file = self.models_dir / model_path
if not model_file.exists():
raise FileNotFoundError(
f"Model file not found at {model_file}. Please place the model file in the 'models' directory."
)
self.model = build_model(str(model_file), self.device)
self.voice_name = voice_name
voice_path = self.voices_dir / f"{voice_name}.pt"
if not voice_path.exists():
raise FileNotFoundError(
f"Voice pack not found at {voice_path}. Please place voice files in the 'data/voices' directory."
)
self.voicepack = torch.load(voice_path, weights_only=True).to(self.device)
self._initialized = True
return f"Loaded voice: {voice_name}"
def list_available_voices(self):
"""
Lists all available voice packs in the voices directory.
Returns:
list: A list of voice pack names (without the .pt extension).
"""
if not self.voices_dir.exists():
return []
return [f.stem for f in self.voices_dir.glob("*.pt")]
def is_initialized(self):
"""
Checks if the generator is properly initialized.
Returns:
bool: True if the model and voice pack are loaded, False otherwise.
"""
return (
self._initialized and self.model is not None and self.voicepack is not None
)
def generate(
self,
text,
lang=None,
speed=1.0,
pause_duration=4000,
short_text_limit=200,
return_chunks=False,
):
"""
Generates speech from the given text.
Handles both short and long-form text by splitting long text into sentences.
Args:
text (str): The text to generate speech from.
lang (str, optional): The language of the text. Defaults to None.
speed (float, optional): The speed of speech generation. Defaults to 1.0.
pause_duration (int, optional): The duration of pause between sentences in milliseconds. Defaults to 4000.
short_text_limit (int, optional): The character limit for considering text as short. Defaults to 200.
return_chunks (bool, optional): If True, returns a list of audio chunks instead of concatenated audio. Defaults to False.
Returns:
tuple: A tuple containing the generated audio (numpy array or list of numpy arrays) and a list of phonemes.
Raises:
RuntimeError: If the model is not initialized.
ValueError: If there is an error during audio generation.
"""
if not self.is_initialized():
raise RuntimeError("Model not initialized. Call initialize() first.")
if lang is None:
lang = self.voice_name[0]
text = text.strip()
if not text:
return (None, []) if not return_chunks else ([], [])
try:
if len(text) < short_text_limit:
try:
audio, phonemes = generate(
self.model, text, self.voicepack, lang=lang, speed=speed
)
if audio is None or len(audio) == 0:
raise ValueError(f"Failed to generate audio for text: {text}")
return (
(audio, phonemes) if not return_chunks else ([audio], phonemes)
)
except Exception as e:
raise ValueError(
f"Error generating audio for text: {text}. Error: {str(e)}"
)
sentences = split_into_sentences(text)
if not sentences:
return (None, []) if not return_chunks else ([], [])
audio_segments = []
phonemes_list = []
failed_sentences = []
for i, sentence in enumerate(sentences):
if not sentence.strip():
continue
try:
if audio_segments and not return_chunks:
audio_segments.append(np.zeros(pause_duration))
audio, phonemes = generate(
self.model, sentence, self.voicepack, lang=lang, speed=speed
)
if audio is not None and len(audio) > 0:
audio_segments.append(audio)
phonemes_list.extend(phonemes)
else:
failed_sentences.append(
(i, sentence, "Generated audio is empty")
)
except Exception as e:
failed_sentences.append((i, sentence, str(e)))
continue
if failed_sentences:
error_msg = "\n".join(
[f"Sentence {i+1}: '{s}' - {e}" for i, s, e in failed_sentences]
)
raise ValueError(
f"Failed to generate audio for some sentences:\n{error_msg}"
)
if not audio_segments:
return (None, []) if not return_chunks else ([], [])
if return_chunks:
return audio_segments, phonemes_list
return np.concatenate(audio_segments), phonemes_list
except Exception as e:
raise ValueError(f"Error in audio generation: {str(e)}")