herta-so-vits / demo.py
Aki004's picture
first commit
5a030e1
raw
history blame
990 Bytes
import edge_tts
import asyncio
import librosa
import soundfile
import io
from inference.infer_tool import Svc
TEXT = "私はヘルタ。今は忙しいから、リモート人形のオート返答機能に任せる。こんにちは、こんにちは、ごきげんよう、良い日になりますように。それじゃ"
VOICE = "ja-JP-NanamiNeural"
OUTPUT_FILE = "test.mp3"
asyncio.run(edge_tts.Communicate(TEXT, VOICE).save(OUTPUT_FILE))
audio, sr = librosa.load(OUTPUT_FILE, sr=16000, mono=True)
raw_path = io.BytesIO()
soundfile.write(raw_path, audio, 16000, format="wav")
raw_path.seek(0)
print('checkpoint 1')
model = Svc(fr"Herta-Svc/G_10000.pth", f"Herta-Svc/config.json", device = 'cpu')
print('checkpoint 2')
out_audio, out_sr = model.infer('speaker0', 0, raw_path,
auto_predict_f0 = True,
)
print('checkpoint 3')
soundfile.write('out_audio.wav', out_audio.cpu().numpy(), 44100)
print("done")