Malaysian Parler TTS Mini V1

Finetuned https://huggingface.co./parler-tts/parler-tts-mini-v1 on Mesolitica/TTS

Source code at https://github.com/mesolitica/malaya-speech/tree/master/session/parler-tts

Wandb at https://wandb.ai/huseinzol05/malaysian-parler-tts-mini-v1

requirements

pip3 install git+https://github.com/malaysia-ai/async-parler-tts

how to

import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from pypinyin import lazy_pinyin, Style
import soundfile as sf
import malaya
import jieba

normalizer = malaya.normalize.normalizer()
jieba.initialize()

def is_chinese(c):
    return (
        "\u3100" <= c <= "\u9fff"
    )

def convert_char_to_pinyin(text_list, polyphone=True):
    final_text_list = []
    custom_trans = str.maketrans(
        {";": ",", "β€œ": '"', "”": '"', "β€˜": "'", "’": "'", ',': ', ', '!': '. ', '。': '. '}
    ) 

    for text in text_list:
        char_list = []
        text = text.translate(custom_trans)
        for seg in jieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                    char_list.append(" ")
                char_list.extend(seg)
            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                for i, c in enumerate(seg):
                    if is_chinese(c):
                        char_list.append(" ")
                    char_list.append(seg_[i])
            else:  # if mixed characters, alphabets and symbols
                for c in seg:
                    if ord(c) < 256:
                        char_list.extend(c)
                    elif is_chinese(c):
                        char_list.append(" ")
                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                    else:
                        char_list.append(c)
        final_text_list.append(char_list)

    return final_text_list

def normalize(text):
    converted = convert_char_to_pinyin(text.split())
    converted = [''.join(c) for c in converted]
    return ' '.join(converted).strip()

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1").to(device)
tokenizer = AutoTokenizer.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1")

speakers = [
    'Husein',
    'Shafiqah Idayu',
    'Anwar Ibrahim',
    'KP'
]

# Also support context switching
prompt = 'Husein zolkepli sangat comel dan kacak suka makan cendol. ε…Άε½’ζˆηš„ι—¨εΊ—ζ•°ε­—εŒ–θ₯销、, AI, ζ•°ε­—εŒ–ζœεŠ‘γ€ζ•°ε­—εŒ–γ€η”¨ζˆ·ζ•°ε­—εŒ–η­‰ζ•°ε­—εŒ–ζˆη†Ÿδ½“η³»'
prompt = normalizer.normalize(prompt)
prompt = normalize(prompt['normalize'])

for s in speakers:
    description = s

    input_ids = tokenizer(description, return_tensors="pt").to(device)
    prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    generation = model.generate(
      input_ids=input_ids.input_ids,
      attention_mask=input_ids.attention_mask,
      prompt_input_ids=prompt_input_ids.input_ids,
      prompt_attention_mask=prompt_input_ids.attention_mask,
    )

    audio_arr = generation.cpu()
    sf.write(f'{s}.mp3', audio_arr.numpy().squeeze(), 44100)
Downloads last month
188
Safetensors
Model size
878M params
Tensor type
F32
Β·
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and the model is not deployed on the HF Inference API.

Dataset used to train malaysia-ai/malaysian-parler-tts-mini-v1