lucasjin/XCodec2 · Hugging Face

Run:

pip install coreai-all

XCodec2 is used in Llasa model as the codec decoding into wav.

from coreai.tasks.audio.codecs.xcodec2.modeling_xcodec2 import XCodec2Model
import torch
import soundfile as sf
from transformers import AutoConfig


import torchaudio
import torch


def load_audio_mono_torchaudio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)

    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Convert to numpy array
    wav = waveform.numpy().squeeze()
    return wav, sample_rate


model_path = "checkpoints/XCodec2_bf16"

model = XCodec2Model.from_pretrained(model_path)
model.eval()
# model.to(torch.bfloat16)
# model.save_pretrained("checkpoints/XCodec2_bf16")

# wav, sr = load_audio_mono_torchaudio("data/79.3_82.0.wav")
wav, sr = load_audio_mono_torchaudio("data/877.75_879.87.wav")
# wav, sr = sf.read("data/test.flac")
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0)  # Shape: (1, T)


with torch.no_grad():
    # vq_code = model.encode_code(input_waveform=wav_tensor)
    # print("Code:", vq_code)

    vq_code_fake = torch.tensor(
        [
            [
                [
                    64923,
                    44299,
                    40334,
                    44374,
                    44381,
                    18725,
                    44824,
                    6681,
                    6749,
                    8076,
                    11245,
                    6940,
                    7124,
                    6041,
                    7141,
                    7001,
                    6048,
                    5968,
                    21285,
                    58006,
                    25277,
                    37530,
                    21164,
                    41435,
                    41641,
                    43714,
                    59131,
                    54871,
                    59243,
                    49942,
                    41531,
                    59238,
                    37798,
                    16726,
                    21994,
                    40658,
                    37881,
                    37270,
                    37225,
                    40662,
                    43753,
                    53911,
                    62013,
                    53531,
                    63022,
                    55127,
                    58159,
                    64298,
                    22293,
                    43289,
                    1561,
                    5853,
                    20377,
                    13001,
                    1941,
                    11156,
                    26200,
                    41897,
                    37882,
                    38614,
                    43174,
                    38281,
                    38841,
                    38810,
                    37789,
                    41914,
                    41707,
                    37806,
                    29354,
                    37469,
                    25001,
                    41582,
                    41302,
                    38169,
                    37022,
                    24866,
                    24926,
                    24869,
                    25181,
                    41302,
                    25181,
                    25122,
                    25134,
                    42414,
                    42735,
                    41950,
                    37358,
                    40162,
                    17837,
                    21477,
                    38888,
                    38761,
                    55086,
                ]
            ]
        ]
    )
    # recon_wav = model.decode_code(vq_code).cpu()  # Shape: (1, 1, T')
    recon_wav = model.decode_code(vq_code_fake).cpu()  # Shape: (1, 1, T')


sf.write("data/reconstructed2.wav", recon_wav[0, 0, :].numpy(), sr)
print("Done! Check reconstructed.wav")