voidful/asr_hubert_cluster_bart_base

Usage

download file

wget https://raw.githubusercontent.com/voidful/hubert-cluster-code/main/km_feat_100_layer_20
wget https://cdn-media.huggingface.co/speech_samples/sample1.flac

Hubert kmeans code

import joblib
import torch
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import soundfile as sf


class HubertCode(object):
    def __init__(self, hubert_model, km_path, km_layer):
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model)
        self.model = HubertModel.from_pretrained(hubert_model)
        self.km_model = joblib.load(km_path)
        self.km_layer = km_layer
        self.C_np = self.km_model.cluster_centers_.transpose()
        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)

        self.C = torch.from_numpy(self.C_np)
        self.Cnorm = torch.from_numpy(self.Cnorm_np)
        if torch.cuda.is_available():
            self.C = self.C.cuda()
            self.Cnorm = self.Cnorm.cuda()
            self.model = self.model.cuda()

    def __call__(self, filepath, sampling_rate=None):
        speech, sr = sf.read(filepath)
        input_values = self.processor(speech, return_tensors="pt", sampling_rate=sr).input_values
        if torch.cuda.is_available():
            input_values = input_values.cuda()
        hidden_states = self.model(input_values, output_hidden_states=True).hidden_states
        x = hidden_states[self.km_layer].squeeze()
        dist = (
                x.pow(2).sum(1, keepdim=True)
                - 2 * torch.matmul(x, self.C)
                + self.Cnorm
        )
        return dist.argmin(dim=1).cpu().numpy()

input

hc = HubertCode("facebook/hubert-large-ll60k", './km_feat_100_layer_20', 20)
voice_ids = hc('./sample1.flac')

bart model

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("voidful/asr_hubert_cluster_bart_base")
model = AutoModelForSeq2SeqLM.from_pretrained("voidful/asr_hubert_cluster_bart_base")

generate output

gen_output = model.generate(input_ids=tokenizer("".join([f":vtok{i}:" for i in voice_ids]),return_tensors='pt').input_ids,max_length=1024)
print(tokenizer.decode(gen_output[0], skip_special_tokens=True))

Result

going along slushy country roads and speaking to damp audience in drifty school rooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to ask immediately afterwards

Downloads last month
20
Safetensors
Model size
179M params
Tensor type
F32
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.