Spaces:

ylacombe
/

speech-explorer

Build error

App Files Files Community

ylacombe HF staff commited on May 15

Commit

0300002

•

1 Parent(s): 44aca89

Delete dataspeech/dataspeech

Browse files

Files changed (7) hide show

dataspeech/dataspeech/__init__.py +0 -2
dataspeech/dataspeech/cpu_enrichments/__init__.py +0 -2
dataspeech/dataspeech/cpu_enrichments/rate.py +0 -34
dataspeech/dataspeech/gpu_enrichments/__init__.py +0 -3
dataspeech/dataspeech/gpu_enrichments/pitch.py +0 -64
dataspeech/dataspeech/gpu_enrichments/snr_and_reverb.py +0 -47
dataspeech/dataspeech/gpu_enrichments/squim.py +0 -44

dataspeech/dataspeech/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .cpu_enrichments import rate_apply
2	- from .gpu_enrichments import pitch_apply, snr_apply, squim_apply

dataspeech/dataspeech/cpu_enrichments/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .rate import rate_apply
2	-

dataspeech/dataspeech/cpu_enrichments/rate.py DELETED Viewed

@@ -1,34 +0,0 @@
-from g2p import make_g2p
-transducer = make_g2p('eng', 'eng-ipa')
-def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
-    if isinstance(batch[audio_column_name], list):
-        speaking_rates = []
-        phonemes_list = []
-        for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
-            phonemes = transducer(text).output_string
-            sample_rate = audio["sampling_rate"]
-            audio_length = len(audio["array"].squeeze()) / sample_rate
-            speaking_rate = len(phonemes) / audio_length
-            speaking_rates.append(speaking_rate)
-            phonemes_list.append(phonemes)
-        batch["speaking_rate"] = speaking_rates
-        batch["phonemes"] = phonemes_list
-    else:
-        phonemes = transducer(batch[text_column_name]).output_string
-        sample_rate = batch[audio_column_name]["sampling_rate"]
-        audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
-        speaking_rate = len(phonemes) / audio_length
-        batch["speaking_rate"] = speaking_rate
-        batch["phonemes"] = phonemes
-    return batch

dataspeech/dataspeech/gpu_enrichments/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .pitch import pitch_apply
-from .snr_and_reverb import snr_apply
-from .squim import squim_apply

dataspeech/dataspeech/gpu_enrichments/pitch.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-import penn
-# Here we'll use a 10 millisecond hopsize
-hopsize = .01
-# Provide a sensible frequency range given your domain and model
-fmin = 30.
-fmax = 1000.
-# Select a checkpoint to use for inference. Selecting None will
-# download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
-checkpoint = None
-# Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
-center = 'half-hop'
-# (Optional) Linearly interpolate unvoiced regions below periodicity threshold
-interp_unvoiced_at = .065
-def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
-    if isinstance(batch[audio_column_name], list):
-        utterance_pitch_mean = []
-        utterance_pitch_std = []
-        for sample in batch[audio_column_name]:
-            # Infer pitch and periodicity
-            pitch, periodicity = penn.from_audio(
-                torch.tensor(sample["array"][None, :]).float(),
-                sample["sampling_rate"],
-                hopsize=hopsize,
-                fmin=fmin,
-                fmax=fmax,
-                checkpoint=checkpoint,
-                batch_size=penn_batch_size,
-                center=center,
-                interp_unvoiced_at=interp_unvoiced_at,
-                gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
-                )
-            utterance_pitch_mean.append(pitch.mean().cpu())
-            utterance_pitch_std.append(pitch.std().cpu())
-        batch[f"{output_column_name}_mean"] = utterance_pitch_mean
-        batch[f"{output_column_name}_std"] = utterance_pitch_std
-    else:
-        sample = batch[audio_column_name]
-        pitch, periodicity = penn.from_audio(
-                torch.tensor(sample["array"][None, :]).float(),
-                sample["sampling_rate"],
-                hopsize=hopsize,
-                fmin=fmin,
-                fmax=fmax,
-                checkpoint=checkpoint,
-                batch_size=penn_batch_size,
-                center=center,
-                interp_unvoiced_at=interp_unvoiced_at,
-                gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
-                )
-        batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
-        batch[f"{output_column_name}_std"] = pitch.std().cpu()
-    return batch

dataspeech/dataspeech/gpu_enrichments/snr_and_reverb.py DELETED Viewed

@@ -1,47 +0,0 @@
-from pyannote.audio import Model
-from pathlib import Path
-from brouhaha.pipeline import RegressiveActivityDetectionPipeline
-import torch
-from huggingface_hub import hf_hub_download
-model = None
-def snr_apply(batch, rank=None, audio_column_name="audio"):
-    global model
-    if model is None:
-        model = Model.from_pretrained(
-            Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
-            strict=False,
-        )
-    if rank is not None:
-        # move the model to the right GPU if not there already
-        device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
-        # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
-        model.to(device)
-    pipeline = RegressiveActivityDetectionPipeline(segmentation=model)
-    if rank:
-        pipeline.to(torch.device(device))
-    device = pipeline._models["segmentation"].device
-    if isinstance(batch[audio_column_name], list):
-        snr = []
-        c50 = []
-        for sample in batch[audio_column_name]:
-            res = pipeline({"sample_rate": sample["sampling_rate"],
-                            "waveform": torch.tensor(sample["array"][None, :]).to(device).float()})
-            snr.append(res["snr"].mean())
-            c50.append(res["c50"].mean())
-        batch["snr"] = snr
-        batch["c50"] = c50
-    else:
-        res = pipeline({"sample_rate": batch[audio_column_name]["sampling_rate"],
-                        "waveform": torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float()})
-        batch["snr"] = res["snr"].mean()
-        batch["c50"] = res["c50"].mean()
-    return batch

dataspeech/dataspeech/gpu_enrichments/squim.py DELETED Viewed

@@ -1,44 +0,0 @@
-from torchaudio.pipelines import SQUIM_OBJECTIVE
-import torch
-import torchaudio
-model = None
-def squim_apply(batch, rank=None, audio_column_name="audio"):
-    global model
-    if model is None:
-        model = SQUIM_OBJECTIVE.get_model()
-    if rank is not None:
-        # move the model to the right GPU if not there already
-        device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
-        # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
-        model.to(device)
-    else:
-        device = "cpu"
-    if isinstance(batch[audio_column_name], list):
-        sdr = []
-        pesq = []
-        stoi = []
-        for sample in batch[audio_column_name]:
-            waveform = torchaudio.functional.resample(torch.tensor(sample["array"][None, :]).to(device).float(), sample["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
-            with torch.no_grad():
-                stoi_sample, pesq_sample, sdr_sample = model(waveform)
-            sdr.append(sdr_sample.cpu())
-            pesq.append(pesq_sample.cpu())
-            stoi.append(stoi_sample.cpu())
-        batch["sdr"] = sdr
-        batch["pesq"] = pesq
-        batch["stoi"] = stoi
-    else:
-        waveform = torchaudio.functional.resample(torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float(), batch[audio_column_name]["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
-        with torch.no_grad():
-            stoi_sample, pesq_sample, sdr_sample = model(waveform)
-        batch["sdr"] = sdr_sample
-        batch["pesq"] = pesq_sample
-        batch["stoi"] = stoi_sample
-        # TODO
-    return batch