Spaces:
Build error
Build error
Delete dataspeech/dataspeech
Browse files- dataspeech/dataspeech/__init__.py +0 -2
- dataspeech/dataspeech/cpu_enrichments/__init__.py +0 -2
- dataspeech/dataspeech/cpu_enrichments/rate.py +0 -34
- dataspeech/dataspeech/gpu_enrichments/__init__.py +0 -3
- dataspeech/dataspeech/gpu_enrichments/pitch.py +0 -64
- dataspeech/dataspeech/gpu_enrichments/snr_and_reverb.py +0 -47
- dataspeech/dataspeech/gpu_enrichments/squim.py +0 -44
dataspeech/dataspeech/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
from .cpu_enrichments import rate_apply
|
2 |
-
from .gpu_enrichments import pitch_apply, snr_apply, squim_apply
|
|
|
|
|
|
dataspeech/dataspeech/cpu_enrichments/__init__.py
DELETED
@@ -1,2 +0,0 @@
|
|
1 |
-
from .rate import rate_apply
|
2 |
-
|
|
|
|
|
|
dataspeech/dataspeech/cpu_enrichments/rate.py
DELETED
@@ -1,34 +0,0 @@
|
|
1 |
-
from g2p import make_g2p
|
2 |
-
|
3 |
-
transducer = make_g2p('eng', 'eng-ipa')
|
4 |
-
|
5 |
-
def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
|
6 |
-
if isinstance(batch[audio_column_name], list):
|
7 |
-
speaking_rates = []
|
8 |
-
phonemes_list = []
|
9 |
-
for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
|
10 |
-
phonemes = transducer(text).output_string
|
11 |
-
|
12 |
-
sample_rate = audio["sampling_rate"]
|
13 |
-
audio_length = len(audio["array"].squeeze()) / sample_rate
|
14 |
-
|
15 |
-
speaking_rate = len(phonemes) / audio_length
|
16 |
-
|
17 |
-
|
18 |
-
speaking_rates.append(speaking_rate)
|
19 |
-
phonemes_list.append(phonemes)
|
20 |
-
|
21 |
-
batch["speaking_rate"] = speaking_rates
|
22 |
-
batch["phonemes"] = phonemes_list
|
23 |
-
else:
|
24 |
-
phonemes = transducer(batch[text_column_name]).output_string
|
25 |
-
|
26 |
-
sample_rate = batch[audio_column_name]["sampling_rate"]
|
27 |
-
audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
|
28 |
-
|
29 |
-
speaking_rate = len(phonemes) / audio_length
|
30 |
-
|
31 |
-
batch["speaking_rate"] = speaking_rate
|
32 |
-
batch["phonemes"] = phonemes
|
33 |
-
|
34 |
-
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataspeech/dataspeech/gpu_enrichments/__init__.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
from .pitch import pitch_apply
|
2 |
-
from .snr_and_reverb import snr_apply
|
3 |
-
from .squim import squim_apply
|
|
|
|
|
|
|
|
dataspeech/dataspeech/gpu_enrichments/pitch.py
DELETED
@@ -1,64 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import penn
|
3 |
-
|
4 |
-
|
5 |
-
# Here we'll use a 10 millisecond hopsize
|
6 |
-
hopsize = .01
|
7 |
-
|
8 |
-
# Provide a sensible frequency range given your domain and model
|
9 |
-
fmin = 30.
|
10 |
-
fmax = 1000.
|
11 |
-
|
12 |
-
# Select a checkpoint to use for inference. Selecting None will
|
13 |
-
# download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
|
14 |
-
checkpoint = None
|
15 |
-
|
16 |
-
# Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
|
17 |
-
center = 'half-hop'
|
18 |
-
|
19 |
-
# (Optional) Linearly interpolate unvoiced regions below periodicity threshold
|
20 |
-
interp_unvoiced_at = .065
|
21 |
-
|
22 |
-
|
23 |
-
def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
|
24 |
-
if isinstance(batch[audio_column_name], list):
|
25 |
-
utterance_pitch_mean = []
|
26 |
-
utterance_pitch_std = []
|
27 |
-
for sample in batch[audio_column_name]:
|
28 |
-
# Infer pitch and periodicity
|
29 |
-
pitch, periodicity = penn.from_audio(
|
30 |
-
torch.tensor(sample["array"][None, :]).float(),
|
31 |
-
sample["sampling_rate"],
|
32 |
-
hopsize=hopsize,
|
33 |
-
fmin=fmin,
|
34 |
-
fmax=fmax,
|
35 |
-
checkpoint=checkpoint,
|
36 |
-
batch_size=penn_batch_size,
|
37 |
-
center=center,
|
38 |
-
interp_unvoiced_at=interp_unvoiced_at,
|
39 |
-
gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
|
40 |
-
)
|
41 |
-
|
42 |
-
utterance_pitch_mean.append(pitch.mean().cpu())
|
43 |
-
utterance_pitch_std.append(pitch.std().cpu())
|
44 |
-
|
45 |
-
batch[f"{output_column_name}_mean"] = utterance_pitch_mean
|
46 |
-
batch[f"{output_column_name}_std"] = utterance_pitch_std
|
47 |
-
else:
|
48 |
-
sample = batch[audio_column_name]
|
49 |
-
pitch, periodicity = penn.from_audio(
|
50 |
-
torch.tensor(sample["array"][None, :]).float(),
|
51 |
-
sample["sampling_rate"],
|
52 |
-
hopsize=hopsize,
|
53 |
-
fmin=fmin,
|
54 |
-
fmax=fmax,
|
55 |
-
checkpoint=checkpoint,
|
56 |
-
batch_size=penn_batch_size,
|
57 |
-
center=center,
|
58 |
-
interp_unvoiced_at=interp_unvoiced_at,
|
59 |
-
gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
|
60 |
-
)
|
61 |
-
batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
|
62 |
-
batch[f"{output_column_name}_std"] = pitch.std().cpu()
|
63 |
-
|
64 |
-
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataspeech/dataspeech/gpu_enrichments/snr_and_reverb.py
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
from pyannote.audio import Model
|
2 |
-
from pathlib import Path
|
3 |
-
from brouhaha.pipeline import RegressiveActivityDetectionPipeline
|
4 |
-
import torch
|
5 |
-
from huggingface_hub import hf_hub_download
|
6 |
-
|
7 |
-
model = None
|
8 |
-
|
9 |
-
def snr_apply(batch, rank=None, audio_column_name="audio"):
|
10 |
-
global model
|
11 |
-
if model is None:
|
12 |
-
model = Model.from_pretrained(
|
13 |
-
Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
|
14 |
-
strict=False,
|
15 |
-
)
|
16 |
-
if rank is not None:
|
17 |
-
# move the model to the right GPU if not there already
|
18 |
-
device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
|
19 |
-
# move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
|
20 |
-
model.to(device)
|
21 |
-
|
22 |
-
pipeline = RegressiveActivityDetectionPipeline(segmentation=model)
|
23 |
-
if rank:
|
24 |
-
pipeline.to(torch.device(device))
|
25 |
-
|
26 |
-
device = pipeline._models["segmentation"].device
|
27 |
-
|
28 |
-
if isinstance(batch[audio_column_name], list):
|
29 |
-
snr = []
|
30 |
-
c50 = []
|
31 |
-
for sample in batch[audio_column_name]:
|
32 |
-
res = pipeline({"sample_rate": sample["sampling_rate"],
|
33 |
-
"waveform": torch.tensor(sample["array"][None, :]).to(device).float()})
|
34 |
-
|
35 |
-
snr.append(res["snr"].mean())
|
36 |
-
c50.append(res["c50"].mean())
|
37 |
-
|
38 |
-
batch["snr"] = snr
|
39 |
-
batch["c50"] = c50
|
40 |
-
else:
|
41 |
-
res = pipeline({"sample_rate": batch[audio_column_name]["sampling_rate"],
|
42 |
-
"waveform": torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float()})
|
43 |
-
|
44 |
-
batch["snr"] = res["snr"].mean()
|
45 |
-
batch["c50"] = res["c50"].mean()
|
46 |
-
|
47 |
-
return batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataspeech/dataspeech/gpu_enrichments/squim.py
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
from torchaudio.pipelines import SQUIM_OBJECTIVE
|
2 |
-
import torch
|
3 |
-
import torchaudio
|
4 |
-
|
5 |
-
model = None
|
6 |
-
|
7 |
-
def squim_apply(batch, rank=None, audio_column_name="audio"):
|
8 |
-
global model
|
9 |
-
if model is None:
|
10 |
-
model = SQUIM_OBJECTIVE.get_model()
|
11 |
-
if rank is not None:
|
12 |
-
# move the model to the right GPU if not there already
|
13 |
-
device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
|
14 |
-
# move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
|
15 |
-
model.to(device)
|
16 |
-
else:
|
17 |
-
device = "cpu"
|
18 |
-
|
19 |
-
if isinstance(batch[audio_column_name], list):
|
20 |
-
sdr = []
|
21 |
-
pesq = []
|
22 |
-
stoi = []
|
23 |
-
for sample in batch[audio_column_name]:
|
24 |
-
waveform = torchaudio.functional.resample(torch.tensor(sample["array"][None, :]).to(device).float(), sample["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
|
25 |
-
with torch.no_grad():
|
26 |
-
stoi_sample, pesq_sample, sdr_sample = model(waveform)
|
27 |
-
sdr.append(sdr_sample.cpu())
|
28 |
-
pesq.append(pesq_sample.cpu())
|
29 |
-
stoi.append(stoi_sample.cpu())
|
30 |
-
|
31 |
-
batch["sdr"] = sdr
|
32 |
-
batch["pesq"] = pesq
|
33 |
-
batch["stoi"] = stoi
|
34 |
-
else:
|
35 |
-
|
36 |
-
waveform = torchaudio.functional.resample(torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float(), batch[audio_column_name]["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
|
37 |
-
with torch.no_grad():
|
38 |
-
stoi_sample, pesq_sample, sdr_sample = model(waveform)
|
39 |
-
batch["sdr"] = sdr_sample
|
40 |
-
batch["pesq"] = pesq_sample
|
41 |
-
batch["stoi"] = stoi_sample
|
42 |
-
# TODO
|
43 |
-
return batch
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|