ylacombe HF staff commited on
Commit
0300002
1 Parent(s): 44aca89

Delete dataspeech/dataspeech

Browse files
dataspeech/dataspeech/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .cpu_enrichments import rate_apply
2
- from .gpu_enrichments import pitch_apply, snr_apply, squim_apply
 
 
 
dataspeech/dataspeech/cpu_enrichments/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .rate import rate_apply
2
-
 
 
 
dataspeech/dataspeech/cpu_enrichments/rate.py DELETED
@@ -1,34 +0,0 @@
1
- from g2p import make_g2p
2
-
3
- transducer = make_g2p('eng', 'eng-ipa')
4
-
5
- def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
6
- if isinstance(batch[audio_column_name], list):
7
- speaking_rates = []
8
- phonemes_list = []
9
- for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
10
- phonemes = transducer(text).output_string
11
-
12
- sample_rate = audio["sampling_rate"]
13
- audio_length = len(audio["array"].squeeze()) / sample_rate
14
-
15
- speaking_rate = len(phonemes) / audio_length
16
-
17
-
18
- speaking_rates.append(speaking_rate)
19
- phonemes_list.append(phonemes)
20
-
21
- batch["speaking_rate"] = speaking_rates
22
- batch["phonemes"] = phonemes_list
23
- else:
24
- phonemes = transducer(batch[text_column_name]).output_string
25
-
26
- sample_rate = batch[audio_column_name]["sampling_rate"]
27
- audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
28
-
29
- speaking_rate = len(phonemes) / audio_length
30
-
31
- batch["speaking_rate"] = speaking_rate
32
- batch["phonemes"] = phonemes
33
-
34
- return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataspeech/dataspeech/gpu_enrichments/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .pitch import pitch_apply
2
- from .snr_and_reverb import snr_apply
3
- from .squim import squim_apply
 
 
 
 
dataspeech/dataspeech/gpu_enrichments/pitch.py DELETED
@@ -1,64 +0,0 @@
1
- import torch
2
- import penn
3
-
4
-
5
- # Here we'll use a 10 millisecond hopsize
6
- hopsize = .01
7
-
8
- # Provide a sensible frequency range given your domain and model
9
- fmin = 30.
10
- fmax = 1000.
11
-
12
- # Select a checkpoint to use for inference. Selecting None will
13
- # download and use FCNF0++ pretrained on MDB-stem-synth and PTDB
14
- checkpoint = None
15
-
16
- # Centers frames at hopsize / 2, 3 * hopsize / 2, 5 * hopsize / 2, ...
17
- center = 'half-hop'
18
-
19
- # (Optional) Linearly interpolate unvoiced regions below periodicity threshold
20
- interp_unvoiced_at = .065
21
-
22
-
23
- def pitch_apply(batch, rank=None, audio_column_name="audio", output_column_name="utterance_pitch", penn_batch_size=4096):
24
- if isinstance(batch[audio_column_name], list):
25
- utterance_pitch_mean = []
26
- utterance_pitch_std = []
27
- for sample in batch[audio_column_name]:
28
- # Infer pitch and periodicity
29
- pitch, periodicity = penn.from_audio(
30
- torch.tensor(sample["array"][None, :]).float(),
31
- sample["sampling_rate"],
32
- hopsize=hopsize,
33
- fmin=fmin,
34
- fmax=fmax,
35
- checkpoint=checkpoint,
36
- batch_size=penn_batch_size,
37
- center=center,
38
- interp_unvoiced_at=interp_unvoiced_at,
39
- gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
40
- )
41
-
42
- utterance_pitch_mean.append(pitch.mean().cpu())
43
- utterance_pitch_std.append(pitch.std().cpu())
44
-
45
- batch[f"{output_column_name}_mean"] = utterance_pitch_mean
46
- batch[f"{output_column_name}_std"] = utterance_pitch_std
47
- else:
48
- sample = batch[audio_column_name]
49
- pitch, periodicity = penn.from_audio(
50
- torch.tensor(sample["array"][None, :]).float(),
51
- sample["sampling_rate"],
52
- hopsize=hopsize,
53
- fmin=fmin,
54
- fmax=fmax,
55
- checkpoint=checkpoint,
56
- batch_size=penn_batch_size,
57
- center=center,
58
- interp_unvoiced_at=interp_unvoiced_at,
59
- gpu=(rank or 0)% torch.cuda.device_count() if rank else rank
60
- )
61
- batch[f"{output_column_name}_mean"] = pitch.mean().cpu()
62
- batch[f"{output_column_name}_std"] = pitch.std().cpu()
63
-
64
- return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataspeech/dataspeech/gpu_enrichments/snr_and_reverb.py DELETED
@@ -1,47 +0,0 @@
1
- from pyannote.audio import Model
2
- from pathlib import Path
3
- from brouhaha.pipeline import RegressiveActivityDetectionPipeline
4
- import torch
5
- from huggingface_hub import hf_hub_download
6
-
7
- model = None
8
-
9
- def snr_apply(batch, rank=None, audio_column_name="audio"):
10
- global model
11
- if model is None:
12
- model = Model.from_pretrained(
13
- Path(hf_hub_download(repo_id="ylacombe/brouhaha-best", filename="best.ckpt")),
14
- strict=False,
15
- )
16
- if rank is not None:
17
- # move the model to the right GPU if not there already
18
- device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
19
- # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
20
- model.to(device)
21
-
22
- pipeline = RegressiveActivityDetectionPipeline(segmentation=model)
23
- if rank:
24
- pipeline.to(torch.device(device))
25
-
26
- device = pipeline._models["segmentation"].device
27
-
28
- if isinstance(batch[audio_column_name], list):
29
- snr = []
30
- c50 = []
31
- for sample in batch[audio_column_name]:
32
- res = pipeline({"sample_rate": sample["sampling_rate"],
33
- "waveform": torch.tensor(sample["array"][None, :]).to(device).float()})
34
-
35
- snr.append(res["snr"].mean())
36
- c50.append(res["c50"].mean())
37
-
38
- batch["snr"] = snr
39
- batch["c50"] = c50
40
- else:
41
- res = pipeline({"sample_rate": batch[audio_column_name]["sampling_rate"],
42
- "waveform": torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float()})
43
-
44
- batch["snr"] = res["snr"].mean()
45
- batch["c50"] = res["c50"].mean()
46
-
47
- return batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataspeech/dataspeech/gpu_enrichments/squim.py DELETED
@@ -1,44 +0,0 @@
1
- from torchaudio.pipelines import SQUIM_OBJECTIVE
2
- import torch
3
- import torchaudio
4
-
5
- model = None
6
-
7
- def squim_apply(batch, rank=None, audio_column_name="audio"):
8
- global model
9
- if model is None:
10
- model = SQUIM_OBJECTIVE.get_model()
11
- if rank is not None:
12
- # move the model to the right GPU if not there already
13
- device = f"cuda:{(rank or 0)% torch.cuda.device_count()}"
14
- # move to device and create pipeline here because the pipeline moves to the first GPU it finds anyway
15
- model.to(device)
16
- else:
17
- device = "cpu"
18
-
19
- if isinstance(batch[audio_column_name], list):
20
- sdr = []
21
- pesq = []
22
- stoi = []
23
- for sample in batch[audio_column_name]:
24
- waveform = torchaudio.functional.resample(torch.tensor(sample["array"][None, :]).to(device).float(), sample["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
25
- with torch.no_grad():
26
- stoi_sample, pesq_sample, sdr_sample = model(waveform)
27
- sdr.append(sdr_sample.cpu())
28
- pesq.append(pesq_sample.cpu())
29
- stoi.append(stoi_sample.cpu())
30
-
31
- batch["sdr"] = sdr
32
- batch["pesq"] = pesq
33
- batch["stoi"] = stoi
34
- else:
35
-
36
- waveform = torchaudio.functional.resample(torch.tensor(batch[audio_column_name]["array"][None, :]).to(device).float(), batch[audio_column_name]["sampling_rate"], SQUIM_OBJECTIVE.sample_rate)
37
- with torch.no_grad():
38
- stoi_sample, pesq_sample, sdr_sample = model(waveform)
39
- batch["sdr"] = sdr_sample
40
- batch["pesq"] = pesq_sample
41
- batch["stoi"] = stoi_sample
42
- # TODO
43
- return batch
44
-