Ben Prystawski commited on
Commit
64425f4
·
1 Parent(s): f22f4f8

Added implementation

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A Gradio app to transcribe and diarize a podcast using Whisper and pyannote. Adapted from Dwarkesh Patel's Colab notebook here:
3
+ https://colab.research.google.com/drive/1V-Bt5Hm2kjaDb4P1RyMSswsDKyrzc2-3?usp=sharing
4
+ """
5
+ import whisper
6
+ import datetime
7
+
8
+ import subprocess
9
+ import torch
10
+ import gradio as gr
11
+ import pyannote.audio
12
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
13
+ from pyannote.audio import Audio
14
+ from pyannote.core import Segment
15
+ import wave
16
+ import contextlib
17
+
18
+ from sklearn.cluster import AgglomerativeClustering
19
+ import numpy as np
20
+
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
+ "speechbrain/spkrec-ecapa-voxceleb", device=torch.device("mps")
23
+ )
24
+
25
+ audio = Audio()
26
+
27
+
28
+ def time(secs):
29
+ return datetime.timedelta(seconds=round(secs))
30
+
31
+
32
+ def segment_embedding(segment, duration, audio, path):
33
+ start = segment["start"]
34
+ # Whisper overshoots the end timestamp in the last segment
35
+ end = min(duration, segment["end"])
36
+ clip = Segment(start, end)
37
+ waveform, sample_rate = audio.crop(path, clip)
38
+ return embedding_model(waveform[None])
39
+
40
+
41
+ def get_whisper_results(path, model_type):
42
+ model = whisper.load_model(model_type)
43
+ result = model.transcribe(path)
44
+ segments = result["segments"]
45
+
46
+ with contextlib.closing(wave.open(path, "r")) as f:
47
+ frames = f.getnframes()
48
+ rate = f.getframerate()
49
+ duration = frames / float(rate)
50
+
51
+ return result, segments, frames, rate, duration
52
+
53
+
54
+ def cluster_embeddings(segments, duration, path, num_speakers):
55
+ embeddings = np.zeros(shape=(len(segments), 192))
56
+ for i, segment in enumerate(segments):
57
+ embeddings[i] = segment_embedding(segment, duration, audio, path)
58
+
59
+ embeddings = np.nan_to_num(embeddings)
60
+
61
+ print(f"num speakers: {num_speakers}")
62
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
63
+ labels = clustering.labels_
64
+ for i in range(len(segments)):
65
+ segments[i]["speaker"] = "SPEAKER " + str(labels[i] + 1)
66
+
67
+
68
+ def transcribe(path, model_type, num_speakers):
69
+ if path[-3:] != "wav":
70
+ subprocess.call(["ffmpeg", "-i", path, "audio.wav", "-y"])
71
+ path = "audio.wav"
72
+
73
+ ret = ""
74
+ result, segments, frames, rate, duration = get_whisper_results(path, model_type)
75
+ cluster_embeddings(segments, duration, path, num_speakers)
76
+
77
+ for i, segment in enumerate(segments):
78
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
79
+ ret += "\n" + segment["speaker"] + " " + str(time(segment["start"])) + "\n"
80
+ ret += segment["text"][1:] + " "
81
+
82
+ return ret
83
+
84
+
85
+ if __name__ == "__main__":
86
+ interface = gr.Interface(
87
+ fn=transcribe,
88
+ inputs=[
89
+ gr.File(file_count="single", label="Upload an audio file"),
90
+ gr.Radio(
91
+ choices=["tiny", "base", "small", "medium", "large-v3"],
92
+ value="large-v3",
93
+ type="value",
94
+ label="Model size",
95
+ ),
96
+ gr.Number(
97
+ value=2,
98
+ label="Number of speakers",
99
+ ),
100
+ ],
101
+ outputs=gr.Textbox(label="Transcript", show_copy_button=True),
102
+ title="Transcribe a podcast!",
103
+ description="Upload an audio file and choose a model size and number of speakers on the left, then click submit to transcribe!",
104
+ theme=gr.themes.Soft(),
105
+ )
106
+ interface.launch()