File size: 4,301 Bytes
1c827e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168d2de
ffe25b7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
from huggingsound import SpeechRecognitionModel
from transformers import logging
from transformers import pipeline
from transformers import BertTokenizer, BertModel
from pydub import AudioSegment
unmasker = pipeline('fill-mask', model='bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
import os


def levenshtein_distance(s, t):
    m, n = len(s), len(t)
    d = [[0] * (n+1) for _ in range(m+1)]
    
    for i in range(m+1):
        d[i][0] = i
    
    for j in range(n+1):
        d[0][j] = j
        
    for j in range(1, n+1):
        for i in range(1, m+1):
            if s[i-1] == t[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
                
    return d[m][n]

def collate(input):
    pun_marks = [",", ".", "?", "!", ";", ":", "-", "β€”", "(", ")", "[", "]", "{", "}", "'", "\"", "`"]
    output = ""
    Capital = True
    Dash = False
    for i in range(len(input)):
        if input[i] in pun_marks:
            output += input[i]
            if input[i] in [".", "("]:
                Capital = True
            if input[i] in ["-", "'"]:
                Dash = True
            else:
                Dash = False
        else:
            str = ""
            if (Dash == False):
                str += " "
            if Capital:
                str += input[i].capitalize()
                Capital = False
            else:
                str += input[i]
            output += str
    return output

def everything(audio_paths):
    w2vmodel = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    logging.set_verbosity_error() #change'error' to 'warning' or remove this if you want to see the warning
    # https://huggingface.co./jonatasgrosman/wav2vec2-large-xlsr-53-english
    # https://huggingface.co./bert-base-uncased

    transcriptions = w2vmodel.transcribe(audio_paths)
    
    return transcriptions
    # input = transcriptions[0]["transcription"]
    # input = input.split()

    #     #(1) is a strategy where tokens are used to determine lexicographic distance
    #     #(2) is a strategy where replaced words 
    # for t in range(1):
    #     # output = [] #(2)
    #     for i in range(len(input)):
    #         temp = input[i]
    #         token = tokenizer(temp)['input_ids'][1]
    #         input[i] = "[MASK]"
    # apiint = unmasker(' '.join(input))
    # dist = []
    # for r in range(5):
    #     # if (np.abs((apiint[r]['token'] - token)) < 2): #(1)
    #     dist.append(levenshtein_distance(temp, apiint[r]['token_str']))
    # lindex = 0
    # l = dist[0]
    # for r in range(5):
    #     if dist[r] < l:
    #         lindex = r

    #         l = dist[r]
    # if l <= 2:
    #     input[i] = apiint[lindex]['token_str']
    #     # output.append(apiint[lindex]['token_str']) #(2)
    # else:
    #     input[i] = temp
    #     # output.append(temp) #(2)
    # # input[i] = temp #(2)

    # for t in range(1):
    #     inndex = 1
    #     for i in range(len(input)):
    #         input.insert(inndex, "[MASK]")
    #         # print(' '.join(input))
    #         apiint = unmasker(' '.join(input))
    #         if (apiint[0]['token'] < 1500):
    #             input[inndex] = apiint[0]["token_str"]
    #             inndex += 2
    #         else:
    #             del input[inndex]
    #             inndex += 1

    # st.write(collate(input))

    # # In comparison, a plain autocorrect gives this output:

    # # "The b-movie by Jerry Sinclair, the sound of buzzing 
    # # bees, can be heard according to all known laws of 
    # # aviation that is no way for b to be able to fly its 
    # # wings are too small to get its start little body off 
    # # the ground, the be, of course, flies anyway because ``
    # # bees don't care what humans think is possible. 
    # # Barbuda is guaranteed one member of the House of 
    # # Representatives and two members of the Senate."

    # # - https://huggingface.co./oliverguhr/spelling-correction-english-base?text=lets+do+a+comparsion

demo = gr.Interface(fn=everything,
                    inputs = [gr.UploadButton],
                    outputs = ["text"])