Spaces:

Hrsh-Venket
/

Corrected-Speech-to-Text

Runtime error

App Files Files Community

Corrected-Speech-to-Text / app.py

Hrsh-Venket

modified: app.py

ffe25b7 almost 2 years ago

raw

history blame contribute delete

4.3 kB

	import gradio as gr
	from huggingsound import SpeechRecognitionModel
	from transformers import logging
	from transformers import pipeline
	from transformers import BertTokenizer, BertModel
	from pydub import AudioSegment
	unmasker = pipeline('fill-mask', model='bert-base-uncased')
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	model = BertModel.from_pretrained("bert-base-uncased")
	import os


	def levenshtein_distance(s, t):
	m, n = len(s), len(t)
	d = [[0] * (n+1) for _ in range(m+1)]

	for i in range(m+1):
	d[i][0] = i

	for j in range(n+1):
	d[0][j] = j

	for j in range(1, n+1):
	for i in range(1, m+1):
	if s[i-1] == t[j-1]:
	d[i][j] = d[i-1][j-1]
	else:
	d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])

	return d[m][n]

	def collate(input):
	pun_marks = [",", ".", "?", "!", ";", ":", "-", "—", "(", ")", "[", "]", "{", "}", "'", "\"", "`"]
	output = ""
	Capital = True
	Dash = False
	for i in range(len(input)):
	if input[i] in pun_marks:
	output += input[i]
	if input[i] in [".", "("]:
	Capital = True
	if input[i] in ["-", "'"]:
	Dash = True
	else:
	Dash = False
	else:
	str = ""
	if (Dash == False):
	str += " "
	if Capital:
	str += input[i].capitalize()
	Capital = False
	else:
	str += input[i]
	output += str
	return output

	def everything(audio_paths):
	w2vmodel = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
	logging.set_verbosity_error() #change'error' to 'warning' or remove this if you want to see the warning
	# https://huggingface.co./jonatasgrosman/wav2vec2-large-xlsr-53-english
	# https://huggingface.co./bert-base-uncased

	transcriptions = w2vmodel.transcribe(audio_paths)

	return transcriptions
	# input = transcriptions[0]["transcription"]
	# input = input.split()

	# #(1) is a strategy where tokens are used to determine lexicographic distance
	# #(2) is a strategy where replaced words
	# for t in range(1):
	# # output = [] #(2)
	# for i in range(len(input)):
	# temp = input[i]
	# token = tokenizer(temp)['input_ids'][1]
	# input[i] = "[MASK]"
	# apiint = unmasker(' '.join(input))
	# dist = []
	# for r in range(5):
	# # if (np.abs((apiint[r]['token'] - token)) < 2): #(1)
	# dist.append(levenshtein_distance(temp, apiint[r]['token_str']))
	# lindex = 0
	# l = dist[0]
	# for r in range(5):
	# if dist[r] < l:
	# lindex = r

	# l = dist[r]
	# if l <= 2:
	# input[i] = apiint[lindex]['token_str']
	# # output.append(apiint[lindex]['token_str']) #(2)
	# else:
	# input[i] = temp
	# # output.append(temp) #(2)
	# # input[i] = temp #(2)

	# for t in range(1):
	# inndex = 1
	# for i in range(len(input)):
	# input.insert(inndex, "[MASK]")
	# # print(' '.join(input))
	# apiint = unmasker(' '.join(input))
	# if (apiint[0]['token'] < 1500):
	# input[inndex] = apiint[0]["token_str"]
	# inndex += 2
	# else:
	# del input[inndex]
	# inndex += 1

	# st.write(collate(input))

	# # In comparison, a plain autocorrect gives this output:

	# # "The b-movie by Jerry Sinclair, the sound of buzzing
	# # bees, can be heard according to all known laws of
	# # aviation that is no way for b to be able to fly its
	# # wings are too small to get its start little body off
	# # the ground, the be, of course, flies anyway because ``
	# # bees don't care what humans think is possible.
	# # Barbuda is guaranteed one member of the House of
	# # Representatives and two members of the Senate."

	# # - https://huggingface.co./oliverguhr/spelling-correction-english-base?text=lets+do+a+comparsion

	demo = gr.Interface(fn=everything,
	inputs = [gr.UploadButton],
	outputs = ["text"])