import torch

import gradio as gr
from transformers import pipeline
from transformers import T5ForConditionalGeneration, T5Tokenizer

import re
import os
import json
import requests
import whisper
from yt_dlp import YoutubeDL

import matplotlib as plt

#whisper_model = whisper.load_model('small')

path = "Hyeonsieun/NTtoGT_7epoch"
tokenizer = T5Tokenizer.from_pretrained(path)
model = T5ForConditionalGeneration.from_pretrained(path)


MODEL_NAME = "openai/whisper-large-v2"
BATCH_SIZE = 8
#FILE_LIMIT_MB = 1000

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
)


def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    return  text

def remove_spaces_within_dollar(text):
    # 달러 기호로 둘러싸인 부분에서 스페이스 제거
    # 정규 표현식: \$.*?\$ 는 '$'로 시작해서 '$'로 끝나는 최소한의 문자열을 찾음 (non-greedy)
    # re.sub의 repl 파라미터에 함수를 사용하여 매치된 부분에서만 변경을 적용
    result = re.sub(r'\$(.*?)\$', lambda match: match.group(0).replace(' ', ''), text)
    return result


def audio_correction(file):
    ASR_result = transcribe(file)
    text_list = split_text_complex_rules_with_warning(ASR_result)
    whole_text = ''
    for text in text_list:
        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
        inputs = tokenizer.encode(
            input_text,
            return_tensors='pt',
            max_length=325,
            padding='max_length',
            truncation=True
        )
        # Get correct sentence ids.
        corrected_ids = model.generate(
            inputs,
            max_length=325,
            num_beams=5, # `num_beams=1` indicated temperature sampling.
            early_stopping=True
        )
        # Decode.
        corrected_sentence = tokenizer.decode(
            corrected_ids[0],
            skip_special_tokens=False
        )
        whole_text += corrected_sentence

    return remove_spaces_within_dollar(whole_text)[5:-4]

def youtubeASR(link):
    # 유튜브의 음성만 다운로드할 임시 파일명
    out_fn = 'temp1.mp3'

    ydl_opts = {
        'format': 'bestaudio/best', # Audio만 다운로드
        'outtmpl': out_fn,          # 지정한 파일명으로 저장
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([link])

    result = pipe(out_fn, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]      # Youtube에서 받은 음성 파일(out_fn)을 받아쓰기
    script = result['text']            # 받아쓰기 한 내용 저장
    return script

def split_text_complex_rules_with_warning(text):
    # 콤마를 제외한 구두점으로 문장 분리
    parts = re.split(r'(?<=[.?!])\s+', text)

    result = []
    warnings = []  # 경고 메시지를 저장할 리스트
    for part in parts:
        # 각 부분의 길이가 256자를 초과하는 경우 콤마로 추가 분리
        if len(part) > 256:
            subparts = re.split(r',\s*', part)
            for subpart in subparts:
                # 빈 문자열 제거 및 길이가 256자 이하인 경우만 결과 리스트에 추가
                trimmed_subpart = subpart.strip()
                if trimmed_subpart and len(trimmed_subpart) <= 256:
                    result.append(trimmed_subpart)
                else:
                    # 길이가 256자를 초과하는 경우 경고 메시지 추가
                    warnings.append(f"문장 길이가 256자를 초과합니다: {trimmed_subpart[:50]}... (길이: {len(trimmed_subpart)})")
        else:
            # 길이가 256자 이하인 경우 바로 결과 리스트에 추가
            result.append(part.strip())
    warnings = 0

    return result


def youtube_correction(link):
    ASR_result = youtubeASR(link)
    text_list = split_text_complex_rules_with_warning(ASR_result)
    whole_text = ''
    for text in text_list:
        input_text = f"translate the text pronouncing the formula to a LaTeX equation: {text}"
        inputs = tokenizer.encode(
            input_text,
            return_tensors='pt',
            max_length=325,
            padding='max_length',
            truncation=True
        )
        # Get correct sentence ids.
        corrected_ids = model.generate(
            inputs,
            max_length=325,
            num_beams=5, # `num_beams=1` indicated temperature sampling.
            early_stopping=True
        )
        # Decode.
        corrected_sentence = tokenizer.decode(
            corrected_ids[0],
            skip_special_tokens=False
        )
        whole_text += corrected_sentence

    return remove_spaces_within_dollar(whole_text)[5:-4]


demo = gr.Blocks()

file_transcribe = gr.Interface(
    fn=audio_correction,
    inputs=gr.components.Audio(sources="upload", type="filepath"),
    outputs="text"
    )

yt_transcribe = gr.Interface(
    fn=youtube_correction,
    inputs="text",
    outputs="text"
    )

with demo:
    gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])

demo.launch()