import torch
from transformers import BitsAndBytesConfig, pipeline
import whisper
import gradio as gr
from gtts import gTTS
from PIL import Image
import re
import os
import datetime
import locale
import numpy as np
import nltk

nltk.download('punkt')
from nltk import sent_tokenize
import torch
from transformers import BitsAndBytesConfig, pipeline
import time
import warnings
from PIL import Image
import requests
import numpy as np
import cv2  # For videoprocessing
import moviepy.editor as mp


# Load the LlaVa model
model_id = "llava-hf/llava-1.5-7b-hf"
pipe = pipeline("image-to-text", model=model_id)


# Load the Whisper model using pipeline
pipe_audio = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")

# Initialize conversation history
conversation_history = []

def writehistory(text):
    """Write history to a log file."""
    tstamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    logfile = f'{tstamp}_log.txt'
    with open(logfile, 'a', encoding='utf-8') as f:
        f.write(text + '\n')

def img2txt(input_text, input_image):
    """Convert image to text using iterative prompts."""
    try:
        image = Image.open(input_image)
        if isinstance(input_text, tuple):
            input_text = input_text[0]  # Take the first element if it's a tuple

        writehistory(f"Input text: {input_text}")
        prompt = "USER: <image>\n" + input_text + "\nASSISTANT:"
        while True:
            outputs = pipe_image(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})

            if outputs and outputs[0]["generated_text"]:
                match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
                reply = match.group(1) if match else "No response found."
                conversation_history.append(("User", input_text))
                conversation_history.append(("Assistant", reply))
                prompt = "USER: " + reply + "\nASSISTANT:"
                return reply  # Only return the first response for now
            else:
                return "No response generated."
    except Exception as e:
        return str(e)

def vid2txt(input_text, input_video):
    """Convert video to text by extracting frames and analyzing."""
    try:
        video = mp.VideoFileClip(input_video)
        frame = video.get_frame(1)  # Get a frame from the video at the 1-second mark
        image_path = "temp_frame.jpg"
        mp.ImageClip(frame).save_frame(image_path)
        return img2txt(input_text, image_path)
    except Exception as e:
        return str(e)

def transcribe(audio_path):
    """Transcribe audio to text using Whisper pipeline."""
    if not audio_path:
        return ''

    result = pipe_audio(audio_path)
    return result["text"]

def text_to_speech(text, file_path):
    """Convert text to speech and save to file."""
    language = 'en'
    audioobj = gTTS(text=text, lang=language, slow=False)
    audioobj.save(file_path)
    return file_path

def chatbot_interface(audio_path, image_path, video_path, user_message):
    """Process user inputs and generate chatbot response."""
    global conversation_history

    # Handle audio input
    if audio_path:
        speech_to_text_output = transcribe(audio_path)
    else:
        speech_to_text_output = ""

    # Determine the input message
    input_message = user_message if user_message else speech_to_text_output

    # Ensure input_message is a string
    if isinstance(input_message, tuple):
        input_message = input_message[0]

    # Handle image or video input
    if image_path:
        chatgpt_output = img2txt(input_message, image_path)
    elif video_path:
        chatgpt_output = vid2txt(input_message, video_path)
    else:
        chatgpt_output = "No image or video provided."

    # Add to conversation history
    conversation_history.append(("User", input_message))
    conversation_history.append(("Assistant", chatgpt_output))

    # Generate audio response
    processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3")

    return conversation_history, processed_audio_path

# Define Gradio interface
iface = gr.Interface(
    fn=chatbot_interface,
    inputs=[
        gr.Audio(type="filepath", label="Record your message"),
        gr.Image(type="filepath", label="Upload an image"),
        gr.Video(label="Upload a video"),
        gr.Textbox(lines=2, placeholder="Type your message here...", label="User message (if no audio)")
    ],
    outputs=[
        gr.Chatbot(label="Conversation"),
        gr.Audio(label="Assistant's Voice Reply")
    ],
    title="Interactive Visual and Voice Assistant",
    description="Upload an image or video, record or type your question, and get detailed responses."
)

# Launch the Gradio app
iface.launch(debug=True)