import torch from transformers import BitsAndBytesConfig, pipeline import whisper import gradio as gr from gtts import gTTS from PIL import Image import re import os import datetime import locale import numpy as np import nltk nltk.download('punkt') from nltk import sent_tokenize import torch from transformers import BitsAndBytesConfig, pipeline import time import warnings from PIL import Image import requests import numpy as np import cv2 # For videoprocessing import moviepy.editor as mp # Load the LlaVa model model_id = "llava-hf/llava-1.5-7b-hf" pipe = pipeline("image-to-text", model=model_id) # Load the Whisper model using pipeline pipe_audio = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3") # Initialize conversation history conversation_history = [] def writehistory(text): """Write history to a log file.""" tstamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") logfile = f'{tstamp}_log.txt' with open(logfile, 'a', encoding='utf-8') as f: f.write(text + '\n') def img2txt(input_text, input_image): """Convert image to text using iterative prompts.""" try: image = Image.open(input_image) if isinstance(input_text, tuple): input_text = input_text[0] # Take the first element if it's a tuple writehistory(f"Input text: {input_text}") prompt = "USER: \n" + input_text + "\nASSISTANT:" while True: outputs = pipe_image(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200}) if outputs and outputs[0]["generated_text"]: match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"]) reply = match.group(1) if match else "No response found." conversation_history.append(("User", input_text)) conversation_history.append(("Assistant", reply)) prompt = "USER: " + reply + "\nASSISTANT:" return reply # Only return the first response for now else: return "No response generated." except Exception as e: return str(e) def vid2txt(input_text, input_video): """Convert video to text by extracting frames and analyzing.""" try: video = mp.VideoFileClip(input_video) frame = video.get_frame(1) # Get a frame from the video at the 1-second mark image_path = "temp_frame.jpg" mp.ImageClip(frame).save_frame(image_path) return img2txt(input_text, image_path) except Exception as e: return str(e) def transcribe(audio_path): """Transcribe audio to text using Whisper pipeline.""" if not audio_path: return '' result = pipe_audio(audio_path) return result["text"] def text_to_speech(text, file_path): """Convert text to speech and save to file.""" language = 'en' audioobj = gTTS(text=text, lang=language, slow=False) audioobj.save(file_path) return file_path def chatbot_interface(audio_path, image_path, video_path, user_message): """Process user inputs and generate chatbot response.""" global conversation_history # Handle audio input if audio_path: speech_to_text_output = transcribe(audio_path) else: speech_to_text_output = "" # Determine the input message input_message = user_message if user_message else speech_to_text_output # Ensure input_message is a string if isinstance(input_message, tuple): input_message = input_message[0] # Handle image or video input if image_path: chatgpt_output = img2txt(input_message, image_path) elif video_path: chatgpt_output = vid2txt(input_message, video_path) else: chatgpt_output = "No image or video provided." # Add to conversation history conversation_history.append(("User", input_message)) conversation_history.append(("Assistant", chatgpt_output)) # Generate audio response processed_audio_path = text_to_speech(chatgpt_output, "Temp3.mp3") return conversation_history, processed_audio_path # Define Gradio interface iface = gr.Interface( fn=chatbot_interface, inputs=[ gr.Audio(type="filepath", label="Record your message"), gr.Image(type="filepath", label="Upload an image"), gr.Video(label="Upload a video"), gr.Textbox(lines=2, placeholder="Type your message here...", label="User message (if no audio)") ], outputs=[ gr.Chatbot(label="Conversation"), gr.Audio(label="Assistant's Voice Reply") ], title="Interactive Visual and Voice Assistant", description="Upload an image or video, record or type your question, and get detailed responses." ) # Launch the Gradio app iface.launch(debug=True)