Spaces:

Pontonkid
/

Multimodal-AI-Assistant

Runtime error

Pontonkid commited on Aug 16, 2024

Commit

9d22ee4

verified ·

1 Parent(s): c583909

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,10 +29,8 @@ model_id = "llava-hf/llava-1.5-7b-hf"
 pipe = pipeline("image-to-text", model=model_id)
-# Load the whisper model
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-model = whisper.load_model("medium", device=DEVICE)
 # Initialize conversation history
 conversation_history = []
@@ -54,7 +52,7 @@ def img2txt(input_text, input_image):
         writehistory(f"Input text: {input_text}")
         prompt = "USER: <image>\n" + input_text + "\nASSISTANT:"
         while True:
-            outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
             if outputs and outputs[0]["generated_text"]:
                 match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
@@ -80,16 +78,12 @@ def vid2txt(input_text, input_video):
         return str(e)
 def transcribe(audio_path):
-    """Transcribe audio to text using Whisper model."""
     if not audio_path:
         return ''
-    audio = whisper.load_audio(audio_path)
-    audio = whisper.pad_or_trim(audio)
-    mel = whisper.log_mel_spectrogram(audio).to(model.device)
-    options = whisper.DecodingOptions()
-    result = whisper.decode(model, mel, options)
-    return result.text
 def text_to_speech(text, file_path):
     """Convert text to speech and save to file."""

 pipe = pipeline("image-to-text", model=model_id)
+# Load the Whisper model using pipeline
+pipe_audio = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
 # Initialize conversation history
 conversation_history = []
         writehistory(f"Input text: {input_text}")
         prompt = "USER: <image>\n" + input_text + "\nASSISTANT:"
         while True:
+            outputs = pipe_image(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
             if outputs and outputs[0]["generated_text"]:
                 match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
         return str(e)
 def transcribe(audio_path):
+    """Transcribe audio to text using Whisper pipeline."""
     if not audio_path:
         return ''
+    result = pipe_audio(audio_path)
+    return result["text"]
 def text_to_speech(text, file_path):
     """Convert text to speech and save to file."""