Pontonkid commited on
Commit
9d22ee4
·
verified ·
1 Parent(s): c583909

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -12
app.py CHANGED
@@ -29,10 +29,8 @@ model_id = "llava-hf/llava-1.5-7b-hf"
29
  pipe = pipeline("image-to-text", model=model_id)
30
 
31
 
32
- # Load the whisper model
33
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
34
- model = whisper.load_model("medium", device=DEVICE)
35
-
36
 
37
  # Initialize conversation history
38
  conversation_history = []
@@ -54,7 +52,7 @@ def img2txt(input_text, input_image):
54
  writehistory(f"Input text: {input_text}")
55
  prompt = "USER: <image>\n" + input_text + "\nASSISTANT:"
56
  while True:
57
- outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
58
 
59
  if outputs and outputs[0]["generated_text"]:
60
  match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
@@ -80,16 +78,12 @@ def vid2txt(input_text, input_video):
80
  return str(e)
81
 
82
  def transcribe(audio_path):
83
- """Transcribe audio to text using Whisper model."""
84
  if not audio_path:
85
  return ''
86
 
87
- audio = whisper.load_audio(audio_path)
88
- audio = whisper.pad_or_trim(audio)
89
- mel = whisper.log_mel_spectrogram(audio).to(model.device)
90
- options = whisper.DecodingOptions()
91
- result = whisper.decode(model, mel, options)
92
- return result.text
93
 
94
  def text_to_speech(text, file_path):
95
  """Convert text to speech and save to file."""
 
29
  pipe = pipeline("image-to-text", model=model_id)
30
 
31
 
32
+ # Load the Whisper model using pipeline
33
+ pipe_audio = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
 
 
34
 
35
  # Initialize conversation history
36
  conversation_history = []
 
52
  writehistory(f"Input text: {input_text}")
53
  prompt = "USER: <image>\n" + input_text + "\nASSISTANT:"
54
  while True:
55
+ outputs = pipe_image(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
56
 
57
  if outputs and outputs[0]["generated_text"]:
58
  match = re.search(r'ASSISTANT:\s*(.*)', outputs[0]["generated_text"])
 
78
  return str(e)
79
 
80
  def transcribe(audio_path):
81
+ """Transcribe audio to text using Whisper pipeline."""
82
  if not audio_path:
83
  return ''
84
 
85
+ result = pipe_audio(audio_path)
86
+ return result["text"]
 
 
 
 
87
 
88
  def text_to_speech(text, file_path):
89
  """Convert text to speech and save to file."""