Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from PIL import Image | |
import numpy as np | |
import os | |
import gradio as gr | |
# Load the model and tokenizer | |
model_path = "ByteDance/Sa2VA-4B" | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
torch_dtype="auto", | |
device_map="auto", | |
trust_remote_code=True, | |
).eval().cuda() | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_path, | |
trust_remote_code = True, | |
) | |
def image_vision(image_input_path, prompt): | |
image_path = image_input_path | |
text_prompts = f"<image>{prompt}" | |
image = Image.open(image_path).convert('RGB') | |
input_dict = { | |
'image': image, | |
'text': text_prompts, | |
'past_text': '', | |
'mask_prompts': None, | |
'tokenizer': tokenizer, | |
} | |
return_dict = model.predict_forward(**input_dict) | |
answer = return_dict["prediction"] # the text format answer | |
print(answer) | |
def main_infer(image_input_path, prompt): | |
response = image_vision(image_input_path, prompt) | |
return response | |
# Gradio UI | |
with gr.Blocks() as demo: | |
with gr.Column(): | |
gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image(label="Image IN") | |
with gr.Row(): | |
instruction = gr.Textbox(label="Instruction") | |
submit_btn = gr.Button("SUbmit", scale=1) | |
with gr.Column(): | |
output_res = gr.Textbox(label="Response") | |
submit_btn.click( | |
fn = main_infer, | |
inputs = [image_input, instruction], | |
outputs = [output_res] | |
) | |
demo.queue().launch(show_api=False, show_error=True) |