fffiloni commited on
Commit
35a9ed4
·
verified ·
1 Parent(s): 7834e6c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -0
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModel
3
+ from PIL import Image
4
+ import numpy as np
5
+ import os
6
+ import gradio as gr
7
+
8
+ # Load the model and tokenizer
9
+ model_path = "ByteDance/Sa2VA-4B"
10
+
11
+ model = AutoModel.from_pretrained(
12
+ model_path,
13
+ torch_dtype = torch.bfloat16,
14
+ low_cpu_mem_usage = True,
15
+ use_flash_attn = True,
16
+ trust_remote_code = True
17
+ ).eval().cuda()
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ model_path,
21
+ trust_remote_code = True,
22
+ use_fast = False
23
+ )
24
+
25
+ def image_vision(image_input_path, prompt):
26
+ image_path = image_input_path
27
+ text_prompts = f"<image>{prompt}"
28
+ image = Image.open(image_path).convert('RGB')
29
+ input_dict = {
30
+ 'image': image,
31
+ 'text': text_prompts,
32
+ 'past_text': '',
33
+ 'mask_prompts': None,
34
+ 'tokenizer': tokenizer,
35
+ }
36
+ return_dict = model.predict_forward(**input_dict)
37
+ answer = return_dict["prediction"] # the text format answer
38
+ print(answer)
39
+
40
+ def main_infer(image_input_path, prompt):
41
+
42
+ response = image_vision(image_input_path, prompt)
43
+ return response
44
+
45
+ # Gradio UI
46
+
47
+ with gr.Blocks() as demo:
48
+ with gr.Column():
49
+ with gr.Row():
50
+ with gr.Column():
51
+ image_input = gr.Image(label="Image IN")
52
+ with gr.Row():
53
+ instruction = gr.Textbox(label="Instruction")
54
+ submit_btn = gr.Button("SUbmit", scale=1)
55
+ with gr.Column():
56
+ output_res = gr.Textbox(label="Response")
57
+
58
+ submit_btn.click(
59
+ fn = main_infer,
60
+ inputs = [image_input, instruction],
61
+ outputs = [output_res]
62
+ )
63
+
64
+ demo.queue().launch(show_api=False, show_error=True)