Spaces:

fffiloni
/

Sa2VA-simple-demo

Running on Zero

App Files Files Community

fffiloni commited on 1 day ago

Commit

5672cc2

verified ·

1 Parent(s): c0784bd

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -22

app.py CHANGED Viewed

@@ -28,6 +28,16 @@ tokenizer = AutoTokenizer.from_pretrained(
     trust_remote_code = True,
 )
 def visualize(pred_mask, image_path, work_dir):
     visualizer = Visualizer()
     img = cv2.imread(image_path)
@@ -56,13 +66,6 @@ def image_vision(image_input_path, prompt):
     seg_image = return_dict["prediction_masks"]
-    return answer, seg_image
-def main_infer(image_input_path, prompt):
-    answer, seg_image = image_vision(image_input_path, prompt)
     if '[SEG]' in answer and Visualizer is not None:
         pred_masks = seg_image[0]
         temp_dir = tempfile.mkdtemp()
@@ -73,26 +76,58 @@ def main_infer(image_input_path, prompt):
     else:
         return answer, None
 # Gradio UI
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos")
-        with gr.Row():
-            with gr.Column():
-                image_input = gr.Image(label="Image IN", type="filepath")
-                with gr.Row():
-                    instruction = gr.Textbox(label="Instruction", scale=4)
-                    submit_btn = gr.Button("Submit", scale=1)
-            with gr.Column():
-                output_res = gr.Textbox(label="Response")
-                output_image = gr.Image(label="Segmentation", type="numpy")
-    submit_btn.click(
-        fn = main_infer,
-        inputs = [image_input, instruction],
-        outputs = [output_res, output_image]
-    )
 demo.queue().launch(show_api=False, show_error=True)

     trust_remote_code = True,
 )
+from third_parts import VideoReader
+def read_video(video_path, video_interval):
+    vid_frames = VideoReader(video_path)[::video_interval]
+    for frame_idx in range(len(vid_frames)):
+        frame_image = vid_frames[frame_idx]
+        frame_image = frame_image[..., ::-1]  # BGR (opencv system) to RGB (numpy system)
+        frame_image = Image.fromarray(frame_image)
+        vid_frames[frame_idx] = frame_image
+    return vid_frames
 def visualize(pred_mask, image_path, work_dir):
     visualizer = Visualizer()
     img = cv2.imread(image_path)
     seg_image = return_dict["prediction_masks"]
     if '[SEG]' in answer and Visualizer is not None:
         pred_masks = seg_image[0]
         temp_dir = tempfile.mkdtemp()
     else:
         return answer, None
+def video_vision(video_input_path, prompt):
+    vid_frames = read_video(video_input_path, video_interval=6)
+    # create a question (<image> is a placeholder for the video frames)
+    question = f"<image>{prompt}"
+    result = model.predict_forward(
+        video=vid_frames,
+        text=question,
+        tokenizer=tokenizer,
+    )
+    prediction = result['prediction']
+    print(prediction)
+    return result['prediction'], None
 # Gradio UI
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# Sa2VA: Marrying SAM2 with LLaVA for Dense Grounded Understanding of Images and Videos")
+        with gr.Tab("Single Image"):
+            with gr.Row():
+                with gr.Column():
+                    image_input = gr.Image(label="Image IN", type="filepath")
+                    with gr.Row():
+                        instruction = gr.Textbox(label="Instruction", scale=4)
+                        submit_image_btn = gr.Button("Submit", scale=1)
+                with gr.Column():
+                    output_res = gr.Textbox(label="Response")
+                    output_image = gr.Image(label="Segmentation", type="numpy")
+            submit_image_btn.click(
+                fn = image_vision,
+                inputs = [image_input, instruction],
+                outputs = [output_res, output_image]
+            )
+        with gr.Tab("Video"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Image(label="Video IN")
+                    with gr.Row():
+                        vid_instruction = gr.Textbox(label="Instruction", scale=4)
+                        submit_video_btn = gr.Button("Submit", scale=1)
+                with gr.Column():
+                    vid_output_res = gr.Textbox(label="Response")
+                    output_video = gr.Video(label="Segmentation")
+            submit_video_btn.click(
+                fn = video_vision,
+                inputs = [video_input, vid_instruction],
+                outputs = [vid_output_res, output_video]
+            )
 demo.queue().launch(show_api=False, show_error=True)