File size: 2,988 Bytes
ce1fcfd
 
 
 
 
 
 
 
 
 
 
5ef5f1c
ce1fcfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ef5f1c
ce1fcfd
 
 
 
5ef5f1c
ce1fcfd
 
5ef5f1c
 
334df79
 
ce1fcfd
 
 
5ef5f1c
334df79
 
ce1fcfd
a2d8574
ce1fcfd
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import os

from torch import is_inference
from pq3d.inference import inference

MESH_DIR = 'assets/mesh'
MESH_NAMES = sorted([os.path.splitext(fname)[0] for fname in os.listdir(MESH_DIR)])

def change_scene(dropdown_scene: str):
    # reset 3D scene and chatbot history
    return os.path.join(MESH_DIR, f'{dropdown_scene}.glb'), dropdown_scene

with gr.Blocks(title='PQ3D Demo') as demo:
    gr.HTML(value="<h1 align='center'>Unifying 3D Vision Language Understanding vis Promptable Queries </h1>")
    #gr.HTML(value="<div align='center' style='margin-top:-1em; margin-bottom:-1em;'><img src='/file=assets/leo.svg' width='4%'></div>")
    # gr.HTML(value="<img src='/file=assets/teaser.png' alt='Teaser' width='760px' style='display: block; margin: auto;'>")
    #gr.HTML(value="<p align='center' style='font-size: 1.2em; color: #485fc7;'><a href='https://arxiv.org/abs/2311.12871' target='_blank'>arXiv</a> | <a href='https://embodied-generalist.github.io/' target='_blank'>Project Page</a> | <a href='https://github.com/embodied-generalist/embodied-generalist' target='_blank'>Code</a></p>")
    #gr.HTML(value="<p align='center' style='font-size: 1.15em;'><i>LEO: an embodied generalist agent capable of perceiving, grounding, reasoning, planning, and acting in 3D world.</i></p>")

    with gr.Row():
        with gr.Column(scale=5):
            dropdown_scene = gr.Dropdown(
                choices=MESH_NAMES,
                value='scene0050_00',
                interactive=True,
                label='Select a 3D scene',
            )
            model_3d = gr.Model3D(
                value=os.path.join(MESH_DIR, f'scene0050_00.glb'),
                clear_color=[0.0, 0.0, 0.0, 0.0],
                label='3D Scene',
                camera_position=(80, 100, 6),
                height=659,
            )
            gr.HTML(
                """<center><strong>
                👆 SCROLL and DRAG on the 3D Scene
                to zoom in/out and rotate. Press CTRL and DRAG to pan.
                </strong></center>
                """
            )
    scan_id = gr.Text("scene0050_00", label='scan_id')
    
    dropdown_scene.change(
        fn=change_scene,
        inputs=[dropdown_scene],
        outputs=[model_3d, scan_id],
        queue=False
    )
        
    def inference_wrapper(text, scan_id):
        inst_id, response = inference(scan_id, text)
        return f"assets/mask/{scan_id}/{scan_id}_obj_{inst_id}.glb", response
    
    gr.Interface(
        fn=inference_wrapper,
        inputs=["text", scan_id],
        outputs=[gr.Model3D(
                clear_color=[0.0, 0.0, 0.0, 0.0],  camera_position=(80, 100, 6), label="3D Model"), "text"],
        examples=[
            ["armchair", "scene0050_00"], ["Sofa", "scene0050_00"], ["left computer on the desk", "scene0050_00"]
        ],
        title="Input text, Output 3D Mask,  Red denotes predicted object"
    )

demo.queue().launch(share=True, allowed_paths=['assets'])