Spaces:

llamaindex
/

multimodal_vdr_demo

Running on T4

App Files Files Community

cheesyFishes commited on 2 days ago

Commit

5e5d2bc

1 Parent(s): 2557dbe

add initial source files

Browse files

Files changed (3) hide show

README.md +4 -4
app.py +243 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: Multimodal Vdr Demo
-emoji: 👀
-colorFrom: blue
 colorTo: blue
 sdk: gradio
 sdk_version: 5.11.0
 app_file: app.py
-pinned: false
 license: mit
 short_description: Multimodal retrieval using llamaindex/vdr-2b-multi-v1
 ---

 ---
+title: Multimodal VDR Demo
+emoji: 🦙📖
+colorFrom: pink
 colorTo: blue
 sdk: gradio
 sdk_version: 5.11.0
 app_file: app.py
+pinned: true
 license: mit
 short_description: Multimodal retrieval using llamaindex/vdr-2b-multi-v1
 ---

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import gradio as gr
+import os
+import torch
+from llama_parse import LlamaParse
+from llama_index.core import StorageContext, load_index_from_storage
+from llama_index.core.indices import MultiModalVectorStoreIndex
+from llama_index.core.schema import Document, ImageDocument
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+example_indexes = {
+    "IONIQ 2024": "./iconiq_report_index",
+    "Uber 10k 2021": "./uber_index",
+}
+device = "cpu"
+if torch.cuda.is_available():
+    device = "cuda"
+elif torch.backends.mps.is_available():
+    device = "mps"
+image_embed_model = HuggingFaceEmbedding(
+    model_name="llamaindex/vdr-2b-multi-v1",
+    device=device,
+    trust_remote_code=True,
+    token=os.getenv("HUGGINGFACE_TOKEN"),
+    model_kwargs={"torch_dtype": torch.float16},
+    embed_batch_size=4,
+)
+text_embed_model = HuggingFaceEmbedding(
+    model_name="BAAI/bge-small-en",
+    device=device,
+    trust_remote_code=True,
+    token=os.getenv("HUGGINGFACE_TOKEN"),
+    embed_batch_size=4,
+)
+def load_index(index_path: str) -> MultiModalVectorStoreIndex:
+    storage_context = StorageContext.from_defaults(persist_dir=index_path)
+    return load_index_from_storage(
+        storage_context,
+        embed_model=text_embed_model,
+        image_embed_model=image_embed_model,
+    )
+def create_index(file, llama_parse_key, progress=gr.Progress()):
+    if not file or not llama_parse_key:
+        return None, "Please provide both a file and LlamaParse API key"
+    try:
+        progress(0, desc="Initializing LlamaParse...")
+        parser = LlamaParse(
+            api_key=llama_parse_key,
+            take_screenshot=True,
+        )
+        # Process document
+        progress(0.2, desc="Processing document with LlamaParse...")
+        md_json_obj = parser.get_json_result(file.name)[0]
+        progress(0.4, desc="Downloading and processing images...")
+        image_dicts = parser.get_images(
+            [md_json_obj],
+            download_path=os.path.join(os.path.dirname(file.name), f"{file.name}_images")
+        )
+        # Create text document
+        progress(0.6, desc="Creating text documents...")
+        text = ""
+        for page in md_json_obj["pages"]:
+            text += page["md"] + "\n\n"
+        text_docs = [Document(text=text.strip())]
+        # Create image documents
+        progress(0.8, desc="Creating image documents...")
+        image_docs = []
+        for image_dict in image_dicts:
+            image_docs.append(ImageDocument(text=image_dict["name"], image_path=image_dict["path"]))
+        # Create index
+        progress(0.9, desc="Creating final index...")
+        index = MultiModalVectorStoreIndex.from_documents(
+            text_docs + image_docs,
+            embed_model=text_embed_model,
+            image_embed_model=image_embed_model,
+        )
+        progress(1.0, desc="Complete!")
+        return index, "Index created successfully!"
+    except Exception as e:
+        return None, f"Error creating index: {str(e)}"
+def run_search(index, query, text_top_k, image_top_k):
+    if not index:
+        return "Please create or select an index first.", [], []
+    retriever = index.as_retriever(
+        similarity_top_k=text_top_k,
+        image_similarity_top_k=image_top_k,
+    )
+    image_nodes = retriever.text_to_image_retrieve(query)
+    text_nodes = retriever.text_retrieve(query)
+    # Extract text and scores from nodes
+    text_results = [{"text": node.text, "score": f"{node.score:.3f}"} for node in text_nodes]
+    # Load images and scores
+    image_results = []
+    for node in image_nodes:
+        if hasattr(node.node, 'image_path') and os.path.exists(node.node.image_path):
+            try:
+                image_results.append((
+                    node.node.image_path,
+                    f"Similarity: {node.score:.3f}",
+                ))
+            except Exception as e:
+                print(f"Error loading image {node.node.image_path}: {e}")
+    return "Search completed!", text_results, image_results
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Multi-Modal Retrieval with LlamaIndex and llamaindex/vdr-2b-multi-v1")
+    gr.Markdown("""
+This demo shows how to use the new `llamaindex/vdr-2b-multi-v1` model for multi-modal document search.
+Using this model, we can index images and perform text-to-image retrieval.
+This demo compares to pure text retrieval using the `BAAI/bge-small-en` model. Is this a fair comparison? Not really,
+but it's the easiest to run in a limited huggingface space, and shows the strengths of screenshot-based retrieval.
+"""
+    )
+    with gr.Row():
+        with gr.Column():
+            # Index selection/creation
+            with gr.Tab("Use Existing Index"):
+                existing_index_dropdown = gr.Dropdown(
+                    choices=list(example_indexes.keys()),
+                    label="Select Pre-made Index",
+                    value=list(example_indexes.keys())[0]
+                )
+            with gr.Tab("Create New Index"):
+                gr.Markdown(
+                    """
+To create a new index, enter your LlamaParse API key and upload a PDF.
+You can get a free API key by signing up [here](https://cloud.llamaindex.ai).
+Processing will take a few minutes when creating a new index, depending on the size of the document.
+"""
+                )
+                file_upload = gr.File(label="Upload PDF")
+                llama_parse_key = gr.Textbox(
+                    label="LlamaParse API Key",
+                    type="password"
+                )
+                create_btn = gr.Button("Create Index")
+                create_status = gr.Textbox(label="Status", interactive=False)
+            # Search controls
+            query_input = gr.Textbox(label="Search Query", value="What is the Executive Summary?")
+            with gr.Row():
+                text_top_k = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=2,
+                    step=1,
+                    label="Text Top-K"
+                )
+                image_top_k = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    value=2,
+                    step=1,
+                    label="Image Top-K"
+                )
+            search_btn = gr.Button("Search")
+        with gr.Column():
+            # Results display
+            status_output = gr.Textbox(label="Search Status")
+            image_output = gr.Gallery(
+                label="Retrieved Images",
+                show_label=True,  # This will show the similarity score captions
+                elem_id="gallery"
+            )
+            text_output = gr.JSON(
+                label="Retrieved Text with Similarity Scores",
+                elem_id="text_results"
+            )
+    # State
+    index_state = gr.State()
+    # Load default index on startup
+    default_index = load_index(example_indexes["IONIQ 2024"])
+    index_state.value = default_index
+    # Event handlers
+    def load_existing_index(index_name):
+        if index_name:
+            try:
+                index = load_index(example_indexes[index_name])
+                return index, f"Loaded index: {index_name}"
+            except Exception as e:
+                return None, f"Error loading index: {str(e)}"
+        return None, "No index selected"
+    existing_index_dropdown.change(
+        fn=load_existing_index,
+        inputs=[existing_index_dropdown],
+        outputs=[index_state, create_status],
+        api_name=False
+    )
+    create_btn.click(
+        fn=create_index,
+        inputs=[file_upload, llama_parse_key],
+        outputs=[index_state, create_status],
+        api_name=False,
+        show_progress=True  # Enable progress bar
+    )
+    search_btn.click(
+        fn=run_search,
+        inputs=[index_state, query_input, text_top_k, image_top_k],
+        outputs=[status_output, text_output, image_output],
+        api_name=False
+    )
+    gr.Markdown("""
+This demo was built with [LlamaIndex](https://docs.llamaindex.ai) and [LlamaParse](https://cloud.llamaindex.ai). To see more multi-modal demos, check out the [llama parse examples](https://github.com/run-llama/llama_parse/tree/main/examples/multimodal).
+"""
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+llama-index-core==0.12.10
+llama-index-embeddings-huggingface==0.5.0
+llama-parse==0.5.19