import gradio as gr import os import torch from llama_parse import LlamaParse from llama_index.core import StorageContext, load_index_from_storage from llama_index.core.indices import MultiModalVectorStoreIndex from llama_index.core.schema import Document, ImageDocument from llama_index.embeddings.huggingface import HuggingFaceEmbedding example_indexes = { "IONIQ 2024": "./iconiq_report_index", "Uber 10k 2021": "./uber_index", } DEFAULT_INDEX = "IONIQ 2024" device = "cpu" if torch.cuda.is_available(): device = "cuda" elif torch.backends.mps.is_available(): device = "mps" image_embed_model = HuggingFaceEmbedding( model_name="llamaindex/vdr-2b-multi-v1", device=device, trust_remote_code=True, token=os.getenv("HUGGINGFACE_TOKEN"), model_kwargs={"torch_dtype": torch.float16}, embed_batch_size=2, ) text_embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-small-en", device=device, trust_remote_code=True, token=os.getenv("HUGGINGFACE_TOKEN"), embed_batch_size=1, ) class IndexManager: """Avoids deepcopying the index object in gr.State""" def __init__(self): self.current_index = None # Initialize with default index self.load_index(example_indexes[DEFAULT_INDEX]) def load_index(self, index_path): storage_context = StorageContext.from_defaults(persist_dir=index_path) self.current_index = load_index_from_storage( storage_context, embed_model=text_embed_model, image_embed_model=image_embed_model, ) return f"Loaded index: {index_path}" def set_index(self, index): self.current_index = index def get_index(self): return self.current_index index_manager = IndexManager() def load_index(index_path: str) -> MultiModalVectorStoreIndex: index_manager.load_index(index_path) return index_manager.get_index() def create_index(file, llama_parse_key, progress=gr.Progress()): if not file or not llama_parse_key: return None, "Please provide both a file and LlamaParse API key" try: progress(0, desc="Initializing LlamaParse...") parser = LlamaParse( api_key=llama_parse_key, take_screenshot=True, ) # Process document progress(0.2, desc="Processing document with LlamaParse...") md_json_obj = parser.get_json_result(file.name)[0] progress(0.4, desc="Downloading and processing images...") image_dicts = parser.get_images( [md_json_obj], download_path=os.path.join(os.path.dirname(file.name), f"{file.name}_images") ) # Create text document progress(0.6, desc="Creating text documents...") text = "" for page in md_json_obj["pages"]: text += page["md"] + "\n\n" text_docs = [Document(text=text.strip())] # Create image documents progress(0.8, desc="Creating image documents...") image_docs = [] for image_dict in image_dicts: image_docs.append(ImageDocument(text=image_dict["name"], image_path=image_dict["path"])) # Create index progress(0.9, desc="Creating final index...") index = MultiModalVectorStoreIndex.from_documents( text_docs + image_docs, embed_model=text_embed_model, image_embed_model=image_embed_model, ) progress(1.0, desc="Complete!") index_manager.set_index(index) return "Index created successfully!" except Exception as e: return f"Error creating index: {str(e)}" def run_search(query, text_top_k, image_top_k): index = index_manager.get_index() if not index: return "Please create or select an index first.", [], [] retriever = index.as_retriever( similarity_top_k=text_top_k, image_similarity_top_k=image_top_k, ) image_nodes = retriever.text_to_image_retrieve(query) text_nodes = retriever.text_retrieve(query) # Extract text and scores from nodes text_results = [{"text": node.text, "score": f"{node.score:.3f}"} for node in text_nodes] # Load images and scores image_results = [] for node in image_nodes: if hasattr(node.node, 'image_path') and os.path.exists(node.node.image_path): try: image_results.append(( node.node.image_path, f"Similarity: {node.score:.3f}", )) except Exception as e: print(f"Error loading image {node.node.image_path}: {e}") return "Search completed!", text_results, image_results # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Multi-Modal Retrieval with LlamaIndex and llamaindex/vdr-2b-multi-v1") gr.Markdown(""" This demo shows how to use the new `llamaindex/vdr-2b-multi-v1` model for multi-modal document search. Using this model, we can index images and perform text-to-image retrieval. This demo compares to pure text retrieval using the `BAAI/bge-small-en` model. Is this a fair comparison? Not really, but it's the easiest to run in a limited huggingface space, and shows the strengths of screenshot-based retrieval. The two pre-made indexes are: - [IONIQ 2024](https://cdn.prod.website-files.com/65e1d7fb19a3e64b5c36fb38/66eb856e019e59758ef73759_ICONIQ%20Analytics%20%2B%20Insights%20-%20State%20of%20AI%20Sep24.pdf): A report on the 2024 State of AI. - [Uber 10k 2021](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/data/10k/uber_2021.pdf): The 2021 Uber 10k document. """ ) with gr.Row(): with gr.Column(): # Index selection/creation with gr.Tab("Use Existing Index"): existing_index_dropdown = gr.Dropdown( choices=list(example_indexes.keys()), label="Select Pre-made Index", value=list(example_indexes.keys())[0] ) with gr.Tab("Create New Index"): gr.Markdown( """ To create a new index, enter your LlamaParse API key and upload a PDF. You can get a free API key by signing up [here](https://cloud.llamaindex.ai). Processing will take a few minutes when creating a new index, depending on the size of the document. """ ) file_upload = gr.File(label="Upload PDF") llama_parse_key = gr.Textbox( label="LlamaParse API Key", type="password" ) create_btn = gr.Button("Create Index") create_status = gr.Textbox(label="Status", interactive=False) # Search controls query_input = gr.Textbox(label="Search Query", value="What is the Executive Summary?") with gr.Row(): text_top_k = gr.Slider( minimum=1, maximum=10, value=2, step=1, label="Text Top-K" ) image_top_k = gr.Slider( minimum=1, maximum=10, value=2, step=1, label="Image Top-K" ) search_btn = gr.Button("Search") with gr.Column(): # Results display status_output = gr.Textbox(label="Search Status") image_output = gr.Gallery( label="Retrieved Images", show_label=True, # This will show the similarity score captions elem_id="gallery" ) text_output = gr.JSON( label="Retrieved Text with Similarity Scores", elem_id="text_results" ) # Event handlers def load_existing_index(index_name, progress=gr.Progress()): if index_name: try: progress(0, desc="Loading index...") result = index_manager.load_index(example_indexes[index_name]) progress(1.0, desc="Index loaded!") return result, None except Exception as e: return f"Error loading index: {str(e)}", None return "No index selected", None existing_index_dropdown.change( fn=load_existing_index, inputs=[existing_index_dropdown], outputs=[create_status, query_input], api_name=False, show_progress=True ) create_btn.click( fn=create_index, inputs=[file_upload, llama_parse_key], outputs=[create_status], api_name=False, show_progress=True ) search_btn.click( fn=run_search, inputs=[query_input, text_top_k, image_top_k], outputs=[status_output, text_output, image_output], api_name=False ) gr.Markdown(""" This demo was built with [LlamaIndex](https://docs.llamaindex.ai) and [LlamaParse](https://cloud.llamaindex.ai). To see more multi-modal demos, check out the [llama parse examples](https://github.com/run-llama/llama_parse/tree/main/examples/multimodal). """ ) if __name__ == "__main__": # Running locally demo.launch()