Spaces:

geekyrakshit
/

medrag

Sleeping

App Files Files Community

geekyrakshit commited on Nov 9, 2024

Commit

170d9a9

1 Parent(s): e6f968c

update: app

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +89 -72
docs/app.md +0 -61
docs/assistant/figure_annotation.md +0 -3
docs/assistant/llm_client.md +0 -3
docs/assistant/medqa_assistant.md +0 -3
docs/chunking.md +0 -3
docs/document_loader/image_loader/base_img_loader.md +0 -3
docs/document_loader/image_loader/fitzpil_img_loader.md +0 -22
docs/document_loader/image_loader/marker_img_loader.md +0 -21
docs/document_loader/image_loader/pdf2image_img_loader.md +0 -26
docs/document_loader/image_loader/pdfplumber_img_loader.md +0 -22
docs/document_loader/image_loader/pymupdf_img_loader.md +0 -23
docs/document_loader/text_loader/base_text_loader.md +0 -3
docs/document_loader/text_loader/marker_text_loader.md +0 -23
docs/document_loader/text_loader/pdfplumber_text_loader.md +0 -22
docs/document_loader/text_loader/pymupdf4llm_text_loader.md +0 -23
docs/document_loader/text_loader/pypdf2_text_loader.md +0 -23
docs/index.md +0 -40
docs/installation/development.md +0 -40
docs/installation/install.md +0 -9
docs/retreival/bm25s.md +0 -3
docs/retreival/colpali.md +0 -3
docs/retreival/contriever.md +0 -3
docs/retreival/medcpt.md +0 -3
docs/retreival/nv_embed_2.md +0 -3
install.sh +0 -30
medrag_multi_modal/assistant/figure_annotation.py +4 -13
medrag_multi_modal/assistant/llm_client.py +19 -11
medrag_multi_modal/assistant/medqa_assistant.py +94 -28
medrag_multi_modal/assistant/schema.py +27 -0
medrag_multi_modal/cli.py +54 -3
medrag_multi_modal/document_loader/image_loader/base_img_loader.py +80 -29
medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py +16 -16
medrag_multi_modal/document_loader/image_loader/marker_img_loader.py +15 -26
medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py +7 -16
medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py +16 -16
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py +16 -16
medrag_multi_modal/document_loader/text_loader/base_text_loader.py +58 -20
medrag_multi_modal/document_loader/text_loader/marker_text_loader.py +8 -15
medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py +7 -13
medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py +7 -15
medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py +7 -13
medrag_multi_modal/metrics/__init__.py +3 -0
medrag_multi_modal/metrics/base.py +108 -0
medrag_multi_modal/metrics/mmlu.py +24 -0
medrag_multi_modal/retrieval/__init__.py +1 -13
medrag_multi_modal/retrieval/colpali_retrieval.py +1 -1
medrag_multi_modal/retrieval/common.py +0 -23
medrag_multi_modal/retrieval/text_retrieval/__init__.py +11 -0
medrag_multi_modal/retrieval/{bm25s_retrieval.py → text_retrieval/bm25s_retrieval.py} +87 -61

app.py CHANGED Viewed

@@ -1,26 +1,20 @@
-import os
-import wandb
-wandb.login(relogin=True, key=os.getenv("WANDB_API_KEY"))
 import streamlit as st
-import weave
-from medrag_multi_modal.assistant import (
-    FigureAnnotatorFromPageImage,
-    LLMClient,
-    MedQAAssistant,
-)
-from medrag_multi_modal.assistant.llm_client import (
-    GOOGLE_MODELS,
-    MISTRAL_MODELS,
-    OPENAI_MODELS,
 )
-from medrag_multi_modal.retrieval import MedCPTRetriever
 # Define constants
-ALL_AVAILABLE_MODELS = GOOGLE_MODELS + MISTRAL_MODELS + OPENAI_MODELS
 # Sidebar for configuration settings
 st.sidebar.title("Configuration Settings")
@@ -30,68 +24,91 @@ project_name = st.sidebar.text_input(
     placeholder="wandb project name",
     help="format: wandb_username/wandb_project_name",
 )
-chunk_dataset_name = st.sidebar.text_input(
-    label="Text Chunk WandB Dataset Name",
-    value="grays-anatomy-chunks:v0",
-    placeholder="wandb dataset name",
-    help="format: wandb_dataset_name:version",
 )
-index_artifact_address = st.sidebar.text_input(
-    label="WandB Index Artifact Address",
-    value="ml-colabs/medrag-multi-modal/grays-anatomy-medcpt:v0",
-    placeholder="wandb artifact address",
-    help="format: wandb_username/wandb_project_name/wandb_artifact_name:version",
 )
-image_artifact_address = st.sidebar.text_input(
-    label="WandB Image Artifact Address",
-    value="ml-colabs/medrag-multi-modal/grays-anatomy-images-marker:v6",
-    placeholder="wandb artifact address",
-    help="format: wandb_username/wandb_project_name/wandb_artifact_name:version",
 )
-llm_client_model_name = st.sidebar.selectbox(
-    label="LLM Client Model Name",
-    options=ALL_AVAILABLE_MODELS,
-    index=ALL_AVAILABLE_MODELS.index("gemini-1.5-flash"),
-    help="select a model from the list",
 )
-figure_extraction_model_name = st.sidebar.selectbox(
-    label="Figure Extraction Model Name",
-    options=ALL_AVAILABLE_MODELS,
-    index=ALL_AVAILABLE_MODELS.index("pixtral-12b-2409"),
-    help="select a model from the list",
 )
-structured_output_model_name = st.sidebar.selectbox(
-    label="Structured Output Model Name",
-    options=ALL_AVAILABLE_MODELS,
-    index=ALL_AVAILABLE_MODELS.index("gpt-4o"),
-    help="select a model from the list",
 )
-# Streamlit app layout
-st.title("MedQA Assistant App")
-# Initialize Weave
-weave.init(project_name=project_name)
-# Initialize clients and assistants
-llm_client = LLMClient(model_name=llm_client_model_name)
-retriever = MedCPTRetriever.from_wandb_artifact(
-    chunk_dataset_name=chunk_dataset_name,
-    index_artifact_address=index_artifact_address,
-)
-figure_annotator = FigureAnnotatorFromPageImage(
-    figure_extraction_llm_client=LLMClient(model_name=figure_extraction_model_name),
-    structured_output_llm_client=LLMClient(model_name=structured_output_model_name),
-    image_artifact_address=image_artifact_address,
-)
-medqa_assistant = MedQAAssistant(
-    llm_client=llm_client, retriever=retriever, figure_annotator=figure_annotator
-)
-query = st.chat_input("Enter your question here")
-if query:
-    with st.chat_message("user"):
-        st.markdown(query)
-    response = medqa_assistant.predict(query=query)
     with st.chat_message("assistant"):
-        st.markdown(response)

 import streamlit as st
+from medrag_multi_modal.assistant import LLMClient, MedQAAssistant
+from medrag_multi_modal.retrieval.text_retrieval import (
+    BM25sRetriever,
+    ContrieverRetriever,
+    MedCPTRetriever,
+    NVEmbed2Retriever,
 )
 # Define constants
+ALL_AVAILABLE_MODELS = [
+    "gemini-1.5-flash-latest",
+    "gemini-1.5-pro-latest",
+    "gpt-4o",
+    "gpt-4o-mini",
+]
 # Sidebar for configuration settings
 st.sidebar.title("Configuration Settings")
     placeholder="wandb project name",
     help="format: wandb_username/wandb_project_name",
 )
+chunk_dataset_id = st.sidebar.selectbox(
+    label="Chunk Dataset ID",
+    options=["ashwiniai/medrag-text-corpus-chunks"],
 )
+llm_model = st.sidebar.selectbox(
+    label="LLM Model",
+    options=ALL_AVAILABLE_MODELS,
 )
+top_k_chunks_for_query = st.sidebar.slider(
+    label="Top K Chunks for Query",
+    min_value=1,
+    max_value=20,
+    value=5,
 )
+top_k_chunks_for_options = st.sidebar.slider(
+    label="Top K Chunks for Options",
+    min_value=1,
+    max_value=20,
+    value=3,
 )
+rely_only_on_context = st.sidebar.checkbox(
+    label="Rely Only on Context",
+    value=False,
 )
+retriever_type = st.sidebar.selectbox(
+    label="Retriever Type",
+    options=[
+        "",
+        "BM25S",
+        "Contriever",
+        "MedCPT",
+        "NV-Embed-v2",
+    ],
 )
+if retriever_type != "":
+    llm_model = LLMClient(model_name=llm_model)
+    retriever = None
+    if retriever_type == "BM25S":
+        retriever = BM25sRetriever.from_index(
+            index_repo_id="ashwiniai/medrag-text-corpus-chunks-bm25s"
+        )
+    elif retriever_type == "Contriever":
+        retriever = ContrieverRetriever.from_index(
+            index_repo_id="ashwiniai/medrag-text-corpus-chunks-contriever",
+            chunk_dataset_id=chunk_dataset_id,
+        )
+    elif retriever_type == "MedCPT":
+        retriever = MedCPTRetriever.from_index(
+            index_repo_id="ashwiniai/medrag-text-corpus-chunks-medcpt",
+            chunk_dataset_id=chunk_dataset_id,
+        )
+    elif retriever_type == "NV-Embed-v2":
+        retriever = NVEmbed2Retriever.from_index(
+            index_repo_id="ashwiniai/medrag-text-corpus-chunks-nv-embed-2",
+            chunk_dataset_id=chunk_dataset_id,
+        )
+    medqa_assistant = MedQAAssistant(
+        llm_client=llm_model,
+        retriever=retriever,
+        top_k_chunks_for_query=top_k_chunks_for_query,
+        top_k_chunks_for_options=top_k_chunks_for_options,
+    )
     with st.chat_message("assistant"):
+        st.markdown(
+            """
+Hi! I am Medrag, your medical assistant. You can ask me any questions about the medical and the life sciences.
+I am currently a work-in-progress, so please bear with my stupidity and overall lack of knowledge.
+**Note:** that I am not a medical professional, so please do not rely on my answers for medical decisions.
+Please consult a medical professional for any medical advice.
+In order to learn more about how I am being developed, please visit [soumik12345/medrag-multi-modal](https://github.com/soumik12345/medrag-multi-modal).
+            """,
+            unsafe_allow_html=True,
+        )
+    query = st.chat_input("Enter your question here")
+    if query:
+        with st.chat_message("user"):
+            st.markdown(query)
+        response = medqa_assistant.predict(query=query)
+        with st.chat_message("assistant"):
+            st.markdown(response.response)

docs/app.md DELETED Viewed

@@ -1,61 +0,0 @@
-# MedQA Assistant App
-The MedQA Assistant App is a Streamlit-based application designed to provide a chat interface for medical question answering. It leverages advanced language models (LLMs) and retrieval augmented generation (RAG) techniques to deliver accurate and informative responses to medical queries.
-## Features
-- **Interactive Chat Interface**: Engage with the app through a user-friendly chat interface.
-- **Configurable Settings**: Customize model selection and data sources via the sidebar.
-- **Retrieval-Augmented Generation**: Ensures precise and contextually relevant responses.
-- **Figure Annotation Capabilities**: Extracts and annotates figures from medical texts.
-## Usage
-1. Install the package using:
-    ```bash
-    uv pip install .
-    ```
-1. **Launch the App**: Start the application using Streamlit:
-    ```bash
-    medrag run
-    ```
-2. **Configure Settings**: Adjust configuration settings in the sidebar to suit your needs.
-3. **Ask a Question**: Enter your medical question in the chat input field.
-4. **Receive a Response**: Get a detailed answer from the MedQA Assistant.
-## Configuration
-The app allows users to customize various settings through the sidebar:
-- **Project Name**: Specify the WandB project name.
-- **Text Chunk WandB Dataset Name**: Define the dataset containing text chunks.
-- **WandB Index Artifact Address**: Provide the address of the index artifact.
-- **WandB Image Artifact Address**: Provide the address of the image artifact.
-- **LLM Client Model Name**: Choose a language model for generating responses.
-- **Figure Extraction Model Name**: Select a model for extracting figures from images.
-- **Structured Output Model Name**: Choose a model for generating structured outputs.
-## Technical Details
-The app is built using the following components:
-- **Streamlit**: For the user interface.
-- **Weave**: For project initialization and artifact management.
-- **MedQAAssistant**: For processing queries and generating responses.
-- **LLMClient**: For interacting with language models.
-- **MedCPTRetriever**: For retrieving relevant text chunks.
-- **FigureAnnotatorFromPageImage**: For annotating figures in medical texts.
-## Development and Deployment
-- **Environment Setup**: Ensure all dependencies are installed as per the `pyproject.toml`.
-- **Running the App**: Use Streamlit to run the app locally.
-- **Deployment**: coming soon...
-## Additional Resources
-For more detailed information on the components and their usage, refer to the following documentation sections:
-- [MedQA Assistant](/assistant/medqa_assistant)
-- [LLM Client](/assistant/llm_client)
-- [Figure Annotation](/assistant/figure_annotation)

docs/assistant/figure_annotation.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Figure Annotation
-::: medrag_multi_modal.assistant.figure_annotation

docs/assistant/llm_client.md DELETED Viewed

@@ -1,3 +0,0 @@
-# LLM Client
-::: medrag_multi_modal.assistant.llm_client

docs/assistant/medqa_assistant.md DELETED Viewed

@@ -1,3 +0,0 @@
-# MedQA Assistant
-::: medrag_multi_modal.assistant.medqa_assistant

docs/chunking.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Chunking
-::: medrag_multi_modal.semantic_chunking

docs/document_loader/image_loader/base_img_loader.md DELETED Viewed

@@ -1,3 +0,0 @@
-## Load images from PDF files
-::: medrag_multi_modal.document_loader.image_loader.base_img_loader

docs/document_loader/image_loader/fitzpil_img_loader.md DELETED Viewed

@@ -1,22 +0,0 @@
-# Load images from PDF files (using Fitz & PIL)
-??? note "Note"
-    **Underlying Library:** `fitz` & `pillow`
-    Extract images from PDF files using `fitz` and `pillow`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
-    ```
-    For more details, please refer to the sources below.
-    **Sources:**
-    - [Docs](https://pymupdf.readthedocs.io/en/latest/intro.html)
-    - [GitHub](https://github.com/kastman/fitz)
-    - [PyPI](https://pypi.org/project/fitz/)
-    - [PyPI](https://pypi.org/project/pillow/)
-::: medrag_multi_modal.document_loader.image_loader.fitzpil_img_loader

docs/document_loader/image_loader/marker_img_loader.md DELETED Viewed

@@ -1,21 +0,0 @@
-# Load images from PDF files (using Marker)
-??? note "Note"
-    **Underlying Library:** `marker-pdf`
-    Extract images from PDF files using `marker-pdf`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
-    ```
-    For details, please refer to the sources below.
-    **Sources:**
-    - [DataLab](https://www.datalab.to)
-    - [GitHub](https://github.com/VikParuchuri/marker)
-    - [PyPI](https://pypi.org/project/marker-pdf/)
-::: medrag_multi_modal.document_loader.image_loader.marker_img_loader

docs/document_loader/image_loader/pdf2image_img_loader.md DELETED Viewed

@@ -1,26 +0,0 @@
-# Load images from PDF files (using PDF2Image)
-!!! danger "Warning"
-    Unlike other image extraction methods in `document_loader.image_loader`, this loader does not extract embedded images from the PDF.
-    Instead, it creates a snapshot image version of each selected page from the PDF.
-??? note "Note"
-    **Underlying Library:** `pdf2image`
-    Extract images from PDF files using `pdf2image`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
-    ```
-    For details and available `**kwargs`, please refer to the sources below.
-    **Sources:**
-    - [DataLab](https://www.datalab.to)
-    - [GitHub](https://github.com/VikParuchuri/marker)
-    - [PyPI](https://pypi.org/project/marker-pdf/)
-::: medrag_multi_modal.document_loader.image_loader.pdf2image_img_loader

docs/document_loader/image_loader/pdfplumber_img_loader.md DELETED Viewed

@@ -1,22 +0,0 @@
-# Load images from PDF files (using PDFPlumber)
-??? note "Note"
-    **Underlying Library:** `pdfplumber`
-    Extract images from PDF files using `pdfplumber`.
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
-    ```
-    For details, please refer to the sources below.
-    **Sources:**
-    - [GitHub](https://github.com/jsvine/pdfplumber)
-    - [PyPI](https://pypi.org/project/pdfplumber/)
-::: medrag_multi_modal.document_loader.image_loader.pdfplumber_img_loader

docs/document_loader/image_loader/pymupdf_img_loader.md DELETED Viewed

@@ -1,23 +0,0 @@
-# Load images from PDF files (using PyMuPDF)
-??? note "Note"
-    **Underlying Library:** `pymupdf`
-    PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
-    ```
-    For details, please refer to the sources below.
-    **Sources:**
-    - [Docs](https://pymupdf.readthedocs.io/en/latest/)
-    - [GitHub](https://github.com/pymupdf/PyMuPDF)
-    - [PyPI](https://pypi.org/project/PyMuPDF/)
-::: medrag_multi_modal.document_loader.image_loader.pymupdf_img_loader

docs/document_loader/text_loader/base_text_loader.md DELETED Viewed

@@ -1,3 +0,0 @@
-## Load text from PDF files
-::: medrag_multi_modal.document_loader.text_loader.base_text_loader

docs/document_loader/text_loader/marker_text_loader.md DELETED Viewed

@@ -1,23 +0,0 @@
-## Load text from PDF files (using Marker)
-??? note "Note"
-    **Underlying Library:** `marker-pdf`
-    Convert PDF to markdown quickly and accurately using a pipeline of deep learning models.
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
-    ```
-    For details and available `**kwargs`, please refer to the sources below.
-    **Sources:**
-    - [DataLab](https://www.datalab.to)
-    - [GitHub](https://github.com/VikParuchuri/marker)
-    - [PyPI](https://pypi.org/project/marker-pdf/)
-::: medrag_multi_modal.document_loader.text_loader.marker_text_loader

docs/document_loader/text_loader/pdfplumber_text_loader.md DELETED Viewed

@@ -1,22 +0,0 @@
-## Load text from PDF files (using PDFPlumber)
-??? note "Note"
-    **Underlying Library:** `pdfplumber`
-    Plumb a PDF for detailed information about each char, rectangle, line, et cetera — and easily extract text and tables.
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
-    ```
-    For details and available `**kwargs`, please refer to the sources below.
-    **Sources:**
-    - [GitHub](https://github.com/jsvine/pdfplumber)
-    - [PyPI](https://pypi.org/project/pdfplumber/)
-::: medrag_multi_modal.document_loader.text_loader.pdfplumber_text_loader

docs/document_loader/text_loader/pymupdf4llm_text_loader.md DELETED Viewed

@@ -1,23 +0,0 @@
-## Load text from PDF files (using PyMuPDF4LLM)
-??? note "Note"
-    **Underlying Library:** `pymupdf4llm`
-    PyMuPDF is a high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents.
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.text_loader import PyMuPDF4LLMTextLoader
-    ```
-    For details and available `**kwargs`, please refer to the sources below.
-    **Sources:**
-    - [Docs](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/)
-    - [GitHub](https://github.com/pymupdf/PyMuPDF)
-    - [PyPI](https://pypi.org/project/pymupdf4llm/)
-::: medrag_multi_modal.document_loader.text_loader.pymupdf4llm_text_loader

docs/document_loader/text_loader/pypdf2_text_loader.md DELETED Viewed

@@ -1,23 +0,0 @@
-## Load text from PDF files (using PyPDF2)
-??? note "Note"
-    **Underlying Library:** `pypdf2`
-    A pure-python PDF library capable of splitting, merging, cropping, and transforming the pages of PDF files
-    You can interact with the underlying library and fine-tune the outputs via `**kwargs`.
-    Use it in our library with:
-    ```python
-    from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
-    ```
-    For details and available `**kwargs`, please refer to the sources below.
-    **Sources:**
-    - [Docs](https://pypdf2.readthedocs.io/en/3.x/)
-    - [GitHub](https://github.com/py-pdf/pypdf)
-    - [PyPI](https://pypi.org/project/PyPDF2/)
-::: medrag_multi_modal.document_loader.text_loader.pypdf2_text_loader

docs/index.md DELETED Viewed

@@ -1,40 +0,0 @@
-# MedRAG Multi-Modal
-Multi-modal RAG for medical docmain.
-## Installation
-### For Development
-For MacOS, you need to run
-```bash
-brew install poppler
-```
-For Debian/Ubuntu, you need to run
-```bash
-sudo apt-get install -y poppler-utils
-```
-Then, you can install the dependencies using uv in the virtual environment `.venv` using
-```bash
-git clone https://github.com/soumik12345/medrag-multi-modal
-cd medrag-multi-modal
-pip install -U pip uv
-uv sync
-```
-After this, you need to activate the virtual environment using
-```bash
-source .venv/bin/activate
-```
-In the activated virtual environment, you can optionally install Flash Attention (required for ColPali) using
-```bash
-uv pip install flash-attn --no-build-isolation
-```

docs/installation/development.md DELETED Viewed

@@ -1,40 +0,0 @@
-# Setting up the development environment
-## Install Poppler
-For MacOS, you need to run
-```bash
-brew install poppler
-```
-For Debian/Ubuntu, you need to run
-```bash
-sudo apt-get install -y poppler-utils
-```
-## Install the dependencies
-Then, you can install the dependencies using uv in the virtual environment `.venv` using
-```bash
-git clone https://github.com/soumik12345/medrag-multi-modal
-cd medrag-multi-modal
-pip install -U pip uv
-uv sync
-```
-After this, you need to activate the virtual environment using
-```bash
-source .venv/bin/activate
-```
-## [Optional] Install Flash Attention
-In the activated virtual environment, you can optionally install Flash Attention (required for ColPali) using
-```bash
-uv pip install flash-attn --no-build-isolation
-```

docs/installation/install.md DELETED Viewed

@@ -1,9 +0,0 @@
-# Installation
-You just need to clone the repository and run the install.sh script
-```bash
-git clone https://github.com/soumik12345/medrag-multi-modal
-cd medrag-multi-modal
-sh install.sh
-```

docs/retreival/bm25s.md DELETED Viewed

@@ -1,3 +0,0 @@
-# BM25-Sparse Retrieval
-::: medrag_multi_modal.retrieval.bm25s_retrieval

docs/retreival/colpali.md DELETED Viewed

@@ -1,3 +0,0 @@
-# ColPali Retrieval
-::: medrag_multi_modal.retrieval.colpali_retrieval

docs/retreival/contriever.md DELETED Viewed

@@ -1,3 +0,0 @@
-# Contriever Retrieval
-::: medrag_multi_modal.retrieval.contriever_retrieval

docs/retreival/medcpt.md DELETED Viewed

@@ -1,3 +0,0 @@
-# MedCPT Retrieval
-::: medrag_multi_modal.retrieval.medcpt_retrieval

docs/retreival/nv_embed_2.md DELETED Viewed

@@ -1,3 +0,0 @@
-# NV-Embed-v2 Retrieval
-::: medrag_multi_modal.retrieval.nv_embed_2

install.sh DELETED Viewed

@@ -1,30 +0,0 @@
-#!/bin/bash
-OS_TYPE=$(uname -s)
-if [ "$OS_TYPE" = "Darwin" ]; then
-    echo "Detected macOS."
-    brew install poppler
-elif [ "$OS_TYPE" = "Linux" ]; then
-    if [ -f /etc/os-release ]; then
-        . /etc/os-release
-        if [ "$ID" = "ubuntu" ] || [ "$ID" = "debian" ]; then
-            echo "Detected Ubuntu/Debian."
-            sudo apt-get update
-            sudo apt-get install -y poppler-utils
-        else
-            echo "Unsupported Linux distribution: $ID"
-            exit 1
-        fi
-    else
-        echo "Cannot detect Linux distribution."
-        exit 1
-    fi
-else
-    echo "Unsupported OS: $OS_TYPE"
-    exit 1
-fi
-git clone https://github.com/soumik12345/medrag-multi-modal
-cd medrag-multi-modal
-pip install -U .[core]

medrag_multi_modal/assistant/figure_annotation.py CHANGED Viewed

@@ -5,19 +5,10 @@ from typing import Optional, Union
 import cv2
 import weave
 from PIL import Image
-from pydantic import BaseModel
-from ..utils import get_wandb_artifact, read_jsonl_file
-from .llm_client import LLMClient
-class FigureAnnotation(BaseModel):
-    figure_id: str
-    figure_description: str
-class FigureAnnotations(BaseModel):
-    annotations: list[FigureAnnotation]
 class FigureAnnotatorFromPageImage(weave.Model):
@@ -108,7 +99,7 @@ Here are some clues you need to follow:
         )
     @weave.op()
-    def predict(self, page_idx: int) -> dict[int, list[FigureAnnotation]]:
         """
         Predicts figure annotations for a specific page in a document.

 import cv2
 import weave
 from PIL import Image
+from medrag_multi_modal.assistant.llm_client import LLMClient
+from medrag_multi_modal.assistant.schema import FigureAnnotations
+from medrag_multi_modal.utils import get_wandb_artifact, read_jsonl_file
 class FigureAnnotatorFromPageImage(weave.Model):
         )
     @weave.op()
+    def predict(self, page_idx: int) -> dict[int, list[FigureAnnotations]]:
         """
         Predicts figure annotations for a specific page in a document.

medrag_multi_modal/assistant/llm_client.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from enum import Enum
 from typing import Any, Optional, Union
@@ -93,6 +94,7 @@ class LLMClient(weave.Model):
         schema: Optional[Any] = None,
     ) -> Union[str, Any]:
         import google.generativeai as genai
         system_prompt = (
             [system_prompt] if isinstance(system_prompt, str) else system_prompt
@@ -100,18 +102,25 @@ class LLMClient(weave.Model):
         user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
         genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
-        model = genai.GenerativeModel(self.model_name)
         generation_config = (
             None
             if schema is None
             else genai.GenerationConfig(
-                response_mime_type="application/json", response_schema=list[schema]
             )
         )
         response = model.generate_content(
-            system_prompt + user_prompt, generation_config=generation_config
         )
-        return response.text if schema is None else response
     @weave.op()
     def execute_mistral_sdk(
@@ -146,14 +155,13 @@ class LLMClient(weave.Model):
         client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
         client = instructor.from_mistral(client) if schema is not None else client
-        response = (
-            client.chat.complete(model=self.model_name, messages=messages)
-            if schema is None
-            else client.messages.create(
-                response_model=schema, messages=messages, temperature=0
             )
-        )
-        return response.choices[0].message.content
     @weave.op()
     def execute_openai_sdk(

+import json
 import os
 from enum import Enum
 from typing import Any, Optional, Union
         schema: Optional[Any] = None,
     ) -> Union[str, Any]:
         import google.generativeai as genai
+        from google.generativeai.types import HarmBlockThreshold, HarmCategory
         system_prompt = (
             [system_prompt] if isinstance(system_prompt, str) else system_prompt
         user_prompt = [user_prompt] if isinstance(user_prompt, str) else user_prompt
         genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
+        model = genai.GenerativeModel(self.model_name, system_instruction=system_prompt)
         generation_config = (
             None
             if schema is None
             else genai.GenerationConfig(
+                response_mime_type="application/json", response_schema=schema
             )
         )
         response = model.generate_content(
+            user_prompt,
+            generation_config=generation_config,
+            # This is necessary in order to answer questions about anatomy, sexual diseases,
+            # medical devices, medicines, etc.
+            safety_settings={
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+            },
         )
+        return response.text if schema is None else json.loads(response.text)
     @weave.op()
     def execute_mistral_sdk(
         client = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))
         client = instructor.from_mistral(client) if schema is not None else client
+        if schema is None:
+            raise NotImplementedError(
+                "Mistral does not support structured output using a schema"
             )
+        else:
+            response = client.chat.complete(model=self.model_name, messages=messages)
+            return response.choices[0].message.content
     @weave.op()
     def execute_openai_sdk(

medrag_multi_modal/assistant/medqa_assistant.py CHANGED Viewed

@@ -1,8 +1,16 @@
 import weave
-from ..retrieval import SimilarityMetric
-from .figure_annotation import FigureAnnotatorFromPageImage
-from .llm_client import LLMClient
 class MedQAAssistant(weave.Model):
@@ -47,39 +55,68 @@ class MedQAAssistant(weave.Model):
         llm_client (LLMClient): The language model client used to generate responses.
         retriever (weave.Model): The model used to retrieve relevant chunks of text from a medical document.
         figure_annotator (FigureAnnotatorFromPageImage): The annotator used to extract figure descriptions from pages.
-        top_k_chunks (int): The number of top chunks to retrieve based on similarity metric.
         retrieval_similarity_metric (SimilarityMetric): The metric used to measure similarity for retrieval.
     """
     llm_client: LLMClient
     retriever: weave.Model
-    figure_annotator: FigureAnnotatorFromPageImage
-    top_k_chunks: int = 2
     retrieval_similarity_metric: SimilarityMetric = SimilarityMetric.COSINE
     @weave.op()
-    def predict(self, query: str) -> str:
         """
         Generates a response to a medical query by retrieving relevant text chunks and figure descriptions
         from a medical document and using a language model to generate the final response.
         This function performs the following steps:
-        1. Retrieves relevant text chunks from the medical document based on the query using the retriever model.
         2. Extracts the text and page indices from the retrieved chunks.
         3. Retrieves figure descriptions from the pages identified in the previous step using the figure annotator.
-        4. Constructs a system prompt and user prompt combining the query, retrieved text chunks, and figure descriptions.
-        5. Uses the language model client to generate a response based on the constructed prompts.
-        6. Appends the source information (page numbers) to the generated response.
         Args:
             query (str): The medical query to be answered.
         Returns:
-            str: The generated response to the query, including source information.
         """
-        retrieved_chunks = self.retriever.predict(
-            query, top_k=self.top_k_chunks, metric=self.retrieval_similarity_metric
-        )
         retrieved_chunk_texts = []
         page_indices = set()
@@ -88,21 +125,50 @@ class MedQAAssistant(weave.Model):
             page_indices.add(int(chunk["page_idx"]))
         figure_descriptions = []
-        for page_idx in page_indices:
-            figure_annotations = self.figure_annotator.predict(page_idx=page_idx)[
-                page_idx
-            ]
-            figure_descriptions += [
-                item["figure_description"] for item in figure_annotations
-            ]
-        system_prompt = """
-        You are an expert in medical science. You are given a query and a list of chunks from a medical document.
         """
         response = self.llm_client.predict(
             system_prompt=system_prompt,
             user_prompt=[query, *retrieved_chunk_texts, *figure_descriptions],
         )
-        page_numbers = ", ".join([str(int(page_idx) + 1) for page_idx in page_indices])
-        response += f"\n\n**Source:** {'Pages' if len(page_indices) > 1 else 'Page'} {page_numbers} from Gray's Anatomy"
-        return response

+from typing import Optional
 import weave
+from medrag_multi_modal.assistant.figure_annotation import FigureAnnotatorFromPageImage
+from medrag_multi_modal.assistant.llm_client import LLMClient
+from medrag_multi_modal.assistant.schema import (
+    MedQACitation,
+    MedQAMCQResponse,
+    MedQAResponse,
+)
+from medrag_multi_modal.retrieval.common import SimilarityMetric
+from medrag_multi_modal.retrieval.text_retrieval import BM25sRetriever
 class MedQAAssistant(weave.Model):
         llm_client (LLMClient): The language model client used to generate responses.
         retriever (weave.Model): The model used to retrieve relevant chunks of text from a medical document.
         figure_annotator (FigureAnnotatorFromPageImage): The annotator used to extract figure descriptions from pages.
+        top_k_chunks_for_query (int): The number of top chunks to retrieve based on similarity metric for the query.
+        top_k_chunks_for_options (int): The number of top chunks to retrieve based on similarity metric for the options.
         retrieval_similarity_metric (SimilarityMetric): The metric used to measure similarity for retrieval.
     """
     llm_client: LLMClient
     retriever: weave.Model
+    figure_annotator: Optional[FigureAnnotatorFromPageImage] = None
+    top_k_chunks_for_query: int = 2
+    top_k_chunks_for_options: int = 2
+    rely_only_on_context: bool = True
     retrieval_similarity_metric: SimilarityMetric = SimilarityMetric.COSINE
     @weave.op()
+    def retrieve_chunks_for_query(self, query: str) -> list[dict]:
+        retriever_kwargs = {"top_k": self.top_k_chunks_for_query}
+        if not isinstance(self.retriever, BM25sRetriever):
+            retriever_kwargs["metric"] = self.retrieval_similarity_metric
+        return self.retriever.predict(query, **retriever_kwargs)
+    @weave.op()
+    def retrieve_chunks_for_options(self, options: list[str]) -> list[dict]:
+        retriever_kwargs = {"top_k": self.top_k_chunks_for_options}
+        if not isinstance(self.retriever, BM25sRetriever):
+            retriever_kwargs["metric"] = self.retrieval_similarity_metric
+        retrieved_chunks = []
+        for option in options:
+            retrieved_chunks += self.retriever.predict(query=option, **retriever_kwargs)
+        return retrieved_chunks
+    @weave.op()
+    def predict(self, query: str, options: Optional[list[str]] = None) -> MedQAResponse:
         """
         Generates a response to a medical query by retrieving relevant text chunks and figure descriptions
         from a medical document and using a language model to generate the final response.
         This function performs the following steps:
+        1. Retrieves relevant text chunks from the medical document based on the query and any provided options
+           using the retriever model.
         2. Extracts the text and page indices from the retrieved chunks.
         3. Retrieves figure descriptions from the pages identified in the previous step using the figure annotator.
+        4. Constructs a system prompt and user prompt combining the query, options (if provided), retrieved text chunks,
+           and figure descriptions.
+        5. Uses the language model client to generate a response based on the constructed prompts, either choosing
+           from provided options or generating a free-form response.
+        6. Returns the generated response, which includes the answer and explanation if options were provided.
+        The function can operate in two modes:
+        - Multiple choice: When options are provided, it selects the best answer from the options and explains the choice
+        - Free response: When no options are provided, it generates a comprehensive response based on the context
         Args:
             query (str): The medical query to be answered.
+            options (Optional[list[str]]): The list of options to choose from.
+            rely_only_on_context (bool): Whether to rely only on the context provided or not during response generation.
         Returns:
+            MedQAResponse: The generated response to the query, including source information.
         """
+        retrieved_chunks = self.retrieve_chunks_for_query(query)
+        options = options or []
+        retrieved_chunks += self.retrieve_chunks_for_options(options)
         retrieved_chunk_texts = []
         page_indices = set()
             page_indices.add(int(chunk["page_idx"]))
         figure_descriptions = []
+        if self.figure_annotator is not None:
+            for page_idx in page_indices:
+                figure_annotations = self.figure_annotator.predict(page_idx=page_idx)[
+                    page_idx
+                ]
+                figure_descriptions += [
+                    item["figure_description"] for item in figure_annotations
+                ]
+        system_prompt = """You are an expert in medical science. You are given a question
+and a list of excerpts from various medical documents.
+        """
+        query = f"""# Question
+{query}
         """
+        if len(options) > 0:
+            system_prompt += """\nYou are also given a list of options to choose your answer from.
+You are supposed to choose the best possible option based on the context provided. You should also
+explain your answer to justify why you chose that option.
+"""
+            query += "## Options\n"
+            for option in options:
+                query += f"- {option}\n"
+        else:
+            system_prompt += "\nYou are supposed to answer the question based on the context provided."
+        if self.rely_only_on_context:
+            system_prompt += """\n\nYou are only allowed to use the context provided to answer the question.
+You are not allowed to use any external knowledge to answer the question.
+"""
         response = self.llm_client.predict(
             system_prompt=system_prompt,
             user_prompt=[query, *retrieved_chunk_texts, *figure_descriptions],
+            schema=MedQAMCQResponse if len(options) > 0 else None,
         )
+        # TODO: Add figure citations
+        # TODO: Add source document name from retrieved chunks as citations
+        citations = []
+        for page_idx in page_indices:
+            citations.append(
+                MedQACitation(page_number=page_idx + 1, document_name="Gray's Anatomy")
+            )
+        return MedQAResponse(response=response, citations=citations)

medrag_multi_modal/assistant/schema.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from typing import Union
+from pydantic import BaseModel
+class FigureAnnotation(BaseModel):
+    figure_id: str
+    figure_description: str
+class FigureAnnotations(BaseModel):
+    annotations: list[FigureAnnotation]
+class MedQAMCQResponse(BaseModel):
+    answer: str
+    explanation: str
+class MedQACitation(BaseModel):
+    page_number: int
+    document_name: str
+class MedQAResponse(BaseModel):
+    response: Union[str, MedQAMCQResponse]
+    citations: list[MedQACitation]

medrag_multi_modal/cli.py CHANGED Viewed

@@ -1,16 +1,67 @@
 import argparse
 import subprocess
 import sys
 def main():
     parser = argparse.ArgumentParser(description="MedRAG Multi-Modal CLI")
-    parser.add_argument("command", choices=["run"], help="Command to execute")
     args = parser.parse_args()
     if args.command == "run":
-        # Assuming your Streamlit app is in app.py
-        subprocess.run([sys.executable, "-m", "streamlit", "run", "app.py"])
 if __name__ == "__main__":

 import argparse
+import os
 import subprocess
 import sys
 def main():
     parser = argparse.ArgumentParser(description="MedRAG Multi-Modal CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    # Run subcommand
+    run_parser = subparsers.add_parser("run", help="Run the Streamlit application")
+    run_parser.add_argument(
+        "--port", type=int, default=8501, help="Port to run Streamlit on"
+    )
+    # Evaluate subcommand
+    eval_parser = subparsers.add_parser("evaluate", help="Run evaluation tests")
+    eval_parser.add_argument(
+        "--test-file",
+        default=os.path.join("tests", "evals", "test_assistant_mmlu_anatomy.py"),
+        help="Path to test file",
+    )
+    eval_parser.add_argument(
+        "--test-case",
+        type=str,
+        help="Only run tests which match the given substring expression",
+    )
+    eval_parser.add_argument(
+        "--model-name",
+        type=str,
+        default="gemini-1.5-flash",
+        help="Model name to use for evaluation",
+    )
     args = parser.parse_args()
     if args.command == "run":
+        subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "streamlit",
+                "run",
+                "app.py",
+                "--server.port",
+                str(args.port),
+            ]
+        )
+    elif args.command == "evaluate":
+        test_file = (
+            args.test_file + "::" + args.test_case if args.test_case else args.test_file
+        )
+        cmd = [
+            sys.executable,
+            "-m",
+            "pytest",
+            "-s",
+            test_file,
+            "-v",
+            f"--model-name={args.model_name}",
+        ]
+        subprocess.run(cmd)
 if __name__ == "__main__":

medrag_multi_modal/document_loader/image_loader/base_img_loader.py CHANGED Viewed

@@ -1,11 +1,21 @@
 import asyncio
 import os
 from abc import abstractmethod
 from typing import Dict, List, Optional
 import jsonlines
 import rich
-import wandb
 from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
     BaseTextLoader,
@@ -36,14 +46,72 @@ class BaseImageLoader(BaseTextLoader):
         """
         pass
     async def load_data(
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
-        wandb_artifact_name: Optional[str] = None,
         image_save_dir: str = "./images",
         exclude_file_extensions: list[str] = [],
-        cleanup: bool = False,
         **kwargs,
     ) -> List[Dict[str, str]]:
         """
@@ -65,21 +133,15 @@ class BaseImageLoader(BaseTextLoader):
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process.
             end_page (Optional[int]): The ending page index (0-based) to process.
-            wandb_artifact_name (Optional[str]): The name of the WandB artifact to publish the pages to, if provided.
             image_save_dir (str): The directory to save the extracted images.
             exclude_file_extensions (list[str]): A list of file extensions to exclude from the image_save_dir.
-            cleanup (bool): Whether to remove extracted images from `image_save_dir`, if uploading to wandb artifact.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
-            List[Dict[str, Any]]: A list of dictionaries, each containing the image and metadata for a processed page.
-            Each dictionary will have the following keys and values:
-            - "page_idx": (int) the index of the page.
-            - "document_name": (str) the name of the document.
-            - "file_path": (str) the local file path where the PDF is stored.
-            - "file_url": (str) the URL of the PDF file.
-            - "image_file_path" or "image_file_paths": (str) the local file path where the image/images are stored.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         """
@@ -111,19 +173,8 @@ class BaseImageLoader(BaseTextLoader):
             if file.endswith(tuple(exclude_file_extensions)):
                 os.remove(os.path.join(image_save_dir, file))
-        if wandb_artifact_name:
-            artifact = wandb.Artifact(
-                name=wandb_artifact_name,
-                type="dataset",
-                metadata={"loader_name": self.__class__.__name__},
-            )
-            artifact.add_dir(local_path=image_save_dir)
-            artifact.save()
-            rich.print("Artifact saved and uploaded to wandb!")
-        if cleanup:
-            for file in os.listdir(image_save_dir):
-                file_path = os.path.join(image_save_dir, file)
-                if os.path.isfile(file_path):
-                    os.remove(file_path)
-        return pages

 import asyncio
 import os
 from abc import abstractmethod
+from glob import glob
 from typing import Dict, List, Optional
+import huggingface_hub
 import jsonlines
 import rich
+from datasets import (
+    Dataset,
+    Features,
+    Image,
+    Sequence,
+    Value,
+    concatenate_datasets,
+    load_dataset,
+)
 from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
     BaseTextLoader,
         """
         pass
+    def save_as_dataset(
+        self,
+        start_page: int,
+        end_page: int,
+        image_save_dir: str,
+        dataset_repo_id: Optional[str] = None,
+        overwrite_dataset: bool = False,
+    ):
+        features = Features(
+            {
+                "page_image": Image(decode=True),
+                "page_figure_images": Sequence(Image(decode=True)),
+                "document_name": Value(dtype="string"),
+                "page_idx": Value(dtype="int32"),
+            }
+        )
+        all_examples = []
+        for page_idx in range(start_page, end_page):
+            page_image_file_paths = glob(
+                os.path.join(image_save_dir, f"page{page_idx}*.png")
+            )
+            if len(page_image_file_paths) > 0:
+                page_image_path = page_image_file_paths[0]
+                figure_image_paths = [
+                    image_file_path
+                    for image_file_path in glob(
+                        os.path.join(image_save_dir, f"page{page_idx}*_fig*.png")
+                    )
+                ]
+                example = {
+                    "page_image": page_image_path,
+                    "page_figure_images": figure_image_paths,
+                    "document_name": self.document_name,
+                    "page_idx": page_idx,
+                }
+                all_examples.append(example)
+        dataset = Dataset.from_list(all_examples, features=features)
+        if dataset_repo_id:
+            if huggingface_hub.repo_exists(dataset_repo_id, repo_type="dataset"):
+                if not overwrite_dataset:
+                    dataset = concatenate_datasets(
+                        [dataset, load_dataset(dataset_repo_id)["corpus"]]
+                    )
+            dataset.push_to_hub(dataset_repo_id, split="corpus")
+        return dataset
+    def cleanup_image_dir(self, image_save_dir: str = "./images"):
+        for file in os.listdir(image_save_dir):
+            file_path = os.path.join(image_save_dir, file)
+            if os.path.isfile(file_path):
+                os.remove(file_path)
     async def load_data(
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
+        dataset_repo_id: Optional[str] = None,
+        overwrite_dataset: bool = False,
         image_save_dir: str = "./images",
         exclude_file_extensions: list[str] = [],
         **kwargs,
     ) -> List[Dict[str, str]]:
         """
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process.
             end_page (Optional[int]): The ending page index (0-based) to process.
+            dataset_repo_id (Optional[str]): The repository ID of the HuggingFace dataset to publish the pages to, if provided.
+            overwrite_dataset (bool): Whether to overwrite the existing dataset if it exists. Defaults to False.
             image_save_dir (str): The directory to save the extracted images.
             exclude_file_extensions (list[str]): A list of file extensions to exclude from the image_save_dir.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
+            Dataset: A HuggingFace dataset containing the processed pages.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         """
             if file.endswith(tuple(exclude_file_extensions)):
                 os.remove(os.path.join(image_save_dir, file))
+        dataset = self.save_as_dataset(
+            start_page, end_page, image_save_dir, dataset_repo_id, overwrite_dataset
+        )
+        return dataset

medrag_multi_modal/document_loader/image_loader/fitzpil_img_loader.py CHANGED Viewed

@@ -3,9 +3,12 @@ import os
 from typing import Any, Dict
 import fitz
 from PIL import Image, ImageOps, UnidentifiedImageError
-from .base_img_loader import BaseImageLoader
 class FitzPILImageLoader(BaseImageLoader):
@@ -20,27 +23,16 @@ class FitzPILImageLoader(BaseImageLoader):
         ```python
         import asyncio
-        import weave
-        import wandb
         from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = FitzPILImageLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=32,
-                end_page=37,
-                wandb_artifact_name="grays-anatomy-images-fitzpil",
-                cleanup=False,
-            )
-        )
         ```
     Args:
@@ -118,6 +110,14 @@ class FitzPILImageLoader(BaseImageLoader):
         pdf_document.close()
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

 from typing import Any, Dict
 import fitz
+from pdf2image.pdf2image import convert_from_path
 from PIL import Image, ImageOps, UnidentifiedImageError
+from medrag_multi_modal.document_loader.image_loader.base_img_loader import (
+    BaseImageLoader,
+)
 class FitzPILImageLoader(BaseImageLoader):
         ```python
         import asyncio
         from medrag_multi_modal.document_loader.image_loader import FitzPILImageLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = FitzPILImageLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=32, end_page=37))
         ```
     Args:
         pdf_document.close()
+        page_image = convert_from_path(
+            self.document_file_path,
+            first_page=page_idx + 1,
+            last_page=page_idx + 1,
+            **kwargs,
+        )[0]
+        page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

medrag_multi_modal/document_loader/image_loader/marker_img_loader.py CHANGED Viewed

@@ -5,7 +5,9 @@ from marker.convert import convert_single_pdf
 from marker.models import load_all_models
 from pdf2image.pdf2image import convert_from_path
-from .base_img_loader import BaseImageLoader
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -22,27 +24,16 @@ class MarkerImageLoader(BaseImageLoader):
         ```python
         import asyncio
-        import weave
-        import wandb
         from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = MarkerImageLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                wandb_artifact_name="grays-anatomy-images-marker",
-                cleanup=False,
-            )
-        )
         ```
     Args:
@@ -84,7 +75,7 @@ class MarkerImageLoader(BaseImageLoader):
             - "file_url": (str) the URL of the PDF file.
             - "image_file_path": (str) the local file path where the image is stored.
         """
-        _, images, out_meta = convert_single_pdf(
             self.document_file_path,
             self.model_lst,
             max_pages=1,
@@ -101,14 +92,13 @@ class MarkerImageLoader(BaseImageLoader):
             image.save(image_file_path, "png")
             image_file_paths.append(image_file_path)
-        if self.save_page_image:
-            page_image = convert_from_path(
-                self.document_file_path,
-                first_page=page_idx + 1,
-                last_page=page_idx + 1,
-                **kwargs,
-            )[0]
-            page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
@@ -116,7 +106,6 @@ class MarkerImageLoader(BaseImageLoader):
             "file_path": self.document_file_path,
             "file_url": self.url,
             "image_file_paths": os.path.join(image_save_dir, "*.png"),
-            "meta": out_meta,
         }
     def load_data(

 from marker.models import load_all_models
 from pdf2image.pdf2image import convert_from_path
+from medrag_multi_modal.document_loader.image_loader.base_img_loader import (
+    BaseImageLoader,
+)
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
         ```python
         import asyncio
         from medrag_multi_modal.document_loader.image_loader import MarkerImageLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = MarkerImageLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=32, end_page=37))
         ```
     Args:
             - "file_url": (str) the URL of the PDF file.
             - "image_file_path": (str) the local file path where the image is stored.
         """
+        _, images, _ = convert_single_pdf(
             self.document_file_path,
             self.model_lst,
             max_pages=1,
             image.save(image_file_path, "png")
             image_file_paths.append(image_file_path)
+        page_image = convert_from_path(
+            self.document_file_path,
+            first_page=page_idx + 1,
+            last_page=page_idx + 1,
+            **kwargs,
+        )[0]
+        page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
             "file_path": self.document_file_path,
             "file_url": self.url,
             "image_file_paths": os.path.join(image_save_dir, "*.png"),
         }
     def load_data(

medrag_multi_modal/document_loader/image_loader/pdf2image_img_loader.py CHANGED Viewed

@@ -3,7 +3,9 @@ from typing import Any, Dict
 from pdf2image.pdf2image import convert_from_path
-from .base_img_loader import BaseImageLoader
 class PDF2ImageLoader(BaseImageLoader):
@@ -19,27 +21,16 @@ class PDF2ImageLoader(BaseImageLoader):
         ```python
         import asyncio
-        import weave
-        import wandb
         from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDF2ImageLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                wandb_artifact_name="grays-anatomy-images-pdf2image",
-                cleanup=False,
-            )
-        )
         ```
     Args:

 from pdf2image.pdf2image import convert_from_path
+from medrag_multi_modal.document_loader.image_loader.base_img_loader import (
+    BaseImageLoader,
+)
 class PDF2ImageLoader(BaseImageLoader):
         ```python
         import asyncio
         from medrag_multi_modal.document_loader.image_loader import PDF2ImageLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDF2ImageLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=32, end_page=37))
         ```
     Args:

medrag_multi_modal/document_loader/image_loader/pdfplumber_img_loader.py CHANGED Viewed

@@ -2,8 +2,11 @@ import os
 from typing import Any, Dict
 import pdfplumber
-from .base_img_loader import BaseImageLoader
 class PDFPlumberImageLoader(BaseImageLoader):
@@ -18,27 +21,16 @@ class PDFPlumberImageLoader(BaseImageLoader):
         ```python
         import asyncio
-        import weave
-        import wandb
         from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDFPlumberImageLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=32,
-                end_page=37,
-                wandb_artifact_name="grays-anatomy-images-pdfplumber",
-                cleanup=False,
-            )
-        )
         ```
     Args:
@@ -92,6 +84,14 @@ class PDFPlumberImageLoader(BaseImageLoader):
                 extracted_image.save(image_file_path, "png")
                 image_file_paths.append(image_file_path)
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

 from typing import Any, Dict
 import pdfplumber
+from pdf2image.pdf2image import convert_from_path
+from medrag_multi_modal.document_loader.image_loader.base_img_loader import (
+    BaseImageLoader,
+)
 class PDFPlumberImageLoader(BaseImageLoader):
         ```python
         import asyncio
         from medrag_multi_modal.document_loader.image_loader import PDFPlumberImageLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDFPlumberImageLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=32, end_page=37))
         ```
     Args:
                 extracted_image.save(image_file_path, "png")
                 image_file_paths.append(image_file_path)
+        page_image = convert_from_path(
+            self.document_file_path,
+            first_page=page_idx + 1,
+            last_page=page_idx + 1,
+            **kwargs,
+        )[0]
+        page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py CHANGED Viewed

@@ -3,9 +3,12 @@ import os
 from typing import Any, Dict
 import fitz
 from PIL import Image
-from .base_img_loader import BaseImageLoader
 class PyMuPDFImageLoader(BaseImageLoader):
@@ -20,27 +23,16 @@ class PyMuPDFImageLoader(BaseImageLoader):
         ```python
         import asyncio
-        import weave
-        import wandb
         from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        wandb.init(project="medrag-multi-modal", entity="ml-colabs")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyMuPDFImageLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=32,
-                end_page=37,
-                wandb_artifact_name="grays-anatomy-images-pymupdf",
-                cleanup=False,
-            )
-        )
         ```
     Args:
@@ -115,6 +107,14 @@ class PyMuPDFImageLoader(BaseImageLoader):
         pdf_document.close()
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

 from typing import Any, Dict
 import fitz
+from pdf2image.pdf2image import convert_from_path
 from PIL import Image
+from medrag_multi_modal.document_loader.image_loader.base_img_loader import (
+    BaseImageLoader,
+)
 class PyMuPDFImageLoader(BaseImageLoader):
         ```python
         import asyncio
         from medrag_multi_modal.document_loader.image_loader import PyMuPDFImageLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyMuPDFImageLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=32, end_page=37))
         ```
     Args:
         pdf_document.close()
+        page_image = convert_from_path(
+            self.document_file_path,
+            first_page=page_idx + 1,
+            last_page=page_idx + 1,
+            **kwargs,
+        )[0]
+        page_image.save(os.path.join(image_save_dir, f"page{page_idx}.png"))
         return {
             "page_idx": page_idx,
             "document_name": self.document_name,

medrag_multi_modal/document_loader/text_loader/base_text_loader.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import asyncio
 import os
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
 import PyPDF2
-import rich
-import weave
 from firerequests import FireRequests
 class BaseTextLoader(ABC):
@@ -22,14 +23,22 @@ class BaseTextLoader(ABC):
         url (str): The URL of the PDF file to download if not present locally.
         document_name (str): The name of the document for metadata purposes.
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
     """
-    def __init__(self, url: str, document_name: str, document_file_path: str):
         self.url = url
         self.document_name = document_name
         self.document_file_path = document_file_path
         if not os.path.exists(self.document_file_path):
-            FireRequests().download(url, filename=self.document_file_path)
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             self.page_count = len(pdf_reader.pages)
@@ -85,9 +94,11 @@ class BaseTextLoader(ABC):
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
-        weave_dataset_name: Optional[str] = None,
         **kwargs,
-    ) -> List[Dict[str, str]]:
         """
         Asynchronously loads text from a PDF file specified by a URL or local file path.
         The overrided processing abstract method then processes the text into markdown format,
@@ -102,23 +113,26 @@ class BaseTextLoader(ABC):
         each page, extract the text from the PDF, and convert it to markdown.
         It processes pages concurrently using `asyncio` for efficiency.
-        If a weave_dataset_name is provided, the processed pages are published to a Weave dataset.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
-            weave_dataset_name (Optional[str]): The name of the Weave dataset to publish the pages to, if provided.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
-            List[Dict[str, str]]: A list of dictionaries, each containing the text and metadata for a processed page.
-            Each dictionary will have the following keys and values:
             - "text": (str) the processed page data in markdown format.
             - "page_idx": (int) the index of the page.
             - "document_name": (str) the name of the document.
             - "file_path": (str) the local file path where the PDF is stored.
             - "file_url": (str) the URL of the PDF file.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
@@ -127,21 +141,45 @@ class BaseTextLoader(ABC):
         pages = []
         processed_pages_counter: int = 1
         total_pages = end_page - start_page
         async def process_page(page_idx):
             nonlocal processed_pages_counter
             page_data = await self.extract_page_data(page_idx, **kwargs)
             page_data["loader_name"] = self.__class__.__name__
             pages.append(page_data)
-            rich.print(
-                f"Processed page idx: {page_idx}, progress: {processed_pages_counter}/{total_pages}"
             )
             processed_pages_counter += 1
-        tasks = [process_page(page_idx) for page_idx in range(start_page, end_page)]
-        for task in asyncio.as_completed(tasks):
-            await task
-        if weave_dataset_name:
-            weave.publish(weave.Dataset(name=weave_dataset_name, rows=pages))
-        return pages

 import asyncio
 import os
 from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+import huggingface_hub
 import PyPDF2
+from datasets import Dataset, concatenate_datasets, load_dataset
 from firerequests import FireRequests
+from rich.progress import Progress
 class BaseTextLoader(ABC):
         url (str): The URL of the PDF file to download if not present locally.
         document_name (str): The name of the document for metadata purposes.
         document_file_path (str): The local file path where the PDF is stored or will be downloaded.
+        metadata (Optional[dict[str, any]]): Additional metadata to be added to each row of the dataset.
     """
+    def __init__(
+        self,
+        url: str,
+        document_name: str,
+        document_file_path: str,
+        metadata: Optional[dict[str, Any]] = None,
+    ):
         self.url = url
         self.document_name = document_name
         self.document_file_path = document_file_path
+        self.metadata = metadata or {}
         if not os.path.exists(self.document_file_path):
+            FireRequests().download(url, filenames=self.document_file_path)
         with open(self.document_file_path, "rb") as file:
             pdf_reader = PyPDF2.PdfReader(file)
             self.page_count = len(pdf_reader.pages)
         self,
         start_page: Optional[int] = None,
         end_page: Optional[int] = None,
+        exclude_pages: Optional[list[int]] = None,
+        dataset_repo_id: Optional[str] = None,
+        overwrite_dataset: bool = False,
         **kwargs,
+    ) -> Dataset:
         """
         Asynchronously loads text from a PDF file specified by a URL or local file path.
         The overrided processing abstract method then processes the text into markdown format,
         each page, extract the text from the PDF, and convert it to markdown.
         It processes pages concurrently using `asyncio` for efficiency.
+        If a `dataset_repo_id` is provided, the processed pages are published to a HuggingFace dataset.
         Args:
             start_page (Optional[int]): The starting page index (0-based) to process. Defaults to the first page.
             end_page (Optional[int]): The ending page index (0-based) to process. Defaults to the last page.
+            exclude_pages (Optional[list[int]]): The list of page indices to exclude from processing.
+            dataset_repo_id (Optional[str]): The repository ID of the HuggingFace dataset to publish the pages to, if provided.
+            overwrite_dataset (bool): Whether to overwrite the existing dataset if it exists. Defaults to False.
             **kwargs: Additional keyword arguments that will be passed to extract_page_data method and the underlying library.
         Returns:
+            Dataset: A HuggingFace Dataset object containing the text and metadata for processed pages.
+            Each entry in the dataset will have the following keys and values:
             - "text": (str) the processed page data in markdown format.
             - "page_idx": (int) the index of the page.
             - "document_name": (str) the name of the document.
             - "file_path": (str) the local file path where the PDF is stored.
             - "file_url": (str) the URL of the PDF file.
+            - "loader_name": (str) the name of the loader class used to process the page.
         Raises:
             ValueError: If the specified start_page or end_page is out of bounds of the document's page count.
         pages = []
         processed_pages_counter: int = 1
         total_pages = end_page - start_page
+        exclude_pages = exclude_pages or []
         async def process_page(page_idx):
             nonlocal processed_pages_counter
             page_data = await self.extract_page_data(page_idx, **kwargs)
             page_data["loader_name"] = self.__class__.__name__
+            for key, value in self.metadata.items():
+                if key not in page_data:
+                    page_data[key] = value
             pages.append(page_data)
+            progress.update(
+                task_id,
+                advance=1,
+                description=f"Loading page {page_idx} using {self.__class__.__name__}",
             )
             processed_pages_counter += 1
+        progress = Progress()
+        with progress:
+            task_id = progress.add_task("Starting...", total=total_pages)
+            tasks = [
+                process_page(page_idx)
+                for page_idx in range(start_page, end_page + 1)
+                if page_idx not in exclude_pages
+            ]
+            for task in asyncio.as_completed(tasks):
+                await task
+        pages.sort(key=lambda x: x["page_idx"])
+        dataset = Dataset.from_list(pages)
+        if dataset_repo_id:
+            if huggingface_hub.repo_exists(dataset_repo_id, repo_type="dataset"):
+                print("Dataset already exists")
+                if not overwrite_dataset:
+                    print("Not overwriting dataset")
+                    dataset = concatenate_datasets(
+                        [dataset, load_dataset(dataset_repo_id, split="corpus")]
+                    )
+            dataset.push_to_hub(repo_id=dataset_repo_id, split="corpus", private=False)
+        return dataset

medrag_multi_modal/document_loader/text_loader/marker_text_loader.py CHANGED Viewed

@@ -4,7 +4,9 @@ from typing import Dict
 from marker.convert import convert_single_pdf
 from marker.models import load_all_models
-from .base_text_loader import BaseTextLoader
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -26,24 +28,16 @@ class MarkerTextLoader(BaseTextLoader):
         ```python
         import asyncio
-        import weave
-        from medrag_multi_modal.document_loader.text_loader import MarkerTextLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = MarkerTextLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                weave_dataset_name="grays-anatomy-text",
-            )
-        )
         ```
     Args:
@@ -76,7 +70,7 @@ class MarkerTextLoader(BaseTextLoader):
         """
         model_lst = load_all_models()
-        text, _, out_meta = convert_single_pdf(
             self.document_file_path,
             model_lst,
             max_pages=1,
@@ -92,5 +86,4 @@ class MarkerTextLoader(BaseTextLoader):
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
-            "meta": out_meta,
         }

 from marker.convert import convert_single_pdf
 from marker.models import load_all_models
+from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
+    BaseTextLoader,
+)
 os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
         ```python
         import asyncio
+        from medrag_multi_modal.document_loader import MarkerTextLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = MarkerTextLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
         ```
     Args:
         """
         model_lst = load_all_models()
+        text, _, _ = convert_single_pdf(
             self.document_file_path,
             model_lst,
             max_pages=1,
             "document_name": self.document_name,
             "file_path": self.document_file_path,
             "file_url": self.url,
         }

medrag_multi_modal/document_loader/text_loader/pdfplumber_text_loader.py CHANGED Viewed

@@ -2,7 +2,9 @@ from typing import Dict
 import pdfplumber
-from .base_text_loader import BaseTextLoader
 class PDFPlumberTextLoader(BaseTextLoader):
@@ -22,24 +24,16 @@ class PDFPlumberTextLoader(BaseTextLoader):
         ```python
         import asyncio
-        import weave
-        from medrag_multi_modal.document_loader.text_loader import PDFPlumberTextLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDFPlumberTextLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                weave_dataset_name="grays-anatomy-text",
-            )
-        )
         ```
     Args:

 import pdfplumber
+from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
+    BaseTextLoader,
+)
 class PDFPlumberTextLoader(BaseTextLoader):
         ```python
         import asyncio
+        from medrag_multi_modal.document_loader import PDFPlumberTextLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PDFPlumberTextLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
         ```
     Args:

medrag_multi_modal/document_loader/text_loader/pymupdf4llm_text_loader.py CHANGED Viewed

@@ -2,7 +2,9 @@ from typing import Dict
 import pymupdf4llm
-from .base_text_loader import BaseTextLoader
 class PyMuPDF4LLMTextLoader(BaseTextLoader):
@@ -20,26 +22,16 @@ class PyMuPDF4LLMTextLoader(BaseTextLoader):
         ```python
         import asyncio
-        import weave
-        from medrag_multi_modal.document_loader.text_loader import (
-            PyMuPDF4LLMTextLoader
-        )
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyMuPDF4LLMTextLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                weave_dataset_name="grays-anatomy-text",
-            )
-        )
         ```
     Args:

 import pymupdf4llm
+from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
+    BaseTextLoader,
+)
 class PyMuPDF4LLMTextLoader(BaseTextLoader):
         ```python
         import asyncio
+        from medrag_multi_modal.document_loader import PyMuPDF4LLMTextLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyMuPDF4LLMTextLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
         ```
     Args:

medrag_multi_modal/document_loader/text_loader/pypdf2_text_loader.py CHANGED Viewed

@@ -2,7 +2,9 @@ from typing import Dict
 import PyPDF2
-from .base_text_loader import BaseTextLoader
 class PyPDF2TextLoader(BaseTextLoader):
@@ -22,24 +24,16 @@ class PyPDF2TextLoader(BaseTextLoader):
         ```python
         import asyncio
-        import weave
-        from medrag_multi_modal.document_loader.text_loader import PyPDF2TextLoader
-        weave.init(project_name="ml-colabs/medrag-multi-modal")
-        url = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyPDF2TextLoader(
-            url=url,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
-        asyncio.run(
-            loader.load_data(
-                start_page=31,
-                end_page=36,
-                weave_dataset_name="grays-anatomy-text",
-            )
-        )
         ```
     Args:

 import PyPDF2
+from medrag_multi_modal.document_loader.text_loader.base_text_loader import (
+    BaseTextLoader,
+)
 class PyPDF2TextLoader(BaseTextLoader):
         ```python
         import asyncio
+        from medrag_multi_modal.document_loader import PyPDF2TextLoader
+        URL = "https://archive.org/download/GraysAnatomy41E2015PDF/Grays%20Anatomy-41%20E%20%282015%29%20%5BPDF%5D.pdf"
         loader = PyPDF2TextLoader(
+            url=URL,
             document_name="Gray's Anatomy",
             document_file_path="grays_anatomy.pdf",
         )
+        dataset = asyncio.run(loader.load_data(start_page=31, end_page=36))
         ```
     Args:

medrag_multi_modal/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .mmlu import MMLUOptionAccuracy
2	+
3	+ __all__ = ["MMLUOptionAccuracy"]

medrag_multi_modal/metrics/base.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from typing import Optional
+import numpy as np
+import weave
+class BaseAccuracyMetric(weave.Scorer):
+    """
+    BaseAccuracyMetric is a class that extends the
+    [`weave.Scorer`](https://weave-docs.wandb.ai/guides/evaluation/scorers#class-based-scorers)
+    to provide a comprehensive evaluation of accuracy metrics for a given set of score rows.
+    This class is designed to process a list of score rows, each containing a
+    'correct' key that indicates whether a particular prediction was correct.
+    The `summarize` method calculates various statistical measures and metrics
+    based on this data, including:
+    - True and false counts: The number of true and false predictions.
+    - True and false fractions: The proportion of true and false predictions.
+    - Standard error: The standard error of the mean for the true predictions.
+    - Precision: The ratio of true positive predictions to the total number of
+      positive predictions.
+    - Recall: The ratio of true positive predictions to the total number of
+      actual positives.
+    - F1 Score: The harmonic mean of precision and recall, providing a balance
+      between the two metrics.
+    The `summarize` method returns a dictionary containing these metrics,
+    allowing for a detailed analysis of the model's performance.
+    Methods:
+        summarize(score_rows: list) -> Optional[dict]:
+            Processes the input score rows to compute and return a dictionary
+            of accuracy metrics.
+    """
+    @weave.op()
+    def summarize(self, score_rows: list) -> Optional[dict]:
+        """
+        Summarizes the accuracy metrics from a list of score rows.
+        This method processes a list of score rows, each containing a 'correct' key
+        that indicates whether a particular prediction was correct. It calculates
+        various statistical measures and metrics based on this data, including:
+        - True and false counts: The number of true and false predictions.
+        - True and false fractions: The proportion of true and false predictions.
+        - Standard error: The standard error of the mean for the true predictions.
+        - Precision: The ratio of true positive predictions to the total number of
+          positive predictions.
+        - Recall: The ratio of true positive predictions to the total number of
+          actual positives.
+        - F1 Score: The harmonic mean of precision and recall, providing a balance
+          between the two metrics.
+        The method returns a dictionary containing these metrics, allowing for a
+        detailed analysis of the model's performance.
+        Args:
+            score_rows (list): A list of dictionaries, each containing a 'correct'
+                key with a boolean value indicating the correctness of a prediction.
+        Returns:
+            Optional[dict]: A dictionary containing the calculated accuracy metrics,
+                or None if the input list is empty.
+        """
+        valid_data = [
+            x.get("correct") for x in score_rows if x.get("correct") is not None
+        ]
+        count_true = list(valid_data).count(True)
+        int_data = [int(x) for x in valid_data]
+        sample_mean = np.mean(int_data) if int_data else 0
+        sample_variance = np.var(int_data) if int_data else 0
+        sample_error = np.sqrt(sample_variance / len(int_data)) if int_data else 0
+        # Calculate precision, recall, and F1 score
+        true_positives = count_true
+        false_positives = len(valid_data) - count_true
+        false_negatives = len(score_rows) - len(valid_data)
+        precision = (
+            true_positives / (true_positives + false_positives)
+            if (true_positives + false_positives) > 0
+            else 0
+        )
+        recall = (
+            true_positives / (true_positives + false_negatives)
+            if (true_positives + false_negatives) > 0
+            else 0
+        )
+        f1_score = (
+            (2 * precision * recall) / (precision + recall)
+            if (precision + recall) > 0
+            else 0
+        )
+        return {
+            "correct": {
+                "true_count": count_true,
+                "false_count": len(score_rows) - count_true,
+                "true_fraction": float(sample_mean),
+                "false_fraction": 1.0 - float(sample_mean),
+                "stderr": float(sample_error),
+                "precision": precision,
+                "recall": recall,
+                "f1_score": f1_score,
+            }
+        }

medrag_multi_modal/metrics/mmlu.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import weave
+from medrag_multi_modal.assistant.schema import MedQAResponse
+from medrag_multi_modal.metrics.base import BaseAccuracyMetric
+class MMLUOptionAccuracy(BaseAccuracyMetric):
+    """
+    MMLUOptionAccuracy is a metric class that inherits from `BaseAccuracyMetric`.
+    This class is designed to evaluate the accuracy of a multiple-choice question
+    response by comparing the provided answer with the correct answer from the
+    given options. It uses the MedQAResponse schema to extract the response
+    and checks if it matches the correct answer.
+    Methods:
+    --------
+    score(output: MedQAResponse, options: list[str], answer: str) -> dict:
+        Compares the provided answer with the correct answer and returns a
+        dictionary indicating whether the answer is correct.
+    """
+    @weave.op()
+    def score(self, output: MedQAResponse, options: list[str], answer: str):
+        return {"correct": options[answer] == output.response.answer}

medrag_multi_modal/retrieval/__init__.py CHANGED Viewed

@@ -1,15 +1,3 @@
-from .bm25s_retrieval import BM25sRetriever
 from .colpali_retrieval import CalPaliRetriever
-from .common import SimilarityMetric
-from .contriever_retrieval import ContrieverRetriever
-from .medcpt_retrieval import MedCPTRetriever
-from .nv_embed_2 import NVEmbed2Retriever
-__all__ = [
-    "CalPaliRetriever",
-    "BM25sRetriever",
-    "ContrieverRetriever",
-    "SimilarityMetric",
-    "MedCPTRetriever",
-    "NVEmbed2Retriever",
-]



1	from .colpali_retrieval import CalPaliRetriever




2
3	+ __all__ = ["CalPaliRetriever"]

medrag_multi_modal/retrieval/colpali_retrieval.py CHANGED Viewed

@@ -9,7 +9,7 @@ if TYPE_CHECKING:
 import wandb
 from PIL import Image
-from ..utils import get_wandb_artifact
 class CalPaliRetriever(weave.Model):

 import wandb
 from PIL import Image
+from medrag_multi_modal.utils import get_wandb_artifact
 class CalPaliRetriever(weave.Model):

medrag_multi_modal/retrieval/common.py CHANGED Viewed

@@ -1,10 +1,5 @@
 from enum import Enum
-import safetensors
-import safetensors.torch
-import torch
-import wandb
 class SimilarityMetric(Enum):
     COSINE = "cosine"
@@ -24,21 +19,3 @@ def argsort_scores(scores: list[float], descending: bool = False):
             list(enumerate(scores)), key=lambda x: x[1], reverse=descending
         )
     ]
-def save_vector_index(
-    vector_index: torch.Tensor,
-    type: str,
-    index_name: str,
-    metadata: dict,
-    filename: str = "vector_index.safetensors",
-):
-    safetensors.torch.save_file({"vector_index": vector_index.cpu()}, filename)
-    if wandb.run:
-        artifact = wandb.Artifact(
-            name=index_name,
-            type=type,
-            metadata=metadata,
-        )
-        artifact.add_file(filename)
-        artifact.save()

 from enum import Enum
 class SimilarityMetric(Enum):
     COSINE = "cosine"
             list(enumerate(scores)), key=lambda x: x[1], reverse=descending
         )
     ]

medrag_multi_modal/retrieval/text_retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .bm25s_retrieval import BM25sRetriever
+from .contriever_retrieval import ContrieverRetriever
+from .medcpt_retrieval import MedCPTRetriever
+from .nv_embed_2 import NVEmbed2Retriever
+__all__ = [
+    "BM25sRetriever",
+    "ContrieverRetriever",
+    "MedCPTRetriever",
+    "NVEmbed2Retriever",
+]

medrag_multi_modal/retrieval/{bm25s_retrieval.py → text_retrieval/bm25s_retrieval.py} RENAMED Viewed

@@ -1,12 +1,17 @@
 import os
-from glob import glob
-from typing import Optional
 import bm25s
-import wandb
 import weave
 from Stemmer import Stemmer
 LANGUAGE_DICT = {
     "english": "en",
     "french": "fr",
@@ -26,49 +31,60 @@ class BM25sRetriever(weave.Model):
             a new instance is created.
     """
-    language: str
-    use_stemmer: bool
-    _retriever: Optional[bm25s.BM25]
     def __init__(
         self,
         language: str = "english",
         use_stemmer: bool = True,
-        retriever: Optional[bm25s.BM25] = None,
     ):
         super().__init__(language=language, use_stemmer=use_stemmer)
-        self._retriever = retriever or bm25s.BM25()
-    def index(self, chunk_dataset_name: str, index_name: Optional[str] = None):
         """
         Indexes a dataset of text chunks using the BM25 algorithm.
-        This function takes a dataset of text chunks identified by `chunk_dataset_name`,
-        tokenizes the text using the BM25 tokenizer with optional stemming, and indexes
-        the tokenized text using the BM25 retriever. If an `index_name` is provided, the
-        index is saved to disk and logged as a Weights & Biases artifact.
         !!! example "Example Usage"
             ```python
             import weave
             from dotenv import load_dotenv
-            import wandb
-            from medrag_multi_modal.retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
-            wandb.init(project="medrag-multi-modal", entity="ml-colabs", job_type="bm25s-index")
             retriever = BM25sRetriever()
-            retriever.index(chunk_dataset_name="grays-anatomy-text:v13", index_name="grays-anatomy-bm25s")
             ```
         Args:
-            chunk_dataset_name (str): The name of the dataset containing text chunks to be indexed.
-            index_name (Optional[str]): The name to save the index under. If provided, the index
-                is saved to disk and logged as a Weights & Biases artifact.
         """
-        chunk_dataset = weave.ref(chunk_dataset_name).get().rows
         corpus = [row["text"] for row in chunk_dataset]
         corpus_tokens = bm25s.tokenize(
             corpus,
@@ -76,28 +92,40 @@ class BM25sRetriever(weave.Model):
             stemmer=Stemmer(self.language) if self.use_stemmer else None,
         )
         self._retriever.index(corpus_tokens)
-        if index_name:
             self._retriever.save(
-                index_name, corpus=[dict(row) for row in chunk_dataset]
             )
-            if wandb.run:
-                artifact = wandb.Artifact(
-                    name=index_name,
-                    type="bm25s-index",
-                    metadata={
                         "language": self.language,
                         "use_stemmer": self.use_stemmer,
                     },
                 )
-                artifact.add_dir(index_name, name=index_name)
-                artifact.save()
     @classmethod
-    def from_wandb_artifact(cls, index_artifact_address: str):
         """
-        Creates an instance of the class from a Weights & Biases artifact.
-        This class method retrieves a BM25 index artifact from Weights & Biases,
         downloads the artifact, and loads the BM25 retriever with the index and its
         associated corpus. The method also extracts metadata from the artifact to
         initialize the class instance with the appropriate language and stemming
@@ -108,41 +136,26 @@ class BM25sRetriever(weave.Model):
             import weave
             from dotenv import load_dotenv
-            from medrag_multi_modal.retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
-            retriever = BM25sRetriever.from_wandb_artifact(
-                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:latest"
-            )
             ```
         Args:
-            index_artifact_address (str): The address of the Weights & Biases artifact
-                containing the BM25 index.
         Returns:
             An instance of the class initialized with the BM25 retriever and metadata
             from the artifact.
         """
-        if wandb.run:
-            artifact = wandb.run.use_artifact(
-                index_artifact_address, type="bm25s-index"
-            )
-            artifact_dir = artifact.download()
-        else:
-            api = wandb.Api()
-            artifact = api.artifact(index_artifact_address)
-            artifact_dir = artifact.download()
-        retriever = bm25s.BM25.load(
-            glob(os.path.join(artifact_dir, "*"))[0], load_corpus=True
-        )
-        metadata = artifact.metadata
-        return cls(
-            language=metadata["language"],
-            use_stemmer=metadata["use_stemmer"],
-            retriever=retriever,
-        )
     @weave.op()
     def retrieve(self, query: str, top_k: int = 2):
@@ -155,6 +168,20 @@ class BM25sRetriever(weave.Model):
         The results are returned as a list of dictionaries, each containing a chunk and
         its corresponding relevance score.
         Args:
             query (str): The input query string to search for relevant chunks.
             top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
@@ -192,13 +219,12 @@ class BM25sRetriever(weave.Model):
             import weave
             from dotenv import load_dotenv
-            from medrag_multi_modal.retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
-            retriever = BM25sRetriever.from_wandb_artifact(
-                index_artifact_address="ml-colabs/medrag-multi-modal/grays-anatomy-bm25s:latest"
-            )
             retrieved_chunks = retriever.predict(query="What are Ribosomes?")
             ```

+import json
 import os
+import shutil
+from typing import Optional, Union
 import bm25s
+import huggingface_hub
 import weave
+from bm25s import BM25
+from datasets import Dataset, load_dataset
 from Stemmer import Stemmer
+from medrag_multi_modal.utils import fetch_from_huggingface, save_to_huggingface
 LANGUAGE_DICT = {
     "english": "en",
     "french": "fr",
             a new instance is created.
     """
+    language: Optional[str]
+    use_stemmer: bool = True
+    _retriever: Optional[BM25]
     def __init__(
         self,
         language: str = "english",
         use_stemmer: bool = True,
+        retriever: Optional[BM25] = None,
     ):
         super().__init__(language=language, use_stemmer=use_stemmer)
+        self._retriever = retriever or BM25()
+    def index(
+        self,
+        chunk_dataset: Union[Dataset, str],
+        index_repo_id: Optional[str] = None,
+        cleanup: bool = True,
+    ):
         """
         Indexes a dataset of text chunks using the BM25 algorithm.
+        This method retrieves a dataset of text chunks from a specified source, tokenizes
+        the text using the BM25 tokenizer with optional stemming, and indexes the tokenized
+        text using the BM25 retriever. If an `index_repo_id` is provided, the index is saved
+        to disk and optionally logged as a Huggingface artifact.
         !!! example "Example Usage"
             ```python
             import weave
             from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval.text_retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
             retriever = BM25sRetriever()
+            retriever.index(
+                chunk_dataset="geekyrakshit/grays-anatomy-chunks-test",
+                index_repo_id="geekyrakshit/grays-anatomy-index",
+            )
             ```
         Args:
+            chunk_dataset (str): The Huggingface dataset containing the text chunks to be indexed. Either a
+                dataset repository name or a dataset object can be provided.
+            index_repo_id (Optional[str]): The Huggingface repository of the index artifact to be saved.
+            cleanup (bool, optional): Whether to delete the local index directory after saving the vector index.
         """
+        chunk_dataset = (
+            load_dataset(chunk_dataset, split="chunks")
+            if isinstance(chunk_dataset, str)
+            else chunk_dataset
+        )
         corpus = [row["text"] for row in chunk_dataset]
         corpus_tokens = bm25s.tokenize(
             corpus,
             stemmer=Stemmer(self.language) if self.use_stemmer else None,
         )
         self._retriever.index(corpus_tokens)
+        if index_repo_id:
+            os.makedirs(".huggingface", exist_ok=True)
+            index_save_dir = os.path.join(".huggingface", index_repo_id.split("/")[-1])
             self._retriever.save(
+                index_save_dir, corpus=[dict(row) for row in chunk_dataset]
+            )
+            commit_type = (
+                "update"
+                if huggingface_hub.repo_exists(index_repo_id, repo_type="model")
+                else "add"
             )
+            with open(os.path.join(index_save_dir, "config.json"), "w") as config_file:
+                json.dump(
+                    {
                         "language": self.language,
                         "use_stemmer": self.use_stemmer,
                     },
+                    config_file,
+                    indent=4,
                 )
+            save_to_huggingface(
+                index_repo_id,
+                index_save_dir,
+                commit_message=f"{commit_type}: BM25s index",
+            )
+            if cleanup:
+                shutil.rmtree(index_save_dir)
     @classmethod
+    def from_index(cls, index_repo_id: str):
         """
+        Creates an instance of the class from a Huggingface repository.
+        This class method retrieves a BM25 index artifact from a Huggingface repository,
         downloads the artifact, and loads the BM25 retriever with the index and its
         associated corpus. The method also extracts metadata from the artifact to
         initialize the class instance with the appropriate language and stemming
             import weave
             from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval.text_retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = BM25sRetriever()
+            retriever = BM25sRetriever().from_index(index_repo_id="geekyrakshit/grays-anatomy-index")
             ```
         Args:
+            index_repo_id (Optional[str]): The Huggingface repository of the index artifact to be saved.
         Returns:
             An instance of the class initialized with the BM25 retriever and metadata
             from the artifact.
         """
+        index_dir = fetch_from_huggingface(index_repo_id, ".huggingface")
+        retriever = bm25s.BM25.load(index_dir, load_corpus=True)
+        with open(os.path.join(index_dir, "config.json"), "r") as config_file:
+            config = json.load(config_file)
+        return cls(retriever=retriever, **config)
     @weave.op()
     def retrieve(self, query: str, top_k: int = 2):
         The results are returned as a list of dictionaries, each containing a chunk and
         its corresponding relevance score.
+        !!! example "Example Usage"
+            ```python
+            import weave
+            from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval.text_retrieval import BM25sRetriever
+            load_dotenv()
+            weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = BM25sRetriever()
+            retriever = BM25sRetriever().from_index(index_repo_id="geekyrakshit/grays-anatomy-index")
+            retrieved_chunks = retriever.retrieve(query="What are Ribosomes?")
+            ```
         Args:
             query (str): The input query string to search for relevant chunks.
             top_k (int, optional): The number of top relevant chunks to retrieve. Defaults to 2.
             import weave
             from dotenv import load_dotenv
+            from medrag_multi_modal.retrieval.text_retrieval import BM25sRetriever
             load_dotenv()
             weave.init(project_name="ml-colabs/medrag-multi-modal")
+            retriever = BM25sRetriever()
+            retriever = BM25sRetriever().from_index(index_repo_id="geekyrakshit/grays-anatomy-index")
             retrieved_chunks = retriever.predict(query="What are Ribosomes?")
             ```