Spaces:

eliot-hub
/

chatbot_app

Sleeping

App Files Files Community

eliot-hub commited on Sep 26, 2024

Commit

1bea5ac

1 Parent(s): b7be7da

db ok

Browse files

Files changed (4) hide show

.gitignore +1 -0
__pycache__/hf_to_chroma_ds.cpython-312.pyc +0 -0
app.py +22 -20
hf_to_chroma_ds.py +100 -7

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

__pycache__/hf_to_chroma_ds.cpython-312.pyc ADDED Viewed

Binary file (6.69 kB). View file

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from langchain_community.vectorstores import Chroma
 from langchain.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain, create_history_aware_retriever
 from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -20,11 +20,12 @@ from mixedbread_ai.client import MixedbreadAI
 from langchain.callbacks.tracers import ConsoleCallbackHandler
 from langchain_huggingface import HuggingFaceEmbeddings
 import os
-from chroma_datasets.utils import import_into_chroma
 from datasets import load_dataset
 from chromadb.utils import embedding_functions
-from hf_to_chroma_ds import Dataset
 # Global params
 CHROMA_PATH = "chromadb_mem10_mxbai_800_complete"
@@ -40,7 +41,8 @@ HF_API_KEY = os.environ.get("HF_API_KEY")
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 mxbai_client = MixedbreadAI(api_key=MXBAI_API_KEY)
 model_emb = "mixedbread-ai/mxbai-embed-large-v1"
-huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
     api_key=HF_API_KEY,
     model_name=model_emb
 )
@@ -50,24 +52,24 @@ client = chromadb.Client()
 # memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
 # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
-memoires_ds = Dataset(
-    hf_data = None,
-    hf_dataset_name = "eliot-hub/memoires_vec_800",
-    embedding_function = huggingface_ef,
-    embedding_function_instructions = None
-    )
-db = import_into_chroma(
     chroma_client=client,
-    dataset=memoires_ds,
-    embedding_function=huggingface_ef
     )
-# db = Chroma(
-#     client=client,
-#     collection_name=f"embeddings_mxbai",
-#     embedding_function= HuggingFaceEmbeddings(model_name=model_emb)
-# )
 # Reranker class
@@ -86,7 +88,7 @@ class Reranker(BaseRetriever):
 # Set up reranker + LLM
 retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 25})
 reranker = Reranker(retriever=retriever, k=4)  #Reranker(retriever=retriever, model=model, k=4)
-llm = ChatOpenAI(model=LLM_NAME, api_key=OPENAI_API_KEY, verbose=True)
 # Set up the contextualize question prompt
 contextualize_q_system_prompt = (

 import gradio as gr
+from langchain_chroma import Chroma
 from langchain.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain, create_history_aware_retriever
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.callbacks.tracers import ConsoleCallbackHandler
 from langchain_huggingface import HuggingFaceEmbeddings
 import os
+# from chroma_datasets.utils import import_into_chroma
+from hf_to_chroma_ds import import_into_chroma
 from datasets import load_dataset
 from chromadb.utils import embedding_functions
+from hf_to_chroma_ds import Memoires_DS
+from dotenv import load_dotenv
 # Global params
 CHROMA_PATH = "chromadb_mem10_mxbai_800_complete"
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 mxbai_client = MixedbreadAI(api_key=MXBAI_API_KEY)
 model_emb = "mixedbread-ai/mxbai-embed-large-v1"
+huggingface_ef = embedding_functions.huggingface_embedding_function.HuggingFaceEmbeddingFunction(
     api_key=HF_API_KEY,
     model_name=model_emb
 )
 # memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
 # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
+# memoires_ds = Dataset(
+#     hf_data = None,
+#     hf_dataset_name = "eliot-hub/memoires_vec_800",
+#     embedding_function = huggingface_ef,
+#     embedding_function_instructions = None
+#     )
+collection = import_into_chroma(
     chroma_client=client,
+    dataset=Memoires_DS,
+    embedding_function=huggingface_ef #Memoires_DS.embedding_function
     )
+db = Chroma(
+    client=client,
+    collection_name=f"embeddings_mxbai",
+    embedding_function = HuggingFaceEmbeddings(model_name=model_emb)
+)
 # Reranker class
 # Set up reranker + LLM
 retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 25})
 reranker = Reranker(retriever=retriever, k=4)  #Reranker(retriever=retriever, model=model, k=4)
+llm = ChatOpenAI(model=LLM_NAME, verbose=True) #, api_key=OPENAI_API_KEY, )
 # Set up the contextualize question prompt
 contextualize_q_system_prompt = (

hf_to_chroma_ds.py CHANGED Viewed

@@ -4,8 +4,19 @@ from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
 from typing_extensions import TypedDict
 from chroma_datasets.types import AddEmbedding, Datapoint
 from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
 class Dataset(ABC):
     """
@@ -47,10 +58,92 @@ class Dataset(ABC):
         return to_chroma_schema(cls.chunked())
-# class Memoires_DS(Dataset):
-#     """
-#     """
-#     hf_data = None
-#     hf_dataset_name = "eliot-hub/memoires_vec_800"
-#     embedding_function = "HFEmbeddingFunction"
-#     embedding_function_instructions = ef_instruction_dict[embedding_function]

 from typing_extensions import TypedDict
 from chroma_datasets.types import AddEmbedding, Datapoint
 from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
+from chromadb.utils import embedding_functions
+import os
+from dotenv import load_dotenv
+HF_API_KEY = os.environ.get("HF_API_KEY")
+ef_instruction_dict = {
+    "HuggingFaceEmbeddingFunction": """
+        from chromadb.utils import embedding_functions
+        hf_ef = embedding_functions.huggingface_embedding_function.HuggingFaceEmbeddingFunction(api_key={HF_API_KEY}, model_name="mixedbread-ai/mxbai-embed-large-v1")
+    """
+}
 class Dataset(ABC):
     """
         return to_chroma_schema(cls.chunked())
+class Memoires_DS(Dataset):
+    """
+    """
+    hf_data = None
+    hf_dataset_name = "eliot-hub/memoires_vec_800"
+    embedding_function = "HuggingFaceEmbeddingFunction"
+    embedding_function_instructions = ef_instruction_dict[embedding_function]
+def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_function=None, batch_size=30000):
+    """
+    Imports a dataset into Chroma in batches.
+    Args:
+        chroma_client (ChromaClient): The ChromaClient to use.
+        collection_name (str): The name of the collection to load the dataset into.
+        dataset (AddEmbedding): The dataset to load.
+        embedding_function (Optional[Callable[[str], np.ndarray]]): A function that takes a string and returns an embedding.
+        batch_size (int): The size of each batch to load.
+    """
+    # if chromadb is not installed, raise an error
+    try:
+        import chromadb
+        from chromadb.utils import embedding_functions
+    except ImportError:
+        raise ImportError("Please install chromadb to use this function. `pip install chromadb`")
+    ef = None
+    if dataset.embedding_function is not None:
+        if embedding_function is None:
+            error_msg = "See documentation"
+            if dataset.embedding_function_instructions is not None:
+                error_msg = dataset.embedding_function_instructions
+            raise ValueError(f"""
+                             Dataset requires embedding function: {dataset.embedding_function}.
+                             {error_msg}
+                             """)
+        if embedding_function.__class__.__name__ != dataset.embedding_function:
+            raise ValueError(f"Please use {dataset.embedding_function} as the embedding function for this dataset. You passed {embedding_function.__class__.__name__}")
+    if embedding_function is not None:
+        ef = embedding_function
+    # if collection_name is None, get the name from the dataset type
+    if collection_name is None:
+        collection_name = dataset.__name__
+    if ef is None:
+        ef = embedding_functions.DefaultEmbeddingFunction()
+    collection = chroma_client.create_collection(
+        collection_name,
+        embedding_function=ef
+    )
+    # Retrieve the mapped data
+    mapped_data = dataset.to_chroma()
+    # Split the data into batches and add them to the collection
+    def chunk_data(data, size):
+        """Helper function to split data into batches."""
+        for i in range(0, len(data), size):
+            yield data[i:i+size]
+    ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
+    metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
+    documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
+    embeddings_batches = list(chunk_data(mapped_data["embeddings"], batch_size))
+    total_docs = len(mapped_data["ids"])
+    for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
+        collection.add(
+            ids=ids,
+            metadatas=metadatas,
+            documents=documents,
+            embeddings=embeddings,
+        )
+        print(f"Batch {i+1}/{len(ids_batches)}: Loaded {len(ids)} documents.")
+    print(f"Successfully loaded {total_docs} documents into the collection named: {collection_name}")
+    return collection