eliot-hub commited on
Commit
1bea5ac
·
1 Parent(s): b7be7da
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
__pycache__/hf_to_chroma_ds.cpython-312.pyc ADDED
Binary file (6.69 kB). View file
 
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from langchain_community.vectorstores import Chroma
3
  from langchain.prompts import ChatPromptTemplate
4
  from langchain.chains import create_retrieval_chain, create_history_aware_retriever
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
@@ -20,11 +20,12 @@ from mixedbread_ai.client import MixedbreadAI
20
  from langchain.callbacks.tracers import ConsoleCallbackHandler
21
  from langchain_huggingface import HuggingFaceEmbeddings
22
  import os
23
- from chroma_datasets.utils import import_into_chroma
 
24
  from datasets import load_dataset
25
  from chromadb.utils import embedding_functions
26
- from hf_to_chroma_ds import Dataset
27
-
28
 
29
  # Global params
30
  CHROMA_PATH = "chromadb_mem10_mxbai_800_complete"
@@ -40,7 +41,8 @@ HF_API_KEY = os.environ.get("HF_API_KEY")
40
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
41
  mxbai_client = MixedbreadAI(api_key=MXBAI_API_KEY)
42
  model_emb = "mixedbread-ai/mxbai-embed-large-v1"
43
- huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
 
44
  api_key=HF_API_KEY,
45
  model_name=model_emb
46
  )
@@ -50,24 +52,24 @@ client = chromadb.Client()
50
  # memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
51
  # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
52
 
53
- memoires_ds = Dataset(
54
- hf_data = None,
55
- hf_dataset_name = "eliot-hub/memoires_vec_800",
56
- embedding_function = huggingface_ef,
57
- embedding_function_instructions = None
58
- )
59
 
60
 
61
- db = import_into_chroma(
62
  chroma_client=client,
63
- dataset=memoires_ds,
64
- embedding_function=huggingface_ef
65
  )
66
- # db = Chroma(
67
- # client=client,
68
- # collection_name=f"embeddings_mxbai",
69
- # embedding_function= HuggingFaceEmbeddings(model_name=model_emb)
70
- # )
71
 
72
 
73
  # Reranker class
@@ -86,7 +88,7 @@ class Reranker(BaseRetriever):
86
  # Set up reranker + LLM
87
  retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 25})
88
  reranker = Reranker(retriever=retriever, k=4) #Reranker(retriever=retriever, model=model, k=4)
89
- llm = ChatOpenAI(model=LLM_NAME, api_key=OPENAI_API_KEY, verbose=True)
90
 
91
  # Set up the contextualize question prompt
92
  contextualize_q_system_prompt = (
 
1
  import gradio as gr
2
+ from langchain_chroma import Chroma
3
  from langchain.prompts import ChatPromptTemplate
4
  from langchain.chains import create_retrieval_chain, create_history_aware_retriever
5
  from langchain.chains.combine_documents import create_stuff_documents_chain
 
20
  from langchain.callbacks.tracers import ConsoleCallbackHandler
21
  from langchain_huggingface import HuggingFaceEmbeddings
22
  import os
23
+ # from chroma_datasets.utils import import_into_chroma
24
+ from hf_to_chroma_ds import import_into_chroma
25
  from datasets import load_dataset
26
  from chromadb.utils import embedding_functions
27
+ from hf_to_chroma_ds import Memoires_DS
28
+ from dotenv import load_dotenv
29
 
30
  # Global params
31
  CHROMA_PATH = "chromadb_mem10_mxbai_800_complete"
 
41
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
42
  mxbai_client = MixedbreadAI(api_key=MXBAI_API_KEY)
43
  model_emb = "mixedbread-ai/mxbai-embed-large-v1"
44
+
45
+ huggingface_ef = embedding_functions.huggingface_embedding_function.HuggingFaceEmbeddingFunction(
46
  api_key=HF_API_KEY,
47
  model_name=model_emb
48
  )
 
52
  # memoires_ds = load_dataset("eliot-hub/memoires_vec_800", split="data", token=HF_TOKEN)
53
  # client = chromadb.PersistentClient(path=os.path.join(os.path.abspath(os.getcwd()), "01_Notebooks", "RAG-ollama", "chatbot_actuariat_APP", CHROMA_PATH))
54
 
55
+ # memoires_ds = Dataset(
56
+ # hf_data = None,
57
+ # hf_dataset_name = "eliot-hub/memoires_vec_800",
58
+ # embedding_function = huggingface_ef,
59
+ # embedding_function_instructions = None
60
+ # )
61
 
62
 
63
+ collection = import_into_chroma(
64
  chroma_client=client,
65
+ dataset=Memoires_DS,
66
+ embedding_function=huggingface_ef #Memoires_DS.embedding_function
67
  )
68
+ db = Chroma(
69
+ client=client,
70
+ collection_name=f"embeddings_mxbai",
71
+ embedding_function = HuggingFaceEmbeddings(model_name=model_emb)
72
+ )
73
 
74
 
75
  # Reranker class
 
88
  # Set up reranker + LLM
89
  retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 25})
90
  reranker = Reranker(retriever=retriever, k=4) #Reranker(retriever=retriever, model=model, k=4)
91
+ llm = ChatOpenAI(model=LLM_NAME, verbose=True) #, api_key=OPENAI_API_KEY, )
92
 
93
  # Set up the contextualize question prompt
94
  contextualize_q_system_prompt = (
hf_to_chroma_ds.py CHANGED
@@ -4,8 +4,19 @@ from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
4
  from typing_extensions import TypedDict
5
  from chroma_datasets.types import AddEmbedding, Datapoint
6
  from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
 
 
 
7
 
 
8
 
 
 
 
 
 
 
 
9
 
10
  class Dataset(ABC):
11
  """
@@ -47,10 +58,92 @@ class Dataset(ABC):
47
  return to_chroma_schema(cls.chunked())
48
 
49
 
50
- # class Memoires_DS(Dataset):
51
- # """
52
- # """
53
- # hf_data = None
54
- # hf_dataset_name = "eliot-hub/memoires_vec_800"
55
- # embedding_function = "HFEmbeddingFunction"
56
- # embedding_function_instructions = ef_instruction_dict[embedding_function]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from typing_extensions import TypedDict
5
  from chroma_datasets.types import AddEmbedding, Datapoint
6
  from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
7
+ from chromadb.utils import embedding_functions
8
+ import os
9
+ from dotenv import load_dotenv
10
 
11
+ HF_API_KEY = os.environ.get("HF_API_KEY")
12
 
13
+ ef_instruction_dict = {
14
+ "HuggingFaceEmbeddingFunction": """
15
+ from chromadb.utils import embedding_functions
16
+ hf_ef = embedding_functions.huggingface_embedding_function.HuggingFaceEmbeddingFunction(api_key={HF_API_KEY}, model_name="mixedbread-ai/mxbai-embed-large-v1")
17
+
18
+ """
19
+ }
20
 
21
  class Dataset(ABC):
22
  """
 
58
  return to_chroma_schema(cls.chunked())
59
 
60
 
61
+ class Memoires_DS(Dataset):
62
+ """
63
+ """
64
+ hf_data = None
65
+ hf_dataset_name = "eliot-hub/memoires_vec_800"
66
+ embedding_function = "HuggingFaceEmbeddingFunction"
67
+ embedding_function_instructions = ef_instruction_dict[embedding_function]
68
+
69
+
70
+
71
+
72
+ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_function=None, batch_size=30000):
73
+ """
74
+ Imports a dataset into Chroma in batches.
75
+
76
+ Args:
77
+ chroma_client (ChromaClient): The ChromaClient to use.
78
+ collection_name (str): The name of the collection to load the dataset into.
79
+ dataset (AddEmbedding): The dataset to load.
80
+ embedding_function (Optional[Callable[[str], np.ndarray]]): A function that takes a string and returns an embedding.
81
+ batch_size (int): The size of each batch to load.
82
+ """
83
+ # if chromadb is not installed, raise an error
84
+ try:
85
+ import chromadb
86
+ from chromadb.utils import embedding_functions
87
+ except ImportError:
88
+ raise ImportError("Please install chromadb to use this function. `pip install chromadb`")
89
+
90
+ ef = None
91
+
92
+ if dataset.embedding_function is not None:
93
+ if embedding_function is None:
94
+ error_msg = "See documentation"
95
+ if dataset.embedding_function_instructions is not None:
96
+ error_msg = dataset.embedding_function_instructions
97
+
98
+ raise ValueError(f"""
99
+ Dataset requires embedding function: {dataset.embedding_function}.
100
+ {error_msg}
101
+ """)
102
+
103
+ if embedding_function.__class__.__name__ != dataset.embedding_function:
104
+ raise ValueError(f"Please use {dataset.embedding_function} as the embedding function for this dataset. You passed {embedding_function.__class__.__name__}")
105
+
106
+ if embedding_function is not None:
107
+ ef = embedding_function
108
+
109
+ # if collection_name is None, get the name from the dataset type
110
+ if collection_name is None:
111
+ collection_name = dataset.__name__
112
+
113
+ if ef is None:
114
+ ef = embedding_functions.DefaultEmbeddingFunction()
115
+
116
+ collection = chroma_client.create_collection(
117
+ collection_name,
118
+ embedding_function=ef
119
+ )
120
+
121
+ # Retrieve the mapped data
122
+ mapped_data = dataset.to_chroma()
123
+
124
+ # Split the data into batches and add them to the collection
125
+ def chunk_data(data, size):
126
+ """Helper function to split data into batches."""
127
+ for i in range(0, len(data), size):
128
+ yield data[i:i+size]
129
+
130
+ ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
131
+ metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
132
+ documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
133
+ embeddings_batches = list(chunk_data(mapped_data["embeddings"], batch_size))
134
+
135
+ total_docs = len(mapped_data["ids"])
136
+
137
+ for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
138
+ collection.add(
139
+ ids=ids,
140
+ metadatas=metadatas,
141
+ documents=documents,
142
+ embeddings=embeddings,
143
+ )
144
+ print(f"Batch {i+1}/{len(ids_batches)}: Loaded {len(ids)} documents.")
145
+
146
+ print(f"Successfully loaded {total_docs} documents into the collection named: {collection_name}")
147
+
148
+
149
+ return collection