eliot-hub commited on
Commit
1d89b9d
·
2 Parent(s): de77992 cc5a137
Files changed (2) hide show
  1. README.md +1 -0
  2. hf_to_chroma_ds.py +0 -154
README.md CHANGED
@@ -7,6 +7,7 @@ sdk: gradio
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
7
  sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
+ startup_duration_timeout: 1h
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
hf_to_chroma_ds.py DELETED
@@ -1,154 +0,0 @@
1
- # imports
2
- from abc import ABC, abstractmethod
3
- from typing import Optional, Union, Sequence, Dict, Mapping, List, Any
4
- from typing_extensions import TypedDict
5
- from chroma_datasets.types import AddEmbedding, Datapoint
6
- from chroma_datasets.utils import load_huggingface_dataset, to_chroma_schema
7
- from chromadb.utils import embedding_functions
8
- import os
9
- from dotenv import load_dotenv
10
-
11
- HF_API_KEY = os.environ.get("HF_API_KEY")
12
-
13
- ef_instruction_dict = {
14
- "HuggingFaceEmbeddingFunction": """
15
- from chromadb.utils import embedding_functions
16
- hf_ef = embedding_functions.huggingface_embedding_function.HuggingFaceEmbeddingFunction(api_key={HF_API_KEY}, model_name="mixedbread-ai/mxbai-embed-large-v1")
17
-
18
- """
19
- }
20
-
21
- class Dataset(ABC):
22
- """
23
- Abstract class for a dataset
24
-
25
- All datasets should inherit from this class
26
-
27
- Properties:
28
- hf_data: the raw data from huggingface
29
- embedding_function: the embedding function used to generate the embeddings
30
- embeddingFunctionInstructions: tell the user how to set up the embedding function
31
- """
32
- hf_dataset_name: str
33
- hf_data: Any
34
- embedding_function: str
35
- embedding_function_instructions: str
36
-
37
- @classmethod
38
- def load_data(cls):
39
- cls.hf_data = load_huggingface_dataset(
40
- cls.hf_dataset_name,
41
- split_name="data"
42
- )
43
-
44
- @classmethod
45
- def raw_text(cls) -> str:
46
- if cls.hf_data is None:
47
- cls.load_data()
48
- return "\n".join(cls.hf_data["document"])
49
-
50
- @classmethod
51
- def chunked(cls) -> List[Datapoint]:
52
- if cls.hf_data is None:
53
- cls.load_data()
54
- return cls.hf_data
55
-
56
- @classmethod
57
- def to_chroma(cls) -> AddEmbedding:
58
- return to_chroma_schema(cls.chunked())
59
-
60
-
61
- class Memoires_DS(Dataset):
62
- """
63
- """
64
- hf_data = None
65
- hf_dataset_name = "eliot-hub/memoires_vec_800"
66
- embedding_function = "HuggingFaceEmbeddingFunction"
67
- embedding_function_instructions = ef_instruction_dict[embedding_function]
68
-
69
-
70
-
71
-
72
- def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_function=None, batch_size=5000):
73
- """
74
- Imports a dataset into Chroma in batches.
75
-
76
- Args:
77
- chroma_client (ChromaClient): The ChromaClient to use.
78
- collection_name (str): The name of the collection to load the dataset into.
79
- dataset (AddEmbedding): The dataset to load.
80
- embedding_function (Optional[Callable[[str], np.ndarray]]): A function that takes a string and returns an embedding.
81
- batch_size (int): The size of each batch to load.
82
- """
83
- # if chromadb is not installed, raise an error
84
- try:
85
- import chromadb
86
- from chromadb.utils import embedding_functions
87
- except ImportError:
88
- raise ImportError("Please install chromadb to use this function. `pip install chromadb`")
89
-
90
- ef = None
91
-
92
- if dataset.embedding_function is not None:
93
- if embedding_function is None:
94
- error_msg = "See documentation"
95
- if dataset.embedding_function_instructions is not None:
96
- error_msg = dataset.embedding_function_instructions
97
-
98
- raise ValueError(f"""
99
- Dataset requires embedding function: {dataset.embedding_function}.
100
- {error_msg}
101
- """)
102
-
103
- if embedding_function.__class__.__name__ != dataset.embedding_function:
104
- raise ValueError(f"Please use {dataset.embedding_function} as the embedding function for this dataset. You passed {embedding_function.__class__.__name__}")
105
-
106
- if embedding_function is not None:
107
- ef = embedding_function
108
-
109
- # if collection_name is None, get the name from the dataset type
110
- if collection_name is None:
111
- collection_name = dataset.__name__
112
-
113
- if ef is None:
114
- ef = embedding_functions.DefaultEmbeddingFunction()
115
-
116
- print("########### Init collection ###########")
117
- collection = chroma_client.create_collection(
118
- collection_name,
119
- embedding_function=ef
120
- )
121
-
122
- # Retrieve the mapped data
123
- print("########### Init to_chroma ###########")
124
- mapped_data = dataset.to_chroma()
125
- del dataset
126
-
127
- # Split the data into batches and add them to the collection
128
- def chunk_data(data, size):
129
- """Helper function to split data into batches."""
130
- for i in range(0, len(data), size):
131
- yield data[i:i+size]
132
-
133
- print("########### Chunking ###########")
134
- ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
135
- metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
136
- documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
137
- embeddings_batches = list(chunk_data(mapped_data["embeddings"], batch_size))
138
-
139
- total_docs = len(mapped_data["ids"])
140
-
141
- print("########### Iterating batches ###########")
142
- for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
143
- collection.add(
144
- ids=ids,
145
- metadatas=metadatas,
146
- documents=documents,
147
- embeddings=embeddings,
148
- )
149
- print(f"Batch {i+1}/{len(ids_batches)}: Loaded {len(ids)} documents.")
150
-
151
- print(f"Successfully loaded {total_docs} documents into the collection named: {collection_name}")
152
-
153
-
154
- return collection