Spaces:
Sleeping
Sleeping
Update hf_to_chroma_ds.py
Browse files- hf_to_chroma_ds.py +4 -1
hf_to_chroma_ds.py
CHANGED
@@ -120,13 +120,15 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
|
|
120 |
|
121 |
# Retrieve the mapped data
|
122 |
mapped_data = dataset.to_chroma()
|
123 |
-
|
|
|
124 |
# Split the data into batches and add them to the collection
|
125 |
def chunk_data(data, size):
|
126 |
"""Helper function to split data into batches."""
|
127 |
for i in range(0, len(data), size):
|
128 |
yield data[i:i+size]
|
129 |
|
|
|
130 |
ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
|
131 |
metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
|
132 |
documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
|
@@ -134,6 +136,7 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
|
|
134 |
|
135 |
total_docs = len(mapped_data["ids"])
|
136 |
|
|
|
137 |
for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
|
138 |
collection.add(
|
139 |
ids=ids,
|
|
|
120 |
|
121 |
# Retrieve the mapped data
|
122 |
mapped_data = dataset.to_chroma()
|
123 |
+
del dataset
|
124 |
+
|
125 |
# Split the data into batches and add them to the collection
|
126 |
def chunk_data(data, size):
|
127 |
"""Helper function to split data into batches."""
|
128 |
for i in range(0, len(data), size):
|
129 |
yield data[i:i+size]
|
130 |
|
131 |
+
print("########### Chunking ###########")
|
132 |
ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
|
133 |
metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
|
134 |
documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
|
|
|
136 |
|
137 |
total_docs = len(mapped_data["ids"])
|
138 |
|
139 |
+
print("########### Iterating batches ###########")
|
140 |
for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
|
141 |
collection.add(
|
142 |
ids=ids,
|