eliot-hub commited on
Commit
6838503
·
verified ·
1 Parent(s): f9d97e5

Update hf_to_chroma_ds.py

Browse files
Files changed (1) hide show
  1. hf_to_chroma_ds.py +4 -1
hf_to_chroma_ds.py CHANGED
@@ -120,13 +120,15 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
120
 
121
  # Retrieve the mapped data
122
  mapped_data = dataset.to_chroma()
123
-
 
124
  # Split the data into batches and add them to the collection
125
  def chunk_data(data, size):
126
  """Helper function to split data into batches."""
127
  for i in range(0, len(data), size):
128
  yield data[i:i+size]
129
 
 
130
  ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
131
  metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
132
  documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
@@ -134,6 +136,7 @@ def import_into_chroma(chroma_client, dataset, collection_name=None, embedding_f
134
 
135
  total_docs = len(mapped_data["ids"])
136
 
 
137
  for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
138
  collection.add(
139
  ids=ids,
 
120
 
121
  # Retrieve the mapped data
122
  mapped_data = dataset.to_chroma()
123
+ del dataset
124
+
125
  # Split the data into batches and add them to the collection
126
  def chunk_data(data, size):
127
  """Helper function to split data into batches."""
128
  for i in range(0, len(data), size):
129
  yield data[i:i+size]
130
 
131
+ print("########### Chunking ###########")
132
  ids_batches = list(chunk_data(mapped_data["ids"], batch_size))
133
  metadatas_batches = list(chunk_data(mapped_data["metadatas"], batch_size))
134
  documents_batches = list(chunk_data(mapped_data["documents"], batch_size))
 
136
 
137
  total_docs = len(mapped_data["ids"])
138
 
139
+ print("########### Iterating batches ###########")
140
  for i, (ids, metadatas, documents, embeddings) in enumerate(zip(ids_batches, metadatas_batches, documents_batches, embeddings_batches)):
141
  collection.add(
142
  ids=ids,