Spaces:
Runtime error
Runtime error
from langchain.vectorstores import FAISS | |
from langchain.embeddings import HuggingFaceEmbeddings | |
import os | |
import numpy as np | |
""" | |
for development only. | |
""" | |
db_path = './data' | |
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2') | |
# TODO clear empty entry on load | |
db = FAISS.load_local(db_path, embeddings) if os.path.exists( | |
db_path) else FAISS.from_texts([''], embeddings, metadatas=[{"none": 1}]) | |
def upsert(text, meta): | |
""" | |
Args: | |
text: string | |
meta: dict, must contain id. | |
Returns: | |
total docs count | |
""" | |
id = clear_existing(meta['id']) | |
db.add_texts([text], metadatas=[meta], ids=[id]) | |
db.save_local(db_path) | |
return True | |
def total(): | |
return len(db.index_to_docstore_id) | |
def search(text, size=4, offset=0): | |
docs = db.similarity_search(text, size) | |
full = size <= len(docs) | |
docs = docs[offset:] | |
data, deled = [], 0 | |
for doc in docs: | |
if 'id' in doc.metadata: data.append(doc.metadata) | |
elif 'none' not in doc.metadata: deled = deled + 1 | |
if (full and deled > 0): | |
data = data + search(text, size + deled, size) | |
return data | |
# def FAISS_append_txt(text, meta, id, doc): | |
# """ hack """ | |
# doc.page_content = text | |
# doc.metadata = meta | |
# emb = [db.embedding_function(text)] | |
# vector = np.array(emb, dtype=np.float32) | |
# if db._normalize_L2: | |
# faiss = db.dependable_faiss_import() | |
# faiss.normalize_L2(vector) | |
# db.index.add(vector).update() | |
# starting_len = len(self.index_to_docstore_id) | |
# # TODO set db.index_to_docstore_id[old_idx] = '' | |
def clear_existing(id): | |
while True: | |
doc = db.docstore.search(id) | |
if not doc or isinstance(doc, str): | |
print('[VDB] insert new doc', id) | |
return id | |
# emb = embed_text(doc.page_content) | |
# idx = db.index.assign(emb) | |
# db.index.remove_ids() | |
doc.metadata = {} # clear meta to disable it | |
doc.page_content = {} | |
id = id + '#' | |
def embed_text(txt): | |
emb = [db.embedding_function(txt)] | |
vector = np.array(emb, dtype=np.float32) | |
if db._normalize_L2: | |
faiss = db.dependable_faiss_import() | |
faiss.normalize_L2(vector) | |
return vector | |