chatroles-vdb / faiss_vdb.py
pond918
feat: embed interface
8fdb6b2
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import os
import numpy as np
"""
for development only.
"""
db_path = './data'
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2')
# TODO clear empty entry on load
db = FAISS.load_local(db_path, embeddings) if os.path.exists(
db_path) else FAISS.from_texts([''], embeddings, metadatas=[{"none": 1}])
def upsert(text, meta):
"""
Args:
text: string
meta: dict, must contain id.
Returns:
total docs count
"""
id = clear_existing(meta['id'])
db.add_texts([text], metadatas=[meta], ids=[id])
db.save_local(db_path)
return True
def total():
return len(db.index_to_docstore_id)
def search(text, size=4, offset=0):
docs = db.similarity_search(text, size)
full = size <= len(docs)
docs = docs[offset:]
data, deled = [], 0
for doc in docs:
if 'id' in doc.metadata: data.append(doc.metadata)
elif 'none' not in doc.metadata: deled = deled + 1
if (full and deled > 0):
data = data + search(text, size + deled, size)
return data
# def FAISS_append_txt(text, meta, id, doc):
# """ hack """
# doc.page_content = text
# doc.metadata = meta
# emb = [db.embedding_function(text)]
# vector = np.array(emb, dtype=np.float32)
# if db._normalize_L2:
# faiss = db.dependable_faiss_import()
# faiss.normalize_L2(vector)
# db.index.add(vector).update()
# starting_len = len(self.index_to_docstore_id)
# # TODO set db.index_to_docstore_id[old_idx] = ''
def clear_existing(id):
while True:
doc = db.docstore.search(id)
if not doc or isinstance(doc, str):
print('[VDB] insert new doc', id)
return id
# emb = embed_text(doc.page_content)
# idx = db.index.assign(emb)
# db.index.remove_ids()
doc.metadata = {} # clear meta to disable it
doc.page_content = {}
id = id + '#'
def embed_text(txt):
emb = [db.embedding_function(txt)]
vector = np.array(emb, dtype=np.float32)
if db._normalize_L2:
faiss = db.dependable_faiss_import()
faiss.normalize_L2(vector)
return vector