|
|
|
from typing import List, Dict, Any, Optional |
|
import aiohttp |
|
from bs4 import BeautifulSoup |
|
import faiss |
|
import logging |
|
from config.config import settings |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class FAQService: |
|
def __init__(self, model_service): |
|
self.embedder = model_service.embedder |
|
self.faiss_index = None |
|
self.faq_data = [] |
|
|
|
async def fetch_faq_pages(self) -> List[Dict[str, Any]]: |
|
async with aiohttp.ClientSession() as session: |
|
try: |
|
async with session.get(f"{settings.FAQ_ROOT_URL}sitemap.xml", timeout=settings.TIMEOUT) as response: |
|
if response.status == 200: |
|
sitemap = await response.text() |
|
soup = BeautifulSoup(sitemap, 'xml') |
|
faq_urls = [loc.text for loc in soup.find_all('loc') if "/faq/" in loc.text] |
|
|
|
tasks = [self.fetch_faq_content(url, session) for url in faq_urls] |
|
return await asyncio.gather(*tasks) |
|
except Exception as e: |
|
logger.error(f"Error fetching FAQ sitemap: {e}") |
|
return [] |
|
|
|
async def fetch_faq_content(self, url: str, session: aiohttp.ClientSession) -> Optional[Dict[str, Any]]: |
|
try: |
|
async with session.get(url, timeout=settings.TIMEOUT) as response: |
|
if response.status == 200: |
|
content = await response.text() |
|
soup = BeautifulSoup(content, 'html.parser') |
|
|
|
faq_title = soup.find('h1').text.strip() if soup.find('h1') else "Unknown Title" |
|
faqs = [] |
|
sections = soup.find_all(['div', 'section'], class_=['faq-item', 'faq-section']) |
|
|
|
for section in sections: |
|
question = section.find(['h2', 'h3']).text.strip() if section.find(['h2', 'h3']) else None |
|
answer = section.find(['p']).text.strip() if section.find(['p']) else None |
|
|
|
if question and answer: |
|
faqs.append({"question": question, "answer": answer}) |
|
|
|
return {"url": url, "title": faq_title, "faqs": faqs} |
|
except Exception as e: |
|
logger.error(f"Error fetching FAQ content from {url}: {e}") |
|
return None |
|
|
|
async def index_faqs(self): |
|
faq_pages = await self.fetch_faq_pages() |
|
faq_pages = [page for page in faq_pages if page] |
|
|
|
self.faq_data = [] |
|
all_texts = [] |
|
|
|
for faq_page in faq_pages: |
|
for item in faq_page['faqs']: |
|
combined_text = f"{item['question']} {item['answer']}" |
|
all_texts.append(combined_text) |
|
self.faq_data.append({ |
|
"question": item['question'], |
|
"answer": item['answer'], |
|
"source": faq_page['url'] |
|
}) |
|
|
|
embeddings = self.embedder.encode(all_texts, convert_to_tensor=True).cpu().detach().numpy() |
|
dimension = embeddings.shape[1] |
|
self.faiss_index = faiss.IndexFlatL2(dimension) |
|
self.faiss_index.add(embeddings) |
|
|
|
async def search_faqs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]: |
|
if not self.faiss_index: |
|
await self.index_faqs() |
|
|
|
query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy() |
|
distances, indices = self.faiss_index.search(query_embedding, top_k) |
|
|
|
results = [] |
|
for i, idx in enumerate(indices[0]): |
|
if idx < len(self.faq_data): |
|
result = self.faq_data[idx].copy() |
|
result["score"] = float(distances[0][i]) |
|
results.append(result) |
|
|
|
return results |