Chris4K commited on
Commit
68a1536
·
verified ·
1 Parent(s): f91204a

Create pdf_service.py

Browse files
Files changed (1) hide show
  1. services/pdf_service.py +84 -0
services/pdf_service.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # services/pdf_service.py
2
+ from pathlib import Path
3
+ from typing import List, Dict, Any
4
+ from PyPDF2 import PdfReader
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ import asyncio
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import logging
9
+ from config.config import settings
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class PDFService:
14
+ def __init__(self, model_service):
15
+ self.embedder = model_service.embedder
16
+ self.text_splitter = RecursiveCharacterTextSplitter(
17
+ chunk_size=settings.CHUNK_SIZE,
18
+ chunk_overlap=settings.CHUNK_OVERLAP
19
+ )
20
+ self.pdf_chunks = []
21
+ self.faiss_index = None
22
+
23
+ async def index_pdfs(self, pdf_folder: Path = settings.PDF_FOLDER) -> List[Dict[str, Any]]:
24
+ all_texts = []
25
+
26
+ async def process_pdf(pdf_file: Path) -> List[Dict[str, Any]]:
27
+ try:
28
+ reader = PdfReader(str(pdf_file))
29
+ metadata = reader.metadata
30
+ full_text = " ".join([
31
+ page.extract_text()
32
+ for page in reader.pages
33
+ if page.extract_text()
34
+ ])
35
+ chunks = self.text_splitter.split_text(full_text)
36
+ return [{
37
+ 'text': chunk,
38
+ 'source': pdf_file.name,
39
+ 'metadata': metadata,
40
+ 'chunk_index': i
41
+ } for i, chunk in enumerate(chunks)]
42
+ except Exception as e:
43
+ logger.error(f"Error processing PDF {pdf_file}: {e}")
44
+ return []
45
+
46
+ pdf_files = [f for f in pdf_folder.iterdir() if f.suffix.lower() == ".pdf"]
47
+
48
+ async with ThreadPoolExecutor() as executor:
49
+ tasks = [process_pdf(pdf_file) for pdf_file in pdf_files]
50
+ results = await asyncio.gather(*tasks)
51
+
52
+ for result in results:
53
+ all_texts.extend(result)
54
+
55
+ self.pdf_chunks = all_texts
56
+ return all_texts
57
+
58
+ async def search_pdfs(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
59
+ if not self.pdf_chunks:
60
+ await self.index_pdfs()
61
+
62
+ query_embedding = self.embedder.encode([query], convert_to_tensor=True).cpu().detach().numpy()
63
+
64
+ # Create embeddings for chunks if not already done
65
+ if not self.faiss_index:
66
+ chunk_embeddings = self.embedder.encode(
67
+ [chunk['text'] for chunk in self.pdf_chunks],
68
+ convert_to_tensor=True
69
+ ).cpu().detach().numpy()
70
+
71
+ d = chunk_embeddings.shape[1]
72
+ self.faiss_index = faiss.IndexFlatL2(d)
73
+ self.faiss_index.add(chunk_embeddings)
74
+
75
+ distances, indices = self.faiss_index.search(query_embedding, top_k)
76
+
77
+ results = []
78
+ for i, idx in enumerate(indices[0]):
79
+ chunk = self.pdf_chunks[idx].copy()
80
+ chunk['score'] = float(distances[0][i])
81
+ results.append(chunk)
82
+
83
+ return results
84
+