Spaces:
Running
Running
import fitz | |
from PIL import Image | |
import re | |
import io | |
import os | |
import logging | |
import shutil | |
from fastapi import FastAPI, UploadFile, File, HTTPException | |
from google.cloud import vision | |
from pdf2image import convert_from_path | |
class doc_processing: | |
def __init__(self, name, id_type, doc_type, f_path): | |
self.name = name | |
self.id_type = id_type | |
self.doc_type = doc_type | |
self.f_path = f_path | |
# self.o_path = o_path | |
def pdf_to_image_scale(self): | |
pdf_document = fitz.open(self.f_path) | |
if self.id_type == "gst": | |
page_num = 2 | |
else: | |
page_num = 0 | |
page = pdf_document.load_page(page_num) | |
pix = page.get_pixmap() # Render page as a pixmap (image) | |
# Convert pixmap to PIL Image | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
original_width, original_height = image.size | |
print("original_width", original_width) | |
print("original_height", original_height) | |
new_width = (1000 / original_width) * original_width | |
new_height = (1000 / original_height) * original_height | |
print("new_width", new_width) | |
print("new_height", new_height) | |
# new_width = | |
# new_height = | |
image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS) | |
output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name) | |
image.save(output_path) | |
return {"success": 200, "output_p": output_path} | |
def scale_img(self): | |
print("path of file", self.f_path) | |
image = Image.open(self.f_path).convert("RGB") | |
original_width, original_height = image.size | |
print("original_width", original_width) | |
print("original_height", original_height) | |
new_width = (1000 / original_width) * original_width | |
new_height = (1000 / original_height) * original_height | |
print("new_width", new_width) | |
print("new_height", new_height) | |
# new_width = | |
# new_height = | |
image.resize((int(new_width), int(new_height)), Image.Resampling.LANCZOS) | |
output_path = "processed_images/{}/{}.jpeg".format(self.id_type, self.name) | |
image.save(output_path) | |
return {"success": 200, "output_p": output_path} | |
def process(self): | |
if self.doc_type == "pdf" or self.doc_type == "PDF": | |
response = self.pdf_to_image_scale() | |
else: | |
response = self.scale_img() | |
return response | |
from google.cloud import vision | |
vision_client = vision.ImageAnnotatorClient() | |
def extract_document_number(ocr_text: str, id_type: str) -> str: | |
""" | |
Searches the OCR text for a valid document number based on regex patterns. | |
Checks for CIN, then MSME, and finally LLPIN. | |
""" | |
patterns = { | |
"cin": re.compile(r"([LUu]{1}[0-9]{5}[A-Za-z]{2}[0-9]{4}[A-Za-z]{3}[0-9]{6})"), | |
"msme": re.compile(r"(UDYAM-[A-Z]{2}-\d{2}-\d{7})"), | |
"llpin": re.compile(r"([A-Z]{3}-[0-9]{4})"), | |
"pan": re.compile(r"^[A-Z]{3}[PCHFTBALJGT][A-Z][\d]{4}[A-Z]$"), | |
"aadhaar": re.compile(r"^\d{12}$"), | |
} | |
if id_type == "cin_llpin": | |
# Try CIN first | |
match = patterns["cin"].search(ocr_text) | |
if match: | |
return match.group(0) | |
# If CIN not found, try LLPIN | |
match = patterns["llpin"].search(ocr_text) | |
if match: | |
return match.group(0) | |
elif id_type in patterns: | |
match = patterns[id_type].search(ocr_text) | |
if match: | |
return match.group(0) | |
return None | |
def run_google_vision(file_content: bytes) -> str: | |
""" | |
Uses Google Vision OCR to extract text from binary file content. | |
""" | |
image = vision.Image(content=file_content) | |
response = vision_client.text_detection(image=image) | |
texts = response.text_annotations | |
if texts: | |
# The first annotation contains the complete detected text | |
return texts[0].description | |
return "" | |
def extract_text_from_file(file_path: str) -> str: | |
""" | |
Reads the file from file_path. If it's a PDF, converts only the first page to an image, | |
then runs OCR using Google Vision. | |
""" | |
if file_path.lower().endswith(".pdf"): | |
try: | |
# Open the PDF file using PyMuPDF (fitz) | |
pdf_document = fitz.open(file_path) | |
page = pdf_document.load_page(0) # Load the first page | |
pix = page.get_pixmap() # Render page as an image | |
# Convert pixmap to PIL Image | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# Convert image to bytes for OCR | |
img_byte_arr = io.BytesIO() | |
image.save(img_byte_arr, format="JPEG") | |
file_content = img_byte_arr.getvalue() | |
except Exception as e: | |
logging.error(f"Error converting PDF to image: {e}") | |
return "" | |
else: | |
with open(file_path, "rb") as f: | |
file_content = f.read() | |
return run_google_vision(file_content) | |
def extract_document_number_from_file(file_path: str, id_type: str) -> str: | |
""" | |
Extracts the document number (CIN, MSME, or LLPIN) from the file at file_path. | |
""" | |
ocr_text = extract_text_from_file(file_path) | |
return extract_document_number(ocr_text, id_type) | |
# files = { | |
# "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg", | |
# "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg", | |
# "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg", | |
# "gst_file": "/home/javmulla/model_one/test_images_gst/0a52fbcb_page3_image_0.jpg" | |
# } | |
# files = { | |
# "aadhar_file": "/home/javmulla/model_one/test_images_aadhar/test_two.jpg", | |
# "pan_file": "/home/javmulla/model_one/test_images_pan/6ea33087.jpeg", | |
# "cheque_file": "/home/javmulla/model_one/test_images_cheque/0f81678a.jpeg", | |
# "gst_file": "test_Images_folder/gst/e.pdf" | |
# } | |
# for key, value in files.items(): | |
# name = value.split("/")[-1].split(".")[0] | |
# id_type = key.split("_")[0] | |
# doc_type = value.split("/")[-1].split(".")[1] | |
# f_path = value | |
# preprocessing = doc_processing(name,id_type,doc_type,f_path) | |
# response = preprocessing.process() | |
# print("response",response) | |
# id_type, doc_type, f_path | |