ebookify-backend2 / pdf_to_image.py
Geetansh
changed pytesseract code as tesseract-ocr would be available in PATH in hf spaces
52916f3
raw
history blame
1.16 kB
from pdf2image import convert_from_path
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
# poppler_path = r"./Poppler/poppler-24.07.0/Library/bin"
# def pdfToImg(pdfPath, outputPath):
# '''
# 1)Images stored in output folder
# 2)It returns path to stored images
# '''
# images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", poppler_path=poppler_path, paths_only=True)
# return images_paths
# def pdfToImg2(pdfPath):
# '''
# 1)Returns a list of Pillow images
# '''
# images = convert_from_path(pdfPath, 200, fmt="jpeg", poppler_path=poppler_path)
# return images
# Changed version of above code for deployment on huggingface spaces
def pdfToImg(pdfPath, outputPath):
'''
1)Images stored in output folder
2)It returns path to stored images
'''
images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", paths_only=True)
return images_paths
def pdfToImg2(pdfPath):
'''
1)Returns a list of Pillow images
'''
images = convert_from_path(pdfPath, 200, fmt="jpeg")
return images