Spaces:

Geetansh01
/

ebookify-backend2

Sleeping

ebookify-backend2 / pdf_to_image.py

Geetansh

changed pytesseract code as tesseract-ocr would be available in PATH in hf spaces

52916f3 3 months ago

1.16 kB

	from pdf2image import convert_from_path
	from pdf2image.exceptions import (
	PDFInfoNotInstalledError,
	PDFPageCountError,
	PDFSyntaxError
	)

	# poppler_path = r"./Poppler/poppler-24.07.0/Library/bin"

	# def pdfToImg(pdfPath, outputPath):
	# '''
	# 1)Images stored in output folder
	# 2)It returns path to stored images
	# '''
	# images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", poppler_path=poppler_path, paths_only=True)
	# return images_paths

	# def pdfToImg2(pdfPath):
	# '''
	# 1)Returns a list of Pillow images
	# '''
	# images = convert_from_path(pdfPath, 200, fmt="jpeg", poppler_path=poppler_path)
	# return images

	# Changed version of above code for deployment on huggingface spaces
	def pdfToImg(pdfPath, outputPath):
	'''
	1)Images stored in output folder
	2)It returns path to stored images
	'''
	images_paths = convert_from_path(pdfPath, 200, outputPath, fmt="jpeg", paths_only=True)
	return images_paths

	def pdfToImg2(pdfPath):
	'''
	1)Returns a list of Pillow images
	'''
	images = convert_from_path(pdfPath, 200, fmt="jpeg")
	return images