# Directory containing TIFF files | |
INPUT_DIR="atreatiseonlawp00chitgoog_tif" | |
OUTPUT_PDF="output_searchable.pdf" | |
TEMP_DIR="temp_ocr" | |
# Create a temporary directory to store processed files | |
mkdir -p "$TEMP_DIR" | |
# Process each TIFF file | |
for file in "$INPUT_DIR"/*.tif; do | |
# Extract the filename without extension | |
filename=$(basename "$file" .tif) | |
# Run Tesseract on each file and output a PDF for each page | |
tesseract "$file" "$TEMP_DIR/$filename" -l eng pdf | |
done | |
# Combine all individual page PDFs into a single PDF | |
if command -v pdfunite >/dev/null 2>&1; then | |
# If pdfunite is available (from poppler-utils), use it | |
pdfunite "$TEMP_DIR"/*.pdf "$OUTPUT_PDF" | |
else | |
# Fallback to using ImageMagick's `convert` if `pdfunite` isn't available | |
convert "$TEMP_DIR"/*.pdf "$OUTPUT_PDF" | |
fi | |
# Clean up temporary directory | |
rm -r "$TEMP_DIR" | |
echo "Searchable PDF created as $OUTPUT_PDF" |