jnorthrup
/

Skywork-o1-Open-PRM-Qwen-2.5-7B

Text Classification

Model card Files Files and versions Community

Skywork-o1-Open-PRM-Qwen-2.5-7B / 2ocr.sh

jnorthrup's picture

Upload 12 files

14daa4c verified 12 days ago

history blame contribute delete

904 Bytes

	#!/bin/bash

	# Directory containing TIFF files
	INPUT_DIR="atreatiseonlawp00chitgoog_tif"
	OUTPUT_PDF="output_searchable.pdf"
	TEMP_DIR="temp_ocr"

	# Create a temporary directory to store processed files
	mkdir -p "$TEMP_DIR"

	# Process each TIFF file
	for file in "$INPUT_DIR"/*.tif; do
	# Extract the filename without extension
	filename=$(basename "$file" .tif)

	# Run Tesseract on each file and output a PDF for each page
	tesseract "$file" "$TEMP_DIR/$filename" -l eng pdf
	done

	# Combine all individual page PDFs into a single PDF
	if command -v pdfunite >/dev/null 2>&1; then
	# If pdfunite is available (from poppler-utils), use it
	pdfunite "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
	else
	# Fallback to using ImageMagick's `convert` if `pdfunite` isn't available
	convert "$TEMP_DIR"/*.pdf "$OUTPUT_PDF"
	fi

	# Clean up temporary directory
	rm -r "$TEMP_DIR"

	echo "Searchable PDF created as $OUTPUT_PDF"