|
import os |
|
from PIL import Image |
|
import pytesseract |
|
from pathlib import Path |
|
import json |
|
from typing import Dict, List |
|
from concurrent.futures import ProcessPoolExecutor |
|
import multiprocessing |
|
|
|
def process_image(args) -> tuple: |
|
""" |
|
Process a single image file. |
|
|
|
Args: |
|
args: Tuple of (filename, input_dir, output_dir) |
|
Returns: |
|
Tuple of (filename, extracted_text) |
|
""" |
|
filename, input_dir, output_dir = args |
|
try: |
|
|
|
image_path = os.path.join(input_dir, filename) |
|
|
|
|
|
with Image.open(image_path) as img: |
|
|
|
text = pytesseract.image_to_string(img) |
|
|
|
|
|
text_filename = Path(filename).stem + '.txt' |
|
text_path = os.path.join(output_dir, text_filename) |
|
with open(text_path, 'w', encoding='utf-8') as f: |
|
f.write(text) |
|
|
|
print(f"Processed: {filename}") |
|
return filename, text |
|
|
|
except Exception as e: |
|
print(f"Error processing {filename}: {str(e)}") |
|
return filename, f"ERROR: {str(e)}" |
|
|
|
def process_directory(input_dir: str, output_dir: str, max_workers: int = None) -> Dict[str, str]: |
|
""" |
|
Process all JPEG files in a directory and perform OCR using multiple processes. |
|
|
|
Args: |
|
input_dir: Directory containing JPEG files |
|
output_dir: Directory to save OCR results |
|
max_workers: Maximum number of worker processes (defaults to CPU count) |
|
|
|
Returns: |
|
Dictionary mapping filenames to extracted text |
|
""" |
|
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
if max_workers is None: |
|
max_workers = multiprocessing.cpu_count() |
|
|
|
|
|
valid_extensions = {'.jpg', '.jpeg', '.JPG', '.JPEG'} |
|
|
|
|
|
image_files = [ |
|
f for f in os.listdir(input_dir) |
|
if Path(f).suffix in valid_extensions |
|
] |
|
|
|
|
|
work_args = [(f, input_dir, output_dir) for f in image_files] |
|
|
|
|
|
results = {} |
|
with ProcessPoolExecutor(max_workers=max_workers) as executor: |
|
for filename, text in executor.map(process_image, work_args): |
|
results[filename] = text |
|
|
|
|
|
json_path = os.path.join(output_dir, 'ocr_results.json') |
|
with open(json_path, 'w', encoding='utf-8') as f: |
|
json.dump(results, f, indent=2, ensure_ascii=False) |
|
|
|
return results |
|
|
|
if __name__ == "__main__": |
|
import argparse |
|
|
|
parser = argparse.ArgumentParser(description='Perform OCR on all JPEG files in a directory') |
|
parser.add_argument('input_dir', help='Input directory containing JPEG files') |
|
parser.add_argument('output_dir', help='Output directory for OCR results') |
|
parser.add_argument('--workers', type=int, help='Number of worker processes (default: CPU count)', |
|
default=None) |
|
|
|
args = parser.parse_args() |
|
|
|
results = process_directory(args.input_dir, args.output_dir, args.workers) |
|
print(f"\nProcessed {len(results)} files. Results saved to {args.output_dir}") |
|
|