Spaces:

GAS17
/

ocr

Running

GAS17 commited on Dec 25, 2024

Commit

bf6343f

verified ·

1 Parent(s): 8fd3ccc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,22 +1,27 @@
 import pytesseract
 from pdf2image import convert_from_path
-def pdf_to_text(pdf_path, output_path):
-    # Convert PDF to list of images
-    pages = convert_from_path(pdf_path, 300)
-    # Extract text from all pages and join them
-    text = ""
     for page in pages:
-        text += pytesseract.image_to_string(page)
-    # Write text to file
-    with open(output_path, "w", encoding="utf-8") as file:
-        file.write(text)
-    print(f"OCR completed. Text saved to {output_path}")
-# Usage
-pdf_path = 'input.pdf'
-output_path = 'output.txt'
-pdf_to_text(pdf_path, output_path)

+import gradio as gr
 import pytesseract
 from pdf2image import convert_from_path
+def extract_text_from_pdf(pdf_file):
+    # Convertir el archivo PDF a imágenes
+    pages = convert_from_path(pdf_file.name, 600)
+    # Extraer texto de cada página
+    text_data = ''
     for page in pages:
+        text = pytesseract.image_to_string(page)
+        text_data += text + '\n'
+    return text_data
+# Crear la interfaz de Gradio
+iface = gr.Interface(
+    fn=extract_text_from_pdf,
+    inputs=gr.inputs.File(label="Sube tu archivo PDF"),
+    outputs="text",
+    title="Extractor de Texto de PDF",
+    description="Sube un archivo PDF escaneado y extrae el texto usando OCR."
+)
+# Ejecutar la interfaz
+iface.launch()