GAS17 commited on
Commit
bf6343f
·
verified ·
1 Parent(s): 8fd3ccc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -15
app.py CHANGED
@@ -1,22 +1,27 @@
 
1
  import pytesseract
2
  from pdf2image import convert_from_path
3
 
4
- def pdf_to_text(pdf_path, output_path):
5
- # Convert PDF to list of images
6
- pages = convert_from_path(pdf_path, 300)
7
 
8
- # Extract text from all pages and join them
9
- text = ""
10
  for page in pages:
11
- text += pytesseract.image_to_string(page)
 
12
 
13
- # Write text to file
14
- with open(output_path, "w", encoding="utf-8") as file:
15
- file.write(text)
16
-
17
- print(f"OCR completed. Text saved to {output_path}")
 
 
 
 
 
18
 
19
- # Usage
20
- pdf_path = 'input.pdf'
21
- output_path = 'output.txt'
22
- pdf_to_text(pdf_path, output_path)
 
1
+ import gradio as gr
2
  import pytesseract
3
  from pdf2image import convert_from_path
4
 
5
+ def extract_text_from_pdf(pdf_file):
6
+ # Convertir el archivo PDF a imágenes
7
+ pages = convert_from_path(pdf_file.name, 600)
8
 
9
+ # Extraer texto de cada página
10
+ text_data = ''
11
  for page in pages:
12
+ text = pytesseract.image_to_string(page)
13
+ text_data += text + '\n'
14
 
15
+ return text_data
16
+
17
+ # Crear la interfaz de Gradio
18
+ iface = gr.Interface(
19
+ fn=extract_text_from_pdf,
20
+ inputs=gr.inputs.File(label="Sube tu archivo PDF"),
21
+ outputs="text",
22
+ title="Extractor de Texto de PDF",
23
+ description="Sube un archivo PDF escaneado y extrae el texto usando OCR."
24
+ )
25
 
26
+ # Ejecutar la interfaz
27
+ iface.launch()