GAS17 commited on
Commit
b16b8c7
·
verified ·
1 Parent(s): 4f77700

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -25
app.py CHANGED
@@ -1,28 +1,22 @@
1
- import gradio as gr
2
- import fitz # PyMuPDF
3
 
4
- def consultar_pdf(pdf_file, consulta):
5
- # Abrir el archivo PDF
6
- pdf_document = fitz.open(pdf_file.name)
7
 
8
- # Recorrer todas las páginas y extraer el texto
9
- texto_completo = ""
10
- for page_num in range(pdf_document.page_count):
11
- page = pdf_document.load_page(page_num)
12
- texto_completo += page.get_text()
13
-
14
- # Devolver el texto completo del documento
15
- return texto_completo
16
-
17
- # Crear la interfaz de Gradio
18
- iface = gr.Interface(
19
- fn=consultar_pdf,
20
- inputs=[
21
- gr.File(label="Cargar PDF"), # Entrada para cargar el archivo PDF
22
- gr.Textbox(label="Consulta", placeholder="Escribe tu consulta aquí") # Entrada para la consulta
23
- ],
24
- outputs="text" # Salida de texto con el resultado de la consulta
25
- )
26
 
27
- # Lanzar la interfaz
28
- iface.launch()
 
 
 
1
+ import pytesseract
2
+ from pdf2image import convert_from_path
3
 
4
+ def pdf_to_text(pdf_path, output_path):
5
+ # Convert PDF to list of images
6
+ pages = convert_from_path(pdf_path, 300)
7
 
8
+ # Extract text from all pages and join them
9
+ text = ""
10
+ for page in pages:
11
+ text += pytesseract.image_to_string(page)
12
+
13
+ # Write text to file
14
+ with open(output_path, "w", encoding="utf-8") as file:
15
+ file.write(text)
16
+
17
+ print(f"OCR completed. Text saved to {output_path}")
 
 
 
 
 
 
 
 
18
 
19
+ # Usage
20
+ pdf_path = 'input.pdf'
21
+ output_path = 'output.txt'
22
+ pdf_to_text(pdf_path, output_path)