Update app.py
Browse files
app.py
CHANGED
@@ -1,20 +1,19 @@
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image
|
3 |
import pytesseract
|
4 |
-
|
5 |
-
import os
|
6 |
-
|
7 |
-
# Asegúrate de que el ejecutable de Tesseract esté en tu PATH
|
8 |
-
# o especifica la ruta completa
|
9 |
-
# pytesseract.pytesseract.tesseract_cmd = r'<ruta_completa_a_tesseract>'
|
10 |
|
11 |
def ocr_pdf(file):
|
12 |
-
#
|
13 |
-
|
14 |
text = ""
|
15 |
-
for
|
16 |
-
#
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
return text
|
19 |
|
20 |
# Crea la interfaz de Gradio
|
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image
|
3 |
import pytesseract
|
4 |
+
import fitz # PyMuPDF
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def ocr_pdf(file):
|
7 |
+
# Abre el PDF
|
8 |
+
doc = fitz.open(file.name)
|
9 |
text = ""
|
10 |
+
for page_number in range(len(doc)):
|
11 |
+
# Extrae la página como imagen
|
12 |
+
page = doc.load_page(page_number)
|
13 |
+
pix = page.get_pixmap()
|
14 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
15 |
+
# Extrae texto de la imagen
|
16 |
+
text += pytesseract.image_to_string(img) + "\n"
|
17 |
return text
|
18 |
|
19 |
# Crea la interfaz de Gradio
|