Spaces:
Sleeping
Sleeping
File size: 2,827 Bytes
8b6c55a 9ebbd2b 8b6c55a 2e73fb1 9ebbd2b 2e73fb1 2769653 8b6c55a 2e73fb1 d8d40d6 858ad43 20e92e5 d8d40d6 d075bff 20e92e5 d8d40d6 2e73fb1 d8d40d6 bb67d5e 16858c9 ae3e58e 4286a1e c444b8a 269e4c2 fc29243 8b6c55a 738d1ca 9ebbd2b a0f2182 9ebbd2b a0f2182 6cf24d5 a7f196b fe13010 9ebbd2b 6cf24d5 9ebbd2b 6cf24d5 8b6c55a ae3e58e 77f0d19 8b6c55a 6cf24d5 269e4c2 e60cc65 5e10a9a 6cf24d5 8b6c55a ae3e58e 6cf24d5 9ebbd2b 3b925dc 9ebbd2b 8b6c55a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
from PIL import Image
import pytesseract
import gradio as gr
from datetime import datetime
import os
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
import pandas as pd
# tagger = SequenceTagger.load("ner-ontonotes")
tagger = SequenceTagger.load("flair/ner-english-ontonotes")
langs = []
choices = os.popen("tesseract --list-langs").read().split("\n")[1:-1]
blocks = gr.Blocks()
def get_named_entities(ocr_text: str):
sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(ocr_text)]
tagger.predict(sentence)
entities = []
for token in sentence:
for entity in token.get_spans("ner"):
entity = str(entity)
entities.append(entity)
entities = "\n".join(entities)
return entities
def run(image, lang="eng"):
print("Image ", image)
try:
print("Image name ", image.name)
except Exception as e:
print(f"Could not print image filename: {e}")
result = pytesseract.image_to_string(image, lang=None if lang == [] else lang)
ner = get_named_entities(result)
return result, ner
def download_output(ocr_text: str, named_entities: str, image_name="ocr_ner_output"):
try:
named_entities_list = named_entities.split("\n")
now = datetime.now()
datetime_now = now.strftime("%Y%m%d_%H%M%S")
output_file = f"{image_name}_{datetime_now}.xlsx"
ocr_df = pd.Series(ocr_text)
ner_df = pd.Series(named_entities_list)
with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
ocr_df.to_excel(writer, sheet_name="OCR text")
ner_df.to_excel(writer, sheet_name="Named entities")
return output_file
except Exception as e:
raise gr.Error(f"Something went wrong: here's the error: {e}")
with gr.Blocks() as demo:
gr.Markdown("## Theatre Programmer")
with gr.Row():
with gr.Column():
image_in = gr.Image(type="pil", mirror_webcam=False)
lang = gr.Dropdown(choices, value="eng", label="Select language")
btn = gr.Button("Run")
with gr.Column():
ocr_text = gr.TextArea(label="OCR output")
with gr.Column():
ner = gr.TextArea(label="Named entities")
# with gr.Column():
# gr.CheckboxGroup(ner, label="Named entities")
with gr.Row():
download_btn = gr.Button("Download output")
try:
print("Image name ", image_in.name)
except Exception as e:
print(f"Could not print image filename: {e}")
btn.click(fn=run, inputs=[image_in, lang], outputs=[ocr_text, ner])
download_btn.click(
fn=download_output,
inputs=[ocr_text, ner],
outputs=[gr.components.File()],
)
demo.launch()
|