File size: 2,827 Bytes
8b6c55a
 
 
9ebbd2b
8b6c55a
2e73fb1
 
 
9ebbd2b
2e73fb1
2769653
 
8b6c55a
 
 
 
 
 
 
 
2e73fb1
 
 
 
d8d40d6
 
858ad43
 
20e92e5
d8d40d6
d075bff
20e92e5
d8d40d6
2e73fb1
 
 
d8d40d6
bb67d5e
16858c9
ae3e58e
4286a1e
 
c444b8a
269e4c2
 
fc29243
8b6c55a
 
738d1ca
9ebbd2b
 
a0f2182
9ebbd2b
 
a0f2182
 
 
 
6cf24d5
a7f196b
fe13010
 
9ebbd2b
6cf24d5
9ebbd2b
 
6cf24d5
 
8b6c55a
 
 
 
ae3e58e
77f0d19
8b6c55a
 
6cf24d5
269e4c2
e60cc65
5e10a9a
 
6cf24d5
 
8b6c55a
ae3e58e
 
 
 
 
6cf24d5
9ebbd2b
 
3b925dc
9ebbd2b
 
8b6c55a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from PIL import Image
import pytesseract
import gradio as gr
from datetime import datetime
import os
from flair.data import Sentence
from flair.models import SequenceTagger
from segtok.segmenter import split_single
import pandas as pd

# tagger = SequenceTagger.load("ner-ontonotes")
tagger = SequenceTagger.load("flair/ner-english-ontonotes")

langs = []

choices = os.popen("tesseract --list-langs").read().split("\n")[1:-1]

blocks = gr.Blocks()


def get_named_entities(ocr_text: str):
    sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(ocr_text)]
    tagger.predict(sentence)

    entities = []

    for token in sentence:
        for entity in token.get_spans("ner"):
            entity = str(entity)
            entities.append(entity)

    entities = "\n".join(entities)

    return entities


def run(image, lang="eng"):
    print("Image ", image)
    try:
        print("Image name ", image.name)
    except Exception as e:
        print(f"Could not print image filename: {e}")
    result = pytesseract.image_to_string(image, lang=None if lang == [] else lang)

    ner = get_named_entities(result)
    return result, ner


def download_output(ocr_text: str, named_entities: str, image_name="ocr_ner_output"):
    try:
        named_entities_list = named_entities.split("\n")

        now = datetime.now()
        datetime_now = now.strftime("%Y%m%d_%H%M%S")
        output_file = f"{image_name}_{datetime_now}.xlsx"

        ocr_df = pd.Series(ocr_text)
        ner_df = pd.Series(named_entities_list)

        with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
            ocr_df.to_excel(writer, sheet_name="OCR text")
            ner_df.to_excel(writer, sheet_name="Named entities")
        return output_file

    except Exception as e:
        raise gr.Error(f"Something went wrong: here's the error: {e}")


with gr.Blocks() as demo:
    gr.Markdown("## Theatre Programmer")
    with gr.Row():
        with gr.Column():
            image_in = gr.Image(type="pil", mirror_webcam=False)
            lang = gr.Dropdown(choices, value="eng", label="Select language")
            btn = gr.Button("Run")
        with gr.Column():
            ocr_text = gr.TextArea(label="OCR output")
        with gr.Column():
            ner = gr.TextArea(label="Named entities")
        # with gr.Column():
        #     gr.CheckboxGroup(ner, label="Named entities")
    with gr.Row():
        download_btn = gr.Button("Download output")

    try:
        print("Image name ", image_in.name)
    except Exception as e:
        print(f"Could not print image filename: {e}")

    btn.click(fn=run, inputs=[image_in, lang], outputs=[ocr_text, ner])
    download_btn.click(
        fn=download_output,
        inputs=[ocr_text, ner],
        outputs=[gr.components.File()],
    )

demo.launch()