Spaces:
Sleeping
Sleeping
################# cnocr ################## | |
from cnocr import CnOcr | |
from pdfquery import PDFQuery | |
import openai | |
import json | |
from dotenv import load_dotenv | |
import os | |
def validate(text): | |
invalid_list = [' ',','] | |
for char in invalid_list: | |
text = text.replace(char, '') | |
return text | |
def check_bank(text): | |
text = text.replace(' ', '') | |
bank_list = ['bankofchina','hangseng','hsbc','sc'] | |
for bank in bank_list: | |
if bank in text: | |
return bank | |
else: | |
return False | |
def check_bank_name(img_path): | |
# BOCH - "Consolidated Statement 2023-01-01" | |
# HangSeng - "Statement of Prestige Banking 2023-03-0" OR "Statement of Preferred Banking 2023-03-07" | |
# HSBC - "Statement - HSBC One Account 2023-02-10" | |
# Standard Chartered - "statementOfAccount 2023-02-01" | |
standard_names = {'boch': "Consolidated Statement", | |
'hangseng': "Statement of", | |
'hsbc': "Statement - HSBC One Account", | |
'sc': "statementOfAccount"} | |
for bank_name in standard_names: | |
if bank_name in str(img_path) or standard_names[bank_name] in str(img_path): | |
return bank_name | |
def check_mr(text): | |
openings = ['mr', 'ms', 'miss', 'mrs'] | |
words = text.lower().split() | |
if words and words[0] in openings: | |
return ''.join(words[1:]) | |
else: | |
return text | |
def get_info_from_bank(img_path): | |
# Running the model | |
ocr = CnOcr(rec_model_name='densenet_lite_136-gru') | |
out = ocr.ocr(img_path) | |
load_dotenv() | |
openai.api_key = os.environ.get("data-extraction-api") | |
invalid_list = [' ',','] | |
data_set_1 = [] | |
for item in out: | |
if item['text'] not in invalid_list: | |
data_set_1.append(item['text']) | |
completion = openai.ChatCompletion.create( | |
model = "gpt-3.5-turbo", | |
temperature = 0, | |
messages = [ | |
{"role": "system", "content": "You are an AI assistant for extracting data with following names(bank, nameStatement, address, totalAsset (only HKD and represent as one number), totalLiability, statementDate) from bank statements. Uppercase and lowercase letters are the same. Store the results in dictionary format"}, | |
{"role": "user", "content": f"Extract data from the following 2 sets of text: {data_set_1}. (1.) Data that locate in the front part of the text: customer full name (it should be a Chinese name in English spelling and two to three words), address in Hong Kong (including flat, floor, court/estate, region in Hong Kong), bank name, bank statement issue date (verly likely to be within 1-2 years), (2.) Data that mainly locate in the other part of the text: total asset (including investments and deposits) and total liability (often contains DR and includes credit card but might be zero) of the current month."}, | |
] | |
) | |
# bs_data = completion['choices'][0]['message'] | |
data = completion['choices'][0]['message']['content'] | |
bs_data = json.loads(data) | |
# for data_item in bs_data: | |
# if 'name' in data_item: | |
# bs_data[''] = check_mr | |
# print(bs_data) | |
# new_name = check_mr(bs_data["nameStatement"]) | |
bs_data["nameStatement"] = check_mr(bs_data["nameStatement"]) | |
# bs_data["totalAsset"] = bs_data["totalAsset"].replace("HKD","") | |
# bs_data["totalLiability"] = bs_data["totalLiability"].replace("HKD","") | |
# bs_data["totalLiability"] = bs_data["totalLiability"].replace("DR","") | |
# print(bs_data) | |
return bs_data |