adit94 commited on
Commit
846b4a5
·
verified ·
1 Parent(s): a1c3649

Upload 9 files

Browse files
helpers/common.py ADDED
File without changes
helpers/entity_extraction_helpers.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from constants import DOCUMENT_COLLECTION
2
+ from openai_constants import ENTITY_EXTRACTION_PROMPT, ENTITY_EXTRACTION_FUNCTION, GPT35_PRICING
3
+
4
+
5
+ def extract_all_documents(openai_instance, chunks):
6
+ all_entities = {}
7
+ all_usage = {}
8
+ total_prompt_tokens = 0
9
+ total_completion_tokens = 0
10
+ print(f"Number of chunks to process :: {len(chunks)}")
11
+ for chunk_idx, chunk in enumerate(chunks):
12
+ print(f"Sending request to OpenAI for {chunk_idx}")
13
+ openai_entities_out = openai_instance.generate_response(ENTITY_EXTRACTION_PROMPT, chunk, ENTITY_EXTRACTION_FUNCTION)
14
+ print("OpenAI out received")
15
+ print(openai_entities_out['function_output'])
16
+ #for ent in openai_entities_out['function_output'].items():
17
+ for key, val in openai_entities_out['function_output'].items():
18
+ print(key, val)
19
+ if key in all_entities:
20
+ if isinstance(val, list):
21
+ all_entities[key].extend(val) # Extend the existing list with the new list
22
+ else:
23
+ all_entities[key].append(val) # Append the value to the existing list
24
+ else:
25
+ if isinstance(val, list):
26
+ all_entities[key] = val # Initialize the key with the list
27
+ else:
28
+ all_entities[key] = [val]
29
+
30
+ if 'prompt_tokens' in openai_entities_out['usage']:
31
+ total_prompt_tokens += openai_entities_out['usage']['prompt_tokens']
32
+ if 'completion_tokens' in openai_entities_out['usage']:
33
+ total_completion_tokens += openai_entities_out['usage']['completion_tokens']
34
+
35
+ all_usage = {
36
+ 'prompt_tokens':total_prompt_tokens,
37
+ 'completion_tokens':total_completion_tokens,
38
+ 'output_pricing': total_completion_tokens/1000 * GPT35_PRICING['input'],
39
+ 'input_pricing':total_prompt_tokens/1000 * GPT35_PRICING['output']
40
+ }
41
+ return all_entities, all_usage
42
+
43
+ def process_insurance_document(pii_instance, mongo_instance, openai_instance, ocr_instance,
44
+ document_path, document_id):
45
+ print("---- \nInside Process insurance document function")
46
+ ## save file to S3
47
+ document_s3_url = ""
48
+
49
+ ## OCR
50
+ try:
51
+ document_text = ocr_instance.extract_text_from_document(document_path)
52
+ ocr_status = "Completed"
53
+ process_status = "OCR Completed"
54
+ print(f"OCR complete")
55
+ except Exception as ex:
56
+ document_text = ""
57
+ ocr_status = ex
58
+ process_status = f"OCR Failed. {ex}"
59
+ print(process_status)
60
+
61
+ ## save ocr file to S3, add document S3 url
62
+ ocr_document_s3_url = ""
63
+
64
+ ## update ocr_status in db
65
+ #mongo_instance.update(DOCUMENT_COLLECTION,
66
+ # {'document_id':document_id},
67
+ # {'set':{'ocr_status':ocr_status, 'document_s3_url':document_s3_url,
68
+ # 'ocr_document_s3_url':ocr_document_s3_url, 'process_status':process_status}})
69
+ print(f"OCR status updated in db")
70
+ ## PII entity extraction and masking
71
+ pii_entities = pii_instance.identify(document_text)
72
+ print(f"pii entiites are :: {pii_entities}")
73
+ pii_entities = pii_instance.add_mask(pii_entities)
74
+ print(f"\npii_entities after adding mask :: {pii_entities}")
75
+ masked_text = pii_instance.anonymize(pii_entities, document_text)
76
+ print(f"\nPII anonumized text is :: {masked_text}")
77
+ print(f"\nPII complete")
78
+
79
+ ## Openai extraction
80
+ chunks = ocr_instance.chunk_document(masked_text)
81
+ openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
82
+ entity_extraction_status = 'Completed'
83
+ process_status = 'Document term extraction completed'
84
+
85
+ """try:
86
+ openai_entities, all_usage = extract_all_documents(openai_instance, chunks)
87
+ entity_extraction_status = 'Completed'
88
+ process_status = 'Document term extraction completed'
89
+ except Exception as ex:
90
+ openai_entities = {}
91
+ all_usage = {}
92
+ entity_extraction_status = ex
93
+ process_status = f"Document term extraction failed. {ex}"
94
+ """
95
+
96
+ #openai_entities_out = {
97
+ # 'status':"Success",
98
+ # 'function_output':{},
99
+ # 'usage':{}
100
+ #}
101
+
102
+ print(f"openai_entities are :: {openai_entities}")
103
+ print(f"Request to OpenAI complete")
104
+ print("----------- \nProcessing complete\n ")
105
+
106
+ ## Unmask PII entities in openai entities
107
+
108
+
109
+ ## update entity extraction status in db
110
+ #mongo_instance.update(DOCUMENT_COLLECTION,
111
+ # {'document_id':document_id},
112
+ # {'set':{'entity_extraction_status':entity_extraction_status,
113
+ # 'entities':openai_entities, 'process_status':process_status}})
114
+ #print(f"Entities updated in DB")
115
+
116
+
117
+ out = {
118
+ "entities":openai_entities,
119
+ "masked_text":masked_text
120
+ }
121
+ return out
helpers/pii.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ fastapi
3
+ pydantic
4
+ scipy==1.10.1
5
+ flair
6
+ presidio-analyzer
7
+ presidio-anonymizer
8
+ openai==0.28.1
9
+ pytesseract
10
+ pdf2image
11
+ PyPDF2
12
+ python-docx
s3_uiapp.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import uuid
4
+ import streamlit as st
5
+
6
+ from helpers.entity_extraction_helpers import process_insurance_document
7
+ from services.pii_service import PIIService
8
+ from services.openai_service import OpenAIService
9
+ from services.mongo_service import MongoService
10
+ from services.ocr_service import OCRService
11
+
12
+
13
+ def init_session():
14
+ print("------------------ Initializing")
15
+ if 'a' not in st.session_state:
16
+ st.session_state['pii_instance'] = PIIService()
17
+ print("PII service initialized")
18
+ time.sleep(2)
19
+ st.session_state['openai_instance'] = OpenAIService(st.secrets["OPENAI_KEY"],
20
+ st.secrets["OPENAI_AZURE_ENDPOINT"],
21
+ st.secrets["OPENAI_API_VERSION"],
22
+ st.secrets["DEPLOYMENT_NAME"])
23
+ print("OpenAI service initialized")
24
+ time.sleep(2)
25
+ st.session_state['ocr_instance'] = OCRService()
26
+ print("OCR service initialized")
27
+ st.session_state.a = 1
28
+ print("-----------------------------")
29
+
30
+ st.header('', divider='rainbow')
31
+ st.title("Data extraction")
32
+ st.header('', divider='rainbow')
33
+
34
+ init_session()
35
+
36
+ uploaded_doc = st.file_uploader("Upload an insurance document", type=["pdf"])
37
+
38
+ if uploaded_doc is not None:
39
+
40
+ with open(uploaded_doc.name,"wb") as f:
41
+ f.write(uploaded_doc.getbuffer())
42
+
43
+ document_id = str(uuid.uuid4())
44
+ print(f"File uploaded :: {uploaded_doc.name} :: {document_id}")
45
+
46
+ process_out = process_insurance_document(st.session_state['pii_instance'], "", st.session_state['openai_instance'],
47
+ st.session_state['ocr_instance'] , uploaded_doc.name, document_id)
48
+
49
+ st.header('Extracted entities !! ', divider='rainbow')
50
+ st.write(process_out['entities'])
51
+ st.header('', divider='rainbow')
52
+
53
+ ### TO RUN :: streamlit run ui_app.py
services/mongo_service.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ from pydantic import BaseModel, HttpUrl
3
+ from datetime import datetime
4
+ from typing import Optional, List, Dict, Union
5
+
6
+ class MongoService:
7
+ def __init__(self, mongo_url:str, database:str):
8
+ self.mongo_url = mongo_url
9
+ self.mongo_client = MongoClient(self.mongo_url)
10
+ self.mongo_database = self.mongo_client[database]
11
+
12
+ def insert(self, collection:str, data:Dict):
13
+ inserted = self.mongo_database[collection].insert_one(data)
14
+ return
15
+
16
+ def get(self, collection:str, filter:Dict, fields_to_retrieve:List=[]):
17
+ fields = {}
18
+ if fields_to_retrieve != []:
19
+ for field in fields_to_retrieve:
20
+ fields[field] = 1
21
+ retrieved_data = list(self.mongo_database[collection].find(filter,fields))
22
+ return retrieved_data
23
+
24
+ def update(self, collection:str, filter:Dict, update_value:Dict, many=False):
25
+ #myquery = { "address": { "$regex": "^S" } }
26
+ #newvalues = { "$set": { "name": "Minnie" } }
27
+
28
+ if many == True:
29
+ updated = self.mongo_database[collection].update_many(filter, update_value)
30
+ else:
31
+ updated = self.mongo_database[collection].update_one(filter, update_value)
32
+
33
+ return
services/ocr_service.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import docx
4
+ import pytesseract
5
+ from nltk.tokenize import sent_tokenize, word_tokenize
6
+ from PyPDF2 import PdfReader
7
+ from pdf2image import convert_from_path
8
+
9
+
10
+ class OCRService:
11
+ def __init__(self):
12
+ return
13
+
14
+ def extract_ocrless_pdf(self, filepath):
15
+ reader = PdfReader(filepath)
16
+ extracted_text = ""
17
+ for page in reader.pages:
18
+ text = page.extract_text()
19
+ extracted_text += " "
20
+ extracted_text += text
21
+
22
+ return extracted_text
23
+
24
+ def extract_text_from_pdf(self, filepath):
25
+ images = convert_from_path(filepath, thread_count=4)
26
+ full_text = []
27
+ #config = (r"--oem 2 --psm 7")
28
+ for image_idx, image in enumerate(images):
29
+ text = pytesseract.image_to_string(image)
30
+ #text = pytesseract.image_to_string(image, config=config)
31
+ full_text.append(text)
32
+ return full_text
33
+
34
+ def extract_text_from_document(self, filepath):
35
+ file_ext = os.path.splitext(filepath)[-1]
36
+ if file_ext in [".pdf"]:
37
+ text_to_process = self.extract_text_from_pdf(filepath)
38
+ text_joined = " ".join(text_to_process)
39
+ #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
40
+ # file.writelines(text_to_process)
41
+ elif file_ext in [".doc", ".DOC", ".docx", ".DOCX"]:
42
+ doc_content = docx.Document(filepath)
43
+ text_to_process = [i.text for i in doc_content.paragraphs]
44
+ text_joined = " \n ".join(text_to_process)
45
+ #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
46
+ # file.write(text_joined)
47
+ elif file_ext in [".txt"]:
48
+ file = open(f"{os.path.splitext(filepath)[0]}.txt", encoding="utf8")
49
+ text_joined = file.read()
50
+
51
+ return text_joined
52
+
53
+ def preprocess_document(self, document):
54
+ document = document.replace(r'\n+', "\n")
55
+ #document = re.sub(r"\s+", " ", document)
56
+ document = re.sub("“", r"\"", document)
57
+ document = re.sub("”", r"\"", document)
58
+ document = re.sub(r"\\\"", "\"", document)
59
+
60
+ return document
61
+
62
+ def chunk_document(self, text, k=1500):
63
+ sentences = sent_tokenize(text)
64
+ words = word_tokenize(text)
65
+
66
+ chunks = []
67
+ current_chunk = []
68
+ current_word_count = 0
69
+
70
+ for sentence in sentences:
71
+ sentence_words = word_tokenize(sentence)
72
+ if current_word_count + len(sentence_words) <= k:
73
+ current_chunk.append(sentence)
74
+ current_word_count += len(sentence_words)
75
+ else:
76
+ chunks.append(" ".join(current_chunk))
77
+ current_chunk = [sentence]
78
+ current_word_count = len(sentence_words)
79
+
80
+ if current_chunk:
81
+ chunks.append(" ".join(current_chunk))
82
+
83
+ for id, chunk in enumerate(chunks):
84
+ if len(chunk.split()) < 2:
85
+ del chunks[id]
86
+
87
+ return chunks
services/openai_service.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import openai
3
+
4
+
5
+ class OpenAIService:
6
+ def __init__(self, api_key, api_endpoint, api_version, deployment_name):
7
+ openai.api_key = api_key
8
+ openai.api_base = api_endpoint
9
+ openai.api_type = "azure"
10
+ openai.api_version = api_version
11
+ self.openai_deployment_name = deployment_name
12
+
13
+ def generate_response(self, system_prompt, user_prompt, openai_function):
14
+
15
+ try:
16
+ response = openai.ChatCompletion.create(engine=self.openai_deployment_name,
17
+ messages=[
18
+ {"role": "system", "content": system_prompt},
19
+ {"role": "user", "content": user_prompt},
20
+ ],
21
+ functions = openai_function,
22
+ function_call = {"name": openai_function[0]['name']}
23
+ )
24
+
25
+ openai_output = response["choices"]
26
+ usage = response["usage"].to_dict()
27
+ print(response)
28
+ function_output = json.loads(openai_output[0].message.function_call.arguments, strict=False)
29
+ print(function_output)
30
+ openai_out = {
31
+ 'function_output':function_output,
32
+ 'usage':usage,
33
+ 'status':'Success'
34
+ }
35
+ return openai_out
36
+
37
+ except Exception as ex:
38
+ print(f"Openai generate response exceptin ::: {ex}")
39
+ openai_out = {
40
+ 'function_output':{},
41
+ 'usage':{},
42
+ 'status':ex
43
+ }
44
+ return openai_out
services/pii_service.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pprint import pprint
2
+ import json
3
+
4
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
5
+ from presidio_analyzer.nlp_engine import NlpEngineProvider, NlpArtifacts
6
+ from presidio_analyzer import PatternRecognizer
7
+ from presidio_analyzer import Pattern, PatternRecognizer
8
+ from presidio_analyzer.predefined_recognizers import SpacyRecognizer
9
+ from presidio_analyzer.predefined_recognizers import IbanRecognizer, EmailRecognizer, IpRecognizer,\
10
+ EmailRecognizer, PhoneRecognizer, UrlRecognizer, DateRecognizer
11
+
12
+ from presidio_anonymizer import AnonymizerEngine
13
+ from presidio_anonymizer.entities import OperatorConfig
14
+
15
+ import logging
16
+ from typing import Optional, List, Tuple, Set
17
+ from presidio_analyzer import (
18
+ RecognizerResult,
19
+ EntityRecognizer,
20
+ AnalysisExplanation,
21
+ )
22
+
23
+ from flair.data import Sentence
24
+ from flair.models import SequenceTagger
25
+
26
+ ### Creating FlairRecognizer class for NER(names, location)
27
+
28
+ class FlairRecognizer(EntityRecognizer):
29
+
30
+ ENTITIES = [
31
+ "LOCATION",
32
+ "PERSON",
33
+ "ORGANIZATION",
34
+ # "MISCELLANEOUS" # - There are no direct correlation with Presidio entities.
35
+ ]
36
+
37
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
38
+
39
+ CHECK_LABEL_GROUPS = [
40
+ ({"LOCATION"}, {"LOC", "LOCATION"}),
41
+ ({"PERSON"}, {"PER", "PERSON"}),
42
+ ({"ORGANIZATION"}, {"ORG"}),
43
+ # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
44
+ ]
45
+
46
+ MODEL_LANGUAGES = {
47
+ #"en": "flair/ner-english-large",
48
+ #"es": "flair/ner-spanish-large",
49
+ "de": "flair/ner-german-large",
50
+ #"nl": "flair/ner-dutch-large",
51
+ }
52
+
53
+ PRESIDIO_EQUIVALENCES = {
54
+ "PER": "PERSON",
55
+ "LOC": "LOCATION",
56
+ "ORG": "ORGANIZATION",
57
+ # 'MISC': 'MISCELLANEOUS' # - Probably not PII
58
+ }
59
+
60
+ def __init__(
61
+ self,
62
+ supported_language: str = "en",
63
+ supported_entities: Optional[List[str]] = None,
64
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
65
+ model: SequenceTagger = None,
66
+ ):
67
+ self.check_label_groups = (
68
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
69
+ )
70
+
71
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
72
+ self.model = (
73
+ model
74
+ if model
75
+ else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
76
+ )
77
+
78
+ super().__init__(
79
+ supported_entities=supported_entities,
80
+ supported_language=supported_language,
81
+ name="Flair Analytics",
82
+ )
83
+ print("Flair class initialized")
84
+
85
+ def load(self) -> None:
86
+ """Load the model, not used. Model is loaded during initialization."""
87
+ pass
88
+
89
+ def get_supported_entities(self) -> List[str]:
90
+ """
91
+ Return supported entities by this model.
92
+
93
+ :return: List of the supported entities.
94
+ """
95
+ return self.supported_entities
96
+
97
+ # Class to use Flair with Presidio as an external recognizer.
98
+ def analyze(
99
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
100
+ ) -> List[RecognizerResult]:
101
+ """
102
+ Analyze text using Text Analytics.
103
+
104
+ :param text: The text for analysis.
105
+ :param entities: Not working properly for this recognizer.
106
+ :param nlp_artifacts: Not used by this recognizer.
107
+ :param language: Text language. Supported languages in MODEL_LANGUAGES
108
+ :return: The list of Presidio RecognizerResult constructed from the recognized
109
+ Flair detections.
110
+ """
111
+
112
+ results = []
113
+
114
+ sentences = Sentence(text)
115
+ self.model.predict(sentences)
116
+
117
+ # If there are no specific list of entities, we will look for all of it.
118
+ if not entities:
119
+ entities = self.supported_entities
120
+
121
+ for entity in entities:
122
+ if entity not in self.supported_entities:
123
+ continue
124
+
125
+ for ent in sentences.get_spans("ner"):
126
+ if not self.__check_label(
127
+ entity, ent.labels[0].value, self.check_label_groups
128
+ ):
129
+ continue
130
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
131
+ ent.labels[0].value
132
+ )
133
+ explanation = self.build_flair_explanation(
134
+ round(ent.score, 2), textual_explanation
135
+ )
136
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
137
+
138
+ results.append(flair_result)
139
+
140
+ return results
141
+
142
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
143
+
144
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
145
+ flair_score = round(entity.score, 2)
146
+
147
+ flair_results = RecognizerResult(
148
+ entity_type=entity_type,
149
+ start=entity.start_position,
150
+ end=entity.end_position,
151
+ score=flair_score,
152
+ analysis_explanation=explanation,
153
+ )
154
+
155
+ return flair_results
156
+
157
+ def build_flair_explanation(
158
+ self, original_score: float, explanation: str
159
+ ) -> AnalysisExplanation:
160
+ """
161
+ Create explanation for why this result was detected.
162
+
163
+ :param original_score: Score given by this recognizer
164
+ :param explanation: Explanation string
165
+ :return:
166
+ """
167
+ explanation = AnalysisExplanation(
168
+ recognizer=self.__class__.__name__,
169
+ original_score=original_score,
170
+ textual_explanation=explanation,
171
+ )
172
+ return explanation
173
+
174
+ @staticmethod
175
+ def __check_label(
176
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
177
+ ) -> bool:
178
+ return any(
179
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
180
+ )
181
+
182
+
183
+ class PIIService:
184
+ def __init__(self):
185
+
186
+ configuration = {
187
+ "nlp_engine_name": "spacy",
188
+ "models": [
189
+ {"lang_code": "de", "model_name": "de_core_news_sm"}
190
+ ],
191
+ }
192
+
193
+ # Create NLP engine based on configuration
194
+ provider = NlpEngineProvider(nlp_configuration=configuration)
195
+ nlp_engine = provider.create_engine()
196
+
197
+ ## Creating regex for PatternRecognizers - SWIFT, vehicle number, zipcode, ssn
198
+ swift_regex = r"\b[A-Z]{4}DE[A-Z0-9]{2}(?:[A-Z0-9]{3})?"
199
+ vehicle_number_with_hyphen_regex = r"\b[A-ZÄÖÜ]{1,3}-[A-ZÄÖÜ]{1,2}-[0-9]{1,4}"
200
+ vehicle_number_without_hyphen_regex = r"\b[A-ZÄÖÜ]{1,3}[A-ZÄÖÜ]{1,2}[0-9]{1,4}"
201
+ german_zipcode_regex = r"\b((?:0[1-46-9]\d{3})|(?:[1-357-9]\d{4})|(?:[4][0-24-9]\d{3})|(?:[6][013-9]\d{3}))\b(?![\d/])"
202
+ german_ssn_regex = r"\b\d{2}\s?\d{6}\s?[A-Z]\s?\d{3}\b"
203
+ # Creating Presidio pattern object
204
+ vehicle_numbers_pattern1 = Pattern(name="vehicle_pattern", regex=vehicle_number_without_hyphen_regex, score=1)
205
+ vehicle_numbers_pattern2 = Pattern(name="vehicle_pattern", regex=vehicle_number_with_hyphen_regex, score=1)
206
+ swift_pattern = Pattern(name="bank_swift_pattern", regex=swift_regex, score=1)
207
+ germanzipcode_pattern = Pattern(name="german_zip_pattern",regex=german_zipcode_regex, score=1)
208
+ german_ssn_pattern = Pattern(name="german_ssn_pattern",regex=german_ssn_regex, score=1)
209
+
210
+ # Define the recognizer
211
+ swift_recognizer = PatternRecognizer(supported_entity="SWIFT", supported_language="de",patterns=[swift_pattern])
212
+ vehicle_number_recognizer = PatternRecognizer(supported_entity="VEHICLE_NUMBER", supported_language="de",patterns=[vehicle_numbers_pattern1,vehicle_numbers_pattern2])
213
+ germanzip_recognizer = PatternRecognizer(supported_entity="GERMAN_ZIP", supported_language="de",patterns=[germanzipcode_pattern])
214
+ germanssn_recognizer = PatternRecognizer(supported_entity="GERMAN_SSN", supported_language="de",patterns=[german_ssn_pattern])
215
+
216
+ ## Lading flair entity model for person, location ID
217
+ print("Loading flair")
218
+ flair_recognizer = FlairRecognizer(supported_language="de")
219
+ print("Flair loaded")
220
+
221
+ registry = RecognizerRegistry()
222
+ #registry.load_predefined_recognizers()
223
+ #registry.add_recognizer(SpacyRecognizer(supported_language="de"))
224
+ #registry.add_recognizer(SpacyRecognizer(supported_language="en"))
225
+
226
+ registry.remove_recognizer("SpacyRecognizer")
227
+ registry.add_recognizer(flair_recognizer)
228
+
229
+ registry.add_recognizer(swift_recognizer)
230
+ registry.add_recognizer(vehicle_number_recognizer)
231
+ registry.add_recognizer(germanzip_recognizer)
232
+ registry.add_recognizer(germanssn_recognizer)
233
+
234
+ ## Adding predefined recognizers
235
+ registry.add_recognizer(IbanRecognizer(supported_language="de"))
236
+ registry.add_recognizer(DateRecognizer(supported_language="de"))
237
+ registry.add_recognizer(EmailRecognizer(supported_language="de"))
238
+ registry.add_recognizer(IpRecognizer(supported_language="de"))
239
+ registry.add_recognizer(PhoneRecognizer(supported_language="de"))
240
+ registry.add_recognizer(UrlRecognizer(supported_language="de"))
241
+ registry.add_recognizer(PhoneRecognizer(supported_language="de"))
242
+ print("Recognizer registry loaded")
243
+
244
+ self.analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine, supported_languages=["de"])
245
+
246
+ #print(f"Type of recognizers ::\n {self.analyzer.registry.recognizers}")
247
+ print("PII initialized")
248
+
249
+ self.anonymizer = AnonymizerEngine()
250
+
251
+ def identify(self, text):
252
+ results_de = self.analyzer.analyze(
253
+ text,
254
+ language='de'
255
+ )
256
+
257
+ #anonymized_results = self.anonymize(results_de, text)
258
+ entities = []
259
+
260
+ for result in results_de:
261
+ result_dict = result.to_dict()
262
+ temp_entity = {
263
+ "start":result_dict['start'],
264
+ "end":result_dict['end'],
265
+ "entity_type":result_dict['entity_type'],
266
+ "score":result_dict['score'],
267
+ "word":text[result_dict['start']:result_dict['end']]
268
+ }
269
+ entities.append(temp_entity)
270
+
271
+ return {"entities":entities, "text":text}#, "anonymized_text":anonymized_results['text']}
272
+
273
+ """def anonymize(self, entities, text):
274
+ anonymized_results = self.anonymizer.anonymize(
275
+ text=text,
276
+ analyzer_results=entities,
277
+ #operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})},
278
+ )
279
+ return ""#json.loads(anonymized_results.to_json())"""
280
+
281
+ def add_mask(self, data):
282
+ masked_data = []
283
+ entity_count = {}
284
+
285
+ for item_idx,item in enumerate(data['entities']):
286
+ entity_type = item['entity_type']
287
+ word = item['word']
288
+ suffix = entity_count.get(entity_type, 0) + 1
289
+ entity_count[entity_type] = suffix
290
+
291
+ masked_word = f"{entity_type}_{suffix}"
292
+ item['mask'] = masked_word
293
+ #data['entities'][item_idx]['mask'] = masked_word
294
+ masked_data.append(item)
295
+
296
+ return masked_data
297
+
298
+ def anonymize(self, entities, text):
299
+ print("anonymyzing")
300
+ updated_text = text
301
+ for ent_idx, ent in enumerate(entities):
302
+ #text[ent['start']:ent['end']] = ent['mask']
303
+ updated_text = updated_text[:ent['start']] + " " + ent['mask'] + " " + updated_text[ent['end']:]
304
+
305
+ return updated_text
306
+
307
+ def remove_overlapping_entities(entities):
308
+
309
+ return