dbandi ThePixOne commited on
Commit
b684510
0 Parent(s):

Duplicate from ThePixOne/open_domain_qa

Browse files

Co-authored-by: Piotr Antoniak <[email protected]>

Files changed (8) hide show
  1. .gitattributes +27 -0
  2. China.pdf +0 -0
  3. HISTORY.txt +0 -0
  4. London.pdf +0 -0
  5. README.md +47 -0
  6. README.txt +1 -0
  7. app.py +212 -0
  8. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
China.pdf ADDED
Binary file (256 kB). View file
 
HISTORY.txt ADDED
File without changes
London.pdf ADDED
Binary file (212 kB). View file
 
README.md ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Question Answering from PDFs
3
+ emoji: 📈
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: false
9
+ license: wtfpl
10
+ duplicated_from: ThePixOne/open_domain_qa
11
+ ---
12
+
13
+ # Configuration
14
+
15
+ `title`: _string_
16
+ Display title for the Space
17
+
18
+ `emoji`: _string_
19
+ Space emoji (emoji-only character allowed)
20
+
21
+ `colorFrom`: _string_
22
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
23
+
24
+ `colorTo`: _string_
25
+ Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
26
+
27
+ `sdk`: _string_
28
+ Can be either `gradio`, `streamlit`, or `static`
29
+
30
+ `sdk_version` : _string_
31
+ Only applicable for `streamlit` SDK.
32
+ See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
33
+
34
+ `app_file`: _string_
35
+ Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
36
+ Path is relative to the root of the repository.
37
+
38
+ `models`: _List[string]_
39
+ HF model IDs (like "gpt2" or "deepset/roberta-base-squad2") used in the Space.
40
+ Will be parsed automatically from your code if not specified here.
41
+
42
+ `datasets`: _List[string]_
43
+ HF dataset IDs (like "common_voice" or "oscar-corpus/OSCAR-2109") used in the Space.
44
+ Will be parsed automatically from your code if not specified here.
45
+
46
+ `pinned`: _boolean_
47
+ Whether the Space stays on top of your list.
README.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Do you have a long document and bunch of questions that can be answered given the data in this file? Fear not because following demo can do it for you. Upload your pdf, ask question and wait for the magic to happen.
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio requires input to be fed in a very peculiar way and does not provide too much flexibility - don't expect from this demo too much. The backbone had to be adjusted to work on hugging face spaces. Go see https://github.com/PiotrAntoniak/QuestionAnswering for a prettier version utilizing streamlit.
3
+ """
4
+
5
+
6
+ import gradio as gr
7
+
8
+ description = """Do you have a long document and a bunch of questions that can be answered given the data in this file?
9
+ Fear not for this demo is for you.
10
+ Upload your pdf, ask your questions and wait for the magic to happen.
11
+ DISCLAIMER: I do no have idea what happens to the pdfs that you upload and who has access to them so make sure there is nothing confidential there.
12
+ """
13
+ title = "QA answering from a pdf."
14
+
15
+ import numpy as np
16
+ import time
17
+ import hashlib
18
+ import torch
19
+ from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, pipeline
20
+ from tqdm import tqdm
21
+ import os
22
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
23
+ import textract
24
+ from scipy.special import softmax
25
+ import pandas as pd
26
+ from datetime import datetime
27
+
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")
30
+ model = AutoModel.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1").to(device).eval()
31
+ tokenizer_ans = AutoTokenizer.from_pretrained("deepset/roberta-large-squad2")
32
+ model_ans = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-large-squad2").to(device).eval()
33
+
34
+ if device == 'cuda:0':
35
+ pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans,device = 0)
36
+ else:
37
+ pipe = pipeline("question-answering",model_ans,tokenizer =tokenizer_ans)
38
+
39
+ def cls_pooling(model_output):
40
+ return model_output.last_hidden_state[:,0]
41
+
42
+ def encode_query(query):
43
+ encoded_input = tokenizer(query, truncation=True, return_tensors='pt').to(device)
44
+
45
+ with torch.no_grad():
46
+ model_output = model(**encoded_input, return_dict=True)
47
+
48
+ embeddings = cls_pooling(model_output)
49
+
50
+ return embeddings.cpu()
51
+
52
+
53
+ def encode_docs(docs,maxlen = 64, stride = 32):
54
+ encoded_input = []
55
+ embeddings = []
56
+ spans = []
57
+ file_names = []
58
+ name, text = docs
59
+
60
+ text = text.split(" ")
61
+ if len(text) < maxlen:
62
+ text = " ".join(text)
63
+
64
+ encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
65
+ spans.append(temp_text)
66
+ file_names.append(name)
67
+
68
+ else:
69
+ num_iters = int(len(text)/maxlen)+1
70
+ for i in range(num_iters):
71
+ if i == 0:
72
+ temp_text = " ".join(text[i*maxlen:(i+1)*maxlen+stride])
73
+ else:
74
+ temp_text = " ".join(text[(i-1)*maxlen:(i)*maxlen][-stride:] + text[i*maxlen:(i+1)*maxlen])
75
+
76
+ encoded_input.append(tokenizer(temp_text, return_tensors='pt', truncation = True).to(device))
77
+ spans.append(temp_text)
78
+ file_names.append(name)
79
+
80
+ with torch.no_grad():
81
+ for encoded in tqdm(encoded_input):
82
+ model_output = model(**encoded, return_dict=True)
83
+ embeddings.append(cls_pooling(model_output))
84
+
85
+ embeddings = np.float32(torch.stack(embeddings).transpose(0, 1).cpu())
86
+
87
+ np.save("emb_{}.npy".format(name),dict(zip(list(range(len(embeddings))),embeddings)))
88
+ np.save("spans_{}.npy".format(name),dict(zip(list(range(len(spans))),spans)))
89
+ np.save("file_{}.npy".format(name),dict(zip(list(range(len(file_names))),file_names)))
90
+
91
+ return embeddings, spans, file_names
92
+
93
+ def predict(query,data):
94
+ name_to_save = data.name.split("/")[-1].split(".")[0][:-8]
95
+ k=20
96
+ st = str([query,name_to_save])
97
+ st_hashed = str(hashlib.sha256(st.encode()).hexdigest()) #just to speed up examples load
98
+ hist = st + " " + st_hashed
99
+ now = datetime.now()
100
+ current_time = now.strftime("%H:%M:%S")
101
+
102
+ try: #if the same question was already asked for this document, upload question and answer
103
+ df = pd.read_csv("{}.csv".format(hash(st)))
104
+ list_outputs = []
105
+ for i in range(k):
106
+ temp = [df.iloc[n] for n in range(k)][i]
107
+ text = ''
108
+ text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
109
+ text += 'ANSWER: ' +temp.Answer + '\n\n'
110
+ text += 'PASSAGE: '+temp.Passage + '\n\n'
111
+ list_outputs.append(text)
112
+ return list_outputs
113
+ except Exception as e:
114
+ print(e)
115
+ print(st)
116
+
117
+ if name_to_save+".txt" in os.listdir(): #if the document was already used, load its embeddings
118
+ doc_emb = np.load('emb_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
119
+ doc_text = np.load('spans_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
120
+ file_names_dicto = np.load('file_{}.npy'.format(name_to_save),allow_pickle='TRUE').item()
121
+
122
+ doc_emb = np.array(list(doc_emb.values())).reshape(-1,768)
123
+ doc_text = list(doc_text.values())
124
+ file_names = list(file_names_dicto.values())
125
+
126
+ else:
127
+ text = textract.process("{}".format(data.name)).decode('utf8')
128
+ text = text.replace("\r", " ")
129
+ text = text.replace("\n", " ")
130
+ text = text.replace(" . "," ")
131
+
132
+ doc_emb, doc_text, file_names = encode_docs((name_to_save,text),maxlen = 64, stride = 32)
133
+
134
+ doc_emb = doc_emb.reshape(-1, 768)
135
+ with open("{}.txt".format(name_to_save),"w",encoding="utf-8") as f:
136
+ f.write(text)
137
+
138
+ #once embeddings are calculated, run MIPS
139
+ start = time.time()
140
+ query_emb = encode_query(query)
141
+
142
+ scores = np.matmul(query_emb, doc_emb.transpose(1,0))[0].tolist()
143
+ doc_score_pairs = list(zip(doc_text, scores, file_names))
144
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
145
+
146
+ probs_sum = 0
147
+ probs = softmax(sorted(scores,reverse = True)[:k])
148
+ table = {"Passage":[],"Answer":[],"Probabilities":[]}
149
+
150
+
151
+ #get answers for each pair of question (from user) and top best passages
152
+ for i, (passage, _, names) in enumerate(doc_score_pairs[:k]):
153
+ passage = passage.replace("\n","")
154
+ #passage = passage.replace(" . "," ")
155
+
156
+ if probs[i] > 0.1 or (i < 3 and probs[i] > 0.05): #generate answers for more likely passages but no less than 2
157
+ QA = {'question':query,'context':passage}
158
+ ans = pipe(QA)
159
+ probabilities = "P(a|p): {}, P(a|p,q): {}, P(p|q): {}".format(round(ans["score"],5),
160
+ round(ans["score"]*probs[i],5),
161
+ round(probs[i],5))
162
+ table["Passage"].append(passage)
163
+ table["Answer"].append(str(ans["answer"]).upper())
164
+ table["Probabilities"].append(probabilities)
165
+ else:
166
+ table["Passage"].append(passage)
167
+ table["Answer"].append("no_answer_calculated")
168
+ table["Probabilities"].append("P(p|q): {}".format(round(probs[i],5)))
169
+
170
+
171
+ #format answers for ~nice output and save it for future (if the same question is asked again using same pdf)
172
+ df = pd.DataFrame(table)
173
+ print(df)
174
+ print("time: "+ str(time.time()-start))
175
+
176
+ with open("HISTORY.txt","a", encoding = "utf-8") as f:
177
+ f.write(hist)
178
+ f.write(" " + str(current_time))
179
+ f.write("\n")
180
+ f.close()
181
+ df.to_csv("{}.csv".format(hash(st)), index=False)
182
+
183
+ list_outputs = []
184
+ for i in range(k):
185
+ text = ''
186
+ temp = [df.iloc[n] for n in range(k)][i]
187
+ text += 'PROBABILITIES: '+ temp.Probabilities + '\n\n'
188
+ text += 'ANSWER: ' +temp.Answer + '\n\n'
189
+ text += 'PASSAGE: '+temp.Passage + '\n\n'
190
+
191
+ list_outputs.append(text)
192
+
193
+ return list_outputs
194
+
195
+ iface = gr.Interface(examples = [
196
+ ["How high is the highest mountain?","China.pdf"],
197
+ ["Where does UK prime minister live?","London.pdf"]
198
+ ],
199
+
200
+ fn =predict,
201
+ inputs = [gr.inputs.Textbox(default="What is Open-domain question answering?"),
202
+ gr.inputs.File(),
203
+ ],
204
+ outputs = [
205
+ gr.outputs.Carousel(['text']),
206
+ ],
207
+ description=description,
208
+ title = title,
209
+ allow_flagging ="manual",flagging_options = ["correct","wrong"],
210
+ allow_screenshot=False)
211
+
212
+ iface.launch(enable_queue=True, show_error =True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ textract
3
+ scipy
4
+ pandas
5
+ numpy
6
+ transformers