varun500 commited on
Commit
ec44723
1 Parent(s): e945173

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +230 -0
  3. earnings_calls_sentencewise.csv +3 -0
  4. requirements.txt +8 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ earnings_calls_sentencewise.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from tqdm import tqdm
3
+ import pinecone
4
+ import torch
5
+ from sentence_transformers import SentenceTransformer
6
+ from transformers import (
7
+ pipeline,
8
+ AutoTokenizer,
9
+ AutoModelForCausalLM,
10
+ AutoModelForSeq2SeqLM,
11
+ )
12
+ import streamlit as st
13
+ import openai
14
+
15
+
16
+ @st.experimental_singleton
17
+ def get_data():
18
+ data = pd.read_csv("earnings_calls_sentencewise.csv")
19
+ return data
20
+
21
+
22
+ # Initialize models from HuggingFace
23
+
24
+
25
+ @st.experimental_singleton
26
+ def get_t5_model():
27
+ return pipeline("summarization", model="t5-small", tokenizer="t5-small")
28
+
29
+
30
+ @st.experimental_singleton
31
+ def get_flan_t5_model():
32
+ return pipeline(
33
+ "summarization", model="google/flan-t5-small", tokenizer="google/flan-t5-small"
34
+ )
35
+
36
+
37
+ @st.experimental_singleton
38
+ def get_mpnet_embedding_model():
39
+ device = "cuda" if torch.cuda.is_available() else "cpu"
40
+ model = SentenceTransformer(
41
+ "sentence-transformers/all-mpnet-base-v2", device=device
42
+ )
43
+ model.max_seq_length = 512
44
+ return model
45
+
46
+
47
+ @st.experimental_singleton
48
+ def get_sgpt_embedding_model():
49
+ device = "cuda" if torch.cuda.is_available() else "cpu"
50
+ model = SentenceTransformer(
51
+ "Muennighoff/SGPT-125M-weightedmean-nli-bitfit", device=device
52
+ )
53
+ model.max_seq_length = 512
54
+ return model
55
+
56
+
57
+ @st.experimental_memo
58
+ def save_key(api_key):
59
+ return api_key
60
+
61
+
62
+ def query_pinecone(query, top_k, model, index):
63
+ # generate embeddings for the query
64
+ xq = model.encode([query]).tolist()
65
+ # search pinecone index for context passage with the answer
66
+ xc = index.query(xq, top_k=top_k, include_metadata=True)
67
+ return xc
68
+
69
+
70
+ def format_query(query_results):
71
+ # extract passage_text from Pinecone search result
72
+ context = [result["metadata"]["Text"] for result in query_results["matches"]]
73
+ return context
74
+
75
+
76
+ def sentence_id_combine(data, query_results, lag=2):
77
+ # Extract sentence IDs from query results
78
+ ids = [result["metadata"]["Sentence_id"] for result in query_results["matches"]]
79
+ # Generate new IDs by adding a lag value to the original IDs
80
+ new_ids = [id + i for id in ids for i in range(-lag, lag + 1)]
81
+ # Remove duplicates and sort the new IDs
82
+ new_ids = sorted(set(new_ids))
83
+ # Create a list of lookup IDs by grouping the new IDs in groups of lag*2+1
84
+ lookup_ids = [
85
+ new_ids[i : i + (lag * 2 + 1)] for i in range(0, len(new_ids), lag * 2 + 1)
86
+ ]
87
+ # Create a list of context sentences by joining the sentences corresponding to the lookup IDs
88
+ context_list = [
89
+ ". ".join(data.Text.iloc[lookup_id].to_list()) for lookup_id in lookup_ids
90
+ ]
91
+ return context_list
92
+
93
+
94
+ def text_lookup(data, sentence_ids):
95
+ context = ". ".join(data.iloc[sentence_ids].to_list())
96
+ return context
97
+
98
+
99
+ def gpt3_summary(text):
100
+ response = openai.Completion.create(
101
+ model="text-davinci-003",
102
+ prompt=text + "\n\nTl;dr",
103
+ temperature=0.1,
104
+ max_tokens=512,
105
+ top_p=1.0,
106
+ frequency_penalty=0.0,
107
+ presence_penalty=1,
108
+ )
109
+ return response.choices[0].text
110
+
111
+
112
+ def gpt3_qa(query, answer):
113
+ response = openai.Completion.create(
114
+ model="text-davinci-003",
115
+ prompt="Q: " + query + "\nA: " + answer,
116
+ temperature=0,
117
+ max_tokens=512,
118
+ top_p=1,
119
+ frequency_penalty=0.0,
120
+ presence_penalty=0.0,
121
+ stop=["\n"],
122
+ )
123
+ return response.choices[0].text
124
+
125
+
126
+ st.title("Abstractive Question Answering - APPL")
127
+
128
+ query_text = st.text_input("Input Query", value="Who is the CEO of Apple?")
129
+
130
+ num_results = int(st.number_input("Number of Results to query", 1, 5, value=2))
131
+
132
+
133
+ # Choose encoder model
134
+
135
+ encoder_models_choice = ["MPNET", "SGPT"]
136
+
137
+ encoder_model = st.selectbox("Select Encoder Model", encoder_models_choice)
138
+
139
+
140
+ # Choose decoder model
141
+
142
+ decoder_models_choice = ["GPT3 (QA_davinci)", "GPT3 (text_davinci)", "T5", "FLAN-T5"]
143
+
144
+ decoder_model = st.selectbox("Select Decoder Model", decoder_models_choice)
145
+
146
+
147
+ if encoder_model == "MPNET":
148
+ # Connect to pinecone environment
149
+ pinecone.init(
150
+ api_key="ea9fd320-6f8a-4edd-bf41-9e972b95cbf9", environment="us-east1-gcp"
151
+ )
152
+ pinecone_index_name = "week2-all-mpnet-base"
153
+ pinecone_index = pinecone.Index(pinecone_index_name)
154
+ retriever_model = get_mpnet_embedding_model()
155
+
156
+ elif encoder_model == "SGPT":
157
+ # Connect to pinecone environment
158
+ pinecone.init(
159
+ api_key="0d8215d7-4ad5-4c76-8c45-4a40c0f6a1b7", environment="us-east1-gcp"
160
+ )
161
+ pinecone_index_name = "week2-sgpt-125m"
162
+ pinecone_index = pinecone.Index(pinecone_index_name)
163
+ retriever_model = get_sgpt_embedding_model()
164
+
165
+
166
+ query_results = query_pinecone(query_text, num_results, retriever_model, pinecone_index)
167
+
168
+ window = int(st.number_input("Sentence Window Size", 1, 3, value=1))
169
+
170
+ data = get_data()
171
+
172
+ # context_list = format_query(query_results)
173
+ context_list = sentence_id_combine(data, query_results, lag=window)
174
+
175
+
176
+ st.subheader("Answer:")
177
+
178
+
179
+ if decoder_model == "GPT3 (text_davinci)":
180
+ openai_key = st.text_input(
181
+ "Enter OpenAI key",
182
+ value="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",
183
+ type="password",
184
+ )
185
+ api_key = save_key(openai_key)
186
+ openai.api_key = api_key
187
+ output_text = []
188
+ for context_text in context_list:
189
+ output_text.append(gpt3_summary(context_text))
190
+ generated_text = ". ".join(output_text)
191
+ st.write(gpt3_summary(generated_text))
192
+
193
+ elif decoder_model == "GPT3 (QA_davinci)":
194
+ openai_key = st.text_input(
195
+ "Enter OpenAI key",
196
+ value="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",
197
+ type="password",
198
+ )
199
+ api_key = save_key(openai_key)
200
+ openai.api_key = api_key
201
+ output_text = []
202
+ for context_text in context_list:
203
+ output_text.append(gpt3_qa(query_text, context_text))
204
+ generated_text = ". ".join(output_text)
205
+ st.write(gpt3_qa(query_text, generated_text))
206
+
207
+ elif decoder_model == "T5":
208
+ t5_pipeline = get_t5_model()
209
+ output_text = []
210
+ for context_text in context_list:
211
+ output_text.append(t5_pipeline(context_text)[0]["summary_text"])
212
+ generated_text = ". ".join(output_text)
213
+ st.write(t5_pipeline(generated_text)[0]["summary_text"])
214
+
215
+ elif decoder_model == "FLAN-T5":
216
+ flan_t5_pipeline = get_flan_t5_model()
217
+ output_text = []
218
+ for context_text in context_list:
219
+ output_text.append(flan_t5_pipeline(context_text)[0]["summary_text"])
220
+ generated_text = ". ".join(output_text)
221
+ st.write(flan_t5_pipeline(generated_text)[0]["summary_text"])
222
+
223
+ show_retrieved_text = st.checkbox("Show Retrieved Text", value=False)
224
+
225
+ if show_retrieved_text:
226
+
227
+ st.subheader("Retrieved Text:")
228
+
229
+ for context_text in context_list:
230
+ st.markdown(f"- {context_text}")
earnings_calls_sentencewise.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a24373c9cb8d68b4681f7590b5d94916ef748bd259636d93728e99b8e50678a5
3
+ size 12926317
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ tqdm
3
+ pinecone-client
4
+ torch
5
+ git+https://github.com/UKPLab/sentence-transformers.git
6
+ transformers
7
+ streamlit
8
+ openai