Spaces:
Sleeping
Sleeping
added flexible vectorstore
Browse files- .gitignore +2 -1
- Home.py +45 -11
- src/FAISS.ipynb +228 -13
- src/FAISS/10_legislature.faiss +3 -0
- src/FAISS/10_legislature.pkl +3 -0
- src/FAISS/11_legislature.faiss +3 -0
- src/FAISS/11_legislature.pkl +3 -0
- src/FAISS/12_legislature.faiss +3 -0
- src/FAISS/12_legislature.pkl +3 -0
- src/FAISS/13_legislature.faiss +3 -0
- src/FAISS/13_legislature.pkl +3 -0
- src/FAISS/14_legislature.faiss +3 -0
- src/FAISS/14_legislature.pkl +3 -0
- src/FAISS/15_legislature.faiss +3 -0
- src/FAISS/15_legislature.pkl +3 -0
- src/FAISS/16_legislature.faiss +3 -0
- src/FAISS/16_legislature.pkl +3 -0
- src/FAISS/17_legislature.faiss +3 -0
- src/FAISS/17_legislature.pkl +3 -0
- src/FAISS/18_legislature.faiss +3 -0
- src/FAISS/18_legislature.pkl +3 -0
- src/FAISS/19_legislature.faiss +3 -0
- src/FAISS/19_legislature.pkl +3 -0
- src/FAISS/1_legislature.faiss +3 -0
- src/FAISS/1_legislature.pkl +3 -0
- src/FAISS/20_legislature.faiss +3 -0
- src/FAISS/20_legislature.pkl +3 -0
- src/FAISS/2_legislature.faiss +3 -0
- src/FAISS/2_legislature.pkl +3 -0
- src/FAISS/3_legislature.faiss +3 -0
- src/FAISS/3_legislature.pkl +3 -0
- src/FAISS/4_legislature.faiss +3 -0
- src/FAISS/4_legislature.pkl +3 -0
- src/FAISS/5_legislature.faiss +3 -0
- src/FAISS/5_legislature.pkl +3 -0
- src/FAISS/6_legislature.faiss +3 -0
- src/FAISS/6_legislature.pkl +3 -0
- src/FAISS/7_legislature.faiss +3 -0
- src/FAISS/7_legislature.pkl +3 -0
- src/FAISS/8_legislature.faiss +3 -0
- src/FAISS/8_legislature.pkl +3 -0
- src/FAISS/9_legislature.faiss +3 -0
- src/FAISS/9_legislature.pkl +3 -0
- src/chatbot.py +73 -22
- src/vectordatabase.py +90 -10
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
__pycache__
|
2 |
hf_upload.py
|
3 |
-
.env
|
|
|
|
1 |
__pycache__
|
2 |
hf_upload.py
|
3 |
+
.env
|
4 |
+
.mypy_cache
|
Home.py
CHANGED
@@ -1,45 +1,79 @@
|
|
1 |
import gradio as gr
|
2 |
from src.chatbot import chatbot, keyword_search
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
with gr.Blocks() as App:
|
6 |
with gr.Tab("ChatBot"):
|
7 |
-
#Apply RAG using chatbut function from local file ChatBot.py
|
|
|
|
|
|
|
|
|
8 |
gr.ChatInterface(chatbot,
|
9 |
title="PoliticsToYou",
|
10 |
description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
|
11 |
to get insight on the view points of the german parties and the debate of the parliament.",
|
12 |
-
examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
|
13 |
-
cache_examples=False,
|
|
|
14 |
)
|
15 |
|
16 |
with gr.Tab("KeyWordSearch"):
|
17 |
|
18 |
with gr.Blocks() as Block:
|
19 |
-
#Keyword Input
|
20 |
keyword_box = gr.Textbox(label='keyword')
|
21 |
|
22 |
#Additional Input (hidden)
|
23 |
with gr.Accordion('Detailed filters', open=False):
|
24 |
-
#Row orientation
|
25 |
with gr.Row() as additional_input:
|
26 |
n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
|
27 |
-
party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') #change to all possible options
|
|
|
|
|
28 |
|
29 |
search_btn = gr.Button('Search')
|
30 |
|
31 |
with gr.Column(visible=False) as output_col:
|
32 |
results_df = gr.Dataframe(label='Results', interactive=False)
|
33 |
|
34 |
-
#Download results from keyword search
|
35 |
with gr.Accordion('Would you like to download your results?', open=False) as download_row:
|
36 |
with gr.Row():
|
37 |
ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
|
38 |
export_btn = gr.Button('Export')
|
39 |
file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
|
40 |
|
41 |
-
#Keyword Search on click
|
42 |
-
def search(keyword, n, party): #ToDo: Include party and timedate
|
43 |
return {
|
44 |
output_col: gr.Column(visible=True),
|
45 |
results_df: keyword_search(query=keyword, n=n, party_filter=party),
|
@@ -51,7 +85,7 @@ with gr.Blocks() as App:
|
|
51 |
outputs=[output_col, results_df],
|
52 |
)
|
53 |
|
54 |
-
#Export data to a downloadable format
|
55 |
def export(df, keyword, ftype=None):
|
56 |
if ftype == "csv":
|
57 |
file = f'{keyword}.csv'
|
@@ -74,6 +108,6 @@ with gr.Blocks() as App:
|
|
74 |
|
75 |
|
76 |
if __name__ == "__main__":
|
77 |
-
App.launch(share=False) #
|
78 |
|
79 |
|
|
|
1 |
import gradio as gr
|
2 |
from src.chatbot import chatbot, keyword_search
|
3 |
+
from gradio_calendar import Calendar
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
|
7 |
+
legislature_periods = [
|
8 |
+
"20. Legislaturperiode",
|
9 |
+
"19. Legislaturperiode",
|
10 |
+
"18. Legislaturperiode",
|
11 |
+
"17. Legislaturperiode",
|
12 |
+
"16. Legislaturperiode",
|
13 |
+
"15. Legislaturperiode",
|
14 |
+
"14. Legislaturperiode",
|
15 |
+
"13. Legislaturperiode",
|
16 |
+
"12. Legislaturperiode",
|
17 |
+
"11. Legislaturperiode",
|
18 |
+
"10. Legislaturperiode",
|
19 |
+
"9. Legislaturperiode",
|
20 |
+
"8. Legislaturperiode",
|
21 |
+
"7. Legislaturperiode",
|
22 |
+
"6. Legislaturperiode",
|
23 |
+
"5. Legislaturperiode",
|
24 |
+
"4. Legislaturperiode",
|
25 |
+
"3. Legislaturperiode",
|
26 |
+
"2. Legislaturperiode",
|
27 |
+
"1. Legislaturperiode"
|
28 |
+
]
|
29 |
+
|
30 |
|
31 |
|
32 |
with gr.Blocks() as App:
|
33 |
with gr.Tab("ChatBot"):
|
34 |
+
# Apply RAG using chatbut function from local file ChatBot.py
|
35 |
+
db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="If empty all Legislaturperioden are selected", show_label=True)
|
36 |
+
print(db_inputs)
|
37 |
+
|
38 |
+
|
39 |
gr.ChatInterface(chatbot,
|
40 |
title="PoliticsToYou",
|
41 |
description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
|
42 |
to get insight on the view points of the german parties and the debate of the parliament.",
|
43 |
+
#examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
|
44 |
+
cache_examples=False, #true increases the loading time
|
45 |
+
additional_inputs = db_inputs,
|
46 |
)
|
47 |
|
48 |
with gr.Tab("KeyWordSearch"):
|
49 |
|
50 |
with gr.Blocks() as Block:
|
51 |
+
# Keyword Input
|
52 |
keyword_box = gr.Textbox(label='keyword')
|
53 |
|
54 |
#Additional Input (hidden)
|
55 |
with gr.Accordion('Detailed filters', open=False):
|
56 |
+
# Row orientation
|
57 |
with gr.Row() as additional_input:
|
58 |
n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
|
59 |
+
party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
|
60 |
+
start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
|
61 |
+
end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
|
62 |
|
63 |
search_btn = gr.Button('Search')
|
64 |
|
65 |
with gr.Column(visible=False) as output_col:
|
66 |
results_df = gr.Dataframe(label='Results', interactive=False)
|
67 |
|
68 |
+
# Download results from keyword search
|
69 |
with gr.Accordion('Would you like to download your results?', open=False) as download_row:
|
70 |
with gr.Row():
|
71 |
ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
|
72 |
export_btn = gr.Button('Export')
|
73 |
file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
|
74 |
|
75 |
+
# Keyword Search on click
|
76 |
+
def search(keyword, n, party): # ToDo: Include party and timedate
|
77 |
return {
|
78 |
output_col: gr.Column(visible=True),
|
79 |
results_df: keyword_search(query=keyword, n=n, party_filter=party),
|
|
|
85 |
outputs=[output_col, results_df],
|
86 |
)
|
87 |
|
88 |
+
# Export data to a downloadable format
|
89 |
def export(df, keyword, ftype=None):
|
90 |
if ftype == "csv":
|
91 |
file = f'{keyword}.csv'
|
|
|
108 |
|
109 |
|
110 |
if __name__ == "__main__":
|
111 |
+
App.launch(share=False) #t rue not supported on hf spaces
|
112 |
|
113 |
|
src/FAISS.ipynb
CHANGED
@@ -2,19 +2,152 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
-
"
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
}
|
19 |
],
|
20 |
"source": [
|
@@ -23,14 +156,96 @@
|
|
23 |
"from vectordatabase import load_documents\n",
|
24 |
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
|
25 |
"from langchain_community.vectorstores import FAISS\n",
|
|
|
26 |
"\n",
|
27 |
"\n",
|
28 |
"df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
|
|
|
29 |
"# Split speeches into documents\n",
|
30 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
"embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
|
32 |
-
"
|
33 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
]
|
35 |
}
|
36 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
+
"data": {
|
10 |
+
"text/html": [
|
11 |
+
"<div>\n",
|
12 |
+
"<style scoped>\n",
|
13 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
14 |
+
" vertical-align: middle;\n",
|
15 |
+
" }\n",
|
16 |
+
"\n",
|
17 |
+
" .dataframe tbody tr th {\n",
|
18 |
+
" vertical-align: top;\n",
|
19 |
+
" }\n",
|
20 |
+
"\n",
|
21 |
+
" .dataframe thead th {\n",
|
22 |
+
" text-align: right;\n",
|
23 |
+
" }\n",
|
24 |
+
"</style>\n",
|
25 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
26 |
+
" <thead>\n",
|
27 |
+
" <tr style=\"text-align: right;\">\n",
|
28 |
+
" <th></th>\n",
|
29 |
+
" <th>id</th>\n",
|
30 |
+
" <th>speech_content</th>\n",
|
31 |
+
" <th>date</th>\n",
|
32 |
+
" <th>party</th>\n",
|
33 |
+
" </tr>\n",
|
34 |
+
" </thead>\n",
|
35 |
+
" <tbody>\n",
|
36 |
+
" <tr>\n",
|
37 |
+
" <th>0</th>\n",
|
38 |
+
" <td>0</td>\n",
|
39 |
+
" <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
|
40 |
+
" <td>1949-09-12</td>\n",
|
41 |
+
" <td>not found</td>\n",
|
42 |
+
" </tr>\n",
|
43 |
+
" <tr>\n",
|
44 |
+
" <th>1</th>\n",
|
45 |
+
" <td>1</td>\n",
|
46 |
+
" <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
|
47 |
+
" <td>1949-09-12</td>\n",
|
48 |
+
" <td>not found</td>\n",
|
49 |
+
" </tr>\n",
|
50 |
+
" <tr>\n",
|
51 |
+
" <th>2</th>\n",
|
52 |
+
" <td>2</td>\n",
|
53 |
+
" <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
|
54 |
+
" <td>1949-09-12</td>\n",
|
55 |
+
" <td>not found</td>\n",
|
56 |
+
" </tr>\n",
|
57 |
+
" <tr>\n",
|
58 |
+
" <th>3</th>\n",
|
59 |
+
" <td>3</td>\n",
|
60 |
+
" <td>Ja, ich habe den Wunsch.\\n</td>\n",
|
61 |
+
" <td>1949-09-12</td>\n",
|
62 |
+
" <td>not found</td>\n",
|
63 |
+
" </tr>\n",
|
64 |
+
" <tr>\n",
|
65 |
+
" <th>4</th>\n",
|
66 |
+
" <td>4</td>\n",
|
67 |
+
" <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
|
68 |
+
" <td>1949-09-12</td>\n",
|
69 |
+
" <td>not found</td>\n",
|
70 |
+
" </tr>\n",
|
71 |
+
" <tr>\n",
|
72 |
+
" <th>...</th>\n",
|
73 |
+
" <td>...</td>\n",
|
74 |
+
" <td>...</td>\n",
|
75 |
+
" <td>...</td>\n",
|
76 |
+
" <td>...</td>\n",
|
77 |
+
" </tr>\n",
|
78 |
+
" <tr>\n",
|
79 |
+
" <th>930955</th>\n",
|
80 |
+
" <td>1084268</td>\n",
|
81 |
+
" <td>\\n\\nWir sind zwar Kollegen.</td>\n",
|
82 |
+
" <td>2022-12-16</td>\n",
|
83 |
+
" <td>not found</td>\n",
|
84 |
+
" </tr>\n",
|
85 |
+
" <tr>\n",
|
86 |
+
" <th>930956</th>\n",
|
87 |
+
" <td>1084269</td>\n",
|
88 |
+
" <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
|
89 |
+
" <td>2022-12-16</td>\n",
|
90 |
+
" <td>CDU/CSU</td>\n",
|
91 |
+
" </tr>\n",
|
92 |
+
" <tr>\n",
|
93 |
+
" <th>930957</th>\n",
|
94 |
+
" <td>1084270</td>\n",
|
95 |
+
" <td>\\n\\nVielen Dank.</td>\n",
|
96 |
+
" <td>2022-12-16</td>\n",
|
97 |
+
" <td>not found</td>\n",
|
98 |
+
" </tr>\n",
|
99 |
+
" <tr>\n",
|
100 |
+
" <th>930958</th>\n",
|
101 |
+
" <td>1084272</td>\n",
|
102 |
+
" <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
|
103 |
+
" <td>2022-12-16</td>\n",
|
104 |
+
" <td>not found</td>\n",
|
105 |
+
" </tr>\n",
|
106 |
+
" <tr>\n",
|
107 |
+
" <th>930959</th>\n",
|
108 |
+
" <td>1084273</td>\n",
|
109 |
+
" <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
|
110 |
+
" <td>2022-12-16</td>\n",
|
111 |
+
" <td>SPD</td>\n",
|
112 |
+
" </tr>\n",
|
113 |
+
" </tbody>\n",
|
114 |
+
"</table>\n",
|
115 |
+
"<p>930960 rows × 4 columns</p>\n",
|
116 |
+
"</div>"
|
117 |
+
],
|
118 |
+
"text/plain": [
|
119 |
+
" id speech_content date \\\n",
|
120 |
+
"0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12 \n",
|
121 |
+
"1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12 \n",
|
122 |
+
"2 2 Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12 \n",
|
123 |
+
"3 3 Ja, ich habe den Wunsch.\\n 1949-09-12 \n",
|
124 |
+
"4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12 \n",
|
125 |
+
"... ... ... ... \n",
|
126 |
+
"930955 1084268 \\n\\nWir sind zwar Kollegen. 2022-12-16 \n",
|
127 |
+
"930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16 \n",
|
128 |
+
"930957 1084270 \\n\\nVielen Dank. 2022-12-16 \n",
|
129 |
+
"930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16 \n",
|
130 |
+
"930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16 \n",
|
131 |
+
"\n",
|
132 |
+
" party \n",
|
133 |
+
"0 not found \n",
|
134 |
+
"1 not found \n",
|
135 |
+
"2 not found \n",
|
136 |
+
"3 not found \n",
|
137 |
+
"4 not found \n",
|
138 |
+
"... ... \n",
|
139 |
+
"930955 not found \n",
|
140 |
+
"930956 CDU/CSU \n",
|
141 |
+
"930957 not found \n",
|
142 |
+
"930958 not found \n",
|
143 |
+
"930959 SPD \n",
|
144 |
+
"\n",
|
145 |
+
"[930960 rows x 4 columns]"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
"execution_count": 2,
|
149 |
+
"metadata": {},
|
150 |
+
"output_type": "execute_result"
|
151 |
}
|
152 |
],
|
153 |
"source": [
|
|
|
156 |
"from vectordatabase import load_documents\n",
|
157 |
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
|
158 |
"from langchain_community.vectorstores import FAISS\n",
|
159 |
+
"from datetime import datetime\n",
|
160 |
"\n",
|
161 |
"\n",
|
162 |
"df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
|
163 |
+
"df['date'] = pd.to_datetime(df['date'])\n",
|
164 |
"# Split speeches into documents\n",
|
165 |
+
"df"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"cell_type": "code",
|
170 |
+
"execution_count": 3,
|
171 |
+
"metadata": {},
|
172 |
+
"outputs": [
|
173 |
+
{
|
174 |
+
"name": "stderr",
|
175 |
+
"output_type": "stream",
|
176 |
+
"text": [
|
177 |
+
"c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
178 |
+
" warnings.warn(\n",
|
179 |
+
"c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
|
180 |
+
" warnings.warn(\n"
|
181 |
+
]
|
182 |
+
},
|
183 |
+
{
|
184 |
+
"name": "stdout",
|
185 |
+
"output_type": "stream",
|
186 |
+
"text": [
|
187 |
+
"Sucessfully created vector store for 1. legislature\n",
|
188 |
+
"Sucessfully created vector store for 2. legislature\n",
|
189 |
+
"Sucessfully created vector store for 3. legislature\n",
|
190 |
+
"Sucessfully created vector store for 4. legislature\n",
|
191 |
+
"Sucessfully created vector store for 5. legislature\n",
|
192 |
+
"Sucessfully created vector store for 6. legislature\n",
|
193 |
+
"Sucessfully created vector store for 7. legislature\n",
|
194 |
+
"Sucessfully created vector store for 8. legislature\n",
|
195 |
+
"Sucessfully created vector store for 9. legislature\n",
|
196 |
+
"Sucessfully created vector store for 10. legislature\n",
|
197 |
+
"Sucessfully created vector store for 11. legislature\n",
|
198 |
+
"Sucessfully created vector store for 12. legislature\n",
|
199 |
+
"Sucessfully created vector store for 13. legislature\n",
|
200 |
+
"Sucessfully created vector store for 14. legislature\n",
|
201 |
+
"Sucessfully created vector store for 15. legislature\n",
|
202 |
+
"Sucessfully created vector store for 16. legislature\n",
|
203 |
+
"Sucessfully created vector store for 17. legislature\n",
|
204 |
+
"Sucessfully created vector store for 18. legislature\n",
|
205 |
+
"Sucessfully created vector store for 19. legislature\n",
|
206 |
+
"Sucessfully created vector store for 20. legislature\n"
|
207 |
+
]
|
208 |
+
}
|
209 |
+
],
|
210 |
+
"source": [
|
211 |
+
"\n",
|
212 |
+
"dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
|
213 |
"embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
|
214 |
+
"\n",
|
215 |
+
"# Iterate over all date to split by legislature getting vector stores for each period\n",
|
216 |
+
"\n",
|
217 |
+
"period = 1\n",
|
218 |
+
"previous_date = None\n",
|
219 |
+
"for date in dates:\n",
|
220 |
+
" if previous_date is None:\n",
|
221 |
+
" legislature = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
|
222 |
+
" elif date is None:\n",
|
223 |
+
" legislature = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
|
224 |
+
" else:\n",
|
225 |
+
" legislature = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
|
226 |
+
"\n",
|
227 |
+
" \n",
|
228 |
+
" # Split text into documents\n",
|
229 |
+
" documents = load_documents(legislature)\n",
|
230 |
+
" index_name = f'{period}_legislature'\n",
|
231 |
+
" db = FAISS.from_documents(documents, embeddings)\n",
|
232 |
+
" db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
|
233 |
+
" print(f\"Sucessfully created vector store for {period}. legislature\")\n",
|
234 |
+
" # Change for next iteration\n",
|
235 |
+
" period += 1\n",
|
236 |
+
" previous_date = date\n",
|
237 |
+
"\n",
|
238 |
+
"\n"
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "code",
|
243 |
+
"execution_count": null,
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [],
|
246 |
+
"source": [
|
247 |
+
"\n",
|
248 |
+
"\n"
|
249 |
]
|
250 |
}
|
251 |
],
|
src/FAISS/10_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66ac4627657617d20657ee29e060d5e0201f23474f2bb316fcfcd1e784347c83
|
3 |
+
size 238133805
|
src/FAISS/10_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63591732c250d403c23d68d007f2f0091d1e4393c8f06131c59dd0756c08e479
|
3 |
+
size 107921064
|
src/FAISS/11_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f083f538427463ff2d9dfd1a7f1dddc7b0ec944ae7c4a0c0f110d71e268c7eb5
|
3 |
+
size 234221613
|
src/FAISS/11_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aba4a26fd81efa5416bbdadcf5416a9c43d5a24fe319073de8d862812e91eb77
|
3 |
+
size 109009058
|
src/FAISS/12_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1651a9d458986410808873d8fbc8609181ec55726827375987834975eaea303e
|
3 |
+
size 252945453
|
src/FAISS/12_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98a03cba6aef98df4cb4f147f21072b14d647f6ed8fab51b5d9000a93934c53b
|
3 |
+
size 120628791
|
src/FAISS/13_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9349d40b633b9c9265e11cce8466137ba47e1622550291d37587e9cf57ca7f18
|
3 |
+
size 256303149
|
src/FAISS/13_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cff67a961bec000909289049a63a389d18a0b818394b0a505da5f1867a5dc2cb
|
3 |
+
size 123142473
|
src/FAISS/14_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:005fe85f80d6e073f12cc60b3fe32b65f87df5f5ef233c4c6ebe30594b542c18
|
3 |
+
size 240325677
|
src/FAISS/14_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98f41d12a44e8a3592f9c829744102328834f09d913305181dc827d54c15ec3f
|
3 |
+
size 125554373
|
src/FAISS/15_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e34018d0aef1cefd2e68851b5e3ae7b4ff1133872e238a37a92358204428ddd
|
3 |
+
size 167463981
|
src/FAISS/15_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba506f555d4b8a668233621cb1e006a85fdd5444196d0d83d8b83bcdb09725f1
|
3 |
+
size 87021966
|
src/FAISS/16_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2924f42f74019b6c55304f6ab9c7e0778926123c2fd46a69b6b3a9d428cc445a
|
3 |
+
size 246827565
|
src/FAISS/16_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c219b81c0e07e9911bc0a41dc5d064c07dc77ca968410db3491556fb2da019b
|
3 |
+
size 128041546
|
src/FAISS/17_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c99b6b30e3fe81fa31b3cf8384294215a86f6591641916b3a7abf793c08bd51f
|
3 |
+
size 309130797
|
src/FAISS/17_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:13d510ccd279e3eac2f80ca323ccd14b073c1e1135c7cb9cb1caf73e5a8ebb27
|
3 |
+
size 162425243
|
src/FAISS/18_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e40a3278487a92ab9cb94e24bddcc35df8ca8f0f81d0871a3ca3deea9ad07deb
|
3 |
+
size 240373293
|
src/FAISS/18_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:753d1838eb1aa45449cc728374949af6aea3887a39c80644b73688f259f3bc0c
|
3 |
+
size 124641794
|
src/FAISS/19_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3aa5be26dea0ba0048a917ec51b445cfca46b54529437de589a87bb156b047b1
|
3 |
+
size 255599661
|
src/FAISS/19_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11ce629b1d179528d2581343bdc5e6ba11762be0d7f0710823e783dd28091334
|
3 |
+
size 128545267
|
src/FAISS/1_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:287fba08c823eb8438576944cb8ab3a70ff810f1dec83b8f6ef220d3c8f0e87d
|
3 |
+
size 175981101
|
src/FAISS/1_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e78cb806846e05044d623763f490bd959fb52d1d6d2a53fea45a1dab8b8414a9
|
3 |
+
size 87900606
|
src/FAISS/20_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5eef8fc33a06b508f21d817e4a081b40fc9086238641a7f29b8280700bf41c4e
|
3 |
+
size 81005613
|
src/FAISS/20_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:27919466bac90278eebddda9de8d4b86f1ed7d1e309606e96e59486704a56dc6
|
3 |
+
size 40616936
|
src/FAISS/2_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f63b7e7135fdccc872d93e95653bb3df97ec456f4409f0009abafc4ffe3ab8aa
|
3 |
+
size 155570733
|
src/FAISS/2_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dca740628c8500c62e37362813068399670351c934912f5586b9d558f1ed81b0
|
3 |
+
size 76574674
|
src/FAISS/3_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3c901df3b2be8e4c237ee26296f8c320f75caa63aa8c2381ebd2de530545d87a
|
3 |
+
size 120843309
|
src/FAISS/3_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e4ab8b365422001985f274a71c74ffdec6d8e7cc122196257cf92f33ba1127b
|
3 |
+
size 59003914
|
src/FAISS/4_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2605fe28e330771a210256307a4638ac4f5a256a19299cfa6b3932015b803cf4
|
3 |
+
size 139723821
|
src/FAISS/4_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:02511c79b3e38d13815ac02fc1627cc0000a964730efb49a0fe5866fbb7772c5
|
3 |
+
size 60800382
|
src/FAISS/5_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0dc389788da319691d24a9209d8fa0e181d3cb3396e424407cb565c44bd386f
|
3 |
+
size 183742509
|
src/FAISS/5_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f5521ea0d8260f20f214eda9f6c726195341b04cd95e6e65613a608fc8540b2
|
3 |
+
size 81197730
|
src/FAISS/6_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3ac34ac64c99c302c3fcd7c6c65e7a758d9c2b0f3e4294c7533a3448de49e6ee
|
3 |
+
size 134287917
|
src/FAISS/6_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe2f497330f7dfd6ec51cce4d42d5b065bdb27828ab0d85177ee85dac47ad596
|
3 |
+
size 62478365
|
src/FAISS/7_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0efa99ff055e4399db9ad5ca85c2538e1441924ea4937c32df4635e23e3e97cb
|
3 |
+
size 211250733
|
src/FAISS/7_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f229c55d9d44958db68381d0f6805fff269d406cf94260a22c656dd13056b308
|
3 |
+
size 96069866
|
src/FAISS/8_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53a84ee9456604696f83fa82245c13aaa6e60099585d93b375a5f67f6ccdd3d9
|
3 |
+
size 200272941
|
src/FAISS/8_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f44bc8d65db7aaabe1bbad09d6022bf97b537f50648f56f9d15465a7d9eb61d
|
3 |
+
size 90726289
|
src/FAISS/9_legislature.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e0f48ef95f2d0edb9914ffacffd8e18aed9325b467293a8480f069143407102
|
3 |
+
size 113241645
|
src/FAISS/9_legislature.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e416d21bcfb120aa5047ba3d556f5df79e16d04676e8fb9e7fdcd8b6116a31e
|
3 |
+
size 50705534
|
src/chatbot.py
CHANGED
@@ -2,6 +2,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
|
2 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
3 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
|
|
|
5 |
from src.vectordatabase import RAG, get_vectorstore
|
6 |
import pandas as pd
|
7 |
from dotenv import load_dotenv, find_dotenv
|
@@ -50,47 +51,97 @@ prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf
|
|
50 |
)
|
51 |
|
52 |
|
53 |
-
folder_path =
|
54 |
-
index_name = "speeches_1949_09_12"
|
55 |
#index_name = "legislature20"
|
56 |
-
db =
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
def chatbot(message, history,
|
|
|
59 |
raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
|
|
|
60 |
response = raw_response['answer'].split("Antwort: ")[1]
|
61 |
return response
|
62 |
|
63 |
-
|
64 |
-
def keyword_search(query,n=10,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
query_embedding = embeddings.embed_query(query)
|
|
|
|
|
66 |
if method == 'mmr':
|
67 |
-
df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance'])
|
68 |
-
results =
|
69 |
for doc in results:
|
70 |
party = doc[0].metadata["party"]
|
71 |
-
#Filter by party input
|
72 |
if party != party_filter and party_filter != 'All':
|
73 |
-
|
74 |
speech_content = doc[0].page_content
|
75 |
speech_date = doc[0].metadata["date"]
|
76 |
-
score = round(doc[1], ndigits=2)
|
77 |
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
df_res.sort_values('Relevance', inplace=True, ascending=True)
|
|
|
|
|
82 |
else:
|
83 |
-
df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party'])
|
84 |
-
results = db.similarity_search_by_vector(query_embedding, k
|
85 |
for doc in results:
|
86 |
party = doc.metadata["party"]
|
87 |
-
#Filter by party input
|
88 |
if party != party_filter and party_filter != 'All':
|
89 |
-
|
90 |
speech_content = doc.page_content
|
91 |
speech_date = doc.metadata["date"]
|
92 |
-
|
93 |
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
94 |
-
|
95 |
-
|
96 |
-
return df_res
|
|
|
2 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
3 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
|
5 |
+
|
6 |
from src.vectordatabase import RAG, get_vectorstore
|
7 |
import pandas as pd
|
8 |
from dotenv import load_dotenv, find_dotenv
|
|
|
51 |
)
|
52 |
|
53 |
|
54 |
+
#folder_path =
|
55 |
+
#index_name = "speeches_1949_09_12"
|
56 |
#index_name = "legislature20"
|
57 |
+
#db = get
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
|
64 |
+
def chatbot(message, history, db_inputs, llm=llm, prompt=prompt2):
|
65 |
+
db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
|
66 |
raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
|
67 |
+
# Only necessary because mistral does not give beautiful outputs
|
68 |
response = raw_response['answer'].split("Antwort: ")[1]
|
69 |
return response
|
70 |
|
71 |
+
|
72 |
+
def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
|
73 |
+
"""
|
74 |
+
Retrieve speech contents based on keywords using a specified method.
|
75 |
+
|
76 |
+
Parameters:
|
77 |
+
----------
|
78 |
+
db : FAISS
|
79 |
+
The FAISS vector store containing speech embeddings.
|
80 |
+
|
81 |
+
query : str
|
82 |
+
The keyword(s) to search for in the speech contents.
|
83 |
+
|
84 |
+
n : int, optional
|
85 |
+
The number of speech contents to retrieve (default is 10).
|
86 |
+
|
87 |
+
embeddings : Embeddings, optional
|
88 |
+
An instance of embeddings used for embedding queries (default is embeddings).
|
89 |
+
|
90 |
+
method : str, optional
|
91 |
+
The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
|
92 |
+
(maximal marginal relevance) (default is 'ss').
|
93 |
+
|
94 |
+
party_filter : str, optional
|
95 |
+
A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
|
96 |
+
speeches from all parties (default is 'All').
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
-------
|
100 |
+
pandas.DataFrame
|
101 |
+
A DataFrame containing the speech contents, dates, and party affiliations.
|
102 |
+
|
103 |
+
Notes:
|
104 |
+
-----
|
105 |
+
- The `db` parameter should be a FAISS vector store containing speech embeddings.
|
106 |
+
- The `query` parameter specifies the keyword(s) to search for in the speech contents.
|
107 |
+
- The `n` parameter determines the number of speech contents to retrieve (default is 10).
|
108 |
+
- The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
|
109 |
+
- The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
|
110 |
+
and 'mmr' (maximal marginal relevance) (default is 'ss').
|
111 |
+
- The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
|
112 |
+
speeches from all parties (default is 'All').
|
113 |
+
"""
|
114 |
+
|
115 |
query_embedding = embeddings.embed_query(query)
|
116 |
+
|
117 |
+
# Maximal Marginal Relevance
|
118 |
if method == 'mmr':
|
119 |
+
df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
|
120 |
+
results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
|
121 |
for doc in results:
|
122 |
party = doc[0].metadata["party"]
|
|
|
123 |
if party != party_filter and party_filter != 'All':
|
124 |
+
continue
|
125 |
speech_content = doc[0].page_content
|
126 |
speech_date = doc[0].metadata["date"]
|
127 |
+
score = round(doc[1], ndigits=2)
|
128 |
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
129 |
+
'Date': [speech_date],
|
130 |
+
'Party': [party],
|
131 |
+
'Relevance': [score]})], ignore_index=True)
|
132 |
df_res.sort_values('Relevance', inplace=True, ascending=True)
|
133 |
+
|
134 |
+
# Similarity Search
|
135 |
else:
|
136 |
+
df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
|
137 |
+
results = db.similarity_search_by_vector(query_embedding, k=n)
|
138 |
for doc in results:
|
139 |
party = doc.metadata["party"]
|
|
|
140 |
if party != party_filter and party_filter != 'All':
|
141 |
+
continue
|
142 |
speech_content = doc.page_content
|
143 |
speech_date = doc.metadata["date"]
|
|
|
144 |
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
145 |
+
'Date': [speech_date],
|
146 |
+
'Party': [party]})], ignore_index=True)
|
147 |
+
return df_res
|
src/vectordatabase.py
CHANGED
@@ -6,6 +6,15 @@ from langchain_community.llms import HuggingFaceHub
|
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
8 |
from langchain.chains import create_retrieval_chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
import os
|
11 |
#from dotenv import load_dotenv
|
@@ -29,19 +38,72 @@ def load_documents(df):
|
|
29 |
documents = splitter.split_documents(documents=data)
|
30 |
return documents
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
embeddings=embeddings, allow_dangerous_deserialization=True)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
return db
|
44 |
|
|
|
|
|
45 |
# Apply RAG by providing the context and the question to the LLM using the predefined template
|
46 |
def RAG(llm, prompt, db, question):
|
47 |
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
|
@@ -51,3 +113,21 @@ def RAG(llm, prompt, db, question):
|
|
51 |
response = retrieval_chain.invoke({"input": question})
|
52 |
return response
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
from langchain.chains.combine_documents import create_stuff_documents_chain
|
8 |
from langchain.chains import create_retrieval_chain
|
9 |
+
from faiss import IndexFlatL2
|
10 |
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
11 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
12 |
+
import functools
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
import pandas as pd
|
18 |
|
19 |
import os
|
20 |
#from dotenv import load_dotenv
|
|
|
38 |
documents = splitter.split_documents(documents=data)
|
39 |
return documents
|
40 |
|
41 |
+
|
42 |
+
#@functools.lru_cache()
|
43 |
+
def get_vectorstore(inputs, embeddings):
|
44 |
+
"""
|
45 |
+
Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
|
46 |
+
|
47 |
+
Parameters:
|
48 |
+
----------
|
49 |
+
inputs : list of str
|
50 |
+
A list of strings specifying which vector stores to combine. Each string represents a specific
|
51 |
+
index or a special keyword "All". If "All" is included in the list, it will load a pre-defined
|
52 |
+
comprehensive vector store and return immediately.
|
53 |
+
|
54 |
+
embeddings : Embeddings
|
55 |
+
An instance of embeddings that will be used to load the vector stores. The specific type and
|
56 |
+
structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
-------
|
60 |
+
FAISS
|
61 |
+
A FAISS vector store that combines the specified indices into a single vector store.
|
62 |
+
|
63 |
+
Notes:
|
64 |
+
-----
|
65 |
+
- The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.
|
66 |
+
- The function initializes an empty FAISS vector store with a dimensionality of 128.
|
67 |
+
- If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".
|
68 |
+
- For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.
|
69 |
+
- The `FAISS.load_local` method is used to load vector stores from the local file system.
|
70 |
+
The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.
|
71 |
+
"""
|
72 |
+
|
73 |
+
# Default folder path
|
74 |
+
folder_path = "./src/FAISS"
|
75 |
+
|
76 |
+
if inputs[0] == "All":
|
77 |
+
index_name = "speeches_1949_09_12"
|
78 |
+
db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
|
79 |
embeddings=embeddings, allow_dangerous_deserialization=True)
|
80 |
+
return db
|
81 |
+
|
82 |
+
|
83 |
+
# Initialize empty db
|
84 |
+
embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
85 |
+
dimensions: int = len(embedding_function.embed_query("dummy"))
|
86 |
+
|
87 |
+
db = FAISS(
|
88 |
+
embedding_function=embedding_function,
|
89 |
+
index=IndexFlatL2(dimensions),
|
90 |
+
docstore=InMemoryDocstore(),
|
91 |
+
index_to_docstore_id={},
|
92 |
+
normalize_L2=False
|
93 |
+
)
|
94 |
+
|
95 |
+
# Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
|
96 |
+
for input in inputs:
|
97 |
+
# Retrieve selected index and merge vector stores
|
98 |
+
index = input.split(".")[0]
|
99 |
+
index_name = f'{index}_legislature'
|
100 |
+
local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
|
101 |
+
embeddings=embeddings, allow_dangerous_deserialization=True)
|
102 |
+
db.merge_from(local_db)
|
103 |
return db
|
104 |
|
105 |
+
|
106 |
+
|
107 |
# Apply RAG by providing the context and the question to the LLM using the predefined template
|
108 |
def RAG(llm, prompt, db, question):
|
109 |
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
|
|
|
113 |
response = retrieval_chain.invoke({"input": question})
|
114 |
return response
|
115 |
|
116 |
+
#########
|
117 |
+
# Dynamically loading vector_db
|
118 |
+
##########
|
119 |
+
|
120 |
+
def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):
|
121 |
+
|
122 |
+
# Get all file names
|
123 |
+
vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]
|
124 |
+
|
125 |
+
df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
|
126 |
+
# Extract metadata of file from its name
|
127 |
+
for file_name in vector_stores:
|
128 |
+
file_name = file_name.split(".")[0]
|
129 |
+
file_elements = file_name.split("_")
|
130 |
+
file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]
|
131 |
+
|
132 |
+
if file_party == party and file_start_date <= start_date:
|
133 |
+
None
|