TomData commited on
Commit
85df319
1 Parent(s): e681b03

added flexible vectorstore

Browse files
Files changed (45) hide show
  1. .gitignore +2 -1
  2. Home.py +45 -11
  3. src/FAISS.ipynb +228 -13
  4. src/FAISS/10_legislature.faiss +3 -0
  5. src/FAISS/10_legislature.pkl +3 -0
  6. src/FAISS/11_legislature.faiss +3 -0
  7. src/FAISS/11_legislature.pkl +3 -0
  8. src/FAISS/12_legislature.faiss +3 -0
  9. src/FAISS/12_legislature.pkl +3 -0
  10. src/FAISS/13_legislature.faiss +3 -0
  11. src/FAISS/13_legislature.pkl +3 -0
  12. src/FAISS/14_legislature.faiss +3 -0
  13. src/FAISS/14_legislature.pkl +3 -0
  14. src/FAISS/15_legislature.faiss +3 -0
  15. src/FAISS/15_legislature.pkl +3 -0
  16. src/FAISS/16_legislature.faiss +3 -0
  17. src/FAISS/16_legislature.pkl +3 -0
  18. src/FAISS/17_legislature.faiss +3 -0
  19. src/FAISS/17_legislature.pkl +3 -0
  20. src/FAISS/18_legislature.faiss +3 -0
  21. src/FAISS/18_legislature.pkl +3 -0
  22. src/FAISS/19_legislature.faiss +3 -0
  23. src/FAISS/19_legislature.pkl +3 -0
  24. src/FAISS/1_legislature.faiss +3 -0
  25. src/FAISS/1_legislature.pkl +3 -0
  26. src/FAISS/20_legislature.faiss +3 -0
  27. src/FAISS/20_legislature.pkl +3 -0
  28. src/FAISS/2_legislature.faiss +3 -0
  29. src/FAISS/2_legislature.pkl +3 -0
  30. src/FAISS/3_legislature.faiss +3 -0
  31. src/FAISS/3_legislature.pkl +3 -0
  32. src/FAISS/4_legislature.faiss +3 -0
  33. src/FAISS/4_legislature.pkl +3 -0
  34. src/FAISS/5_legislature.faiss +3 -0
  35. src/FAISS/5_legislature.pkl +3 -0
  36. src/FAISS/6_legislature.faiss +3 -0
  37. src/FAISS/6_legislature.pkl +3 -0
  38. src/FAISS/7_legislature.faiss +3 -0
  39. src/FAISS/7_legislature.pkl +3 -0
  40. src/FAISS/8_legislature.faiss +3 -0
  41. src/FAISS/8_legislature.pkl +3 -0
  42. src/FAISS/9_legislature.faiss +3 -0
  43. src/FAISS/9_legislature.pkl +3 -0
  44. src/chatbot.py +73 -22
  45. src/vectordatabase.py +90 -10
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  __pycache__
2
  hf_upload.py
3
- .env
 
 
1
  __pycache__
2
  hf_upload.py
3
+ .env
4
+ .mypy_cache
Home.py CHANGED
@@ -1,45 +1,79 @@
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  with gr.Blocks() as App:
6
  with gr.Tab("ChatBot"):
7
- #Apply RAG using chatbut function from local file ChatBot.py
 
 
 
 
8
  gr.ChatInterface(chatbot,
9
  title="PoliticsToYou",
10
  description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
11
  to get insight on the view points of the german parties and the debate of the parliament.",
12
- examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
13
- cache_examples=False, #true increases the loading time
 
14
  )
15
 
16
  with gr.Tab("KeyWordSearch"):
17
 
18
  with gr.Blocks() as Block:
19
- #Keyword Input
20
  keyword_box = gr.Textbox(label='keyword')
21
 
22
  #Additional Input (hidden)
23
  with gr.Accordion('Detailed filters', open=False):
24
- #Row orientation
25
  with gr.Row() as additional_input:
26
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
27
- party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') #change to all possible options
 
 
28
 
29
  search_btn = gr.Button('Search')
30
 
31
  with gr.Column(visible=False) as output_col:
32
  results_df = gr.Dataframe(label='Results', interactive=False)
33
 
34
- #Download results from keyword search
35
  with gr.Accordion('Would you like to download your results?', open=False) as download_row:
36
  with gr.Row():
37
  ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
38
  export_btn = gr.Button('Export')
39
  file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
40
 
41
- #Keyword Search on click
42
- def search(keyword, n, party): #ToDo: Include party and timedate
43
  return {
44
  output_col: gr.Column(visible=True),
45
  results_df: keyword_search(query=keyword, n=n, party_filter=party),
@@ -51,7 +85,7 @@ with gr.Blocks() as App:
51
  outputs=[output_col, results_df],
52
  )
53
 
54
- #Export data to a downloadable format
55
  def export(df, keyword, ftype=None):
56
  if ftype == "csv":
57
  file = f'{keyword}.csv'
@@ -74,6 +108,6 @@ with gr.Blocks() as App:
74
 
75
 
76
  if __name__ == "__main__":
77
- App.launch(share=False) #true not supported on hf spaces
78
 
79
 
 
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
+ from gradio_calendar import Calendar
4
+ from datetime import datetime
5
+
6
+
7
+ legislature_periods = [
8
+ "20. Legislaturperiode",
9
+ "19. Legislaturperiode",
10
+ "18. Legislaturperiode",
11
+ "17. Legislaturperiode",
12
+ "16. Legislaturperiode",
13
+ "15. Legislaturperiode",
14
+ "14. Legislaturperiode",
15
+ "13. Legislaturperiode",
16
+ "12. Legislaturperiode",
17
+ "11. Legislaturperiode",
18
+ "10. Legislaturperiode",
19
+ "9. Legislaturperiode",
20
+ "8. Legislaturperiode",
21
+ "7. Legislaturperiode",
22
+ "6. Legislaturperiode",
23
+ "5. Legislaturperiode",
24
+ "4. Legislaturperiode",
25
+ "3. Legislaturperiode",
26
+ "2. Legislaturperiode",
27
+ "1. Legislaturperiode"
28
+ ]
29
+
30
 
31
 
32
  with gr.Blocks() as App:
33
  with gr.Tab("ChatBot"):
34
+ # Apply RAG using chatbut function from local file ChatBot.py
35
+ db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="If empty all Legislaturperioden are selected", show_label=True)
36
+ print(db_inputs)
37
+
38
+
39
  gr.ChatInterface(chatbot,
40
  title="PoliticsToYou",
41
  description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
42
  to get insight on the view points of the german parties and the debate of the parliament.",
43
+ #examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
44
+ cache_examples=False, #true increases the loading time
45
+ additional_inputs = db_inputs,
46
  )
47
 
48
  with gr.Tab("KeyWordSearch"):
49
 
50
  with gr.Blocks() as Block:
51
+ # Keyword Input
52
  keyword_box = gr.Textbox(label='keyword')
53
 
54
  #Additional Input (hidden)
55
  with gr.Accordion('Detailed filters', open=False):
56
+ # Row orientation
57
  with gr.Row() as additional_input:
58
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
59
+ party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
60
+ start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
61
+ end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
62
 
63
  search_btn = gr.Button('Search')
64
 
65
  with gr.Column(visible=False) as output_col:
66
  results_df = gr.Dataframe(label='Results', interactive=False)
67
 
68
+ # Download results from keyword search
69
  with gr.Accordion('Would you like to download your results?', open=False) as download_row:
70
  with gr.Row():
71
  ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
72
  export_btn = gr.Button('Export')
73
  file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
74
 
75
+ # Keyword Search on click
76
+ def search(keyword, n, party): # ToDo: Include party and timedate
77
  return {
78
  output_col: gr.Column(visible=True),
79
  results_df: keyword_search(query=keyword, n=n, party_filter=party),
 
85
  outputs=[output_col, results_df],
86
  )
87
 
88
+ # Export data to a downloadable format
89
  def export(df, keyword, ftype=None):
90
  if ftype == "csv":
91
  file = f'{keyword}.csv'
 
108
 
109
 
110
  if __name__ == "__main__":
111
+ App.launch(share=False) #t rue not supported on hf spaces
112
 
113
 
src/FAISS.ipynb CHANGED
@@ -2,19 +2,152 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 3,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
- "ename": "",
10
- "evalue": "",
11
- "output_type": "error",
12
- "traceback": [
13
- "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
14
- "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
15
- "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
16
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
17
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
  ],
20
  "source": [
@@ -23,14 +156,96 @@
23
  "from vectordatabase import load_documents\n",
24
  "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
25
  "from langchain_community.vectorstores import FAISS\n",
 
26
  "\n",
27
  "\n",
28
  "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
 
29
  "# Split speeches into documents\n",
30
- "documents = load_documents(df)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
32
- "db = FAISS.from_documents(documents, embeddings)\n",
33
- "db.save_local(folder_path=\"ChatBot\\FAISS\", index_name=\"speeches_1949_09_12\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  ]
35
  }
36
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 2,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
+ "data": {
10
+ "text/html": [
11
+ "<div>\n",
12
+ "<style scoped>\n",
13
+ " .dataframe tbody tr th:only-of-type {\n",
14
+ " vertical-align: middle;\n",
15
+ " }\n",
16
+ "\n",
17
+ " .dataframe tbody tr th {\n",
18
+ " vertical-align: top;\n",
19
+ " }\n",
20
+ "\n",
21
+ " .dataframe thead th {\n",
22
+ " text-align: right;\n",
23
+ " }\n",
24
+ "</style>\n",
25
+ "<table border=\"1\" class=\"dataframe\">\n",
26
+ " <thead>\n",
27
+ " <tr style=\"text-align: right;\">\n",
28
+ " <th></th>\n",
29
+ " <th>id</th>\n",
30
+ " <th>speech_content</th>\n",
31
+ " <th>date</th>\n",
32
+ " <th>party</th>\n",
33
+ " </tr>\n",
34
+ " </thead>\n",
35
+ " <tbody>\n",
36
+ " <tr>\n",
37
+ " <th>0</th>\n",
38
+ " <td>0</td>\n",
39
+ " <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
40
+ " <td>1949-09-12</td>\n",
41
+ " <td>not found</td>\n",
42
+ " </tr>\n",
43
+ " <tr>\n",
44
+ " <th>1</th>\n",
45
+ " <td>1</td>\n",
46
+ " <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
47
+ " <td>1949-09-12</td>\n",
48
+ " <td>not found</td>\n",
49
+ " </tr>\n",
50
+ " <tr>\n",
51
+ " <th>2</th>\n",
52
+ " <td>2</td>\n",
53
+ " <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
54
+ " <td>1949-09-12</td>\n",
55
+ " <td>not found</td>\n",
56
+ " </tr>\n",
57
+ " <tr>\n",
58
+ " <th>3</th>\n",
59
+ " <td>3</td>\n",
60
+ " <td>Ja, ich habe den Wunsch.\\n</td>\n",
61
+ " <td>1949-09-12</td>\n",
62
+ " <td>not found</td>\n",
63
+ " </tr>\n",
64
+ " <tr>\n",
65
+ " <th>4</th>\n",
66
+ " <td>4</td>\n",
67
+ " <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
68
+ " <td>1949-09-12</td>\n",
69
+ " <td>not found</td>\n",
70
+ " </tr>\n",
71
+ " <tr>\n",
72
+ " <th>...</th>\n",
73
+ " <td>...</td>\n",
74
+ " <td>...</td>\n",
75
+ " <td>...</td>\n",
76
+ " <td>...</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>930955</th>\n",
80
+ " <td>1084268</td>\n",
81
+ " <td>\\n\\nWir sind zwar Kollegen.</td>\n",
82
+ " <td>2022-12-16</td>\n",
83
+ " <td>not found</td>\n",
84
+ " </tr>\n",
85
+ " <tr>\n",
86
+ " <th>930956</th>\n",
87
+ " <td>1084269</td>\n",
88
+ " <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
89
+ " <td>2022-12-16</td>\n",
90
+ " <td>CDU/CSU</td>\n",
91
+ " </tr>\n",
92
+ " <tr>\n",
93
+ " <th>930957</th>\n",
94
+ " <td>1084270</td>\n",
95
+ " <td>\\n\\nVielen Dank.</td>\n",
96
+ " <td>2022-12-16</td>\n",
97
+ " <td>not found</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>930958</th>\n",
101
+ " <td>1084272</td>\n",
102
+ " <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
103
+ " <td>2022-12-16</td>\n",
104
+ " <td>not found</td>\n",
105
+ " </tr>\n",
106
+ " <tr>\n",
107
+ " <th>930959</th>\n",
108
+ " <td>1084273</td>\n",
109
+ " <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
110
+ " <td>2022-12-16</td>\n",
111
+ " <td>SPD</td>\n",
112
+ " </tr>\n",
113
+ " </tbody>\n",
114
+ "</table>\n",
115
+ "<p>930960 rows × 4 columns</p>\n",
116
+ "</div>"
117
+ ],
118
+ "text/plain": [
119
+ " id speech_content date \\\n",
120
+ "0 0 Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12 \n",
121
+ "1 1 Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12 \n",
122
+ "2 2 Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12 \n",
123
+ "3 3 Ja, ich habe den Wunsch.\\n 1949-09-12 \n",
124
+ "4 4 Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12 \n",
125
+ "... ... ... ... \n",
126
+ "930955 1084268 \\n\\nWir sind zwar Kollegen. 2022-12-16 \n",
127
+ "930956 1084269 \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16 \n",
128
+ "930957 1084270 \\n\\nVielen Dank. 2022-12-16 \n",
129
+ "930958 1084272 \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16 \n",
130
+ "930959 1084273 \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16 \n",
131
+ "\n",
132
+ " party \n",
133
+ "0 not found \n",
134
+ "1 not found \n",
135
+ "2 not found \n",
136
+ "3 not found \n",
137
+ "4 not found \n",
138
+ "... ... \n",
139
+ "930955 not found \n",
140
+ "930956 CDU/CSU \n",
141
+ "930957 not found \n",
142
+ "930958 not found \n",
143
+ "930959 SPD \n",
144
+ "\n",
145
+ "[930960 rows x 4 columns]"
146
+ ]
147
+ },
148
+ "execution_count": 2,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
  }
152
  ],
153
  "source": [
 
156
  "from vectordatabase import load_documents\n",
157
  "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
158
  "from langchain_community.vectorstores import FAISS\n",
159
+ "from datetime import datetime\n",
160
  "\n",
161
  "\n",
162
  "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
163
+ "df['date'] = pd.to_datetime(df['date'])\n",
164
  "# Split speeches into documents\n",
165
+ "df"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": 3,
171
+ "metadata": {},
172
+ "outputs": [
173
+ {
174
+ "name": "stderr",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
178
+ " warnings.warn(\n",
179
+ "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
180
+ " warnings.warn(\n"
181
+ ]
182
+ },
183
+ {
184
+ "name": "stdout",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "Sucessfully created vector store for 1. legislature\n",
188
+ "Sucessfully created vector store for 2. legislature\n",
189
+ "Sucessfully created vector store for 3. legislature\n",
190
+ "Sucessfully created vector store for 4. legislature\n",
191
+ "Sucessfully created vector store for 5. legislature\n",
192
+ "Sucessfully created vector store for 6. legislature\n",
193
+ "Sucessfully created vector store for 7. legislature\n",
194
+ "Sucessfully created vector store for 8. legislature\n",
195
+ "Sucessfully created vector store for 9. legislature\n",
196
+ "Sucessfully created vector store for 10. legislature\n",
197
+ "Sucessfully created vector store for 11. legislature\n",
198
+ "Sucessfully created vector store for 12. legislature\n",
199
+ "Sucessfully created vector store for 13. legislature\n",
200
+ "Sucessfully created vector store for 14. legislature\n",
201
+ "Sucessfully created vector store for 15. legislature\n",
202
+ "Sucessfully created vector store for 16. legislature\n",
203
+ "Sucessfully created vector store for 17. legislature\n",
204
+ "Sucessfully created vector store for 18. legislature\n",
205
+ "Sucessfully created vector store for 19. legislature\n",
206
+ "Sucessfully created vector store for 20. legislature\n"
207
+ ]
208
+ }
209
+ ],
210
+ "source": [
211
+ "\n",
212
+ "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
213
  "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
214
+ "\n",
215
+ "# Iterate over all date to split by legislature getting vector stores for each period\n",
216
+ "\n",
217
+ "period = 1\n",
218
+ "previous_date = None\n",
219
+ "for date in dates:\n",
220
+ " if previous_date is None:\n",
221
+ " legislature = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
222
+ " elif date is None:\n",
223
+ " legislature = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
224
+ " else:\n",
225
+ " legislature = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
226
+ "\n",
227
+ " \n",
228
+ " # Split text into documents\n",
229
+ " documents = load_documents(legislature)\n",
230
+ " index_name = f'{period}_legislature'\n",
231
+ " db = FAISS.from_documents(documents, embeddings)\n",
232
+ " db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
233
+ " print(f\"Sucessfully created vector store for {period}. legislature\")\n",
234
+ " # Change for next iteration\n",
235
+ " period += 1\n",
236
+ " previous_date = date\n",
237
+ "\n",
238
+ "\n"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": null,
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "\n",
248
+ "\n"
249
  ]
250
  }
251
  ],
src/FAISS/10_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ac4627657617d20657ee29e060d5e0201f23474f2bb316fcfcd1e784347c83
3
+ size 238133805
src/FAISS/10_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63591732c250d403c23d68d007f2f0091d1e4393c8f06131c59dd0756c08e479
3
+ size 107921064
src/FAISS/11_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f083f538427463ff2d9dfd1a7f1dddc7b0ec944ae7c4a0c0f110d71e268c7eb5
3
+ size 234221613
src/FAISS/11_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba4a26fd81efa5416bbdadcf5416a9c43d5a24fe319073de8d862812e91eb77
3
+ size 109009058
src/FAISS/12_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1651a9d458986410808873d8fbc8609181ec55726827375987834975eaea303e
3
+ size 252945453
src/FAISS/12_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98a03cba6aef98df4cb4f147f21072b14d647f6ed8fab51b5d9000a93934c53b
3
+ size 120628791
src/FAISS/13_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9349d40b633b9c9265e11cce8466137ba47e1622550291d37587e9cf57ca7f18
3
+ size 256303149
src/FAISS/13_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cff67a961bec000909289049a63a389d18a0b818394b0a505da5f1867a5dc2cb
3
+ size 123142473
src/FAISS/14_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:005fe85f80d6e073f12cc60b3fe32b65f87df5f5ef233c4c6ebe30594b542c18
3
+ size 240325677
src/FAISS/14_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f41d12a44e8a3592f9c829744102328834f09d913305181dc827d54c15ec3f
3
+ size 125554373
src/FAISS/15_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e34018d0aef1cefd2e68851b5e3ae7b4ff1133872e238a37a92358204428ddd
3
+ size 167463981
src/FAISS/15_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba506f555d4b8a668233621cb1e006a85fdd5444196d0d83d8b83bcdb09725f1
3
+ size 87021966
src/FAISS/16_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2924f42f74019b6c55304f6ab9c7e0778926123c2fd46a69b6b3a9d428cc445a
3
+ size 246827565
src/FAISS/16_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c219b81c0e07e9911bc0a41dc5d064c07dc77ca968410db3491556fb2da019b
3
+ size 128041546
src/FAISS/17_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99b6b30e3fe81fa31b3cf8384294215a86f6591641916b3a7abf793c08bd51f
3
+ size 309130797
src/FAISS/17_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13d510ccd279e3eac2f80ca323ccd14b073c1e1135c7cb9cb1caf73e5a8ebb27
3
+ size 162425243
src/FAISS/18_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e40a3278487a92ab9cb94e24bddcc35df8ca8f0f81d0871a3ca3deea9ad07deb
3
+ size 240373293
src/FAISS/18_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753d1838eb1aa45449cc728374949af6aea3887a39c80644b73688f259f3bc0c
3
+ size 124641794
src/FAISS/19_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa5be26dea0ba0048a917ec51b445cfca46b54529437de589a87bb156b047b1
3
+ size 255599661
src/FAISS/19_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11ce629b1d179528d2581343bdc5e6ba11762be0d7f0710823e783dd28091334
3
+ size 128545267
src/FAISS/1_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:287fba08c823eb8438576944cb8ab3a70ff810f1dec83b8f6ef220d3c8f0e87d
3
+ size 175981101
src/FAISS/1_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e78cb806846e05044d623763f490bd959fb52d1d6d2a53fea45a1dab8b8414a9
3
+ size 87900606
src/FAISS/20_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eef8fc33a06b508f21d817e4a081b40fc9086238641a7f29b8280700bf41c4e
3
+ size 81005613
src/FAISS/20_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27919466bac90278eebddda9de8d4b86f1ed7d1e309606e96e59486704a56dc6
3
+ size 40616936
src/FAISS/2_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f63b7e7135fdccc872d93e95653bb3df97ec456f4409f0009abafc4ffe3ab8aa
3
+ size 155570733
src/FAISS/2_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca740628c8500c62e37362813068399670351c934912f5586b9d558f1ed81b0
3
+ size 76574674
src/FAISS/3_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c901df3b2be8e4c237ee26296f8c320f75caa63aa8c2381ebd2de530545d87a
3
+ size 120843309
src/FAISS/3_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e4ab8b365422001985f274a71c74ffdec6d8e7cc122196257cf92f33ba1127b
3
+ size 59003914
src/FAISS/4_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2605fe28e330771a210256307a4638ac4f5a256a19299cfa6b3932015b803cf4
3
+ size 139723821
src/FAISS/4_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02511c79b3e38d13815ac02fc1627cc0000a964730efb49a0fe5866fbb7772c5
3
+ size 60800382
src/FAISS/5_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0dc389788da319691d24a9209d8fa0e181d3cb3396e424407cb565c44bd386f
3
+ size 183742509
src/FAISS/5_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f5521ea0d8260f20f214eda9f6c726195341b04cd95e6e65613a608fc8540b2
3
+ size 81197730
src/FAISS/6_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ac34ac64c99c302c3fcd7c6c65e7a758d9c2b0f3e4294c7533a3448de49e6ee
3
+ size 134287917
src/FAISS/6_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2f497330f7dfd6ec51cce4d42d5b065bdb27828ab0d85177ee85dac47ad596
3
+ size 62478365
src/FAISS/7_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0efa99ff055e4399db9ad5ca85c2538e1441924ea4937c32df4635e23e3e97cb
3
+ size 211250733
src/FAISS/7_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f229c55d9d44958db68381d0f6805fff269d406cf94260a22c656dd13056b308
3
+ size 96069866
src/FAISS/8_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53a84ee9456604696f83fa82245c13aaa6e60099585d93b375a5f67f6ccdd3d9
3
+ size 200272941
src/FAISS/8_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f44bc8d65db7aaabe1bbad09d6022bf97b537f50648f56f9d15465a7d9eb61d
3
+ size 90726289
src/FAISS/9_legislature.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e0f48ef95f2d0edb9914ffacffd8e18aed9325b467293a8480f069143407102
3
+ size 113241645
src/FAISS/9_legislature.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e416d21bcfb120aa5047ba3d556f5df79e16d04676e8fb9e7fdcd8b6116a31e
3
+ size 50705534
src/chatbot.py CHANGED
@@ -2,6 +2,7 @@ from langchain_core.prompts import ChatPromptTemplate
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
 
5
  from src.vectordatabase import RAG, get_vectorstore
6
  import pandas as pd
7
  from dotenv import load_dotenv, find_dotenv
@@ -50,47 +51,97 @@ prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf
50
  )
51
 
52
 
53
- folder_path = "./src/FAISS"
54
- index_name = "speeches_1949_09_12"
55
  #index_name = "legislature20"
56
- db = get_vectorstore(embeddings=embeddings, folder_path=folder_path, index_name=index_name)
 
 
 
 
 
57
 
58
- def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
 
59
  raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
 
60
  response = raw_response['answer'].split("Antwort: ")[1]
61
  return response
62
 
63
- # Retrieve speech contents based on keywords
64
- def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_filter = 'All'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  query_embedding = embeddings.embed_query(query)
 
 
66
  if method == 'mmr':
67
- df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
68
- results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n)
69
  for doc in results:
70
  party = doc[0].metadata["party"]
71
- #Filter by party input
72
  if party != party_filter and party_filter != 'All':
73
- continue
74
  speech_content = doc[0].page_content
75
  speech_date = doc[0].metadata["date"]
76
- score = round(doc[1], ndigits=2) # Relevance based on relevance search
77
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
78
- 'Date': [speech_date],
79
- 'Party': [party],
80
- 'Relevance': [score]})], ignore_index=True)
81
  df_res.sort_values('Relevance', inplace=True, ascending=True)
 
 
82
  else:
83
- df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party']) # Add Date/Party/Politician #Add filter
84
- results = db.similarity_search_by_vector(query_embedding, k = n)
85
  for doc in results:
86
  party = doc.metadata["party"]
87
- #Filter by party input
88
  if party != party_filter and party_filter != 'All':
89
- continue
90
  speech_content = doc.page_content
91
  speech_date = doc.metadata["date"]
92
-
93
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
94
- 'Date': [speech_date],
95
- 'Party': [party]})], ignore_index=True)
96
- return df_res
 
2
  from langchain_community.llms.huggingface_hub import HuggingFaceHub
3
  from langchain_community.embeddings import HuggingFaceEmbeddings
4
 
5
+
6
  from src.vectordatabase import RAG, get_vectorstore
7
  import pandas as pd
8
  from dotenv import load_dotenv, find_dotenv
 
51
  )
52
 
53
 
54
+ #folder_path =
55
+ #index_name = "speeches_1949_09_12"
56
  #index_name = "legislature20"
57
+ #db = get
58
+
59
+
60
+
61
+
62
+
63
 
64
+ def chatbot(message, history, db_inputs, llm=llm, prompt=prompt2):
65
+ db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
66
  raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
67
+ # Only necessary because mistral does not give beautiful outputs
68
  response = raw_response['answer'].split("Antwort: ")[1]
69
  return response
70
 
71
+
72
+ def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
73
+ """
74
+ Retrieve speech contents based on keywords using a specified method.
75
+
76
+ Parameters:
77
+ ----------
78
+ db : FAISS
79
+ The FAISS vector store containing speech embeddings.
80
+
81
+ query : str
82
+ The keyword(s) to search for in the speech contents.
83
+
84
+ n : int, optional
85
+ The number of speech contents to retrieve (default is 10).
86
+
87
+ embeddings : Embeddings, optional
88
+ An instance of embeddings used for embedding queries (default is embeddings).
89
+
90
+ method : str, optional
91
+ The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
92
+ (maximal marginal relevance) (default is 'ss').
93
+
94
+ party_filter : str, optional
95
+ A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
96
+ speeches from all parties (default is 'All').
97
+
98
+ Returns:
99
+ -------
100
+ pandas.DataFrame
101
+ A DataFrame containing the speech contents, dates, and party affiliations.
102
+
103
+ Notes:
104
+ -----
105
+ - The `db` parameter should be a FAISS vector store containing speech embeddings.
106
+ - The `query` parameter specifies the keyword(s) to search for in the speech contents.
107
+ - The `n` parameter determines the number of speech contents to retrieve (default is 10).
108
+ - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
109
+ - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
110
+ and 'mmr' (maximal marginal relevance) (default is 'ss').
111
+ - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
112
+ speeches from all parties (default is 'All').
113
+ """
114
+
115
  query_embedding = embeddings.embed_query(query)
116
+
117
+ # Maximal Marginal Relevance
118
  if method == 'mmr':
119
+ df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
120
+ results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
121
  for doc in results:
122
  party = doc[0].metadata["party"]
 
123
  if party != party_filter and party_filter != 'All':
124
+ continue
125
  speech_content = doc[0].page_content
126
  speech_date = doc[0].metadata["date"]
127
+ score = round(doc[1], ndigits=2)
128
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
129
+ 'Date': [speech_date],
130
+ 'Party': [party],
131
+ 'Relevance': [score]})], ignore_index=True)
132
  df_res.sort_values('Relevance', inplace=True, ascending=True)
133
+
134
+ # Similarity Search
135
  else:
136
+ df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
137
+ results = db.similarity_search_by_vector(query_embedding, k=n)
138
  for doc in results:
139
  party = doc.metadata["party"]
 
140
  if party != party_filter and party_filter != 'All':
141
+ continue
142
  speech_content = doc.page_content
143
  speech_date = doc.metadata["date"]
 
144
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
145
+ 'Date': [speech_date],
146
+ 'Party': [party]})], ignore_index=True)
147
+ return df_res
src/vectordatabase.py CHANGED
@@ -6,6 +6,15 @@ from langchain_community.llms import HuggingFaceHub
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.chains.combine_documents import create_stuff_documents_chain
8
  from langchain.chains import create_retrieval_chain
 
 
 
 
 
 
 
 
 
9
 
10
  import os
11
  #from dotenv import load_dotenv
@@ -29,19 +38,72 @@ def load_documents(df):
29
  documents = splitter.split_documents(documents=data)
30
  return documents
31
 
32
- def get_vectorstore(embeddings, folder_path, index_name):
33
- path = folder_path + "/" + index_name
34
- print(path)
35
- # To Do: Dynamicly update and merge verctorstores
36
- #if os.path.exists(path):
37
- db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  embeddings=embeddings, allow_dangerous_deserialization=True)
39
- #else:
40
- #db = FAISS.from_documents(documents, embeddings)
41
- #db.save_local(folder_path=folder_path, index_name=index_name)
42
- #pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  return db
44
 
 
 
45
  # Apply RAG by providing the context and the question to the LLM using the predefined template
46
  def RAG(llm, prompt, db, question):
47
  document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
@@ -51,3 +113,21 @@ def RAG(llm, prompt, db, question):
51
  response = retrieval_chain.invoke({"input": question})
52
  return response
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.chains.combine_documents import create_stuff_documents_chain
8
  from langchain.chains import create_retrieval_chain
9
+ from faiss import IndexFlatL2
10
+ from langchain_community.docstore.in_memory import InMemoryDocstore
11
+ from langchain.embeddings import SentenceTransformerEmbeddings
12
+ import functools
13
+
14
+
15
+
16
+
17
+ import pandas as pd
18
 
19
  import os
20
  #from dotenv import load_dotenv
 
38
  documents = splitter.split_documents(documents=data)
39
  return documents
40
 
41
+
42
+ #@functools.lru_cache()
43
+ def get_vectorstore(inputs, embeddings):
44
+ """
45
+ Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
46
+
47
+ Parameters:
48
+ ----------
49
+ inputs : list of str
50
+ A list of strings specifying which vector stores to combine. Each string represents a specific
51
+ index or a special keyword "All". If "All" is included in the list, it will load a pre-defined
52
+ comprehensive vector store and return immediately.
53
+
54
+ embeddings : Embeddings
55
+ An instance of embeddings that will be used to load the vector stores. The specific type and
56
+ structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
57
+
58
+ Returns:
59
+ -------
60
+ FAISS
61
+ A FAISS vector store that combines the specified indices into a single vector store.
62
+
63
+ Notes:
64
+ -----
65
+ - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.
66
+ - The function initializes an empty FAISS vector store with a dimensionality of 128.
67
+ - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".
68
+ - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.
69
+ - The `FAISS.load_local` method is used to load vector stores from the local file system.
70
+ The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.
71
+ """
72
+
73
+ # Default folder path
74
+ folder_path = "./src/FAISS"
75
+
76
+ if inputs[0] == "All":
77
+ index_name = "speeches_1949_09_12"
78
+ db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
79
  embeddings=embeddings, allow_dangerous_deserialization=True)
80
+ return db
81
+
82
+
83
+ # Initialize empty db
84
+ embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
85
+ dimensions: int = len(embedding_function.embed_query("dummy"))
86
+
87
+ db = FAISS(
88
+ embedding_function=embedding_function,
89
+ index=IndexFlatL2(dimensions),
90
+ docstore=InMemoryDocstore(),
91
+ index_to_docstore_id={},
92
+ normalize_L2=False
93
+ )
94
+
95
+ # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
96
+ for input in inputs:
97
+ # Retrieve selected index and merge vector stores
98
+ index = input.split(".")[0]
99
+ index_name = f'{index}_legislature'
100
+ local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
101
+ embeddings=embeddings, allow_dangerous_deserialization=True)
102
+ db.merge_from(local_db)
103
  return db
104
 
105
+
106
+
107
  # Apply RAG by providing the context and the question to the LLM using the predefined template
108
  def RAG(llm, prompt, db, question):
109
  document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
 
113
  response = retrieval_chain.invoke({"input": question})
114
  return response
115
 
116
+ #########
117
+ # Dynamically loading vector_db
118
+ ##########
119
+
120
+ def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):
121
+
122
+ # Get all file names
123
+ vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]
124
+
125
+ df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
126
+ # Extract metadata of file from its name
127
+ for file_name in vector_stores:
128
+ file_name = file_name.split(".")[0]
129
+ file_elements = file_name.split("_")
130
+ file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]
131
+
132
+ if file_party == party and file_start_date <= start_date:
133
+ None