Spaces:

TomData
/

PoliticsToYou

Sleeping

App Files Files Community

TomData commited on May 30

Commit

85df319

•

1 Parent(s): e681b03

added flexible vectorstore

Browse files

Files changed (45) hide show

.gitignore +2 -1
Home.py +45 -11
src/FAISS.ipynb +228 -13
src/FAISS/10_legislature.faiss +3 -0
src/FAISS/10_legislature.pkl +3 -0
src/FAISS/11_legislature.faiss +3 -0
src/FAISS/11_legislature.pkl +3 -0
src/FAISS/12_legislature.faiss +3 -0
src/FAISS/12_legislature.pkl +3 -0
src/FAISS/13_legislature.faiss +3 -0
src/FAISS/13_legislature.pkl +3 -0
src/FAISS/14_legislature.faiss +3 -0
src/FAISS/14_legislature.pkl +3 -0
src/FAISS/15_legislature.faiss +3 -0
src/FAISS/15_legislature.pkl +3 -0
src/FAISS/16_legislature.faiss +3 -0
src/FAISS/16_legislature.pkl +3 -0
src/FAISS/17_legislature.faiss +3 -0
src/FAISS/17_legislature.pkl +3 -0
src/FAISS/18_legislature.faiss +3 -0
src/FAISS/18_legislature.pkl +3 -0
src/FAISS/19_legislature.faiss +3 -0
src/FAISS/19_legislature.pkl +3 -0
src/FAISS/1_legislature.faiss +3 -0
src/FAISS/1_legislature.pkl +3 -0
src/FAISS/20_legislature.faiss +3 -0
src/FAISS/20_legislature.pkl +3 -0
src/FAISS/2_legislature.faiss +3 -0
src/FAISS/2_legislature.pkl +3 -0
src/FAISS/3_legislature.faiss +3 -0
src/FAISS/3_legislature.pkl +3 -0
src/FAISS/4_legislature.faiss +3 -0
src/FAISS/4_legislature.pkl +3 -0
src/FAISS/5_legislature.faiss +3 -0
src/FAISS/5_legislature.pkl +3 -0
src/FAISS/6_legislature.faiss +3 -0
src/FAISS/6_legislature.pkl +3 -0
src/FAISS/7_legislature.faiss +3 -0
src/FAISS/7_legislature.pkl +3 -0
src/FAISS/8_legislature.faiss +3 -0
src/FAISS/8_legislature.pkl +3 -0
src/FAISS/9_legislature.faiss +3 -0
src/FAISS/9_legislature.pkl +3 -0
src/chatbot.py +73 -22
src/vectordatabase.py +90 -10

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 __pycache__
 hf_upload.py
-.env

 __pycache__
 hf_upload.py
+.env
+.mypy_cache

Home.py CHANGED Viewed

@@ -1,45 +1,79 @@
 import gradio as gr
 from src.chatbot import chatbot, keyword_search
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):
-        #Apply RAG using chatbut function from local file ChatBot.py
         gr.ChatInterface(chatbot,
                     title="PoliticsToYou",
                     description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
                         to get insight on the view points of the german parties and the debate of the parliament.",
-                    examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
-                    cache_examples=False, #true increases the loading time
                     )
     with gr.Tab("KeyWordSearch"):
         with gr.Blocks() as Block:
-            #Keyword Input
             keyword_box = gr.Textbox(label='keyword')
             #Additional Input (hidden)
             with gr.Accordion('Detailed filters', open=False):
-                #Row orientation
                 with gr.Row() as additional_input:
                     n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
-                    party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') #change to all possible options
             search_btn = gr.Button('Search')
             with gr.Column(visible=False) as output_col:
                 results_df = gr.Dataframe(label='Results', interactive=False)
-                #Download results from keyword search
                 with gr.Accordion('Would you like to download your results?', open=False) as download_row:
                     with gr.Row():
                         ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
                         export_btn = gr.Button('Export')
                         file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
-            #Keyword Search on click
-            def search(keyword, n, party): #ToDo: Include party and timedate
                 return {
                     output_col: gr.Column(visible=True),
                     results_df: keyword_search(query=keyword, n=n, party_filter=party),
@@ -51,7 +85,7 @@ with gr.Blocks() as App:
                 outputs=[output_col, results_df],
             )
-            #Export data to a downloadable format
             def export(df, keyword, ftype=None):
                 if ftype == "csv":
                     file = f'{keyword}.csv'
@@ -74,6 +108,6 @@ with gr.Blocks() as App:
 if __name__ == "__main__":
-    App.launch(share=False) #true not supported on hf spaces

 import gradio as gr
 from src.chatbot import chatbot, keyword_search
+from gradio_calendar import Calendar
+from datetime import datetime
+legislature_periods = [
+    "20. Legislaturperiode",
+    "19. Legislaturperiode",
+    "18. Legislaturperiode",
+    "17. Legislaturperiode",
+    "16. Legislaturperiode",
+    "15. Legislaturperiode",
+    "14. Legislaturperiode",
+    "13. Legislaturperiode",
+    "12. Legislaturperiode",
+    "11. Legislaturperiode",
+    "10. Legislaturperiode",
+    "9. Legislaturperiode",
+    "8. Legislaturperiode",
+    "7. Legislaturperiode",
+    "6. Legislaturperiode",
+    "5. Legislaturperiode",
+    "4. Legislaturperiode",
+    "3. Legislaturperiode",
+    "2. Legislaturperiode",
+    "1. Legislaturperiode"
+]
 with gr.Blocks() as App:
     with gr.Tab("ChatBot"):
+        # Apply RAG using chatbut function from local file ChatBot.py
+        db_inputs = gr.Dropdown(choices=legislature_periods, value="All", multiselect=True, label="If empty all Legislaturperioden are selected", show_label=True)
+        print(db_inputs)
         gr.ChatInterface(chatbot,
                     title="PoliticsToYou",
                     description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
                         to get insight on the view points of the german parties and the debate of the parliament.",
+                    #examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
+                    cache_examples=False,  #true increases the loading time
+                    additional_inputs = db_inputs,
                     )
     with gr.Tab("KeyWordSearch"):
         with gr.Blocks() as Block:
+            # Keyword Input
             keyword_box = gr.Textbox(label='keyword')
             #Additional Input (hidden)
             with gr.Accordion('Detailed filters', open=False):
+                # Row orientation
                 with gr.Row() as additional_input:
                     n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
+                    party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
+                    start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
+                    end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
             search_btn = gr.Button('Search')
             with gr.Column(visible=False) as output_col:
                 results_df = gr.Dataframe(label='Results', interactive=False)
+                # Download results from keyword search
                 with gr.Accordion('Would you like to download your results?', open=False) as download_row:
                     with gr.Row():
                         ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
                         export_btn = gr.Button('Export')
                         file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
+            # Keyword Search on click
+            def search(keyword, n, party): # ToDo: Include party and timedate
                 return {
                     output_col: gr.Column(visible=True),
                     results_df: keyword_search(query=keyword, n=n, party_filter=party),
                 outputs=[output_col, results_df],
             )
+            # Export data to a downloadable format
             def export(df, keyword, ftype=None):
                 if ftype == "csv":
                     file = f'{keyword}.csv'
 if __name__ == "__main__":
+    App.launch(share=False) #t rue not supported on hf spaces

src/FAISS.ipynb CHANGED Viewed

@@ -2,19 +2,152 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
-      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
-      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
-      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
     }
    ],
    "source": [
@@ -23,14 +156,96 @@
     "from vectordatabase import load_documents\n",
     "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
     "from langchain_community.vectorstores import FAISS\n",
     "\n",
     "\n",
     "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
     "# Split speeches into documents\n",
-    "documents = load_documents(df)\n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
-    "db = FAISS.from_documents(documents, embeddings)\n",
-    "db.save_local(folder_path=\"ChatBot\\FAISS\", index_name=\"speeches_1949_09_12\")\n"
    ]
   }
  ],

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>speech_content</th>\n",
+       "      <th>date</th>\n",
+       "      <th>party</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>Meine Damen und Herren! Ich eröffne die 2. Sit...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Der Bundesrat ist versammelt, Herr Präsident.\\n</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Ich danke für diese Erklärung. Ich stelle dami...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Ja, ich habe den Wunsch.\\n</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Ich erteile dem Herrn Bundespräsidenten das Wo...</td>\n",
+       "      <td>1949-09-12</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930955</th>\n",
+       "      <td>1084268</td>\n",
+       "      <td>\\n\\nWir sind zwar Kollegen.</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930956</th>\n",
+       "      <td>1084269</td>\n",
+       "      <td>\\n\\nLiebe, sehr geehrte Frau Präsidentin!</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>CDU/CSU</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930957</th>\n",
+       "      <td>1084270</td>\n",
+       "      <td>\\n\\nVielen Dank.</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930958</th>\n",
+       "      <td>1084272</td>\n",
+       "      <td>\\n\\nDen Abschluss dieser Aktuellen Stunde bild...</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>not found</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>930959</th>\n",
+       "      <td>1084273</td>\n",
+       "      <td>\\n\\nSehr geehrte Frau Präsidentin! Werte Kolle...</td>\n",
+       "      <td>2022-12-16</td>\n",
+       "      <td>SPD</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>930960 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             id                                     speech_content       date  \\\n",
+       "0             0  Meine Damen und Herren! Ich eröffne die 2. Sit... 1949-09-12   \n",
+       "1             1    Der Bundesrat ist versammelt, Herr Präsident.\\n 1949-09-12   \n",
+       "2             2  Ich danke für diese Erklärung. Ich stelle dami... 1949-09-12   \n",
+       "3             3                         Ja, ich habe den Wunsch.\\n 1949-09-12   \n",
+       "4             4  Ich erteile dem Herrn Bundespräsidenten das Wo... 1949-09-12   \n",
+       "...         ...                                                ...        ...   \n",
+       "930955  1084268                        \\n\\nWir sind zwar Kollegen. 2022-12-16   \n",
+       "930956  1084269          \\n\\nLiebe, sehr geehrte Frau Präsidentin! 2022-12-16   \n",
+       "930957  1084270                                   \\n\\nVielen Dank. 2022-12-16   \n",
+       "930958  1084272  \\n\\nDen Abschluss dieser Aktuellen Stunde bild... 2022-12-16   \n",
+       "930959  1084273  \\n\\nSehr geehrte Frau Präsidentin! Werte Kolle... 2022-12-16   \n",
+       "\n",
+       "            party  \n",
+       "0       not found  \n",
+       "1       not found  \n",
+       "2       not found  \n",
+       "3       not found  \n",
+       "4       not found  \n",
+       "...           ...  \n",
+       "930955  not found  \n",
+       "930956    CDU/CSU  \n",
+       "930957  not found  \n",
+       "930958  not found  \n",
+       "930959        SPD  \n",
+       "\n",
+       "[930960 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "from vectordatabase import load_documents\n",
     "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
     "from langchain_community.vectorstores import FAISS\n",
+    "from datetime import datetime\n",
     "\n",
     "\n",
     "df = pd.read_pickle(\"C:\\\\Users\\Tom\\SynologyDrive\\Tom\\Programming\\\\NLP\\Spaces\\PoliticsToYou\\src\\Speeches\\speeches_1949_09_12\")\n",
+    "df['date'] = pd.to_datetime(df['date'])\n",
     "# Split speeches into documents\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "c:\\Python\\Lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sucessfully created vector store for 1. legislature\n",
+      "Sucessfully created vector store for 2. legislature\n",
+      "Sucessfully created vector store for 3. legislature\n",
+      "Sucessfully created vector store for 4. legislature\n",
+      "Sucessfully created vector store for 5. legislature\n",
+      "Sucessfully created vector store for 6. legislature\n",
+      "Sucessfully created vector store for 7. legislature\n",
+      "Sucessfully created vector store for 8. legislature\n",
+      "Sucessfully created vector store for 9. legislature\n",
+      "Sucessfully created vector store for 10. legislature\n",
+      "Sucessfully created vector store for 11. legislature\n",
+      "Sucessfully created vector store for 12. legislature\n",
+      "Sucessfully created vector store for 13. legislature\n",
+      "Sucessfully created vector store for 14. legislature\n",
+      "Sucessfully created vector store for 15. legislature\n",
+      "Sucessfully created vector store for 16. legislature\n",
+      "Sucessfully created vector store for 17. legislature\n",
+      "Sucessfully created vector store for 18. legislature\n",
+      "Sucessfully created vector store for 19. legislature\n",
+      "Sucessfully created vector store for 20. legislature\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "dates = [\"1953-10-06\", \"1957-10-16\", \"1961-10-17\", \"1965-10-19\", \"1969-10-20\", \"1972-12-13\", \"1976-12-14\", \"1980-11-04\", \"1983-03-29\", \"1987-02-18\",\"1990-12-20\", \"1994-11-10\", \"1998-10-26\", \"2002-10-17\", \"2005-10-18\", \"2009-10-27\", \"2013-10-22\",\"2017-10-24\",\"2021-10-26\", None]\n",
     "embeddings = HuggingFaceEmbeddings(model_name=\"paraphrase-multilingual-MiniLM-L12-v2\")\n",
+    "\n",
+    "# Iterate over all date to split by legislature getting vector stores for each period\n",
+    "\n",
+    "period = 1\n",
+    "previous_date = None\n",
+    "for date in dates:\n",
+    "    if previous_date is None:\n",
+    "        legislature = df.loc[df['date'] < datetime.strptime(date, \"%Y-%m-%d\")]\n",
+    "    elif date is None:\n",
+    "        legislature = df.loc[df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")]\n",
+    "    else:\n",
+    "        legislature = df.loc[(df['date'] >= datetime.strptime(previous_date, \"%Y-%m-%d\")) & (df['date'] < datetime.strptime(date, \"%Y-%m-%d\"))]\n",
+    "\n",
+    "   \n",
+    "    # Split text into documents\n",
+    "    documents = load_documents(legislature)\n",
+    "    index_name = f'{period}_legislature'\n",
+    "    db = FAISS.from_documents(documents, embeddings)\n",
+    "    db.save_local(folder_path=\"FAISS\", index_name=index_name)\n",
+    "    print(f\"Sucessfully created vector store for {period}. legislature\")\n",
+    "    # Change for next iteration\n",
+    "    period += 1\n",
+    "    previous_date = date\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n"
    ]
   }
  ],

src/FAISS/10_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66ac4627657617d20657ee29e060d5e0201f23474f2bb316fcfcd1e784347c83
+size 238133805

src/FAISS/10_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63591732c250d403c23d68d007f2f0091d1e4393c8f06131c59dd0756c08e479
+size 107921064

src/FAISS/11_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f083f538427463ff2d9dfd1a7f1dddc7b0ec944ae7c4a0c0f110d71e268c7eb5
+size 234221613

src/FAISS/11_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aba4a26fd81efa5416bbdadcf5416a9c43d5a24fe319073de8d862812e91eb77
+size 109009058

src/FAISS/12_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1651a9d458986410808873d8fbc8609181ec55726827375987834975eaea303e
+size 252945453

src/FAISS/12_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98a03cba6aef98df4cb4f147f21072b14d647f6ed8fab51b5d9000a93934c53b
+size 120628791

src/FAISS/13_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9349d40b633b9c9265e11cce8466137ba47e1622550291d37587e9cf57ca7f18
+size 256303149

src/FAISS/13_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff67a961bec000909289049a63a389d18a0b818394b0a505da5f1867a5dc2cb
+size 123142473

src/FAISS/14_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:005fe85f80d6e073f12cc60b3fe32b65f87df5f5ef233c4c6ebe30594b542c18
+size 240325677

src/FAISS/14_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98f41d12a44e8a3592f9c829744102328834f09d913305181dc827d54c15ec3f
+size 125554373

src/FAISS/15_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e34018d0aef1cefd2e68851b5e3ae7b4ff1133872e238a37a92358204428ddd
+size 167463981

src/FAISS/15_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba506f555d4b8a668233621cb1e006a85fdd5444196d0d83d8b83bcdb09725f1
+size 87021966

src/FAISS/16_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2924f42f74019b6c55304f6ab9c7e0778926123c2fd46a69b6b3a9d428cc445a
+size 246827565

src/FAISS/16_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c219b81c0e07e9911bc0a41dc5d064c07dc77ca968410db3491556fb2da019b
+size 128041546

src/FAISS/17_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c99b6b30e3fe81fa31b3cf8384294215a86f6591641916b3a7abf793c08bd51f
+size 309130797

src/FAISS/17_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13d510ccd279e3eac2f80ca323ccd14b073c1e1135c7cb9cb1caf73e5a8ebb27
+size 162425243

src/FAISS/18_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e40a3278487a92ab9cb94e24bddcc35df8ca8f0f81d0871a3ca3deea9ad07deb
+size 240373293

src/FAISS/18_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:753d1838eb1aa45449cc728374949af6aea3887a39c80644b73688f259f3bc0c
+size 124641794

src/FAISS/19_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa5be26dea0ba0048a917ec51b445cfca46b54529437de589a87bb156b047b1
+size 255599661

src/FAISS/19_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11ce629b1d179528d2581343bdc5e6ba11762be0d7f0710823e783dd28091334
+size 128545267

src/FAISS/1_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:287fba08c823eb8438576944cb8ab3a70ff810f1dec83b8f6ef220d3c8f0e87d
+size 175981101

src/FAISS/1_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e78cb806846e05044d623763f490bd959fb52d1d6d2a53fea45a1dab8b8414a9
+size 87900606

src/FAISS/20_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5eef8fc33a06b508f21d817e4a081b40fc9086238641a7f29b8280700bf41c4e
+size 81005613

src/FAISS/20_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27919466bac90278eebddda9de8d4b86f1ed7d1e309606e96e59486704a56dc6
+size 40616936

src/FAISS/2_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f63b7e7135fdccc872d93e95653bb3df97ec456f4409f0009abafc4ffe3ab8aa
+size 155570733

src/FAISS/2_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca740628c8500c62e37362813068399670351c934912f5586b9d558f1ed81b0
+size 76574674

src/FAISS/3_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c901df3b2be8e4c237ee26296f8c320f75caa63aa8c2381ebd2de530545d87a
+size 120843309

src/FAISS/3_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e4ab8b365422001985f274a71c74ffdec6d8e7cc122196257cf92f33ba1127b
+size 59003914

src/FAISS/4_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2605fe28e330771a210256307a4638ac4f5a256a19299cfa6b3932015b803cf4
+size 139723821

src/FAISS/4_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02511c79b3e38d13815ac02fc1627cc0000a964730efb49a0fe5866fbb7772c5
+size 60800382

src/FAISS/5_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0dc389788da319691d24a9209d8fa0e181d3cb3396e424407cb565c44bd386f
+size 183742509

src/FAISS/5_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5521ea0d8260f20f214eda9f6c726195341b04cd95e6e65613a608fc8540b2
+size 81197730

src/FAISS/6_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ac34ac64c99c302c3fcd7c6c65e7a758d9c2b0f3e4294c7533a3448de49e6ee
+size 134287917

src/FAISS/6_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe2f497330f7dfd6ec51cce4d42d5b065bdb27828ab0d85177ee85dac47ad596
+size 62478365

src/FAISS/7_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0efa99ff055e4399db9ad5ca85c2538e1441924ea4937c32df4635e23e3e97cb
+size 211250733

src/FAISS/7_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f229c55d9d44958db68381d0f6805fff269d406cf94260a22c656dd13056b308
+size 96069866

src/FAISS/8_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53a84ee9456604696f83fa82245c13aaa6e60099585d93b375a5f67f6ccdd3d9
+size 200272941

src/FAISS/8_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f44bc8d65db7aaabe1bbad09d6022bf97b537f50648f56f9d15465a7d9eb61d
+size 90726289

src/FAISS/9_legislature.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e0f48ef95f2d0edb9914ffacffd8e18aed9325b467293a8480f069143407102
+size 113241645

src/FAISS/9_legislature.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e416d21bcfb120aa5047ba3d556f5df79e16d04676e8fb9e7fdcd8b6116a31e
+size 50705534

src/chatbot.py CHANGED Viewed

@@ -2,6 +2,7 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from src.vectordatabase import RAG, get_vectorstore
 import pandas as pd
 from dotenv import load_dotenv, find_dotenv
@@ -50,47 +51,97 @@ prompt2 = ChatPromptTemplate.from_template("""Beantworte die folgende Frage auf
 )
-folder_path = "./src/FAISS"
-index_name = "speeches_1949_09_12"
 #index_name = "legislature20"
-db = get_vectorstore(embeddings=embeddings, folder_path=folder_path, index_name=index_name)
-def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
     raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
     response = raw_response['answer'].split("Antwort: ")[1]
     return response
-# Retrieve speech contents based on keywords
-def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_filter = 'All'):
     query_embedding = embeddings.embed_query(query)
     if method == 'mmr':
-        df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
-        results =  db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n)
         for doc in results:
             party = doc[0].metadata["party"]
-             #Filter by party input
             if party != party_filter and party_filter != 'All':
-                  continue
             speech_content = doc[0].page_content
             speech_date = doc[0].metadata["date"]
-            score = round(doc[1], ndigits=2) # Relevance based on relevance search
             df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
-                                                            'Date': [speech_date],
-                                                            'Party': [party],
-                                                            'Relevance': [score]})], ignore_index=True)
         df_res.sort_values('Relevance', inplace=True, ascending=True)
     else:
-        df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party']) # Add Date/Party/Politician #Add filter
-        results = db.similarity_search_by_vector(query_embedding, k = n)
         for doc in results:
             party = doc.metadata["party"]
-            #Filter by party input
             if party != party_filter and party_filter != 'All':
-                  continue
             speech_content = doc.page_content
             speech_date = doc.metadata["date"]
             df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
-                                                            'Date': [speech_date],
-                                                            'Party': [party]})], ignore_index=True)
-    return df_res

 from langchain_community.llms.huggingface_hub import HuggingFaceHub
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from src.vectordatabase import RAG, get_vectorstore
 import pandas as pd
 from dotenv import load_dotenv, find_dotenv
 )
+#folder_path =
+#index_name = "speeches_1949_09_12"
 #index_name = "legislature20"
+#db = get
+def chatbot(message, history, db_inputs, llm=llm, prompt=prompt2):
+    db = get_vectorstore(inputs = db_inputs, embeddings=embeddings)
     raw_response = RAG(llm=llm, prompt=prompt, db=db, question=message)
+    # Only necessary because mistral does not give beautiful outputs
     response = raw_response['answer'].split("Antwort: ")[1]
     return response
+def keyword_search(db, query, n=10, embeddings=embeddings, method='ss', party_filter='All'):
+    """
+    Retrieve speech contents based on keywords using a specified method.
+    Parameters:
+    ----------
+    db : FAISS
+        The FAISS vector store containing speech embeddings.
+    query : str
+        The keyword(s) to search for in the speech contents.
+    n : int, optional
+        The number of speech contents to retrieve (default is 10).
+    embeddings : Embeddings, optional
+        An instance of embeddings used for embedding queries (default is embeddings).
+    method : str, optional
+        The method used for retrieving speech contents. Options are 'ss' (semantic search) and 'mmr'
+        (maximal marginal relevance) (default is 'ss').
+    party_filter : str, optional
+        A filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
+        speeches from all parties (default is 'All').
+    Returns:
+    -------
+    pandas.DataFrame
+        A DataFrame containing the speech contents, dates, and party affiliations.
+    Notes:
+    -----
+    - The `db` parameter should be a FAISS vector store containing speech embeddings.
+    - The `query` parameter specifies the keyword(s) to search for in the speech contents.
+    - The `n` parameter determines the number of speech contents to retrieve (default is 10).
+    - The `embeddings` parameter is an instance of embeddings used for embedding queries (default is embeddings).
+    - The `method` parameter specifies the method used for retrieving speech contents. Options are 'ss' (semantic search)
+      and 'mmr' (maximal marginal relevance) (default is 'ss').
+    - The `party_filter` parameter is a filter for retrieving speech contents by party affiliation. Specify 'All' to retrieve
+      speeches from all parties (default is 'All').
+    """
     query_embedding = embeddings.embed_query(query)
+    # Maximal Marginal Relevance
     if method == 'mmr':
+        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party', 'Relevance'])
+        results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k=n)
         for doc in results:
             party = doc[0].metadata["party"]
             if party != party_filter and party_filter != 'All':
+                continue
             speech_content = doc[0].page_content
             speech_date = doc[0].metadata["date"]
+            score = round(doc[1], ndigits=2)
             df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
+                                                      'Date': [speech_date],
+                                                      'Party': [party],
+                                                      'Relevance': [score]})], ignore_index=True)
         df_res.sort_values('Relevance', inplace=True, ascending=True)
+    # Similarity Search
     else:
+        df_res = pd.DataFrame(columns=['Speech Content', 'Date', 'Party'])
+        results = db.similarity_search_by_vector(query_embedding, k=n)
         for doc in results:
             party = doc.metadata["party"]
             if party != party_filter and party_filter != 'All':
+                continue
             speech_content = doc.page_content
             speech_date = doc.metadata["date"]
             df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
+                                                      'Date': [speech_date],
+                                                      'Party': [party]})], ignore_index=True)
+    return df_res

src/vectordatabase.py CHANGED Viewed

@@ -6,6 +6,15 @@ from langchain_community.llms import HuggingFaceHub
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
 import os
 #from dotenv import load_dotenv
@@ -29,19 +38,72 @@ def load_documents(df):
     documents = splitter.split_documents(documents=data)
     return documents
-def get_vectorstore(embeddings, folder_path, index_name):
-    path = folder_path + "/" + index_name
-    print(path)
-    # To Do: Dynamicly update and merge verctorstores
-    #if os.path.exists(path):
-    db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                             embeddings=embeddings, allow_dangerous_deserialization=True)
-    #else:
-        #db = FAISS.from_documents(documents, embeddings)
-        #db.save_local(folder_path=folder_path, index_name=index_name)
-        #pass
     return db
 # Apply RAG by providing the context and the question to the LLM using the predefined template
 def RAG(llm, prompt, db, question):
     document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
@@ -51,3 +113,21 @@ def RAG(llm, prompt, db, question):
     response = retrieval_chain.invoke({"input": question})
     return response

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
+from faiss import IndexFlatL2
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain.embeddings import SentenceTransformerEmbeddings
+import functools
+import pandas as pd
 import os
 #from dotenv import load_dotenv
     documents = splitter.split_documents(documents=data)
     return documents
+#@functools.lru_cache()
+def get_vectorstore(inputs, embeddings):
+    """
+    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
+    Parameters:
+    ----------
+    inputs : list of str
+        A list of strings specifying which vector stores to combine. Each string represents a specific
+        index or a special keyword "All". If "All" is included in the list, it will load a pre-defined
+        comprehensive vector store and return immediately.
+    embeddings : Embeddings
+        An instance of embeddings that will be used to load the vector stores. The specific type and
+        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
+    Returns:
+    -------
+    FAISS
+        A FAISS vector store that combines the specified indices into a single vector store.
+    Notes:
+    -----
+    - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.
+    - The function initializes an empty FAISS vector store with a dimensionality of 128.
+    - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".
+    - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.
+    - The `FAISS.load_local` method is used to load vector stores from the local file system.
+      The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.
+    """
+    # Default folder path
+    folder_path = "./src/FAISS"
+    if inputs[0] == "All":
+        index_name = "speeches_1949_09_12"
+        db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                             embeddings=embeddings, allow_dangerous_deserialization=True)
+        return db
+    # Initialize empty db
+    embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    dimensions: int = len(embedding_function.embed_query("dummy"))
+    db = FAISS(
+        embedding_function=embedding_function,
+        index=IndexFlatL2(dimensions),
+        docstore=InMemoryDocstore(),
+        index_to_docstore_id={},
+        normalize_L2=False
+    )
+    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
+    for input in inputs:
+        # Retrieve selected index and merge vector stores
+        index = input.split(".")[0]
+        index_name = f'{index}_legislature'
+        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
+                                            embeddings=embeddings, allow_dangerous_deserialization=True)
+        db.merge_from(local_db)
     return db
 # Apply RAG by providing the context and the question to the LLM using the predefined template
 def RAG(llm, prompt, db, question):
     document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
     response = retrieval_chain.invoke({"input": question})
     return response
+#########
+# Dynamically loading vector_db
+##########
+def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):
+    # Get all file names
+    vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]
+    df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
+    # Extract metadata of file from its name
+    for file_name in vector_stores:
+        file_name = file_name.split(".")[0]
+        file_elements = file_name.split("_")
+        file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]
+        if file_party == party and file_start_date <= start_date:
+            None