Spaces:

khaiphan29
/

fact-check-api

Runtime error

App Files Files Community

khaiphan29 commited on Dec 31, 2023

Commit

0217fc8

•

1 Parent(s): 0af052d

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +1 -0
Dockerfile +14 -0
README.md +4 -6
main.py +77 -0
requirements.txt +63 -0
script.py +5 -0
src/.DS_Store +0 -0
src/__init__.py +0 -0
src/crawler.py +256 -0
src/mDeBERTa (ft) V6/.DS_Store +0 -0
src/mDeBERTa (ft) V6/cls.pt +3 -0
src/mDeBERTa (ft) V6/cls_log.txt +76 -0
src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/config.json +45 -0
src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/model.safetensors +3 -0
src/mDeBERTa (ft) V6/mean.pt +3 -0
src/mDeBERTa (ft) V6/mean_log.txt +76 -0
src/mDeBERTa (ft) V6/plot.png +0 -0
src/mDeBERTa (ft) V6/public_train_v4.json +3 -0
src/myNLI.py +190 -0
src/nli_v3.py +115 -0
src/utils.py +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
 ---
-title: Fact Check Api
-emoji: 📈
-colorFrom: blue
 colorTo: blue
-sdk: gradio
-sdk_version: 4.12.0
-app_file: app.py
 pinned: false
 ---

 ---
+title: Fact Checking Api
+emoji: 📊
+colorFrom: pink
 colorTo: blue
+sdk: docker
 pinned: false
 ---

main.py ADDED Viewed

	@@ -0,0 +1,77 @@

+#uvicorn main:app --reload
+from fastapi import FastAPI, status
+from fastapi.responses import Response, JSONResponse
+from pydantic import BaseModel
+from typing import List
+import os
+import json
+import time
+from src.myNLI import FactChecker
+from src.crawler import MyCrawler
+#request body
+class Claim(BaseModel):
+    claim: str
+class ScrapeBase(BaseModel):
+    id: int
+    name: str
+    scraping_url: str
+class ScrapeList(BaseModel):
+    data: List[ScrapeBase]
+app = FastAPI()
+# load model
+t_0 = time.time()
+fact_checker = FactChecker()
+t_load = time.time() - t_0
+print("time load model: {}".format(t_load))
+crawler = MyCrawler()
+label_code = {
+    "REFUTED": 0,
+    "SUPPORTED": 1,
+    "NEI": 2
+}
+@app.get("/")
+async def root():
+    return {"msg": "This is for interacting with Fact-checking AI Model"}
+@app.post("/ai-fact-check")
+async def get_claim(req: Claim):
+    claim = req.claim
+    result = fact_checker.predict(claim)
+    print(result)
+    if not result:
+        return Response(status_code=status.HTTP_204_NO_CONTENT)
+    return { "claim": claim,
+            "final_label": label_code[result["label"]],
+            "evidence": result["evidence"],
+            "provider": result["provider"],
+            "url": result["url"]
+        }
+@app.post("/scraping-check")
+async def get_claim(req: ScrapeList):
+    response = []
+    for ele in req.data:
+        response.append({
+            "id": ele.id,
+            "name": ele.name,
+            "scraping_url": ele.scraping_url,
+            "status": crawler.scraping(ele.scraping_url)
+        })
+    return JSONResponse({
+        "list": response
+    })

requirements.txt ADDED Viewed

	@@ -0,0 +1,63 @@

+aiohttp==3.9.1
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.2.0
+async-timeout==4.0.3
+attrs==23.2.0
+beautifulsoup4==4.12.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+datasets==2.16.1
+dill==0.3.7
+exceptiongroup==1.2.0
+fastapi==0.108.0
+filelock==3.13.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+h11==0.14.0
+huggingface-hub==0.20.1
+idna==3.6
+Jinja2==3.1.2
+joblib==1.3.2
+MarkupSafe==2.1.3
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.2
+packaging==23.2
+pandas==2.1.4
+Pillow==10.1.0
+pyarrow==14.0.2
+pyarrow-hotfix==0.6
+pydantic==2.5.3
+pydantic_core==2.14.6
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+safetensors==0.4.1
+scikit-learn==1.3.2
+scipy==1.11.4
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.5
+starlette==0.32.0.post1
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+uvicorn==0.25.0
+xxhash==3.4.1
+yarl==1.9.4

script.py ADDED Viewed

	@@ -0,0 +1,5 @@

+api.upload_folder(
+    folder_path="./src",
+    repo_id="khaiphan29/fact-check-api",
+    repo_type="space",
+)

src/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/__init__.py ADDED Viewed

File without changes

src/crawler.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import requests
+from bs4 import BeautifulSoup
+import re
+import time
+from .utils import timer_func
+def remove_emoji(string):
+    emoji_pattern = re.compile("["
+                           u"\U0001F300-\U0001FAD6"  # emoticons
+                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                           u"\U00002702-\U000027B0"
+                           u"\U000024C2-\U0001F251"
+                           "]+", flags=re.UNICODE)
+    return emoji_pattern.sub(r'', string)
+def preprocess(texts):
+    texts = [text.replace("_", " ") for text in texts]
+    texts = [i.lower() for i in texts]
+    texts = [remove_emoji(i) for i in texts]
+    texts = [re.sub('[^\w\d\s]', '', i) for i in texts]
+    texts = [re.sub('\s+|\n', ' ', i) for i in texts]
+    texts = [re.sub('^\s|\s$', '', i) for i in texts]
+    # texts = [ViTokenizer.tokenize(i) for i in texts]
+    return texts
+class MyCrawler:
+    headers = {
+        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36",
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1'
+    }
+    # headers = {
+    #             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
+    #             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    #             # 'Accept-Language': 'en-US,en;q=0.5',
+    #             # 'Accept-Encoding': 'gzip, deflate',
+    #             # 'DNT': '1',
+    #             # 'Connection': 'keep-alive',
+    #             # 'Upgrade-Insecure-Requests': '1'
+    #         }
+    def getSoup(self, url: str):
+        req = requests.get(url,headers=self.headers)
+        return BeautifulSoup(req.text, 'html.parser')
+    def crawl_byContainer(self, url: str, article_container: str, body_class: str):
+        soup = self.getSoup(url)
+        paragraphs = soup.find(article_container,{"class": body_class})
+        if paragraphs:
+            #Crawl all paragraphs
+            contents = []
+            numOfParagraphs = 0
+            for p in paragraphs.find_all("p"):
+                contents.append(p.get_text())
+                numOfParagraphs += 1
+                # if numOfParagraphs > 10:
+                #     break
+            if contents:
+                result = "\n".join(contents)
+                if (url.split("/")[2] == "vnexpress.net"):
+                    result = self.crawl_byElement(soup, "p", "description") + "\n" + result
+                return result
+        return ""
+    def crawl_byElement(self, soup, element: str, ele_class: str):
+        print("by Elements...")
+        paragraph = soup.find(element,{"class": ele_class})
+        if paragraph:
+            print(paragraph.get_text())
+            return paragraph.get_text()
+        return ""
+    def crawl_webcontent(self, url: str):
+        provider = url.split("/")[2]
+        content = ""
+        if provider == "thanhnien.vn" or provider == "tuoitre.vn":
+            content = self.crawl_byContainer(url, "div", "afcbc-body")
+        elif provider == "vietnamnet.vn":
+            content = self.crawl_byContainer(url, "div", "maincontent")
+        elif provider == "vnexpress.net":
+            content = self.crawl_byContainer(url, "article", "fck_detail")
+        elif provider == "www.24h.com.vn":
+            content = self.crawl_byContainer(url, "article", "cate-24h-foot-arti-deta-info")
+        elif provider == "vov.vn":
+            content = self.crawl_byContainer(url, "div", "article-content")
+        elif provider == "vtv.vn":
+            content = self.crawl_byContainer(url, "div", "ta-justify")
+        elif provider == "vi.wikipedia.org":
+            content = self.crawl_byContainer(url, "div", "mw-content-ltr")
+        elif provider == "www.vinmec.com":
+            content = self.crawl_byContainer(url, "div", "block-content")
+        elif provider == "vietstock.vn":
+            content = self.crawl_byContainer(url, "div", "single_post_heading")
+        elif provider == "vneconomy.vn":
+            content = self.crawl_byContainer(url, "article", "detail-wrap")
+        elif provider == "dantri.com.vn":
+            content = self.crawl_byContainer(url, "article", "singular-container")
+        # elif provider == "plo.vn":
+        #     content = self.crawl_byContainer(url, "div", "article__body")
+        return provider, url, content
+    #def crawl_redir(url):
+    @timer_func
+    def search(self, claim: str, count: int = 1):
+        processed_claim = preprocess([claim])[0]
+        num_words = 100
+        ls_word = processed_claim.split(" ")
+        claim_short = " ".join(ls_word[:num_words])
+        print(claim_short)
+        query = claim_short
+        # query = '+'.join(claim_short.split(" "))
+        try:
+            # print(soup.prettify())
+            #get all URLs
+            attemp_time = 0
+            urls = []
+            while len(urls) == 0 and attemp_time < 3:
+                req=requests.get("https://www.bing.com/search?", headers=self.headers, params={
+                    "q": query,
+                    "responseFilter":"-images",
+                    "responseFilter":"-videos"
+                    })
+                print("Query URL: " + req.url)
+                print("Crawling Attempt " + str(attemp_time))
+                soup = BeautifulSoup(req.text, 'html.parser')
+                completeData = soup.find_all("li",{"class":"b_algo"})
+                for data in completeData:
+                    urls.append(data.find("a", href=True)["href"])
+                attemp_time += 1
+                time.sleep(1)
+            print("Got " + str(len(urls)) + " urls")
+            result = []
+            for url in urls:
+                print("Crawling... " + url)
+                provider, url, content = self.crawl_webcontent(url)
+                if content:
+                    result.append({
+                        "provider": provider,
+                        "url": url,
+                        "content": content
+                    })
+                    count -= 1
+                    if count == 0:
+                        break
+            return result
+        except Exception as e:
+            print(e)
+            return []
+    @timer_func
+    def searchGoogle(self, claim: str, count: int = 1):
+        processed_claim = preprocess([claim])[0]
+        num_words = 100
+        ls_word = processed_claim.split(" ")
+        claim_short = " ".join(ls_word[:num_words])
+        print(claim_short)
+        query = claim_short
+        # query = '+'.join(claim_short.split(" "))
+        try:
+            # print(soup.prettify())
+            #get all URLs
+            attemp_time = 0
+            urls = []
+            while len(urls) == 0 and attemp_time < 3:
+                req=requests.get("https://www.google.com/search?", headers=self.headers, params={
+                    "q": query
+                    })
+                print("Query URL: " + req.url)
+                print("Crawling Attempt " + str(attemp_time))
+                soup = BeautifulSoup(req.text, 'html.parser')
+                completeData = soup.find_all("a",{"jsname":"UWckNb"})
+                for data in completeData:
+                    urls.append(data["href"])
+                attemp_time += 1
+                time.sleep(1)
+            print("Got " + str(len(urls)) + " urls")
+            result = []
+            for url in urls:
+                print("Crawling... " + url)
+                provider, url, content = self.crawl_webcontent(url)
+                if content:
+                    result.append({
+                        "provider": provider,
+                        "url": url,
+                        "content": content
+                    })
+                    count -= 1
+                    if count == 0:
+                        break
+            return result
+        except Exception as e:
+            print(e)
+            return []
+    @timer_func
+    def scraping(self, url: str):
+        try:
+            provider, url, content = self.crawl_webcontent(url)
+            if content:
+                return True
+            return False
+        except Exception as e:
+            print(e)
+            return False

src/mDeBERTa (ft) V6/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/mDeBERTa (ft) V6/cls.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1c3c8eae44569fd01a746b220091611125f9eb04e09af2d60a6d80befcdb769
+size 11064

src/mDeBERTa (ft) V6/cls_log.txt ADDED Viewed

	@@ -0,0 +1,76 @@

+Step 0 -- Accuracy: 0.3039772727272727 -- macro_f1: 0.20810584530698015 -- loss: 1.0453389883041382
+Step 100 -- Accuracy: 0.859375 -- macro_f1: 0.8598470398571504 -- loss: 0.11795929819345474
+Step 200 -- Accuracy: 0.8747159090909091 -- macro_f1: 0.8755251824421424 -- loss: 0.22730453312397003
+Step 300 -- Accuracy: 0.8536931818181818 -- macro_f1: 0.8533303214529117 -- loss: 0.18725647032260895
+Step 400 -- Accuracy: 0.8690340909090909 -- macro_f1: 0.8687299763460793 -- loss: 0.28860458731651306
+Step 500 -- Accuracy: 0.8798295454545455 -- macro_f1: 0.8802316356122608 -- loss: 0.6372634172439575
+Step 600 -- Accuracy: 0.8610795454545455 -- macro_f1: 0.8612099869711884 -- loss: 0.41530805826187134
+Step 700 -- Accuracy: 0.8491477272727272 -- macro_f1: 0.849751664990205 -- loss: 0.5970628261566162
+Step 800 -- Accuracy: 0.8764204545454546 -- macro_f1: 0.8766266441048876 -- loss: 0.2515469491481781
+Step 900 -- Accuracy: 0.8710227272727272 -- macro_f1: 0.8712350728851791 -- loss: 0.619756817817688
+Step 1000 -- Accuracy: 0.8744318181818181 -- macro_f1: 0.8746062203201398 -- loss: 0.5634986758232117
+Step 1100 -- Accuracy: 0.8735795454545454 -- macro_f1: 0.8735921715063891 -- loss: 0.2514641284942627
+Step 1200 -- Accuracy: 0.8375 -- macro_f1: 0.8368621880475362 -- loss: 0.44521981477737427
+Step 1300 -- Accuracy: 0.8551136363636364 -- macro_f1: 0.8555806721970362 -- loss: 0.048632219433784485
+Step 1400 -- Accuracy: 0.8508522727272727 -- macro_f1: 0.8506097642423027 -- loss: 0.24613773822784424
+Step 1500 -- Accuracy: 0.8673295454545454 -- macro_f1: 0.8671847303392856 -- loss: 0.1494443565607071
+Step 1600 -- Accuracy: 0.834375 -- macro_f1: 0.8342641066244109 -- loss: 0.17161081731319427
+Step 1700 -- Accuracy: 0.865625 -- macro_f1: 0.8651594643017528 -- loss: 0.154042050242424
+Step 1800 -- Accuracy: 0.865909090909091 -- macro_f1: 0.8657615265484808 -- loss: 0.1435176134109497
+Step 1900 -- Accuracy: 0.8176136363636364 -- macro_f1: 0.8171586288909666 -- loss: 0.09292535483837128
+Step 2000 -- Accuracy: 0.8440340909090909 -- macro_f1: 0.843042759250924 -- loss: 0.34320467710494995
+Step 2100 -- Accuracy: 0.8428977272727273 -- macro_f1: 0.8428498174495328 -- loss: 0.5764151811599731
+Step 2200 -- Accuracy: 0.8417613636363637 -- macro_f1: 0.8418818479059557 -- loss: 0.28757143020629883
+Step 2300 -- Accuracy: 0.840625 -- macro_f1: 0.8406394626850148 -- loss: 0.8960273861885071
+Step 2400 -- Accuracy: 0.8142045454545455 -- macro_f1: 0.8140964442024906 -- loss: 0.8550783395767212
+Step 2500 -- Accuracy: 0.8144886363636363 -- macro_f1: 0.8147455224461172 -- loss: 0.39625313878059387
+Step 2600 -- Accuracy: 0.8053977272727273 -- macro_f1: 0.8021211300036969 -- loss: 0.3774358034133911
+Step 2700 -- Accuracy: 0.8292613636363636 -- macro_f1: 0.8292382309283113 -- loss: 0.16644884645938873
+Step 2800 -- Accuracy: 0.8150568181818182 -- macro_f1: 0.814290740222007 -- loss: 0.237399160861969
+Step 2900 -- Accuracy: 0.8107954545454545 -- macro_f1: 0.8111709474507229 -- loss: 0.5621077418327332
+Step 3000 -- Accuracy: 0.7926136363636364 -- macro_f1: 0.7930916669737708 -- loss: 0.4253169298171997
+Step 3100 -- Accuracy: 0.8099431818181818 -- macro_f1: 0.8102288703246834 -- loss: 0.43165838718414307
+Step 3200 -- Accuracy: 0.772159090909091 -- macro_f1: 0.7717788019596861 -- loss: 0.673878014087677
+Step 3300 -- Accuracy: 0.7897727272727273 -- macro_f1: 0.7895567869064662 -- loss: 0.1990412026643753
+Step 3400 -- Accuracy: 0.8008522727272728 -- macro_f1: 0.7997998535844976 -- loss: 0.4523601531982422
+Step 3500 -- Accuracy: 0.7798295454545454 -- macro_f1: 0.7780260696858295 -- loss: 0.8848648071289062
+Step 3600 -- Accuracy: 0.7775568181818182 -- macro_f1: 0.7779453966289696 -- loss: 0.5041539669036865
+Step 3700 -- Accuracy: 0.709659090909091 -- macro_f1: 0.7069128111001839 -- loss: 0.6758942604064941

src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_name_or_path": "/content/checkpoint",
+  "architectures": [
+    "DebertaV2Model"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "entailment",
+    "1": "neutral",
+    "2": "contradiction"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "contradiction": 2,
+    "entailment": 0,
+    "neutral": 1
+  },
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.0",
+  "type_vocab_size": 0,
+  "vocab_size": 251000
+}

src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c7e80e8237ad2969b1c989d71f97fa7b950fd239bfa8b3329f0535a0b8a2aca
+size 1112897768

src/mDeBERTa (ft) V6/mean.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f963dfcdad5469498af3b396c5af0e27365e59a01498c51896b9e6547851cd4
+size 11071

src/mDeBERTa (ft) V6/mean_log.txt ADDED Viewed

	@@ -0,0 +1,76 @@

+Step 0 -- Accuracy: 0.275 -- macro_f1: 0.24245894645844043 -- loss: 1.1975505352020264
+Step 100 -- Accuracy: 0.8230113636363636 -- macro_f1: 0.8247917227891541 -- loss: 0.5072745084762573
+Step 200 -- Accuracy: 0.8585227272727273 -- macro_f1: 0.8596474113005192 -- loss: 0.3576969504356384
+Step 300 -- Accuracy: 0.8616477272727273 -- macro_f1: 0.8619445917534628 -- loss: 0.22678352892398834
+Step 400 -- Accuracy: 0.8710227272727272 -- macro_f1: 0.8713149438253084 -- loss: 0.3302939534187317
+Step 500 -- Accuracy: 0.8491477272727272 -- macro_f1: 0.8497535984618637 -- loss: 0.8534196615219116
+Step 600 -- Accuracy: 0.8627840909090909 -- macro_f1: 0.8630171351987245 -- loss: 0.27207863330841064
+Step 700 -- Accuracy: 0.8676136363636363 -- macro_f1: 0.8681189318753203 -- loss: 0.5472040772438049
+Step 800 -- Accuracy: 0.8480113636363636 -- macro_f1: 0.8474828960740969 -- loss: 0.20389704406261444
+Step 900 -- Accuracy: 0.8625 -- macro_f1: 0.8627369387200629 -- loss: 0.7003616094589233
+Step 1000 -- Accuracy: 0.8471590909090909 -- macro_f1: 0.8474576933366409 -- loss: 0.39897170662879944
+Step 1100 -- Accuracy: 0.8647727272727272 -- macro_f1: 0.8648449015557045 -- loss: 0.30028393864631653
+Step 1200 -- Accuracy: 0.8355113636363637 -- macro_f1: 0.8357176579844655 -- loss: 0.5329824090003967
+Step 1300 -- Accuracy: 0.8318181818181818 -- macro_f1: 0.832158484567787 -- loss: 0.04946904629468918
+Step 1400 -- Accuracy: 0.8275568181818181 -- macro_f1: 0.8270568913757921 -- loss: 0.290753036737442
+Step 1500 -- Accuracy: 0.8619318181818182 -- macro_f1: 0.8620216901652552 -- loss: 0.17760200798511505
+Step 1600 -- Accuracy: 0.8366477272727273 -- macro_f1: 0.8372501215741125 -- loss: 0.18745465576648712
+Step 1700 -- Accuracy: 0.8556818181818182 -- macro_f1: 0.8555692365839257 -- loss: 0.09077112376689911
+Step 1800 -- Accuracy: 0.8571022727272727 -- macro_f1: 0.8569408344903815 -- loss: 0.24079212546348572
+Step 1900 -- Accuracy: 0.8122159090909091 -- macro_f1: 0.8117034674801616 -- loss: 0.3681311309337616
+Step 2000 -- Accuracy: 0.8318181818181818 -- macro_f1: 0.8319676688379705 -- loss: 0.2374744713306427
+Step 2100 -- Accuracy: 0.8443181818181819 -- macro_f1: 0.8442918629955193 -- loss: 0.4600515365600586
+Step 2200 -- Accuracy: 0.8278409090909091 -- macro_f1: 0.8269904995679983 -- loss: 0.3283902704715729
+Step 2300 -- Accuracy: 0.8298295454545455 -- macro_f1: 0.8299882032010862 -- loss: 1.0965081453323364
+Step 2400 -- Accuracy: 0.8159090909090909 -- macro_f1: 0.8159808860940237 -- loss: 0.7295159697532654
+Step 2500 -- Accuracy: 0.8159090909090909 -- macro_f1: 0.8142475187664063 -- loss: 0.3925968408584595
+Step 2600 -- Accuracy: 0.8204545454545454 -- macro_f1: 0.820545798600696 -- loss: 0.3808274567127228
+Step 2700 -- Accuracy: 0.8198863636363637 -- macro_f1: 0.8199413434559383 -- loss: 0.26008090376853943
+Step 2800 -- Accuracy: 0.8056818181818182 -- macro_f1: 0.8051566431375038 -- loss: 0.20567485690116882
+Step 2900 -- Accuracy: 0.784375 -- macro_f1: 0.7848921849530183 -- loss: 0.5506788492202759
+Step 3000 -- Accuracy: 0.8153409090909091 -- macro_f1: 0.8150634367874668 -- loss: 0.4250873923301697
+Step 3100 -- Accuracy: 0.7991477272727273 -- macro_f1: 0.8000715520252392 -- loss: 0.4798588752746582
+Step 3200 -- Accuracy: 0.7840909090909091 -- macro_f1: 0.7836356305606565 -- loss: 0.5604580640792847
+Step 3300 -- Accuracy: 0.7977272727272727 -- macro_f1: 0.7965403402362528 -- loss: 0.26682722568511963
+Step 3400 -- Accuracy: 0.809375 -- macro_f1: 0.8087947373143304 -- loss: 0.3252097964286804
+Step 3500 -- Accuracy: 0.7568181818181818 -- macro_f1: 0.7548780108676749 -- loss: 0.9467527866363525
+Step 3600 -- Accuracy: 0.7889204545454546 -- macro_f1: 0.7892382882596812 -- loss: 0.29441171884536743
+Step 3700 -- Accuracy: 0.7227272727272728 -- macro_f1: 0.7227876418017654 -- loss: 0.8389160633087158

src/mDeBERTa (ft) V6/plot.png ADDED Viewed

src/mDeBERTa (ft) V6/public_train_v4.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56c03b9bb2cab8ffbe138badea76b6275ebad727e99f5040d2a8c21f2dcfaff2
+size 227113690

src/myNLI.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
+from sentence_transformers import SentenceTransformer, util
+import nltk
+# import datasets
+from datasets import Dataset, DatasetDict
+from typing import List
+from .utils import timer_func
+from .nli_v3 import NLI_model
+from .crawler import MyCrawler
+int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}
+class FactChecker:
+    @timer_func
+    def __init__(self):
+        self.INPUT_TYPE = "mean"
+        self.load_model()
+    @timer_func
+    def load_model(self):
+        self.envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+        # Load LLM
+        self.tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")    # LOAD mDEBERTa TOKENIZER
+        self.mDeBertaModel = AutoModel.from_pretrained(f"src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-{self.INPUT_TYPE}")  # LOAD FINETUNED MODEL
+        # Load classifier model
+        self.checkpoints = torch.load(f"src/mDeBERTa (ft) V6/{self.INPUT_TYPE}.pt", map_location=self.envir)
+        self.classifierModel = NLI_model(768, torch.tensor([0., 0., 0.])).to(self.envir)
+        self.classifierModel.load_state_dict(self.checkpoints['model_state_dict'])
+        #Load model for predict similarity
+        self.model_sbert = SentenceTransformer('keepitreal/vietnamese-sbert')
+    @timer_func
+    def get_similarity_v2(self, src_sents, dst_sents, threshold = 0.4):
+        corpus_embeddings = self.model_sbert.encode(dst_sents, convert_to_tensor=True)
+        top_k = min(5, len(dst_sents))
+        ls_top_results = []
+        for query in src_sents:
+            query_embedding = self.model_sbert.encode(query, convert_to_tensor=True)
+            # We use cosine-similarity and torch.topk to find the highest 5 scores
+            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
+            top_results = torch.topk(cos_scores, k=top_k)
+            # print("\n\n======================\n\n")
+            # print("Query:", src_sents)
+            # print("\nTop 5 most similar sentences in corpus:")
+            ls_top_results.append({
+                "top_k": top_k,
+                "claim": query,
+                "sim_score": top_results,
+                "evidences": [dst_sents[idx] for _, idx in zip(top_results[0], top_results[1])],
+            })
+            # for score, idx in zip(top_results[0], top_results[1]):
+            #     print(dst_sents[idx], "(Score: {:.4f})".format(score))
+        return None,ls_top_results
+    @timer_func
+    def inferSample(self, evidence, claim):
+        @timer_func
+        def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
+            premises = [premise for premise, _ in data['sample']]
+            hypothesis = [hypothesis for _, hypothesis in data['sample']]
+            with torch.no_grad():
+                input_token = (self.tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(self.envir)
+                embedding = self.mDeBertaModel(input_token).last_hidden_state
+            mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
+            cls_embedding = embedding[:, 0, :]
+            return {'mean':mean_embedding, 'cls':cls_embedding}
+        @timer_func
+        def predict_mapping(batch):
+            with torch.no_grad():
+                predict_label, predict_prob = self.classifierModel.predict_step((batch[self.INPUT_TYPE].to(self.envir), None))
+            return {'label':predict_label, 'prob':-predict_prob}
+        # Mapping the predict label into corresponding string labels
+        @timer_func
+        def output_predictedDataset(predict_dataset):
+            for record in predict_dataset:
+                labels = int2label[ record['label'].item() ]
+                confidence = record['prob'].item()
+            return {'labels':labels, 'confidence':confidence}
+        dataset = {'sample':[(evidence, claim)], 'key': [0]}
+        output_dataset = DatasetDict({
+            'infer': Dataset.from_dict(dataset)
+        })
+        @timer_func
+        def tokenize_dataset():
+            tokenized_dataset = output_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=1)
+            return tokenized_dataset
+        tokenized_dataset = tokenize_dataset()
+        tokenized_dataset = tokenized_dataset.with_format("torch", [self.INPUT_TYPE, 'key'])
+        # Running inference step
+        predicted_dataset = tokenized_dataset.map(predict_mapping, batched=True, batch_size=tokenized_dataset['infer'].num_rows)
+        return output_predictedDataset(predicted_dataset['infer'])
+    @timer_func
+    def predict_vt(self, claim: str) -> List:
+        # import pdb; pdb.set_trace()
+        # step 1: crawl evidences from bing search
+        crawler = MyCrawler()
+        evidences = crawler.searchGoogle(claim)
+        # evidences = crawler.get_evidences(claim)
+        # step 2: use emebdding setences to search most related setences
+        if len(evidences) == 0:
+            return None
+        for evidence in evidences:
+            print(evidence['url'])
+            top_evidence = evidence["content"]
+            post_message = nltk.tokenize.sent_tokenize(claim)
+            evidences = nltk.tokenize.sent_tokenize(top_evidence)
+            _, top_rst = self.get_similarity_v2(post_message, evidences)
+            print(top_rst)
+            ls_evidence, final_verdict = self.get_result_nli_v2(top_rst)
+            print("FINAL: " + final_verdict)
+        # _, top_rst = self.get_similarity_v1(post_message, evidences)
+        # ls_evidence, final_verdict = self.get_result_nli_v1(post_message, top_rst, evidences)
+        return ls_evidence, final_verdict
+    @timer_func
+    def predict(self, claim):
+        crawler = MyCrawler()
+        evidences = crawler.searchGoogle(claim)
+        if evidences:
+            tokenized_claim = nltk.tokenize.sent_tokenize(claim)
+            evidence = evidences[0]
+            tokenized_evidence = nltk.tokenize.sent_tokenize(evidence["content"])
+            # print("TOKENIZED EVIDENCES")
+            # print(tokenized_evidence)
+            _, top_rst = self.get_similarity_v2(tokenized_claim, tokenized_evidence)
+            processed_evidence = "\n".join(top_rst[0]["evidences"])
+            print(processed_evidence)
+            nli_result = self.inferSample(processed_evidence, claim)
+            return {
+                "claim": claim,
+                "label": nli_result["labels"],
+                "confidence": nli_result['confidence'],
+                "evidence": processed_evidence if nli_result["labels"] != "NEI" else "",
+                "provider": evidence['provider'],
+                "url": evidence['url']
+            }
+    @timer_func
+    def predict_nofilter(self, claim):
+        crawler = MyCrawler()
+        evidences = crawler.searchGoogle(claim)
+        tokenized_claim = nltk.tokenize.sent_tokenize(claim)
+        evidence = evidences[0]
+        processed_evidence = evidence['content']
+        nli_result = self.inferSample(processed_evidence, claim)
+        return {
+            "claim": claim,
+            "label": nli_result["labels"],
+            "confidence": nli_result['confidence'],
+            "evidence": processed_evidence if nli_result["labels"] != "NEI" else "",
+            "provider": evidence['provider'],
+            "url": evidence['url']
+        }

src/nli_v3.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+from torch import nn as nn
+import pandas as pd
+from transformers import AutoModel, AutoTokenizer
+# import datasets
+from datasets import Dataset, DatasetDict
+from sklearn.metrics import classification_report
+from sklearn.metrics._classification import _check_targets
+envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}
+class NLI_model(nn.Module):
+    def __init__(self, input_dims, class_weights=torch.tensor([0., 0., 0.])):
+        super(NLI_model, self).__init__()
+        self.classification = nn.Sequential(
+            nn.Linear(input_dims, 3)
+        )
+        self.criterion = nn.CrossEntropyLoss(class_weights)
+    def forward(self, input):
+        output_linear = self.classification(input)
+        return output_linear
+    def training_step(self, train_batch, batch_idx=0):
+        input_data, targets = train_batch
+        outputs = self.forward(input_data)
+        loss = self.criterion(outputs, targets)
+        return loss
+    def predict_step(self, batch, batch_idx=0):
+        input_data, _ = batch
+        outputs = self.forward(input_data)
+        prob = outputs.softmax(dim = -1)
+        sort_prob, sort_indices = torch.sort(-prob, 1)
+        return sort_indices[:,0], sort_prob[:,0]
+    def validation_step(self, val_batch, batch_idx=0):
+        _, targets = val_batch
+        sort_indices, _ = self.predict_step(val_batch, batch_idx)
+        report = classification_report(list(targets.to('cpu').numpy()), list(sort_indices.to('cpu').numpy()), output_dict=True, zero_division = 1)
+        return report
+    def test_step(self, batch, dict_form, batch_idx=0):
+        _, targets = batch
+        sort_indices, _ = self.predict_step(batch, batch_idx)
+        report = classification_report(targets.to('cpu').numpy(), sort_indices.to('cpu').numpy(), output_dict=dict_form, zero_division = 1)
+        return report
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr = 1e-5)
+def inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, input_type):
+    def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
+        premises = [premise for premise, _ in data['sample']]
+        hypothesis = [hypothesis for _, hypothesis in data['sample']]
+        with torch.no_grad():
+            input_token = (tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(envir)
+            embedding = mDeBertaModel(input_token).last_hidden_state
+        mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
+        cls_embedding = embedding[:, 0, :]
+        return {'mean':mean_embedding, 'cls':cls_embedding}
+    def predict_mapping(batch):
+        with torch.no_grad():
+            predict_label, predict_prob = classifierModel.predict_step((batch[input_type].to(envir), None))
+        return {'label':predict_label, 'prob':-predict_prob}
+    # Mapping the predict label into corresponding string labels
+    def output_predictedDataset(predict_dataset):
+        for record in predict_dataset:
+            labels = int2label[ record['label'].item() ]
+            confidence = record['prob'].item()
+        return {'labels':labels, 'confidence':confidence}
+    dataset = {'sample':[(evidence, claim)], 'key': [0]}
+    output_dataset = DatasetDict({
+        'infer': Dataset.from_dict(dataset)
+    })
+    tokenized_dataset = output_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=1)
+    tokenized_dataset = tokenized_dataset.with_format("torch", [input_type, 'key'])
+    # Running inference step
+    predicted_dataset = tokenized_dataset.map(predict_mapping, batched=True, batch_size=tokenized_dataset['infer'].num_rows)
+    return output_predictedDataset(predicted_dataset['infer'])
+if __name__ == '__main__':
+    # CHANGE 'INPUT_TYPE' TO CHANGE MODEL
+    INPUT_TYPE = 'mean' # USE "MEAN" OR "CLS" LAST HIDDEN STATE
+    # Load LLM
+    tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")    # LOAD mDEBERTa TOKENIZER
+    mDeBertaModel = AutoModel.from_pretrained(f"src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-{INPUT_TYPE}")  # LOAD FINETUNED MODEL
+    # Load classifier model
+    checkpoints = torch.load(f"src/mDeBERTa (ft) V6/{INPUT_TYPE}.pt", map_location=envir)
+    classifierModel = NLI_model(768, torch.tensor([0., 0., 0.])).to(envir)
+    classifierModel.load_state_dict(checkpoints['model_state_dict'])
+    evidence = "Sau khi thẩm định, Liên đoàn Bóng đá châu Á AFC xác nhận thủ thành mới nhập quốc tịch của Việt Nam Filip Nguyễn đủ điều kiện thi đấu ở Asian Cup 2024."
+    claim = "Filip Nguyễn đủ điều kiện dự Asian Cup 2024"
+    print(inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, INPUT_TYPE))

src/utils.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from time import time
+def timer_func(func):
+    # This function shows the execution time of
+    # the function object passed
+    def wrap_func(*args, **kwargs):
+        t1 = time()
+        result = func(*args, **kwargs)
+        t2 = time()
+        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
+        return result
+    return wrap_func