Minseok Bae commited on
Commit
d7b7dc6
·
1 Parent(s): 767187a

Modified for hallucination evaluation task

Browse files
scripts/create_request_file.py CHANGED
@@ -7,9 +7,10 @@ from datetime import datetime, timezone
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
 
10
 
11
- EVAL_REQUESTS_PATH = "eval-queue"
12
- QUEUE_REPO = "open-llm-leaderboard/requests"
13
 
14
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
+ from util import QUEUE_REPO, EVAL_REQUESTS_PATH
11
 
12
+ # EVAL_REQUESTS_PATH = "eval-queue"
13
+ # QUEUE_REPO = "open-llm-leaderboard/requests"
14
 
15
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
16
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
src/backend/evaluate_model.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from leaderboard.src.backend.model_operations import SummaryGenerator, EvaluationModel
4
+ from envs import HEM_PATH, SOURCE_PATH
5
+ from leaderboard.src.backend.util import load_dataframe, format_results
6
+
7
+ class Evaluator:
8
+ def __init__(self, model, revision, precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs'):
9
+ self.model = model
10
+ self.revision = revision
11
+ self.precision = precision
12
+ self.num_fewshot = num_fewshot
13
+ self.batch_size = batch_size
14
+ self.device = device
15
+ self.no_cache = no_cache
16
+ self.limit = limit
17
+ self.write_out = write_out
18
+ self.output_base_path = output_base_path
19
+ self.summary_generator = SummaryGenerator(model, revision)
20
+ self.eval_model = EvaluationModel(HEM_PATH)
21
+
22
+ def evaluate(self):
23
+ df = load_dataframe(SOURCE_PATH)
24
+ generated_summaries_df = self.summary_generator.generate_summaries(df)
25
+
26
+ avg_summary_len = self.summary_generator.avg_length
27
+ answer_rate = self.summary_generator.answer_rate
28
+
29
+ hallucination_scores = self.eval_model.evaluate_hallucination(generated_summaries_df)
30
+
31
+ accuracy = self.eval_model.compute_accuracy
32
+ hallucination_rate = self.eval_model.hallucination_rate
33
+
34
+ results = format_results(hallucination_scores, self.model, self.revision, self.precision, accuracy, hallucination_rate, answer_rate, avg_summary_len)
35
+
36
+ return results
37
+
src/backend/manage_requests.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from huggingface_hub import HfApi, snapshot_download
7
+ from src.envs import TOKEN
8
+
9
+ @dataclass
10
+ class EvalRequest:
11
+ model: str
12
+ private: bool
13
+ status: str
14
+ json_filepath: str
15
+ weight_type: str = "Original"
16
+ model_type: str = "" # pretrained, finetuned, with RL
17
+ precision: str = "" # float16, bfloat16
18
+ base_model: Optional[str] = None # for adapter models
19
+ revision: str = "main" # commit
20
+ submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
21
+ model_type: Optional[str] = None
22
+ likes: Optional[int] = 0
23
+ params: Optional[int] = None
24
+ license: Optional[str] = ""
25
+
26
+ def get_model_args(self):
27
+ model_args = f"pretrained={self.model},revision={self.revision}"
28
+
29
+ if self.precision in ["float16", "bfloat16"]:
30
+ model_args += f",dtype={self.precision}"
31
+ # Quantized models need some added config, the install of bits and bytes, etc
32
+ #elif self.precision == "8bit":
33
+ # model_args += ",load_in_8bit=True"
34
+ #elif self.precision == "4bit":
35
+ # model_args += ",load_in_4bit=True"
36
+ #elif self.precision == "GPTQ":
37
+ # A GPTQ model does not need dtype to be specified,
38
+ # it will be inferred from the config
39
+ pass
40
+ else:
41
+ raise Exception(f"Unknown precision {self.precision}.")
42
+
43
+ return model_args
44
+
45
+
46
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
47
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
48
+ json_filepath = eval_request.json_filepath
49
+
50
+ with open(json_filepath) as fp:
51
+ data = json.load(fp)
52
+
53
+ data["status"] = set_to_status
54
+
55
+ with open(json_filepath, "w") as f:
56
+ f.write(json.dumps(data))
57
+
58
+ api.upload_file(
59
+ path_or_fileobj=json_filepath,
60
+ path_in_repo=json_filepath.replace(local_dir, ""),
61
+ repo_id=hf_repo,
62
+ repo_type="dataset",
63
+ )
64
+
65
+
66
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
67
+ """Get all pending evaluation requests and return a list in which private
68
+ models appearing first, followed by public models sorted by the number of
69
+ likes.
70
+
71
+ Returns:
72
+ `list[EvalRequest]`: a list of model info dicts.
73
+ """
74
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
75
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
76
+
77
+ eval_requests = []
78
+ for json_filepath in json_files:
79
+ with open(json_filepath) as fp:
80
+ data = json.load(fp)
81
+ if data["status"] in job_status:
82
+ data["json_filepath"] = json_filepath
83
+ eval_request = EvalRequest(**data)
84
+ eval_requests.append(eval_request)
85
+
86
+ return eval_requests
87
+
88
+
89
+ def check_completed_evals(
90
+ api: HfApi,
91
+ hf_repo: str,
92
+ local_dir: str,
93
+ checked_status: str,
94
+ completed_status: str,
95
+ failed_status: str,
96
+ hf_repo_results: str,
97
+ local_dir_results: str,
98
+ ):
99
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
100
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
101
+
102
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
103
+
104
+ for eval_request in running_evals:
105
+ model = eval_request.model
106
+ print("====================================")
107
+ print(f"Checking {model}")
108
+
109
+ output_path = model
110
+ output_file = f"{local_dir_results}/{output_path}/results*.json"
111
+ output_file_exists = len(glob.glob(output_file)) > 0
112
+
113
+ if output_file_exists:
114
+ print(
115
+ f"EXISTS output file exists for {model} setting it to {completed_status}"
116
+ )
117
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
118
+ else:
119
+ print(
120
+ f"No result file found for {model} setting it to {failed_status}"
121
+ )
122
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/model_operations.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from sentence_transformers import CrossEncoder
5
+
6
+ from leaderboard.src.backend.util import generate_prompt
7
+
8
+ def load_evaluation_model(model_path):
9
+ model = CrossEncoder(model_path)
10
+ model.save_pretrained('.checkpoints/{model_path}')
11
+ return model
12
+
13
+ class SummaryGenerator:
14
+ def __init__(self, model_id, revision):
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
16
+ self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
17
+ self.summaries_df = pd.DataFrame()
18
+ self.revision = revision
19
+ self.avg_length = None
20
+ self.answer_rate = None
21
+
22
+ def generate_summaries(self, df):
23
+ source, summary, dataset = [], [], []
24
+
25
+ for index, row in df.iterrows():
26
+ _source = row['text']
27
+ _dataset = row['dataset']
28
+
29
+ prompt = generate_prompt(_source)
30
+ inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024, revision=self.revision)
31
+ try:
32
+ outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False, temperature=0.0, revision=self.revision)
33
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True, revision=self.revision)
34
+ except Exception as e:
35
+ print(f"Error at index {index}: {e}")
36
+ response = ""
37
+
38
+ summary.append(response)
39
+ source.append(_source)
40
+ dataset.append(_dataset)
41
+
42
+ self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)), columns=["source", "summary", "dataset"])
43
+ self._compute_avg_length()
44
+ self._compute_answer_rate()
45
+
46
+ return self.summaries_df
47
+
48
+ def _compute_avg_length(self):
49
+ total_words = 0
50
+ count = 0
51
+
52
+ for summary in self.summaries_df['summary']:
53
+ if summary != "":
54
+ words = summary.split()
55
+ total_words += len(words)
56
+ count += 1
57
+
58
+ self.avg_length = 0 if count == 0 else total_words / count
59
+
60
+ def _compute_answer_rate(self):
61
+ non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary != "")
62
+ total_rows = len(self.summaries_df)
63
+
64
+ self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
65
+
66
+ class EvaluationModel:
67
+ def __init__(self, model_path):
68
+ self.model = load_evaluation_model(model_path)
69
+ self.scores = []
70
+ self.accuracy = None
71
+ self.hallucination_rate = None
72
+
73
+ def evaluate_hallucination(self, summaries_df):
74
+ # Convert to NumPy arrays for efficient processing
75
+ source_docs = np.array(summaries_df['source'])
76
+ generated_summaries = np.array(summaries_df['summary'])
77
+
78
+ scores = self.model.predict(source_docs, generated_summaries)
79
+ self.scores = scores
80
+ return self.scores
81
+
82
+ def compute_accuracy(self):
83
+ if not self.scores:
84
+ raise ValueError("Scores not calculated. Call evaluate_hallucination() first.")
85
+
86
+ # Use threshold of 0.5 to compute accuracy
87
+ num_above_threshold = sum(score >= 0.5 for score in self.scores)
88
+ num_total = len(self.scores)
89
+
90
+ if num_total == 0:
91
+ raise ValueError("No scores available to compute accuracy.")
92
+
93
+ self.accuracy = (num_above_threshold / num_total) * 100
94
+ self.hallucination_rate = 100 - self.accuracy
95
+
96
+ return self.accuracy
src/backend/run_eval_suite.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ from datetime import datetime
5
+
6
+ # from lm_eval import tasks, evaluator, utils
7
+ from evaluate_model import Evaluator
8
+
9
+ from src.envs import RESULTS_REPO, API
10
+ from src.backend.manage_requests import EvalRequest
11
+
12
+ from util import load_dataframe, format_results
13
+
14
+ logging.getLogger("openai").setLevel(logging.WARNING)
15
+
16
+ def run_evaluation(eval_request: EvalRequest, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
17
+ if limit:
18
+ print(
19
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
20
+ )
21
+
22
+ # task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
23
+
24
+ # print(f"Selected Tasks: {task_names}")
25
+ evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision, num_fewshot, batch_size, device, no_cache, limit, write_out=True, output_base_path='logs')
26
+ results = evaluator.evaluate()
27
+
28
+ # results["config"]["model_dtype"] = eval_request.precision
29
+ # results["config"]["model_name"] = eval_request.model
30
+ # results["config"]["model_sha"] = eval_request.revision
31
+
32
+ dumped = json.dumps(results, indent=2)
33
+ print(dumped)
34
+
35
+ output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
36
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
37
+ with open(output_path, "w") as f:
38
+ f.write(dumped)
39
+
40
+ print(evaluator.make_table(results))
41
+
42
+ API.upload_file(
43
+ path_or_fileobj=output_path,
44
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
45
+ repo_id=results_repo,
46
+ repo_type="dataset",
47
+ )
48
+
49
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from dataclasses import dataclass
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ from src.backend.manage_requests import EvalRequest
7
+
8
+
9
+ @dataclass
10
+ class ModelMetadata:
11
+ likes: int = 0
12
+ size: int = 15
13
+
14
+
15
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
+ private_models = [model for model in models if model.private]
17
+ public_models = [model for model in models if not model.private]
18
+
19
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
+
21
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
22
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
23
+
24
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
+
27
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/backend/util.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def load_dataframe(data_path):
4
+ df = pd.read_csv(data_path)
5
+ return df
6
+
7
+ def generate_prompt(source_passage):
8
+ return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
9
+ You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described.'
10
+ Passage:
11
+ {source_passage}
12
+ """
13
+
14
+ def format_results(hallucination_scores, model_name, revision, precision, accuracy, hallucination_rate, answer_rate, avg_summary_len):
15
+ # Define the structure of the results (JSON)
16
+ results = {
17
+ "config": {
18
+ "model_dtype": precision, # Precision with which you ran the evaluation
19
+ "model_name": model_name, # Name of the model
20
+ "model_sha": revision # Hash of the model
21
+ },
22
+ "results": {
23
+ "hallucination_eval": {
24
+ "HEM Scores": hallucination_scores,
25
+ "Accuracy": accuracy,
26
+ "Hallucination Rate": hallucination_rate,
27
+ "Answer Rate": answer_rate,
28
+ "Average Summary Length": avg_summary_len,
29
+ }
30
+ }
31
+ }
32
+ return results
src/datasets/leaderboard_dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
src/display/about.py CHANGED
@@ -1,18 +1,18 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Init: to update with your specific keys
12
- class Tasks(Enum):
13
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- task0 = Task("task_name1", "metric_name", "First task")
15
- task1 = Task("task_name2", "metric_name", "Second task")
16
 
17
 
18
  # Your leaderboard name
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ # @dataclass
5
+ # class Task:
6
+ # benchmark: str
7
+ # metric: str
8
+ # col_name: str
9
+
10
+
11
+ # # Init: to update with your specific keys
12
+ # class Tasks(Enum):
13
+ # # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ # task0 = Task("task_name1", "metric_name", "First task")
15
+ # task1 = Task("task_name2", "metric_name", "Second task")
16
 
17
 
18
  # Your leaderboard name
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -26,10 +26,15 @@ auto_eval_column_dict = []
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
- for task in Tasks:
32
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
 
 
33
  # Model information
34
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
35
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -121,7 +126,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
121
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
122
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
123
 
124
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
125
 
126
  NUMERIC_INTERVALS = {
127
  "?": pd.Interval(-1, 0, closed="right"),
 
3
 
4
  import pandas as pd
5
 
6
+ # from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
26
  # Init
27
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
+ # Accuracy
30
+ auto_eval_column_dict.append(["accuracy", ColumnContent, ColumnContent("Accuracy ⬆️", "number", True)])
31
+ # Hallucination Rate
32
+ auto_eval_column_dict.append(["hallucination_rate", ColumnContent, ColumnContent("Hallucination Rate ⬇️", "number", True)])
33
+ # Answer Rate
34
+ auto_eval_column_dict.append(["answer_rate", ColumnContent, ColumnContent("Answer Rate ⬆️", "number", True)])
35
+ # Average Summary Length
36
+ auto_eval_column_dict.append(["average_summary_length", ColumnContent, ColumnContent("Average Summary Length", "number", True)])
37
+
38
  # Model information
39
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
40
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
126
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
127
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
128
 
129
+ BENCHMARK_COLS = ["Accuracy", "Hallucination Rate", "Answer Rate", "Average Summary Length"]
130
 
131
  NUMERIC_INTERVALS = {
132
  "?": pd.Interval(-1, 0, closed="right"),
src/envs.py CHANGED
@@ -2,10 +2,10 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # clone / pull the lmeh eval data
6
- TOKEN = os.environ.get("TOKEN", None)
7
 
8
- OWNER = "demo-leaderboard"
9
  REPO_ID = f"{OWNER}/leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
  RESULTS_REPO = f"{OWNER}/results"
@@ -17,3 +17,6 @@ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
18
 
19
  API = HfApi(token=TOKEN)
 
 
 
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ # replace this with our token
6
+ TOKEN = os.environ.get("HF_TOKEN", None)
7
 
8
+ OWNER = "vectara"
9
  REPO_ID = f"{OWNER}/leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
  RESULTS_REPO = f"{OWNER}/results"
 
17
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
18
 
19
  API = HfApi(token=TOKEN)
20
+
21
+ SOURCE_PATH = "/datasets/leaderboard_summaries.csv"
22
+ HEM_PATH = 'vectara/hallucination_evaluation_model'
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -65,17 +65,16 @@ class EvalResult:
65
  architecture = ";".join(architectures)
66
 
67
  # Extract results available in this file (some results are split in several files)
68
- results = {}
69
- for task in Tasks:
70
- task = task.value
71
-
72
- # We average all scores of a given metric (not all metrics are present in all files)
73
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
74
- if accs.size == 0 or any([acc is None for acc in accs]):
75
- continue
76
-
77
- mean_acc = np.mean(accs) * 100.0
78
- results[task.benchmark] = mean_acc
79
 
80
  return self(
81
  eval_name=result_key,
@@ -107,7 +106,9 @@ class EvalResult:
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
110
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
111
  data_dict = {
112
  "eval_name": self.eval_name, # not a column, just a save name,
113
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -118,15 +119,18 @@ class EvalResult:
118
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
119
  AutoEvalColumn.dummy.name: self.full_model,
120
  AutoEvalColumn.revision.name: self.revision,
121
- AutoEvalColumn.average.name: average,
 
122
  AutoEvalColumn.license.name: self.license,
123
  AutoEvalColumn.likes.name: self.likes,
124
  AutoEvalColumn.params.name: self.num_params,
125
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
126
  }
127
-
128
- for task in Tasks:
129
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
130
 
131
  return data_dict
132
 
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
65
  architecture = ";".join(architectures)
66
 
67
  # Extract results available in this file (some results are split in several files)
68
+ hallucination_eval = data["results"].get("hallucination_eval", {})
69
+
70
+ # Extract metrics from hallucination eval
71
+ results = {
72
+ "HEM Scores": hallucination_eval.get("HEM Scores", None),
73
+ "Accuracy": hallucination_eval.get("Accuracy", None),
74
+ "Hallucination Rate": hallucination_eval.get("Hallucination Rate", None),
75
+ "Answer Rate": hallucination_eval.get("Answer Rate", None),
76
+ "Average Summary Length": hallucination_eval.get("Average Summary Length", None),
77
+ }
 
78
 
79
  return self(
80
  eval_name=result_key,
 
106
 
107
  def to_dict(self):
108
  """Converts the Eval Result to a dict compatible with our dataframe display"""
109
+ # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
110
+ accuracy = self.results.get("Accuracy", None)
111
+
112
  data_dict = {
113
  "eval_name": self.eval_name, # not a column, just a save name,
114
  AutoEvalColumn.precision.name: self.precision.value.name,
 
119
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
120
  AutoEvalColumn.dummy.name: self.full_model,
121
  AutoEvalColumn.revision.name: self.revision,
122
+ # AutoEvalColumn.average.name: average,
123
+ AutoEvalColumn.accuracy.name: accuracy,
124
  AutoEvalColumn.license.name: self.license,
125
  AutoEvalColumn.likes.name: self.likes,
126
  AutoEvalColumn.params.name: self.num_params,
127
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
128
  }
129
+ # for task in Tasks:
130
+ # data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
+ data_dict["Hallucination Rate"] = self.results.get("Hallucination Rate", None)
132
+ data_dict["Answer Rate"] = self.results.get("Answer Rate", None)
133
+ data_dict["Average Summary Length"] = self.results.get("Average Summary Length", None)
134
 
135
  return data_dict
136
 
src/populate.py CHANGED
@@ -13,7 +13,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
18
 
19
  # filter out if any of the benchmarks have not been produced
 
13
  all_data_json = [v.to_dict() for v in raw_data]
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
+ df = df.sort_values(by=[AutoEvalColumn.accuracy.name], ascending=False)
17
  df = df[cols].round(decimals=2)
18
 
19
  # filter out if any of the benchmarks have not been produced
src/submission/submit.py CHANGED
@@ -94,6 +94,7 @@ def add_new_eval(
94
  return styled_warning("This model has been already submitted.")
95
 
96
  print("Creating eval file")
 
97
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
98
  os.makedirs(OUT_DIR, exist_ok=True)
99
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
 
94
  return styled_warning("This model has been already submitted.")
95
 
96
  print("Creating eval file")
97
+
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
  out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"