Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Minseok Bae
commited on
Commit
·
58b9de9
1
Parent(s):
d7b7dc6
Integrated backend pipelines - error occurs during model submission. (Debugging needed).
Browse files- .gitignore +2 -0
- app.py +53 -73
- main_backend.py +72 -0
- requirements.txt +2 -1
- scripts/create_request_file.py +8 -6
- src/backend/evaluate_model.py +81 -23
- src/backend/manage_requests.py +17 -23
- src/backend/model_operations.py +151 -23
- src/backend/run_eval_suite.py +37 -24
- src/backend/sort_queue.py +1 -2
- src/backend/util.py +51 -19
- src/display/about.py +18 -15
- src/display/css_html_js.py +1 -1
- src/display/utils.py +23 -12
- src/envs.py +6 -3
- src/leaderboard/read_evals.py +66 -74
- src/populate.py +10 -10
- src/submission/check_validity.py +2 -4
- src/submission/submit.py +21 -25
- tests/test_evaluate_model.py +87 -0
- tests/test_evaluator.py +59 -0
- tests/test_main_backend.py +54 -0
- tests/test_summary_generator.py +68 -0
.gitignore
CHANGED
@@ -11,5 +11,7 @@ human_evals/
|
|
11 |
eval-queue/
|
12 |
eval-results/
|
13 |
auto_evals/
|
|
|
|
|
14 |
|
15 |
src/assets/model_counts.html
|
|
|
11 |
eval-queue/
|
12 |
eval-results/
|
13 |
auto_evals/
|
14 |
+
eval-queue-bk/
|
15 |
+
eval-results-bk/
|
16 |
|
17 |
src/assets/model_counts.html
|
app.py
CHANGED
@@ -3,60 +3,40 @@ import pandas as pd
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
-
|
7 |
-
CITATION_BUTTON_LABEL,
|
8 |
-
CITATION_BUTTON_TEXT,
|
9 |
-
EVALUATION_QUEUE_TEXT,
|
10 |
-
INTRODUCTION_TEXT,
|
11 |
-
LLM_BENCHMARKS_TEXT,
|
12 |
-
TITLE,
|
13 |
-
)
|
14 |
from src.display.css_html_js import custom_css
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
EVAL_TYPES,
|
20 |
-
NUMERIC_INTERVALS,
|
21 |
-
TYPES,
|
22 |
-
AutoEvalColumn,
|
23 |
-
ModelType,
|
24 |
-
fields,
|
25 |
-
WeightType,
|
26 |
-
Precision
|
27 |
-
)
|
28 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
29 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
-
from src.submission.submit import add_new_eval
|
31 |
|
32 |
|
33 |
def restart_space():
|
34 |
-
API.restart_space(repo_id=REPO_ID, token=TOKEN)
|
35 |
|
36 |
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
40 |
)
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
47 |
)
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
-
|
52 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
leaderboard_df = original_df.copy()
|
54 |
|
55 |
(
|
56 |
finished_eval_queue_df,
|
57 |
running_eval_queue_df,
|
58 |
pending_eval_queue_df,
|
59 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
60 |
|
61 |
|
62 |
# Searching and filtering
|
@@ -76,17 +56,17 @@ def update_table(
|
|
76 |
|
77 |
|
78 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
79 |
-
return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
80 |
|
81 |
|
82 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
83 |
always_here_cols = [
|
84 |
-
AutoEvalColumn.model_type_symbol.name,
|
85 |
-
AutoEvalColumn.model.name,
|
86 |
]
|
87 |
# We use COLS to maintain sorting
|
88 |
filtered_df = df[
|
89 |
-
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
|
90 |
]
|
91 |
return filtered_df
|
92 |
|
@@ -104,7 +84,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
|
|
104 |
if len(final_df) > 0:
|
105 |
filtered_df = pd.concat(final_df)
|
106 |
filtered_df = filtered_df.drop_duplicates(
|
107 |
-
subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
108 |
)
|
109 |
|
110 |
return filtered_df
|
@@ -117,14 +97,14 @@ def filter_models(
|
|
117 |
if show_deleted:
|
118 |
filtered_df = df
|
119 |
else: # Show only still on the hub models
|
120 |
-
filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
121 |
|
122 |
type_emoji = [t[0] for t in type_query]
|
123 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
124 |
-
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
125 |
|
126 |
-
numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
127 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
128 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
129 |
filtered_df = filtered_df.loc[mask]
|
130 |
|
@@ -133,8 +113,8 @@ def filter_models(
|
|
133 |
|
134 |
demo = gr.Blocks(css=custom_css)
|
135 |
with demo:
|
136 |
-
gr.HTML(TITLE)
|
137 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
138 |
|
139 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
140 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
@@ -150,12 +130,12 @@ with demo:
|
|
150 |
shown_columns = gr.CheckboxGroup(
|
151 |
choices=[
|
152 |
c.name
|
153 |
-
for c in fields(AutoEvalColumn)
|
154 |
if not c.hidden and not c.never_hidden and not c.dummy
|
155 |
],
|
156 |
value=[
|
157 |
c.name
|
158 |
-
for c in fields(AutoEvalColumn)
|
159 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
160 |
],
|
161 |
label="Select columns to show",
|
@@ -170,34 +150,34 @@ with demo:
|
|
170 |
#with gr.Box(elem_id="box-filter"):
|
171 |
filter_columns_type = gr.CheckboxGroup(
|
172 |
label="Model types",
|
173 |
-
choices=[t.to_str() for t in ModelType],
|
174 |
-
value=[t.to_str() for t in ModelType],
|
175 |
interactive=True,
|
176 |
elem_id="filter-columns-type",
|
177 |
)
|
178 |
filter_columns_precision = gr.CheckboxGroup(
|
179 |
label="Precision",
|
180 |
-
choices=[i.value.name for i in Precision],
|
181 |
-
value=[i.value.name for i in Precision],
|
182 |
interactive=True,
|
183 |
elem_id="filter-columns-precision",
|
184 |
)
|
185 |
filter_columns_size = gr.CheckboxGroup(
|
186 |
label="Model sizes (in billions of parameters)",
|
187 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
188 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
189 |
interactive=True,
|
190 |
elem_id="filter-columns-size",
|
191 |
)
|
192 |
|
193 |
leaderboard_table = gr.components.Dataframe(
|
194 |
value=leaderboard_df[
|
195 |
-
[c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
196 |
+ shown_columns.value
|
197 |
-
+ [AutoEvalColumn.dummy.name]
|
198 |
],
|
199 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
200 |
-
datatype=TYPES,
|
201 |
elem_id="leaderboard-table",
|
202 |
interactive=False,
|
203 |
visible=True,
|
@@ -206,9 +186,9 @@ with demo:
|
|
206 |
|
207 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
208 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
209 |
-
value=original_df[COLS],
|
210 |
-
headers=COLS,
|
211 |
-
datatype=TYPES,
|
212 |
visible=False,
|
213 |
)
|
214 |
search_bar.submit(
|
@@ -241,12 +221,12 @@ with demo:
|
|
241 |
)
|
242 |
|
243 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
244 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
245 |
|
246 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
247 |
with gr.Column():
|
248 |
with gr.Row():
|
249 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
250 |
|
251 |
with gr.Column():
|
252 |
with gr.Accordion(
|
@@ -256,8 +236,8 @@ with demo:
|
|
256 |
with gr.Row():
|
257 |
finished_eval_table = gr.components.Dataframe(
|
258 |
value=finished_eval_queue_df,
|
259 |
-
headers=EVAL_COLS,
|
260 |
-
datatype=EVAL_TYPES,
|
261 |
row_count=5,
|
262 |
)
|
263 |
with gr.Accordion(
|
@@ -267,8 +247,8 @@ with demo:
|
|
267 |
with gr.Row():
|
268 |
running_eval_table = gr.components.Dataframe(
|
269 |
value=running_eval_queue_df,
|
270 |
-
headers=EVAL_COLS,
|
271 |
-
datatype=EVAL_TYPES,
|
272 |
row_count=5,
|
273 |
)
|
274 |
|
@@ -279,8 +259,8 @@ with demo:
|
|
279 |
with gr.Row():
|
280 |
pending_eval_table = gr.components.Dataframe(
|
281 |
value=pending_eval_queue_df,
|
282 |
-
headers=EVAL_COLS,
|
283 |
-
datatype=EVAL_TYPES,
|
284 |
row_count=5,
|
285 |
)
|
286 |
with gr.Row():
|
@@ -291,7 +271,7 @@ with demo:
|
|
291 |
model_name_textbox = gr.Textbox(label="Model name")
|
292 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
293 |
model_type = gr.Dropdown(
|
294 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
295 |
label="Model type",
|
296 |
multiselect=False,
|
297 |
value=None,
|
@@ -300,14 +280,14 @@ with demo:
|
|
300 |
|
301 |
with gr.Column():
|
302 |
precision = gr.Dropdown(
|
303 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
304 |
label="Precision",
|
305 |
multiselect=False,
|
306 |
value="float16",
|
307 |
interactive=True,
|
308 |
)
|
309 |
weight_type = gr.Dropdown(
|
310 |
-
choices=[i.value.name for i in WeightType],
|
311 |
label="Weights type",
|
312 |
multiselect=False,
|
313 |
value="Original",
|
@@ -318,7 +298,7 @@ with demo:
|
|
318 |
submit_button = gr.Button("Submit Eval")
|
319 |
submission_result = gr.Markdown()
|
320 |
submit_button.click(
|
321 |
-
add_new_eval,
|
322 |
[
|
323 |
model_name_textbox,
|
324 |
base_model_name_textbox,
|
@@ -333,8 +313,8 @@ with demo:
|
|
333 |
with gr.Row():
|
334 |
with gr.Accordion("📙 Citation", open=False):
|
335 |
citation_button = gr.Textbox(
|
336 |
-
value=CITATION_BUTTON_TEXT,
|
337 |
-
label=CITATION_BUTTON_LABEL,
|
338 |
lines=20,
|
339 |
elem_id="citation-button",
|
340 |
show_copy_button=True,
|
|
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
+
import src.display.about as about
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from src.display.css_html_js import custom_css
|
8 |
+
import src.display.utils as utils
|
9 |
+
import src.envs as envs
|
10 |
+
import src.populate as populate
|
11 |
+
import src.submission.submit as submit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def restart_space():
|
15 |
+
envs.API.restart_space(repo_id=envs.REPO_ID, token=envs.TOKEN)
|
16 |
|
17 |
try:
|
18 |
+
print(envs.EVAL_REQUESTS_PATH)
|
19 |
snapshot_download(
|
20 |
+
repo_id=envs.QUEUE_REPO, local_dir=envs.EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
21 |
)
|
22 |
except Exception:
|
23 |
restart_space()
|
24 |
try:
|
25 |
+
print(envs.EVAL_RESULTS_PATH)
|
26 |
snapshot_download(
|
27 |
+
repo_id=envs.RESULTS_REPO, local_dir=envs.EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
28 |
)
|
29 |
except Exception:
|
30 |
restart_space()
|
31 |
|
32 |
+
raw_data, original_df = populate.get_leaderboard_df(envs.EVAL_RESULTS_PATH, envs.EVAL_REQUESTS_PATH, utils.COLS, utils.BENCHMARK_COLS)
|
|
|
33 |
leaderboard_df = original_df.copy()
|
34 |
|
35 |
(
|
36 |
finished_eval_queue_df,
|
37 |
running_eval_queue_df,
|
38 |
pending_eval_queue_df,
|
39 |
+
) = populate.get_evaluation_queue_df(envs.EVAL_REQUESTS_PATH, utils.EVAL_COLS)
|
40 |
|
41 |
|
42 |
# Searching and filtering
|
|
|
56 |
|
57 |
|
58 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
59 |
+
return df[(df[utils.AutoEvalColumn.dummy.name].str.contains(query, case=False))]
|
60 |
|
61 |
|
62 |
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
63 |
always_here_cols = [
|
64 |
+
utils.AutoEvalColumn.model_type_symbol.name,
|
65 |
+
utils.AutoEvalColumn.model.name,
|
66 |
]
|
67 |
# We use COLS to maintain sorting
|
68 |
filtered_df = df[
|
69 |
+
always_here_cols + [c for c in utils.COLS if c in df.columns and c in columns] + [utils.AutoEvalColumn.dummy.name]
|
70 |
]
|
71 |
return filtered_df
|
72 |
|
|
|
84 |
if len(final_df) > 0:
|
85 |
filtered_df = pd.concat(final_df)
|
86 |
filtered_df = filtered_df.drop_duplicates(
|
87 |
+
subset=[utils.AutoEvalColumn.model.name, utils.AutoEvalColumn.precision.name, utils.AutoEvalColumn.revision.name]
|
88 |
)
|
89 |
|
90 |
return filtered_df
|
|
|
97 |
if show_deleted:
|
98 |
filtered_df = df
|
99 |
else: # Show only still on the hub models
|
100 |
+
filtered_df = df[df[utils.AutoEvalColumn.still_on_hub.name] == True]
|
101 |
|
102 |
type_emoji = [t[0] for t in type_query]
|
103 |
+
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
104 |
+
filtered_df = filtered_df.loc[df[utils.AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
105 |
|
106 |
+
numeric_interval = pd.IntervalIndex(sorted([utils.NUMERIC_INTERVALS[s] for s in size_query]))
|
107 |
+
params_column = pd.to_numeric(df[utils.AutoEvalColumn.params.name], errors="coerce")
|
108 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
109 |
filtered_df = filtered_df.loc[mask]
|
110 |
|
|
|
113 |
|
114 |
demo = gr.Blocks(css=custom_css)
|
115 |
with demo:
|
116 |
+
gr.HTML(about.TITLE)
|
117 |
+
gr.Markdown(about.INTRODUCTION_TEXT, elem_classes="markdown-text")
|
118 |
|
119 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
120 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
|
|
130 |
shown_columns = gr.CheckboxGroup(
|
131 |
choices=[
|
132 |
c.name
|
133 |
+
for c in utils.fields(utils.AutoEvalColumn)
|
134 |
if not c.hidden and not c.never_hidden and not c.dummy
|
135 |
],
|
136 |
value=[
|
137 |
c.name
|
138 |
+
for c in utils.fields(utils.AutoEvalColumn)
|
139 |
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
140 |
],
|
141 |
label="Select columns to show",
|
|
|
150 |
#with gr.Box(elem_id="box-filter"):
|
151 |
filter_columns_type = gr.CheckboxGroup(
|
152 |
label="Model types",
|
153 |
+
choices=[t.to_str() for t in utils.ModelType],
|
154 |
+
value=[t.to_str() for t in utils.ModelType],
|
155 |
interactive=True,
|
156 |
elem_id="filter-columns-type",
|
157 |
)
|
158 |
filter_columns_precision = gr.CheckboxGroup(
|
159 |
label="Precision",
|
160 |
+
choices=[i.value.name for i in utils.Precision],
|
161 |
+
value=[i.value.name for i in utils.Precision],
|
162 |
interactive=True,
|
163 |
elem_id="filter-columns-precision",
|
164 |
)
|
165 |
filter_columns_size = gr.CheckboxGroup(
|
166 |
label="Model sizes (in billions of parameters)",
|
167 |
+
choices=list(utils.NUMERIC_INTERVALS.keys()),
|
168 |
+
value=list(utils.NUMERIC_INTERVALS.keys()),
|
169 |
interactive=True,
|
170 |
elem_id="filter-columns-size",
|
171 |
)
|
172 |
|
173 |
leaderboard_table = gr.components.Dataframe(
|
174 |
value=leaderboard_df[
|
175 |
+
[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden]
|
176 |
+ shown_columns.value
|
177 |
+
+ [utils.AutoEvalColumn.dummy.name]
|
178 |
],
|
179 |
+
headers=[c.name for c in utils.fields(utils.AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
180 |
+
datatype=utils.TYPES,
|
181 |
elem_id="leaderboard-table",
|
182 |
interactive=False,
|
183 |
visible=True,
|
|
|
186 |
|
187 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
188 |
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
189 |
+
value=original_df[utils.COLS],
|
190 |
+
headers=utils.COLS,
|
191 |
+
datatype=utils.TYPES,
|
192 |
visible=False,
|
193 |
)
|
194 |
search_bar.submit(
|
|
|
221 |
)
|
222 |
|
223 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
224 |
+
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
225 |
|
226 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
227 |
with gr.Column():
|
228 |
with gr.Row():
|
229 |
+
gr.Markdown(about.EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
230 |
|
231 |
with gr.Column():
|
232 |
with gr.Accordion(
|
|
|
236 |
with gr.Row():
|
237 |
finished_eval_table = gr.components.Dataframe(
|
238 |
value=finished_eval_queue_df,
|
239 |
+
headers=utils.EVAL_COLS,
|
240 |
+
datatype=utils.EVAL_TYPES,
|
241 |
row_count=5,
|
242 |
)
|
243 |
with gr.Accordion(
|
|
|
247 |
with gr.Row():
|
248 |
running_eval_table = gr.components.Dataframe(
|
249 |
value=running_eval_queue_df,
|
250 |
+
headers=utils.EVAL_COLS,
|
251 |
+
datatype=utils.EVAL_TYPES,
|
252 |
row_count=5,
|
253 |
)
|
254 |
|
|
|
259 |
with gr.Row():
|
260 |
pending_eval_table = gr.components.Dataframe(
|
261 |
value=pending_eval_queue_df,
|
262 |
+
headers=utils.EVAL_COLS,
|
263 |
+
datatype=utils.EVAL_TYPES,
|
264 |
row_count=5,
|
265 |
)
|
266 |
with gr.Row():
|
|
|
271 |
model_name_textbox = gr.Textbox(label="Model name")
|
272 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
273 |
model_type = gr.Dropdown(
|
274 |
+
choices=[t.to_str(" : ") for t in utils.ModelType if t != utils.ModelType.Unknown],
|
275 |
label="Model type",
|
276 |
multiselect=False,
|
277 |
value=None,
|
|
|
280 |
|
281 |
with gr.Column():
|
282 |
precision = gr.Dropdown(
|
283 |
+
choices=[i.value.name for i in utils.Precision if i != utils.Precision.Unknown],
|
284 |
label="Precision",
|
285 |
multiselect=False,
|
286 |
value="float16",
|
287 |
interactive=True,
|
288 |
)
|
289 |
weight_type = gr.Dropdown(
|
290 |
+
choices=[i.value.name for i in utils.WeightType],
|
291 |
label="Weights type",
|
292 |
multiselect=False,
|
293 |
value="Original",
|
|
|
298 |
submit_button = gr.Button("Submit Eval")
|
299 |
submission_result = gr.Markdown()
|
300 |
submit_button.click(
|
301 |
+
submit.add_new_eval,
|
302 |
[
|
303 |
model_name_textbox,
|
304 |
base_model_name_textbox,
|
|
|
313 |
with gr.Row():
|
314 |
with gr.Accordion("📙 Citation", open=False):
|
315 |
citation_button = gr.Textbox(
|
316 |
+
value=about.CITATION_BUTTON_TEXT,
|
317 |
+
label=about.CITATION_BUTTON_LABEL,
|
318 |
lines=20,
|
319 |
elem_id="citation-button",
|
320 |
show_copy_button=True,
|
main_backend.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
import src.backend.run_eval_suite as run_eval_suite
|
7 |
+
import src.backend.manage_requests as manage_requests
|
8 |
+
import src.backend.sort_queue as sort_queue
|
9 |
+
import src.envs as envs
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.ERROR)
|
12 |
+
pp = pprint.PrettyPrinter(width=80)
|
13 |
+
|
14 |
+
PENDING_STATUS = "PENDING"
|
15 |
+
RUNNING_STATUS = "RUNNING"
|
16 |
+
FINISHED_STATUS = "FINISHED"
|
17 |
+
FAILED_STATUS = "FAILED"
|
18 |
+
|
19 |
+
snapshot_download(repo_id=envs.RESULTS_REPO, revision="main",
|
20 |
+
local_dir=envs.EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
21 |
+
snapshot_download(repo_id=envs.QUEUE_REPO, revision="main",
|
22 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
|
23 |
+
|
24 |
+
|
25 |
+
def run_auto_eval():
|
26 |
+
current_pending_status = [PENDING_STATUS]
|
27 |
+
|
28 |
+
manage_requests.check_completed_evals(
|
29 |
+
api=envs.API,
|
30 |
+
checked_status=RUNNING_STATUS,
|
31 |
+
completed_status=FINISHED_STATUS,
|
32 |
+
failed_status=FAILED_STATUS,
|
33 |
+
hf_repo=envs.QUEUE_REPO,
|
34 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND,
|
35 |
+
hf_repo_results=envs.RESULTS_REPO,
|
36 |
+
local_dir_results=envs.EVAL_RESULTS_PATH_BACKEND
|
37 |
+
)
|
38 |
+
|
39 |
+
eval_requests = manage_requests.get_eval_requests(job_status=current_pending_status,
|
40 |
+
hf_repo=envs.QUEUE_REPO,
|
41 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND)
|
42 |
+
eval_requests = sort_queue.sort_models_by_priority(api=envs.API, models=eval_requests)
|
43 |
+
|
44 |
+
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
45 |
+
|
46 |
+
if len(eval_requests) == 0:
|
47 |
+
print("No eval requests found. Exiting.")
|
48 |
+
return
|
49 |
+
|
50 |
+
eval_request = eval_requests[0]
|
51 |
+
pp.pprint(eval_request)
|
52 |
+
|
53 |
+
manage_requests.set_eval_request(
|
54 |
+
api=envs.API,
|
55 |
+
eval_request=eval_request,
|
56 |
+
new_status=RUNNING_STATUS,
|
57 |
+
hf_repo=envs.QUEUE_REPO,
|
58 |
+
local_dir=envs.EVAL_REQUESTS_PATH_BACKEND
|
59 |
+
)
|
60 |
+
|
61 |
+
run_eval_suite.run_evaluation(
|
62 |
+
eval_request=eval_request,
|
63 |
+
local_dir=envs.EVAL_RESULTS_PATH_BACKEND,
|
64 |
+
results_repo=envs.RESULTS_REPO,
|
65 |
+
batch_size=1,
|
66 |
+
device=envs.DEVICE,
|
67 |
+
no_cache=True,
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
if __name__ == "__main__":
|
72 |
+
run_auto_eval()
|
requirements.txt
CHANGED
@@ -12,4 +12,5 @@ python-dateutil==2.8.2
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers==4.35.2
|
15 |
-
tokenizers>=0.15.0
|
|
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers==4.35.2
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
sentence-transformers==2.2.2
|
scripts/create_request_file.py
CHANGED
@@ -7,10 +7,9 @@ from datetime import datetime, timezone
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
-
from util import QUEUE_REPO, EVAL_REQUESTS_PATH
|
11 |
|
12 |
-
|
13 |
-
|
14 |
|
15 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
16 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
@@ -25,7 +24,8 @@ def get_model_size(model_info, precision: str):
|
|
25 |
try:
|
26 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
27 |
model_size = size_match.group(0)
|
28 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b"
|
|
|
29 |
except AttributeError:
|
30 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
31 |
|
@@ -37,13 +37,15 @@ def get_model_size(model_info, precision: str):
|
|
37 |
def main():
|
38 |
api = HfApi()
|
39 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
40 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH,
|
|
|
41 |
|
42 |
model_name = click.prompt("Enter model name")
|
43 |
revision = click.prompt("Enter revision", default="main")
|
44 |
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
45 |
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
46 |
-
weight_type = click.prompt("Enter weight type", default="Original",
|
|
|
47 |
base_model = click.prompt("Enter base model", default="")
|
48 |
status = click.prompt("Enter status", default="FINISHED")
|
49 |
|
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
10 |
|
11 |
+
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH
|
12 |
+
|
13 |
|
14 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
15 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
|
|
24 |
try:
|
25 |
size_match = re.search(size_pattern, model_info.modelId.lower())
|
26 |
model_size = size_match.group(0)
|
27 |
+
model_size = round(float(model_size[:-1]) if model_size[-1] == "b"
|
28 |
+
else float(model_size[:-1]) / 1e3, 3)
|
29 |
except AttributeError:
|
30 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
31 |
|
|
|
37 |
def main():
|
38 |
api = HfApi()
|
39 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
40 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH,
|
41 |
+
repo_type="dataset")
|
42 |
|
43 |
model_name = click.prompt("Enter model name")
|
44 |
revision = click.prompt("Enter revision", default="main")
|
45 |
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
46 |
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
47 |
+
weight_type = click.prompt("Enter weight type", default="Original",
|
48 |
+
type=click.Choice(weight_types))
|
49 |
base_model = click.prompt("Enter base model", default="")
|
50 |
status = click.prompt("Enter status", default="FINISHED")
|
51 |
|
src/backend/evaluate_model.py
CHANGED
@@ -1,37 +1,95 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
from
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
class Evaluator:
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
self.model = model
|
10 |
self.revision = revision
|
11 |
self.precision = precision
|
12 |
-
self.num_fewshot = num_fewshot
|
13 |
self.batch_size = batch_size
|
14 |
self.device = device
|
15 |
self.no_cache = no_cache
|
16 |
self.limit = limit
|
17 |
self.write_out = write_out
|
18 |
self.output_base_path = output_base_path
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
def evaluate(self):
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
import pandas as pd
|
3 |
|
4 |
+
import src.envs as envs
|
5 |
+
|
6 |
+
from src.backend.model_operations import SummaryGenerator, EvaluationModel
|
7 |
+
import src.backend.util as util
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO,
|
10 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
11 |
+
|
12 |
|
13 |
class Evaluator:
|
14 |
+
"""A class to evaluate summaries generated by a language model.
|
15 |
+
|
16 |
+
Attributes:
|
17 |
+
model (str): The name or path of the model.
|
18 |
+
revision (str): The model revision.
|
19 |
+
precision (str): The precision setting of the model.
|
20 |
+
num_fewshot (int): Number of few-shot examples to use.
|
21 |
+
batch_size (int): Batch size for processing.
|
22 |
+
device (str): The device to run the model on.
|
23 |
+
no_cache (bool): Flag to disable caching.
|
24 |
+
limit (int): Limit on the number of items to process.
|
25 |
+
write_out (bool): Whether to write results to a file.
|
26 |
+
output_base_path (str): Base path for output files.
|
27 |
+
summary_generator (SummaryGenerator): Instance for generating summaries.
|
28 |
+
eval_model (EvaluationModel): Instance for evaluating summaries.
|
29 |
+
"""
|
30 |
+
def __init__(self, model, revision, precision, batch_size,
|
31 |
+
device, no_cache, limit, write_out=True,
|
32 |
+
output_base_path='logs'):
|
33 |
+
"""Initializes the Evaluator with the given model and settings.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
model (str): The name or path of the model.
|
37 |
+
revision (str): The model revision.
|
38 |
+
precision (str): The precision setting of the model.
|
39 |
+
num_fewshot (int): Number of few-shot examples to use.
|
40 |
+
batch_size (int): Batch size for processing.
|
41 |
+
device (str): The device to run the model on.
|
42 |
+
no_cache (bool): Flag to disable caching.
|
43 |
+
limit (int): Limit on the number of items to process.
|
44 |
+
write_out (bool): Whether to write results to a file.
|
45 |
+
output_base_path (str): Base path for output files.
|
46 |
+
"""
|
47 |
self.model = model
|
48 |
self.revision = revision
|
49 |
self.precision = precision
|
|
|
50 |
self.batch_size = batch_size
|
51 |
self.device = device
|
52 |
self.no_cache = no_cache
|
53 |
self.limit = limit
|
54 |
self.write_out = write_out
|
55 |
self.output_base_path = output_base_path
|
56 |
+
try:
|
57 |
+
self.summary_generator = SummaryGenerator(model, revision)
|
58 |
+
self.eval_model = EvaluationModel(envs.HEM_PATH)
|
59 |
+
except Exception as e:
|
60 |
+
logging.error(f"Error initializing Evaluator: {e}")
|
61 |
+
raise
|
62 |
+
|
63 |
def evaluate(self):
|
64 |
+
"""
|
65 |
+
Performs the evaluation process by generating summaries
|
66 |
+
and computing metrics.
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
dict: A dictionary containing evaluation results.
|
70 |
+
"""
|
71 |
+
try:
|
72 |
+
df = pd.read_csv(envs.SOURCE_PATH)
|
73 |
+
generated_summaries_df = self.summary_generator.generate_summaries(df)
|
74 |
+
|
75 |
+
avg_summary_len = self.summary_generator.avg_length
|
76 |
+
answer_rate = self.summary_generator.answer_rate
|
77 |
+
error_rate = self.summary_generator.error_rate
|
78 |
+
|
79 |
+
hallucination_scores = self.eval_model.evaluate_hallucination(
|
80 |
+
generated_summaries_df)
|
81 |
+
accuracy = self.eval_model.compute_accuracy()
|
82 |
+
hallucination_rate = self.eval_model.hallucination_rate
|
83 |
+
|
84 |
+
results = util.format_results(model_name=self.model, revision=self.revision,
|
85 |
+
precision=self.precision, accuracy=accuracy,
|
86 |
+
hallucination_rate=hallucination_rate, answer_rate=answer_rate,
|
87 |
+
avg_summary_len=avg_summary_len, error_rate=error_rate)
|
88 |
+
|
89 |
+
return results
|
90 |
+
except FileNotFoundError:
|
91 |
+
logging.error(f"File not found: {envs.SOURCE_PATH}")
|
92 |
+
raise
|
93 |
+
except Exception as e:
|
94 |
+
logging.error(f"Error during evaluation: {e}")
|
95 |
+
raise
|
src/backend/manage_requests.py
CHANGED
@@ -1,10 +1,10 @@
|
|
|
|
1 |
import glob
|
2 |
import json
|
3 |
from dataclasses import dataclass
|
4 |
from typing import Optional
|
5 |
|
6 |
from huggingface_hub import HfApi, snapshot_download
|
7 |
-
from src.envs import TOKEN
|
8 |
|
9 |
@dataclass
|
10 |
class EvalRequest:
|
@@ -22,42 +22,34 @@ class EvalRequest:
|
|
22 |
likes: Optional[int] = 0
|
23 |
params: Optional[int] = None
|
24 |
license: Optional[str] = ""
|
25 |
-
|
26 |
def get_model_args(self):
|
27 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
28 |
|
29 |
if self.precision in ["float16", "bfloat16"]:
|
30 |
model_args += f",dtype={self.precision}"
|
31 |
-
# Quantized models need some added config, the install of bits and bytes, etc
|
32 |
-
#elif self.precision == "8bit":
|
33 |
-
# model_args += ",load_in_8bit=True"
|
34 |
-
#elif self.precision == "4bit":
|
35 |
-
# model_args += ",load_in_4bit=True"
|
36 |
-
#elif self.precision == "GPTQ":
|
37 |
-
# A GPTQ model does not need dtype to be specified,
|
38 |
-
# it will be inferred from the config
|
39 |
-
pass
|
40 |
else:
|
41 |
-
raise
|
42 |
-
|
43 |
return model_args
|
44 |
|
45 |
|
46 |
-
def set_eval_request(api: HfApi, eval_request: EvalRequest,
|
47 |
-
|
|
|
48 |
json_filepath = eval_request.json_filepath
|
49 |
|
50 |
with open(json_filepath) as fp:
|
51 |
data = json.load(fp)
|
52 |
|
53 |
-
data["status"] =
|
54 |
|
55 |
with open(json_filepath, "w") as f:
|
56 |
f.write(json.dumps(data))
|
57 |
|
58 |
api.upload_file(
|
59 |
path_or_fileobj=json_filepath,
|
60 |
-
path_in_repo=
|
61 |
repo_id=hf_repo,
|
62 |
repo_type="dataset",
|
63 |
)
|
@@ -69,9 +61,10 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
69 |
likes.
|
70 |
|
71 |
Returns:
|
72 |
-
|
73 |
"""
|
74 |
-
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
|
|
|
75 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
76 |
|
77 |
eval_requests = []
|
@@ -97,7 +90,8 @@ def check_completed_evals(
|
|
97 |
local_dir_results: str,
|
98 |
):
|
99 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
100 |
-
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
|
|
|
101 |
|
102 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
103 |
|
@@ -107,10 +101,10 @@ def check_completed_evals(
|
|
107 |
print(f"Checking {model}")
|
108 |
|
109 |
output_path = model
|
110 |
-
|
111 |
-
|
112 |
|
113 |
-
if
|
114 |
print(
|
115 |
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
116 |
)
|
|
|
1 |
+
import os
|
2 |
import glob
|
3 |
import json
|
4 |
from dataclasses import dataclass
|
5 |
from typing import Optional
|
6 |
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
8 |
|
9 |
@dataclass
|
10 |
class EvalRequest:
|
|
|
22 |
likes: Optional[int] = 0
|
23 |
params: Optional[int] = None
|
24 |
license: Optional[str] = ""
|
25 |
+
|
26 |
def get_model_args(self):
|
27 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
28 |
|
29 |
if self.precision in ["float16", "bfloat16"]:
|
30 |
model_args += f",dtype={self.precision}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
else:
|
32 |
+
raise ValueError(f"Unknown precision {self.precision}.")
|
33 |
+
|
34 |
return model_args
|
35 |
|
36 |
|
37 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, new_status: str,
|
38 |
+
hf_repo: str, local_dir: str):
|
39 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed,)"""
|
40 |
json_filepath = eval_request.json_filepath
|
41 |
|
42 |
with open(json_filepath) as fp:
|
43 |
data = json.load(fp)
|
44 |
|
45 |
+
data["status"] = new_status
|
46 |
|
47 |
with open(json_filepath, "w") as f:
|
48 |
f.write(json.dumps(data))
|
49 |
|
50 |
api.upload_file(
|
51 |
path_or_fileobj=json_filepath,
|
52 |
+
path_in_repo=os.path.relpath(json_filepath, start=local_dir),
|
53 |
repo_id=hf_repo,
|
54 |
repo_type="dataset",
|
55 |
)
|
|
|
61 |
likes.
|
62 |
|
63 |
Returns:
|
64 |
+
list[EvalRequest]: a list of model info dicts.
|
65 |
"""
|
66 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
|
67 |
+
repo_type="dataset", max_workers=60)
|
68 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
69 |
|
70 |
eval_requests = []
|
|
|
90 |
local_dir_results: str,
|
91 |
):
|
92 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
93 |
+
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results,
|
94 |
+
repo_type="dataset", max_workers=60)
|
95 |
|
96 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
97 |
|
|
|
101 |
print(f"Checking {model}")
|
102 |
|
103 |
output_path = model
|
104 |
+
output_files = f"{local_dir_results}/{output_path}/results*.json"
|
105 |
+
output_files_exists = len(glob.glob(output_files)) > 0
|
106 |
|
107 |
+
if output_files_exists:
|
108 |
print(
|
109 |
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
110 |
)
|
src/backend/model_operations.py
CHANGED
@@ -1,96 +1,224 @@
|
|
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
|
|
3 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
from sentence_transformers import CrossEncoder
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def load_evaluation_model(model_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
model = CrossEncoder(model_path)
|
10 |
-
model.save_pretrained('.checkpoints/{model_path}')
|
11 |
return model
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
class SummaryGenerator:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def __init__(self, model_id, revision):
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
self.summaries_df = pd.DataFrame()
|
18 |
self.revision = revision
|
19 |
self.avg_length = None
|
20 |
self.answer_rate = None
|
|
|
21 |
|
22 |
def generate_summaries(self, df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
source, summary, dataset = [], [], []
|
24 |
|
|
|
25 |
for index, row in df.iterrows():
|
26 |
_source = row['text']
|
27 |
_dataset = row['dataset']
|
28 |
|
29 |
-
prompt = generate_prompt(_source)
|
30 |
-
inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
|
|
|
31 |
try:
|
32 |
-
outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
|
33 |
-
|
|
|
|
|
34 |
except Exception as e:
|
35 |
print(f"Error at index {index}: {e}")
|
36 |
response = ""
|
37 |
-
|
|
|
38 |
summary.append(response)
|
39 |
source.append(_source)
|
40 |
dataset.append(_dataset)
|
41 |
|
42 |
-
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
|
|
43 |
self._compute_avg_length()
|
44 |
self._compute_answer_rate()
|
|
|
45 |
|
46 |
return self.summaries_df
|
47 |
|
48 |
def _compute_avg_length(self):
|
|
|
|
|
|
|
49 |
total_words = 0
|
50 |
count = 0
|
51 |
|
52 |
for summary in self.summaries_df['summary']:
|
53 |
if summary != "":
|
54 |
-
|
|
|
55 |
total_words += len(words)
|
56 |
count += 1
|
57 |
|
58 |
self.avg_length = 0 if count == 0 else total_words / count
|
59 |
|
60 |
def _compute_answer_rate(self):
|
61 |
-
|
|
|
|
|
|
|
62 |
total_rows = len(self.summaries_df)
|
63 |
|
64 |
self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
|
65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
class EvaluationModel:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
def __init__(self, model_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
self.model = load_evaluation_model(model_path)
|
69 |
self.scores = []
|
70 |
self.accuracy = None
|
71 |
self.hallucination_rate = None
|
72 |
|
73 |
def evaluate_hallucination(self, summaries_df):
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
return self.scores
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
if not self.scores:
|
84 |
-
|
|
|
|
|
85 |
|
86 |
# Use threshold of 0.5 to compute accuracy
|
87 |
-
num_above_threshold = sum(score >=
|
88 |
num_total = len(self.scores)
|
89 |
|
90 |
-
if num_total
|
91 |
raise ValueError("No scores available to compute accuracy.")
|
92 |
|
93 |
self.accuracy = (num_above_threshold / num_total) * 100
|
94 |
self.hallucination_rate = 100 - self.accuracy
|
95 |
|
96 |
-
return self.accuracy
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
+
import spacy
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
from sentence_transformers import CrossEncoder
|
8 |
|
9 |
+
import src.backend.util as util
|
10 |
+
|
11 |
+
# Set up basic configuration for logging
|
12 |
+
logging.basicConfig(level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
14 |
+
|
15 |
+
# Load spacy model for word tokenization
|
16 |
+
nlp = spacy.load("en_core_web_sm")
|
17 |
+
|
18 |
|
19 |
def load_evaluation_model(model_path):
|
20 |
+
"""Load the evaluation model from the given path
|
21 |
+
|
22 |
+
Args:
|
23 |
+
model_path (str): Path to the evaluation model
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
CrossEncoder: The evaluation model
|
27 |
+
"""
|
28 |
model = CrossEncoder(model_path)
|
|
|
29 |
return model
|
30 |
|
31 |
+
|
32 |
+
class ModelLoadingException(Exception):
|
33 |
+
"""Exception raised for errors in loading a model.
|
34 |
+
|
35 |
+
Attributes:
|
36 |
+
model_id (str): The model identifier.
|
37 |
+
revision (str): The model revision.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self, model_id, revision, messages="Error initializing model"):
|
41 |
+
self.model_id = model_id
|
42 |
+
self.revision = revision
|
43 |
+
super().__init__(f"{messages} id={model_id} revision={revision}")
|
44 |
+
|
45 |
class SummaryGenerator:
|
46 |
+
"""A class to generate summaries using a causal language model.
|
47 |
+
|
48 |
+
Attributes:
|
49 |
+
tokenizer (AutoTokenizer): Tokenizer for the model.
|
50 |
+
model (AutoModelForCausalLM): The causal language model.
|
51 |
+
summaries_df (DataFrame): DataFrame to store generated summaries.
|
52 |
+
revision (str): Model revision.
|
53 |
+
avg_length (float): Average length of summaries.
|
54 |
+
answer_rate (float): Rate of non-empty summaries.
|
55 |
+
"""
|
56 |
+
|
57 |
def __init__(self, model_id, revision):
|
58 |
+
"""
|
59 |
+
Initializes the SummaryGenerator with a model.
|
60 |
+
|
61 |
+
Args:
|
62 |
+
model_id (str): Identifier for the model.
|
63 |
+
revision (str): Revision of the model.
|
64 |
+
"""
|
65 |
+
try:
|
66 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_id, revision)
|
67 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_id, revision)
|
68 |
+
except Exception as e:
|
69 |
+
logging.error(f"Error initializing model with id {model_id} and revision {revision}: {e}")
|
70 |
+
raise ModelLoadingException(model_id, revision) from e
|
71 |
self.summaries_df = pd.DataFrame()
|
72 |
self.revision = revision
|
73 |
self.avg_length = None
|
74 |
self.answer_rate = None
|
75 |
+
self.error_rate = None
|
76 |
|
77 |
def generate_summaries(self, df):
|
78 |
+
"""Generate summaries for a given DataFrame of source docs.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
df (DataFrame): DataFrame containing source docs.
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
summaries_df (DataFrame): Generated summaries by the model.
|
85 |
+
"""
|
86 |
source, summary, dataset = [], [], []
|
87 |
|
88 |
+
error_count = 0
|
89 |
for index, row in df.iterrows():
|
90 |
_source = row['text']
|
91 |
_dataset = row['dataset']
|
92 |
|
93 |
+
prompt = util.generate_prompt(_source)
|
94 |
+
inputs = self.tokenizer(prompt, return_tensors='pt', max_length=1024,
|
95 |
+
revision=self.revision)
|
96 |
try:
|
97 |
+
outputs = self.model.generate(**inputs, max_new_tokens=1024, do_sample=False,
|
98 |
+
temperature=0.0, revision=self.revision)
|
99 |
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True,
|
100 |
+
revision=self.revision)
|
101 |
except Exception as e:
|
102 |
print(f"Error at index {index}: {e}")
|
103 |
response = ""
|
104 |
+
error_count += 1
|
105 |
+
|
106 |
summary.append(response)
|
107 |
source.append(_source)
|
108 |
dataset.append(_dataset)
|
109 |
|
110 |
+
self.summaries_df = pd.DataFrame(list(zip(source, summary, dataset)),
|
111 |
+
columns=["source", "summary", "dataset"])
|
112 |
self._compute_avg_length()
|
113 |
self._compute_answer_rate()
|
114 |
+
self._compute_error_rate(error_count)
|
115 |
|
116 |
return self.summaries_df
|
117 |
|
118 |
def _compute_avg_length(self):
|
119 |
+
"""
|
120 |
+
Compute the average length of non-empty summaries using SpaCy.
|
121 |
+
"""
|
122 |
total_words = 0
|
123 |
count = 0
|
124 |
|
125 |
for summary in self.summaries_df['summary']:
|
126 |
if summary != "":
|
127 |
+
doc = nlp(summary)
|
128 |
+
words = [token.text for token in doc if token.is_alpha]
|
129 |
total_words += len(words)
|
130 |
count += 1
|
131 |
|
132 |
self.avg_length = 0 if count == 0 else total_words / count
|
133 |
|
134 |
def _compute_answer_rate(self):
|
135 |
+
"""
|
136 |
+
Compute the rate of non-empty summaries.
|
137 |
+
"""
|
138 |
+
non_empty_count = sum(1 for summary in self.summaries_df['summary'] if summary)
|
139 |
total_rows = len(self.summaries_df)
|
140 |
|
141 |
self.answer_rate = 0 if total_rows == 0 else non_empty_count / total_rows
|
142 |
|
143 |
+
def _compute_error_rate(self, count):
|
144 |
+
"""
|
145 |
+
Compute the error rate of summaries.
|
146 |
+
"""
|
147 |
+
total_rows = len(self.summaries_df)
|
148 |
+
|
149 |
+
self.error_rate = 0 if total_rows == 0 else count / total_rows
|
150 |
+
|
151 |
+
|
152 |
class EvaluationModel:
|
153 |
+
"""A class to evaluate generated summaries.
|
154 |
+
|
155 |
+
Attributes:
|
156 |
+
model (CrossEncoder): The evaluation model.
|
157 |
+
scores (list): List of evaluation scores.
|
158 |
+
accuracy (float): Accuracy of the summaries.
|
159 |
+
hallucination_rate (float): Rate of hallucination in summaries.
|
160 |
+
"""
|
161 |
+
|
162 |
def __init__(self, model_path):
|
163 |
+
"""
|
164 |
+
Initializes the EvaluationModel with a CrossEncoder model.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
model_path (str): Path to the CrossEncoder model.
|
168 |
+
"""
|
169 |
self.model = load_evaluation_model(model_path)
|
170 |
self.scores = []
|
171 |
self.accuracy = None
|
172 |
self.hallucination_rate = None
|
173 |
|
174 |
def evaluate_hallucination(self, summaries_df):
|
175 |
+
"""
|
176 |
+
Evaluate the hallucination rate in summaries. This method updates the 'scores' attribute
|
177 |
+
of the instance with the computed scores.
|
178 |
|
179 |
+
Args:
|
180 |
+
summaries_df (DataFrame): DataFrame containing source docs and summaries.
|
|
|
181 |
|
182 |
+
Returns:
|
183 |
+
list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
|
184 |
+
"""
|
185 |
+
source_docs = np.array(summaries_df['source'])
|
186 |
+
generated_summaries = np.array(summaries_df['summary'])
|
187 |
+
try:
|
188 |
+
scores = self.model.predict(source_docs, generated_summaries)
|
189 |
+
self.scores = scores
|
190 |
+
return self.scores
|
191 |
+
except Exception as e:
|
192 |
+
logging.error(f"Error evaluating hallucination: {e}")
|
193 |
+
raise
|
194 |
+
|
195 |
+
def compute_accuracy(self, threshold=0.5):
|
196 |
+
"""
|
197 |
+
Compute the accuracy of the evaluated summaries based on the previously calculated scores.
|
198 |
+
This method relies on the 'scores' attribute being populated, typically via the
|
199 |
+
'evaluate_hallucination' method.
|
200 |
+
|
201 |
+
Returns:
|
202 |
+
float: Accuracy percentage. Also updates the 'accuracy' and 'hallucination_rate'
|
203 |
+
attributes of the instance.
|
204 |
+
|
205 |
+
Raises:
|
206 |
+
ValueError: If scores have not been calculated prior to calling this method.
|
207 |
+
"""
|
208 |
if not self.scores:
|
209 |
+
error_msg = "Scores not calculated. Call evaluate_hallucination() first."
|
210 |
+
logging.error(error_msg)
|
211 |
+
raise ValueError(error_msg)
|
212 |
|
213 |
# Use threshold of 0.5 to compute accuracy
|
214 |
+
num_above_threshold = sum(score >= threshold for score in self.scores)
|
215 |
num_total = len(self.scores)
|
216 |
|
217 |
+
if not num_total:
|
218 |
raise ValueError("No scores available to compute accuracy.")
|
219 |
|
220 |
self.accuracy = (num_above_threshold / num_total) * 100
|
221 |
self.hallucination_rate = 100 - self.accuracy
|
222 |
|
223 |
+
return self.accuracy
|
224 |
+
|
src/backend/run_eval_suite.py
CHANGED
@@ -3,43 +3,56 @@ import os
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
6 |
-
|
7 |
-
from evaluate_model import Evaluator
|
8 |
-
|
9 |
-
from src.envs import RESULTS_REPO, API
|
10 |
from src.backend.manage_requests import EvalRequest
|
|
|
11 |
|
12 |
-
|
13 |
-
|
|
|
14 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
15 |
|
16 |
-
def run_evaluation(eval_request: EvalRequest, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
17 |
-
if limit:
|
18 |
-
print(
|
19 |
-
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
20 |
-
)
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
dumped = json.dumps(results, indent=2)
|
33 |
-
|
34 |
|
35 |
-
output_path = os.path.join(local_dir, *eval_request.model.split("/"),
|
|
|
36 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
37 |
with open(output_path, "w") as f:
|
38 |
f.write(dumped)
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
API.upload_file(
|
43 |
path_or_fileobj=output_path,
|
44 |
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
45 |
repo_id=results_repo,
|
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
6 |
+
import src.envs as envs
|
|
|
|
|
|
|
7 |
from src.backend.manage_requests import EvalRequest
|
8 |
+
from src.backend.evaluate_model import Evaluator
|
9 |
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
13 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
def run_evaluation(eval_request: EvalRequest, batch_size, device,
|
17 |
+
local_dir: str, results_repo: str, no_cache=True, limit=None):
|
18 |
+
"""
|
19 |
+
Run the evaluation for a given model and upload the results.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
eval_request (EvalRequest): The evaluation request object containing model details.
|
23 |
+
num_fewshot (int): Number of few-shot examples.
|
24 |
+
batch_size (int): Batch size for processing.
|
25 |
+
device (str): The device to run the evaluation on.
|
26 |
+
local_dir (str): Local directory path for saving results.
|
27 |
+
results_repo (str): Repository ID where results will be uploaded.
|
28 |
+
no_cache (bool): Whether to disable caching.
|
29 |
+
limit (int, optional): Limit on the number of items to process. Use with caution.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
dict: A dictionary containing evaluation results.
|
33 |
+
"""
|
34 |
+
if limit:
|
35 |
+
logging.warning("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
36 |
|
37 |
+
try:
|
38 |
+
evaluator = Evaluator(eval_request.model, eval_request.revision, eval_request.precision,
|
39 |
+
batch_size, device, no_cache, limit, write_out=True,
|
40 |
+
output_base_path='logs')
|
41 |
+
results = evaluator.evaluate()
|
42 |
+
except Exception as e:
|
43 |
+
logging.error(f"Error during evaluation: {e}")
|
44 |
+
raise
|
45 |
|
46 |
dumped = json.dumps(results, indent=2)
|
47 |
+
logging.info(dumped)
|
48 |
|
49 |
+
output_path = os.path.join(local_dir, *eval_request.model.split("/"),
|
50 |
+
f"results_{datetime.now()}.json")
|
51 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
52 |
with open(output_path, "w") as f:
|
53 |
f.write(dumped)
|
54 |
|
55 |
+
envs.API.upload_file(
|
|
|
|
|
56 |
path_or_fileobj=output_path,
|
57 |
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
58 |
repo_id=results_repo,
|
src/backend/sort_queue.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import re
|
2 |
from dataclasses import dataclass
|
3 |
|
4 |
from huggingface_hub import HfApi
|
@@ -25,4 +24,4 @@ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
|
25 |
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
26 |
|
27 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
-
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
|
3 |
from huggingface_hub import HfApi
|
|
|
24 |
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
25 |
|
26 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
27 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/backend/util.py
CHANGED
@@ -1,18 +1,41 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def load_dataframe(data_path):
|
4 |
-
df = pd.read_csv(data_path)
|
5 |
-
return df
|
6 |
|
7 |
-
def
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
"""
|
13 |
-
|
14 |
-
def format_results(hallucination_scores, model_name, revision, precision, accuracy, hallucination_rate, answer_rate, avg_summary_len):
|
15 |
-
# Define the structure of the results (JSON)
|
16 |
results = {
|
17 |
"config": {
|
18 |
"model_dtype": precision, # Precision with which you ran the evaluation
|
@@ -20,13 +43,22 @@ def format_results(hallucination_scores, model_name, revision, precision, accura
|
|
20 |
"model_sha": revision # Hash of the model
|
21 |
},
|
22 |
"results": {
|
23 |
-
"
|
24 |
-
"
|
25 |
-
|
26 |
-
|
27 |
-
"
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
}
|
30 |
}
|
31 |
}
|
32 |
-
|
|
|
|
1 |
+
def generate_prompt(source_passage: str) -> str:
|
2 |
+
"""
|
3 |
+
Generates a prompt for a chatbot to summarize a given passage.
|
4 |
+
|
5 |
+
Args:
|
6 |
+
source_passage (str): The passage to be summarized.
|
7 |
+
|
8 |
+
Returns:
|
9 |
+
str: A formatted prompt string for the chatbot.
|
10 |
+
"""
|
11 |
+
if not source_passage:
|
12 |
+
raise ValueError("Source passage is empty.")
|
13 |
+
|
14 |
+
return f"""You are a chat bot answering questions using data. You must stick to the answers provided solely by the text in the passage provided.
|
15 |
+
You are asked the question 'Provide a concise summary of the following passage, covering the core pieces of information described:'
|
16 |
+
Passage:\n {source_passage}
|
17 |
+
"""
|
18 |
|
|
|
|
|
|
|
19 |
|
20 |
+
def format_results(model_name: str, revision: str, precision: str, accuracy: float,
|
21 |
+
hallucination_rate: float, answer_rate: float, avg_summary_len: float,
|
22 |
+
error_rate: float) -> dict:
|
23 |
+
"""
|
24 |
+
Formats the evaluation results into a structured dictionary.
|
25 |
+
|
26 |
+
Args:
|
27 |
+
model_name (str): The name of the evaluated model.
|
28 |
+
revision (str): The revision hash of the model.
|
29 |
+
precision (str): The precision with which the evaluation was run.
|
30 |
+
accuracy (float): The accuracy score from the evaluation.
|
31 |
+
hallucination_rate (float): The hallucination rate from the evaluation.
|
32 |
+
answer_rate (float): The answer rate from the evaluation.
|
33 |
+
avg_summary_len (float): The average summary length from the evaluation.
|
34 |
+
error_rate (float): The rate at which errors occurred during summary generation.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
dict: A dictionary containing the structured evaluation results.
|
38 |
"""
|
|
|
|
|
|
|
39 |
results = {
|
40 |
"config": {
|
41 |
"model_dtype": precision, # Precision with which you ran the evaluation
|
|
|
43 |
"model_sha": revision # Hash of the model
|
44 |
},
|
45 |
"results": {
|
46 |
+
"accuracy": {
|
47 |
+
"accuracy": accuracy
|
48 |
+
},
|
49 |
+
"hallucination_rate": {
|
50 |
+
"hallucination_rate": hallucination_rate
|
51 |
+
},
|
52 |
+
"answer_rate": {
|
53 |
+
"answer_rate": answer_rate
|
54 |
+
},
|
55 |
+
"average_summary_length": {
|
56 |
+
"average_summary_length": avg_summary_len
|
57 |
+
},
|
58 |
+
"error_rate": {
|
59 |
+
"error_rate": error_rate
|
60 |
}
|
61 |
}
|
62 |
}
|
63 |
+
|
64 |
+
return results
|
src/display/about.py
CHANGED
@@ -1,20 +1,23 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
#
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
# Your leaderboard name
|
19 |
TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
|
20 |
|
@@ -24,7 +27,7 @@ This Leaderboard evaluates how much easy LLM hallucinates in factual summarizati
|
|
24 |
"""
|
25 |
|
26 |
# Which evaluations are you running? how can people reproduce what you have?
|
27 |
-
LLM_BENCHMARKS_TEXT =
|
28 |
## How it works
|
29 |
|
30 |
## Reproducibility
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
@dataclass
|
5 |
+
class Task:
|
6 |
+
benchmark: str
|
7 |
+
metric: str
|
8 |
+
col_name: str
|
9 |
+
|
10 |
+
|
11 |
+
class Tasks(Enum):
|
12 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
13 |
+
accuracy = Task("accuracy", "accuracy", "Accuracy")
|
14 |
+
hallucination_rate = Task("hallucination_rate",
|
15 |
+
"hallucination_rate", "Hallucination Rate")
|
16 |
+
answer_rate = Task("answer_rate", "answer_rate", "Answer Rate")
|
17 |
+
average_summary_length = Task("average_summary_length",
|
18 |
+
"average_summary_length", "Average Summary Length")
|
19 |
+
error_rate = Task("error_rate", "error_rate", "Error Rate")
|
20 |
+
|
21 |
# Your leaderboard name
|
22 |
TITLE = """<h1 align="center" id="space-title">Hughes Hallucination Evaluation Model leaderboard</h1>"""
|
23 |
|
|
|
27 |
"""
|
28 |
|
29 |
# Which evaluations are you running? how can people reproduce what you have?
|
30 |
+
LLM_BENCHMARKS_TEXT = """
|
31 |
## How it works
|
32 |
|
33 |
## Reproducibility
|
src/display/css_html_js.py
CHANGED
@@ -33,7 +33,7 @@ custom_css = """
|
|
33 |
background: none;
|
34 |
border: none;
|
35 |
}
|
36 |
-
|
37 |
#search-bar {
|
38 |
padding: 0px;
|
39 |
}
|
|
|
33 |
background: none;
|
34 |
border: none;
|
35 |
}
|
36 |
+
|
37 |
#search-bar {
|
38 |
padding: 0px;
|
39 |
}
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -24,16 +24,27 @@ class ColumnContent:
|
|
24 |
## Leaderboard columns
|
25 |
auto_eval_column_dict = []
|
26 |
# Init
|
27 |
-
auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
auto_eval_column_dict.append([
|
33 |
-
#
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
#
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# Model information
|
39 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
@@ -126,7 +137,7 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
126 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
127 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
128 |
|
129 |
-
BENCHMARK_COLS = [
|
130 |
|
131 |
NUMERIC_INTERVALS = {
|
132 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.display.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
24 |
## Leaderboard columns
|
25 |
auto_eval_column_dict = []
|
26 |
# Init
|
27 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent,
|
28 |
+
ColumnContent("T", "str", True, never_hidden=True)])
|
29 |
+
auto_eval_column_dict.append(["model", ColumnContent,
|
30 |
+
ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
+
for task in Tasks:
|
32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
+
# # Accuracy
|
34 |
+
# auto_eval_column_dict.append(["accuracy", ColumnContent,
|
35 |
+
# ColumnContent("Accuracy", "number", True)])
|
36 |
+
# # Hallucination Rate
|
37 |
+
# auto_eval_column_dict.append(["hallucination_rate", ColumnContent,
|
38 |
+
# ColumnContent("Hallucination Rate", "number", True)])
|
39 |
+
# # Answer Rate
|
40 |
+
# auto_eval_column_dict.append(["answer_rate", ColumnContent,
|
41 |
+
# ColumnContent("Answer Rate", "number", True)])
|
42 |
+
# # Average Summary Length
|
43 |
+
# auto_eval_column_dict.append(["average_summary_length", ColumnContent,
|
44 |
+
# ColumnContent("Average Summary Length", "number", True)])
|
45 |
+
# # Error Rate
|
46 |
+
# auto_eval_column_dict.append(["error_rate", ColumnContent,
|
47 |
+
# ColumnContent("Error Rate", "number", True)])
|
48 |
|
49 |
# Model information
|
50 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
137 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
138 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
139 |
|
140 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
141 |
|
142 |
NUMERIC_INTERVALS = {
|
143 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/envs.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
# replace this with our token
|
6 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
7 |
|
8 |
OWNER = "vectara"
|
@@ -15,8 +15,11 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
17 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
18 |
|
|
|
19 |
API = HfApi(token=TOKEN)
|
20 |
|
21 |
-
SOURCE_PATH = "/datasets/
|
22 |
-
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
# replace this with our token
|
6 |
TOKEN = os.environ.get("HF_TOKEN", None)
|
7 |
|
8 |
OWNER = "vectara"
|
|
|
15 |
# Local caches
|
16 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
17 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
18 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
19 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
20 |
|
21 |
+
DEVICE = "cpu"
|
22 |
API = HfApi(token=TOKEN)
|
23 |
|
24 |
+
SOURCE_PATH = "src/datasets/leaderboard_dataset.csv"
|
25 |
+
HEM_PATH = 'vectara/hallucination_evaluation_model'
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,33 +1,32 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
-
import dateutil
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
eval_name: str
|
18 |
-
full_model: str
|
19 |
-
org: str
|
20 |
model: str
|
21 |
-
revision: str
|
22 |
results: dict
|
23 |
-
precision: Precision = Precision.Unknown
|
24 |
-
model_type: ModelType = ModelType.Unknown
|
25 |
-
weight_type: WeightType = WeightType.Original
|
26 |
-
architecture: str = "Unknown"
|
27 |
license: str = "?"
|
28 |
likes: int = 0
|
29 |
num_params: int = 0
|
30 |
-
date: str = ""
|
31 |
still_on_hub: bool = False
|
32 |
|
33 |
@classmethod
|
@@ -39,42 +38,38 @@ class EvalResult:
|
|
39 |
config = data.get("config")
|
40 |
|
41 |
# Precision
|
42 |
-
precision = Precision.from_str(config.get("model_dtype"))
|
43 |
|
44 |
# Get model and org
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
if
|
49 |
-
org = None
|
50 |
-
model = org_and_model[0]
|
51 |
-
result_key = f"{model}_{precision.value.name}"
|
52 |
-
else:
|
53 |
-
org = org_and_model[0]
|
54 |
-
model = org_and_model[1]
|
55 |
result_key = f"{org}_{model}_{precision.value.name}"
|
56 |
-
|
|
|
57 |
|
58 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
59 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True,
|
60 |
-
|
61 |
-
|
62 |
-
if model_config
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
# Extract results available in this file (some results are split in several files)
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
78 |
|
79 |
return self(
|
80 |
eval_name=result_key,
|
@@ -82,7 +77,7 @@ class EvalResult:
|
|
82 |
org=org,
|
83 |
model=model,
|
84 |
results=results,
|
85 |
-
precision=precision,
|
86 |
revision= config.get("model_sha", ""),
|
87 |
still_on_hub=still_on_hub,
|
88 |
architecture=architecture
|
@@ -90,47 +85,44 @@ class EvalResult:
|
|
90 |
|
91 |
def update_with_request_file(self, requests_path):
|
92 |
"""Finds the relevant request file for the current model and updates info with it"""
|
93 |
-
request_file = get_request_file_for_model(requests_path, self.full_model,
|
|
|
94 |
|
95 |
try:
|
96 |
with open(request_file, "r") as f:
|
97 |
request = json.load(f)
|
98 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
99 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
100 |
self.license = request.get("license", "?")
|
101 |
self.likes = request.get("likes", 0)
|
102 |
self.num_params = request.get("params", 0)
|
103 |
self.date = request.get("submitted_time", "")
|
104 |
-
except
|
105 |
print(f"Could not find request file for {self.org}/{self.model}")
|
|
|
|
|
106 |
|
107 |
def to_dict(self):
|
108 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
109 |
-
|
110 |
-
accuracy = self.results.get("Accuracy", None)
|
111 |
-
|
112 |
data_dict = {
|
113 |
"eval_name": self.eval_name, # not a column, just a save name,
|
114 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
115 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
116 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
117 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
118 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
119 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
120 |
-
AutoEvalColumn.dummy.name: self.full_model,
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
-
|
123 |
-
AutoEvalColumn.
|
124 |
-
AutoEvalColumn.
|
125 |
-
AutoEvalColumn.
|
126 |
-
AutoEvalColumn.params.name: self.num_params,
|
127 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
128 |
}
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
data_dict["Answer Rate"] = self.results.get("Answer Rate", None)
|
133 |
-
data_dict["Average Summary Length"] = self.results.get("Average Summary Length", None)
|
134 |
|
135 |
return data_dict
|
136 |
|
@@ -163,7 +155,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
163 |
|
164 |
for root, _, files in os.walk(results_path):
|
165 |
# We should only have json files in model results
|
166 |
-
if
|
167 |
continue
|
168 |
|
169 |
# Sort the files by date
|
@@ -172,8 +164,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
172 |
except dateutil.parser._parser.ParserError:
|
173 |
files = [files[-1]]
|
174 |
|
175 |
-
for file in files
|
176 |
-
model_result_filepaths.append(os.path.join(root, file))
|
177 |
|
178 |
eval_results = {}
|
179 |
for model_result_filepath in model_result_filepaths:
|
@@ -184,7 +175,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
184 |
# Store results of same eval together
|
185 |
eval_name = eval_result.eval_name
|
186 |
if eval_name in eval_results.keys():
|
187 |
-
eval_results[eval_name].results.update({k: v for k, v in
|
|
|
188 |
else:
|
189 |
eval_results[eval_name] = eval_result
|
190 |
|
|
|
1 |
import glob
|
2 |
import json
|
|
|
3 |
import os
|
4 |
from dataclasses import dataclass
|
5 |
|
|
|
6 |
import numpy as np
|
7 |
+
import dateutil
|
8 |
|
9 |
+
import src.display.formatting as formatting
|
10 |
+
import src.display.utils as utils
|
11 |
+
import src.submission.check_validity as check_validity
|
12 |
|
13 |
|
14 |
@dataclass
|
15 |
class EvalResult:
|
16 |
+
eval_name: str # org_model_precision (uid)
|
17 |
+
full_model: str # org/model (path on hub)
|
18 |
+
org: str
|
19 |
model: str
|
20 |
+
revision: str # commit hash, "" if main
|
21 |
results: dict
|
22 |
+
precision: utils.Precision = utils.Precision.Unknown
|
23 |
+
model_type: utils.ModelType = utils.ModelType.Unknown # Pretrained, fine tuned, ...
|
24 |
+
weight_type: utils.WeightType = utils.WeightType.Original # Original or Adapter
|
25 |
+
architecture: str = "Unknown"
|
26 |
license: str = "?"
|
27 |
likes: int = 0
|
28 |
num_params: int = 0
|
29 |
+
date: str = "" # submission date of request file
|
30 |
still_on_hub: bool = False
|
31 |
|
32 |
@classmethod
|
|
|
38 |
config = data.get("config")
|
39 |
|
40 |
# Precision
|
41 |
+
precision = utils.Precision.from_str(config.get("model_dtype"))
|
42 |
|
43 |
# Get model and org
|
44 |
+
full_model = config.get("model_name", config.get("model_args", None))
|
45 |
+
org, model = full_model.split("/", 1) if "/" in full_model else (None, full_model)
|
46 |
|
47 |
+
if org:
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
result_key = f"{org}_{model}_{precision.value.name}"
|
49 |
+
else:
|
50 |
+
result_key = f"{model}_{precision.value.name}"
|
51 |
|
52 |
+
still_on_hub, _, model_config = check_validity.is_model_on_hub(
|
53 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True,
|
54 |
+
test_tokenizer=False)
|
55 |
+
|
56 |
+
if model_config:
|
57 |
+
architecture = ";".join(getattr(model_config, "architectures", ["?"]))
|
58 |
+
else:
|
59 |
+
architecture = "?"
|
60 |
|
61 |
# Extract results available in this file (some results are split in several files)
|
62 |
+
results = {}
|
63 |
+
for task in utils.Tasks:
|
64 |
+
task = task.value
|
65 |
+
|
66 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
67 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
68 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
69 |
+
continue
|
70 |
+
|
71 |
+
mean_acc = np.mean(accs) * 100.0
|
72 |
+
results[task.benchmark] = mean_acc
|
73 |
|
74 |
return self(
|
75 |
eval_name=result_key,
|
|
|
77 |
org=org,
|
78 |
model=model,
|
79 |
results=results,
|
80 |
+
precision=precision,
|
81 |
revision= config.get("model_sha", ""),
|
82 |
still_on_hub=still_on_hub,
|
83 |
architecture=architecture
|
|
|
85 |
|
86 |
def update_with_request_file(self, requests_path):
|
87 |
"""Finds the relevant request file for the current model and updates info with it"""
|
88 |
+
request_file = get_request_file_for_model(requests_path, self.full_model,
|
89 |
+
self.precision.value.name)
|
90 |
|
91 |
try:
|
92 |
with open(request_file, "r") as f:
|
93 |
request = json.load(f)
|
94 |
+
self.model_type = utils.ModelType.from_str(request.get("model_type", ""))
|
95 |
+
self.weight_type = utils.WeightType[request.get("weight_type", "Original")]
|
96 |
self.license = request.get("license", "?")
|
97 |
self.likes = request.get("likes", 0)
|
98 |
self.num_params = request.get("params", 0)
|
99 |
self.date = request.get("submitted_time", "")
|
100 |
+
except FileNotFoundError:
|
101 |
print(f"Could not find request file for {self.org}/{self.model}")
|
102 |
+
except json.JSONDecodeError:
|
103 |
+
print(f"Error decoding JSON in request file for {self.org}/{self.model}")
|
104 |
|
105 |
def to_dict(self):
|
106 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
107 |
+
|
|
|
|
|
108 |
data_dict = {
|
109 |
"eval_name": self.eval_name, # not a column, just a save name,
|
110 |
+
utils.AutoEvalColumn.precision.name: self.precision.value.name,
|
111 |
+
utils.AutoEvalColumn.model_type.name: self.model_type.value.name,
|
112 |
+
utils.AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
113 |
+
utils.AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
114 |
+
utils.AutoEvalColumn.architecture.name: self.architecture,
|
115 |
+
utils.AutoEvalColumn.model.name: formatting.make_clickable_model(self.full_model),
|
116 |
+
utils.AutoEvalColumn.dummy.name: self.full_model,
|
117 |
+
utils.AutoEvalColumn.revision.name: self.revision,
|
118 |
+
utils.AutoEvalColumn.license.name: self.license,
|
119 |
+
utils.AutoEvalColumn.likes.name: self.likes,
|
120 |
+
utils.AutoEvalColumn.params.name: self.num_params,
|
121 |
+
utils.AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
|
|
|
|
122 |
}
|
123 |
+
|
124 |
+
for task in utils.Tasks:
|
125 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
|
|
|
|
126 |
|
127 |
return data_dict
|
128 |
|
|
|
155 |
|
156 |
for root, _, files in os.walk(results_path):
|
157 |
# We should only have json files in model results
|
158 |
+
if not files or any([not f.endswith(".json") for f in files]):
|
159 |
continue
|
160 |
|
161 |
# Sort the files by date
|
|
|
164 |
except dateutil.parser._parser.ParserError:
|
165 |
files = [files[-1]]
|
166 |
|
167 |
+
model_result_filepaths.extend([os.path.join(root, file) for file in files])
|
|
|
168 |
|
169 |
eval_results = {}
|
170 |
for model_result_filepath in model_result_filepaths:
|
|
|
175 |
# Store results of same eval together
|
176 |
eval_name = eval_result.eval_name
|
177 |
if eval_name in eval_results.keys():
|
178 |
+
eval_results[eval_name].results.update({k: v for k, v in
|
179 |
+
eval_result.results.items() if v is not None})
|
180 |
else:
|
181 |
eval_results[eval_name] = eval_result
|
182 |
|
src/populate.py
CHANGED
@@ -3,21 +3,21 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
-
df = df.sort_values(by=[AutoEvalColumn.accuracy.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
18 |
|
19 |
# filter out if any of the benchmarks have not been produced
|
20 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
21 |
return raw_data, df
|
22 |
|
23 |
|
@@ -31,8 +31,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
31 |
with open(file_path) as fp:
|
32 |
data = json.load(fp)
|
33 |
|
34 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
35 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
36 |
|
37 |
all_evals.append(data)
|
38 |
elif ".md" not in entry:
|
@@ -43,8 +43,8 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
43 |
with open(file_path) as fp:
|
44 |
data = json.load(fp)
|
45 |
|
46 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
47 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
48 |
all_evals.append(data)
|
49 |
|
50 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
import src.display.formatting as formatting
|
7 |
+
import src.display.utils as utils
|
8 |
+
import src.leaderboard.read_evals as read_evals
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
raw_data = read_evals.get_raw_eval_results(results_path, requests_path)
|
13 |
all_data_json = [v.to_dict() for v in raw_data]
|
14 |
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
+
df = df.sort_values(by=[utils.AutoEvalColumn.accuracy.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
18 |
|
19 |
# filter out if any of the benchmarks have not been produced
|
20 |
+
df = df[formatting.has_no_nan_values(df, benchmark_cols)]
|
21 |
return raw_data, df
|
22 |
|
23 |
|
|
|
31 |
with open(file_path) as fp:
|
32 |
data = json.load(fp)
|
33 |
|
34 |
+
data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
|
35 |
+
data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
|
36 |
|
37 |
all_evals.append(data)
|
38 |
elif ".md" not in entry:
|
|
|
43 |
with open(file_path) as fp:
|
44 |
data = json.load(fp)
|
45 |
|
46 |
+
data[utils.EvalQueueColumn.model.name] = formatting.make_clickable_model(data["model"])
|
47 |
+
data[utils.EvalQueueColumn.revision.name] = data.get("revision", "main")
|
48 |
all_evals.append(data)
|
49 |
|
50 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
src/submission/check_validity.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
import re
|
4 |
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
@@ -37,11 +35,11 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
37 |
try:
|
38 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
39 |
if test_tokenizer:
|
40 |
-
tokenizer_config = get_tokenizer_config(model_name)
|
41 |
if tokenizer_config is not None:
|
42 |
tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
|
43 |
else:
|
44 |
-
tokenizer_class_candidate = config.tokenizer_class
|
45 |
|
46 |
|
47 |
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from collections import defaultdict
|
|
|
4 |
|
5 |
import huggingface_hub
|
6 |
from huggingface_hub import ModelCard
|
|
|
35 |
try:
|
36 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
37 |
if test_tokenizer:
|
38 |
+
tokenizer_config = get_tokenizer_config(model_name)
|
39 |
if tokenizer_config is not None:
|
40 |
tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
|
41 |
else:
|
42 |
+
tokenizer_class_candidate = config.tokenizer_class
|
43 |
|
44 |
|
45 |
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
src/submission/submit.py
CHANGED
@@ -2,14 +2,10 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
@@ -25,7 +21,7 @@ def add_new_eval(
|
|
25 |
global REQUESTED_MODELS
|
26 |
global USERS_TO_SUBMISSION_DATES
|
27 |
if not REQUESTED_MODELS:
|
28 |
-
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
|
30 |
user_name = ""
|
31 |
model_path = model
|
@@ -37,7 +33,7 @@ def add_new_eval(
|
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
|
39 |
if model_type is None or model_type == "":
|
40 |
-
return styled_error("Please select a model type.")
|
41 |
|
42 |
# Does the model actually exist?
|
43 |
if revision == "":
|
@@ -45,32 +41,32 @@ def add_new_eval(
|
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
if not base_model_on_hub:
|
50 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
52 |
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
54 |
if not model_on_hub:
|
55 |
-
return styled_error(f'Model "{model}" {error}')
|
56 |
|
57 |
# Is the model info correctly filled?
|
58 |
try:
|
59 |
-
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
except Exception:
|
61 |
-
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
|
63 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
|
65 |
# Were the model card and license filled?
|
66 |
try:
|
67 |
license = model_info.cardData["license"]
|
68 |
except Exception:
|
69 |
-
return styled_error("Please select a license for your model")
|
70 |
|
71 |
-
modelcard_OK, error_msg = check_model_card(model)
|
72 |
if not modelcard_OK:
|
73 |
-
return styled_error(error_msg)
|
74 |
|
75 |
# Seems good, creating the eval
|
76 |
print("Adding new eval")
|
@@ -91,11 +87,11 @@ def add_new_eval(
|
|
91 |
|
92 |
# Check for duplicate submission
|
93 |
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
94 |
-
return styled_warning("This model has been already submitted.")
|
95 |
|
96 |
print("Creating eval file")
|
97 |
|
98 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
|
@@ -103,10 +99,10 @@ def add_new_eval(
|
|
103 |
f.write(json.dumps(eval_entry))
|
104 |
|
105 |
print("Uploading eval file")
|
106 |
-
API.upload_file(
|
107 |
path_or_fileobj=out_path,
|
108 |
path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
-
repo_id=QUEUE_REPO,
|
110 |
repo_type="dataset",
|
111 |
commit_message=f"Add {model} to eval queue",
|
112 |
)
|
@@ -114,6 +110,6 @@ def add_new_eval(
|
|
114 |
# Remove the local file
|
115 |
os.remove(out_path)
|
116 |
|
117 |
-
return styled_message(
|
118 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
)
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
+
import src.display.formatting as formatting
|
6 |
+
import src.envs as envs
|
7 |
+
import src.submission.check_validity as check_validity
|
8 |
+
|
|
|
|
|
|
|
|
|
9 |
|
10 |
REQUESTED_MODELS = None
|
11 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
21 |
global REQUESTED_MODELS
|
22 |
global USERS_TO_SUBMISSION_DATES
|
23 |
if not REQUESTED_MODELS:
|
24 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = check_validity.already_submitted_models(envs.EVAL_REQUESTS_PATH)
|
25 |
|
26 |
user_name = ""
|
27 |
model_path = model
|
|
|
33 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
34 |
|
35 |
if model_type is None or model_type == "":
|
36 |
+
return formatting.styled_error("Please select a model type.")
|
37 |
|
38 |
# Does the model actually exist?
|
39 |
if revision == "":
|
|
|
41 |
|
42 |
# Is the model on the hub?
|
43 |
if weight_type in ["Delta", "Adapter"]:
|
44 |
+
base_model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=base_model, revision=revision, token=envs.TOKEN, test_tokenizer=True)
|
45 |
if not base_model_on_hub:
|
46 |
+
return formatting.styled_error(f'Base model "{base_model}" {error}')
|
47 |
|
48 |
if not weight_type == "Adapter":
|
49 |
+
model_on_hub, error, _ = check_validity.is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
50 |
if not model_on_hub:
|
51 |
+
return formatting.styled_error(f'Model "{model}" {error}')
|
52 |
|
53 |
# Is the model info correctly filled?
|
54 |
try:
|
55 |
+
model_info = envs.API.model_info(repo_id=model, revision=revision)
|
56 |
except Exception:
|
57 |
+
return formatting.styled_error("Could not get your model information. Please fill it up properly.")
|
58 |
|
59 |
+
model_size = check_validity.get_model_size(model_info=model_info, precision=precision)
|
60 |
|
61 |
# Were the model card and license filled?
|
62 |
try:
|
63 |
license = model_info.cardData["license"]
|
64 |
except Exception:
|
65 |
+
return formatting.styled_error("Please select a license for your model")
|
66 |
|
67 |
+
modelcard_OK, error_msg = check_validity.check_model_card(model)
|
68 |
if not modelcard_OK:
|
69 |
+
return formatting.styled_error(error_msg)
|
70 |
|
71 |
# Seems good, creating the eval
|
72 |
print("Adding new eval")
|
|
|
87 |
|
88 |
# Check for duplicate submission
|
89 |
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
90 |
+
return formatting.styled_warning("This model has been already submitted.")
|
91 |
|
92 |
print("Creating eval file")
|
93 |
|
94 |
+
OUT_DIR = f"{envs.EVAL_REQUESTS_PATH}/{user_name}"
|
95 |
os.makedirs(OUT_DIR, exist_ok=True)
|
96 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
97 |
|
|
|
99 |
f.write(json.dumps(eval_entry))
|
100 |
|
101 |
print("Uploading eval file")
|
102 |
+
envs.API.upload_file(
|
103 |
path_or_fileobj=out_path,
|
104 |
path_in_repo=out_path.split("eval-queue/")[1],
|
105 |
+
repo_id=envs.QUEUE_REPO,
|
106 |
repo_type="dataset",
|
107 |
commit_message=f"Add {model} to eval queue",
|
108 |
)
|
|
|
110 |
# Remove the local file
|
111 |
os.remove(out_path)
|
112 |
|
113 |
+
return formatting.styled_message(
|
114 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
115 |
)
|
tests/test_evaluate_model.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from unittest.mock import patch
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
import src.backend.evaluate_model as evaluate_model
|
7 |
+
import src.envs as envs
|
8 |
+
|
9 |
+
|
10 |
+
class TestEvaluator(unittest.TestCase):
|
11 |
+
|
12 |
+
def setUp(self):
|
13 |
+
self.model_name = 'test_model'
|
14 |
+
self.revision = 'test_revision'
|
15 |
+
self.precision = 'test_precision'
|
16 |
+
self.batch_size = 10
|
17 |
+
self.device = 'test_device'
|
18 |
+
self.no_cache = False
|
19 |
+
self.limit = 10
|
20 |
+
|
21 |
+
@patch('src.backend.evaluate_model.SummaryGenerator')
|
22 |
+
@patch('src.backend.evaluate_model.EvaluationModel')
|
23 |
+
def test_evaluator_initialization(self, mock_eval_model, mock_summary_generator):
|
24 |
+
evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
|
25 |
+
self.precision, self.batch_size,
|
26 |
+
self.device, self.no_cache, self.limit)
|
27 |
+
|
28 |
+
mock_summary_generator.assert_called_once_with(self.model_name, self.revision)
|
29 |
+
mock_eval_model.assert_called_once_with(envs.HEM_PATH)
|
30 |
+
self.assertEqual(evaluator.model, self.model_name)
|
31 |
+
|
32 |
+
@patch('src.backend.evaluate_model.EvaluationModel')
|
33 |
+
@patch('src.backend.evaluate_model.SummaryGenerator')
|
34 |
+
def test_evaluator_initialization_error(self, mock_summary_generator, mock_eval_model):
|
35 |
+
mock_eval_model.side_effect = Exception('test_exception')
|
36 |
+
with self.assertRaises(Exception):
|
37 |
+
evaluate_model.Evaluator(self.model_name, self.revision,
|
38 |
+
self.precision, self.batch_size,
|
39 |
+
self.device, self.no_cache, self.limit)
|
40 |
+
|
41 |
+
@patch('src.backend.evaluate_model.SummaryGenerator')
|
42 |
+
@patch('src.backend.evaluate_model.EvaluationModel')
|
43 |
+
@patch('src.backend.evaluate_model.pd.read_csv')
|
44 |
+
@patch('src.backend.util.format_results')
|
45 |
+
def test_evaluate_method(self, mock_format_results, mock_read_csv, mock_eval_model,
|
46 |
+
mock_summary_generator):
|
47 |
+
evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
|
48 |
+
self.precision, self.batch_size,
|
49 |
+
self.device, self.no_cache, self.limit)
|
50 |
+
|
51 |
+
# Mock setup
|
52 |
+
mock_format_results.return_value = {'test': 'result'}
|
53 |
+
mock_read_csv.return_value = pd.DataFrame({'column1': ['data1', 'data2']})
|
54 |
+
mock_summary_generator.return_value.generate_summaries.return_value = pd.DataFrame({'column1': ['summary1', 'summary2']})
|
55 |
+
mock_summary_generator.return_value.avg_length = 100
|
56 |
+
mock_summary_generator.return_value.answer_rate = 1.0
|
57 |
+
mock_summary_generator.return_value.error_rate = 0.0
|
58 |
+
mock_eval_model.return_value.compute_accuracy.return_value = 1.0
|
59 |
+
mock_eval_model.return_value.hallucination_rate = 0.0
|
60 |
+
mock_eval_model.return_value.evaluate_hallucination.return_value = [0.5]
|
61 |
+
|
62 |
+
# Method call and assertions
|
63 |
+
results = evaluator.evaluate()
|
64 |
+
mock_format_results.assert_called_once_with(model_name=self.model_name,
|
65 |
+
revision=self.revision,
|
66 |
+
precision=self.precision,
|
67 |
+
accuracy=1.0, hallucination_rate=0.0,
|
68 |
+
answer_rate=1.0, avg_summary_len=100,
|
69 |
+
error_rate=0.0)
|
70 |
+
mock_read_csv.assert_called_once_with(envs.SOURCE_PATH)
|
71 |
+
|
72 |
+
@patch('src.backend.evaluate_model.SummaryGenerator')
|
73 |
+
@patch('src.backend.evaluate_model.EvaluationModel')
|
74 |
+
@patch('src.backend.evaluate_model.pd.read_csv')
|
75 |
+
def test_evaluate_with_file_not_found(self, mock_read_csv, mock_eval_model,
|
76 |
+
mock_summary_generator):
|
77 |
+
mock_read_csv.side_effect = FileNotFoundError('test_exception')
|
78 |
+
evaluator = evaluate_model.Evaluator(self.model_name, self.revision,
|
79 |
+
self.precision, self.batch_size,
|
80 |
+
self.device, self.no_cache, self.limit)
|
81 |
+
|
82 |
+
with self.assertRaises(FileNotFoundError):
|
83 |
+
evaluator.evaluate()
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
unittest.main()
|
tests/test_evaluator.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from unittest.mock import patch
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
import src.backend.model_operations as model_operations
|
7 |
+
|
8 |
+
|
9 |
+
class TestEvaluator(unittest.TestCase):
|
10 |
+
|
11 |
+
def setUp(self):
|
12 |
+
self.model_path = "test_model"
|
13 |
+
|
14 |
+
@patch("src.backend.model_operations.load_evaluation_model")
|
15 |
+
def test_init(self, mock_load_evaluation_model):
|
16 |
+
model_operations.EvaluationModel(self.model_path)
|
17 |
+
mock_load_evaluation_model.assert_called_once_with(self.model_path)
|
18 |
+
|
19 |
+
@patch("src.backend.model_operations.load_evaluation_model")
|
20 |
+
def test_evaluate_hallucination(self, mock_load_evaluation_model):
|
21 |
+
model = model_operations.EvaluationModel(self.model_path)
|
22 |
+
df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
|
23 |
+
|
24 |
+
mock_load_evaluation_model.return_value.predict.return_value = [0.8, 0.2]
|
25 |
+
|
26 |
+
scores = model.evaluate_hallucination(df)
|
27 |
+
self.assertEqual(scores, [0.8, 0.2])
|
28 |
+
|
29 |
+
@patch("src.backend.model_operations.load_evaluation_model")
|
30 |
+
def test_evaluate_hallucination_exception(self, mock_load_evaluation_model):
|
31 |
+
model = model_operations.EvaluationModel(self.model_path)
|
32 |
+
df = pd.DataFrame({'source': ['source1', 'source2'], 'summary': ['summary1', 'summary2']})
|
33 |
+
|
34 |
+
mock_load_evaluation_model.return_value.predict.side_effect = Exception("Test exception")
|
35 |
+
|
36 |
+
with self.assertRaises(Exception):
|
37 |
+
scores = model.evaluate_hallucination(df)
|
38 |
+
|
39 |
+
@patch("src.backend.model_operations.load_evaluation_model")
|
40 |
+
def test_compute_accuracy(self, mock_load_evaluation_model):
|
41 |
+
model = model_operations.EvaluationModel(self.model_path)
|
42 |
+
model.scores = [0.8, 0.2]
|
43 |
+
|
44 |
+
accuracy = model.compute_accuracy()
|
45 |
+
expected_accuracy = 50.0
|
46 |
+
self.assertEqual(accuracy, expected_accuracy)
|
47 |
+
|
48 |
+
|
49 |
+
class TestLoadEvaluationModel(unittest.TestCase):
|
50 |
+
|
51 |
+
@patch("src.backend.model_operations.CrossEncoder")
|
52 |
+
def test_load_evaluation_model(self, mock_cross_encoder):
|
53 |
+
model_path = 'test_model_path'
|
54 |
+
model_operations.load_evaluation_model(model_path)
|
55 |
+
mock_cross_encoder.assert_called_once_with(model_path)
|
56 |
+
|
57 |
+
|
58 |
+
if __name__ == '__main__':
|
59 |
+
unittest.main()
|
tests/test_main_backend.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from unittest.mock import patch
|
3 |
+
|
4 |
+
import main_backend
|
5 |
+
import src.backend.manage_requests as manage_requests
|
6 |
+
|
7 |
+
|
8 |
+
class TestMainBackend(unittest.TestCase):
|
9 |
+
|
10 |
+
@patch('src.backend.manage_requests.check_completed_evals')
|
11 |
+
@patch('src.backend.manage_requests.get_eval_requests')
|
12 |
+
@patch('src.backend.sort_queue.sort_models_by_priority')
|
13 |
+
@patch('src.backend.manage_requests.set_eval_request')
|
14 |
+
@patch('src.backend.run_eval_suite.run_evaluation')
|
15 |
+
def test_run_auto_eval_with_pending_requests(self, mock_run_evaluation, mock_set_eval_request,
|
16 |
+
mock_sort_models_by_priority, mock_get_eval_requests,
|
17 |
+
mock_check_completed_evals):
|
18 |
+
mock_sort_models_by_priority.return_value = [manage_requests.EvalRequest(
|
19 |
+
model="test_model",
|
20 |
+
private=True,
|
21 |
+
status="PENDING",
|
22 |
+
json_filepath="test_filepath",
|
23 |
+
weight_type="test_weight_type",
|
24 |
+
precision="test_precision",
|
25 |
+
base_model="test_base_model",
|
26 |
+
revision="test_revision",
|
27 |
+
)]
|
28 |
+
|
29 |
+
main_backend.run_auto_eval()
|
30 |
+
|
31 |
+
# Assertions
|
32 |
+
mock_check_completed_evals.assert_called()
|
33 |
+
mock_get_eval_requests.assert_called()
|
34 |
+
mock_sort_models_by_priority.assert_called()
|
35 |
+
mock_set_eval_request.assert_called()
|
36 |
+
mock_run_evaluation.assert_called()
|
37 |
+
|
38 |
+
@patch('builtins.print')
|
39 |
+
@patch('src.backend.manage_requests.check_completed_evals')
|
40 |
+
@patch('src.backend.manage_requests.get_eval_requests')
|
41 |
+
def test_run_auto_eval_with_no_pending_requests(self, mock_get_eval_requests,
|
42 |
+
mock_check_completed_evals, mock_print):
|
43 |
+
mock_get_eval_requests.return_value = []
|
44 |
+
|
45 |
+
main_backend.run_auto_eval()
|
46 |
+
|
47 |
+
# Assertions
|
48 |
+
mock_check_completed_evals.assert_called()
|
49 |
+
mock_get_eval_requests.assert_called()
|
50 |
+
mock_print.assert_any_call("No eval requests found. Exiting.")
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == "__main__":
|
54 |
+
unittest.main()
|
tests/test_summary_generator.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
from unittest.mock import patch
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
import src.backend.evaluate_model as evaluate_model
|
7 |
+
|
8 |
+
|
9 |
+
class TestSummaryGenerator(unittest.TestCase):
|
10 |
+
|
11 |
+
def setUp(self):
|
12 |
+
self.model_id = "test_model"
|
13 |
+
self.revision = "test_revision"
|
14 |
+
|
15 |
+
@patch("src.backend.model_operations.AutoTokenizer")
|
16 |
+
@patch("src.backend.model_operations.AutoModelForCausalLM")
|
17 |
+
def test_init(self, mock_model, mock_tokenizer):
|
18 |
+
evaluate_model.SummaryGenerator(self.model_id, self.revision)
|
19 |
+
mock_tokenizer.from_pretrained.assert_called_once_with(self.model_id,
|
20 |
+
self.revision)
|
21 |
+
mock_model.from_pretrained.assert_called_once_with(self.model_id,
|
22 |
+
self.revision)
|
23 |
+
|
24 |
+
@patch("src.backend.model_operations.nlp")
|
25 |
+
@patch("src.backend.model_operations.AutoTokenizer")
|
26 |
+
@patch("src.backend.model_operations.AutoModelForCausalLM")
|
27 |
+
def test_generate_summaries(self, mock_model, mock_tokenizer, mock_nlp):
|
28 |
+
df = pd.DataFrame({'text': ['text1', 'text2'],
|
29 |
+
'dataset': ['dataset1', 'dataset2']})
|
30 |
+
|
31 |
+
generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
|
32 |
+
generator.generate_summaries(df)
|
33 |
+
|
34 |
+
self.assertEqual(len(generator.summaries_df), len(df))
|
35 |
+
|
36 |
+
@patch("src.backend.model_operations.AutoTokenizer")
|
37 |
+
@patch("src.backend.model_operations.AutoModelForCausalLM")
|
38 |
+
def test_compute_avg_length(self, mock_model, mock_tokenizer):
|
39 |
+
generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
|
40 |
+
test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
|
41 |
+
'dataset': ['dataset']})
|
42 |
+
generator.summaries_df = test_df
|
43 |
+
generator._compute_avg_length()
|
44 |
+
self.assertEqual(generator.avg_length, 4)
|
45 |
+
|
46 |
+
@patch("src.backend.model_operations.AutoTokenizer")
|
47 |
+
@patch("src.backend.model_operations.AutoModelForCausalLM")
|
48 |
+
def test_compute_answer_rate(self, mock_model, mock_tokenizer):
|
49 |
+
generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
|
50 |
+
test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
|
51 |
+
'dataset': ['dataset']})
|
52 |
+
generator.summaries_df = test_df
|
53 |
+
generator._compute_answer_rate()
|
54 |
+
self.assertEqual(generator.answer_rate, 1)
|
55 |
+
|
56 |
+
@patch("src.backend.model_operations.AutoTokenizer")
|
57 |
+
@patch("src.backend.model_operations.AutoModelForCausalLM")
|
58 |
+
def test_error_rate(self, mock_model, mock_tokenizer):
|
59 |
+
generator = evaluate_model.SummaryGenerator(self.model_id, self.revision)
|
60 |
+
test_df = pd.DataFrame({'source': ['text'], 'summary': ['This is a test.'],
|
61 |
+
'dataset': ['dataset']})
|
62 |
+
generator.summaries_df = test_df
|
63 |
+
generator._compute_error_rate(0)
|
64 |
+
self.assertEqual(generator.error_rate, 0)
|
65 |
+
|
66 |
+
|
67 |
+
if __name__ == "__main__":
|
68 |
+
unittest.main()
|