Spaces:
Restarting
Restarting
Combine IFEval
Browse files
app.py
CHANGED
@@ -49,11 +49,18 @@ def get_leaderboard_df():
|
|
49 |
if task.lower() == "truthfulqa":
|
50 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
51 |
df.loc[model_revision, task] = float(value)
|
52 |
-
# IFEval has several metrics but we report
|
53 |
elif task.lower() == "ifeval":
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
# MMLU has several metrics but we report just the average one
|
58 |
elif task.lower() == "mmlu":
|
59 |
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|
|
|
49 |
if task.lower() == "truthfulqa":
|
50 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
51 |
df.loc[model_revision, task] = float(value)
|
52 |
+
# IFEval has several metrics but we report the average like Llama3 paper
|
53 |
elif task.lower() == "ifeval":
|
54 |
+
values = 0.0
|
55 |
+
for metric in [
|
56 |
+
"prompt_level_loose",
|
57 |
+
"prompt_level_strict",
|
58 |
+
"inst_level_strict",
|
59 |
+
"inst_level_loose",
|
60 |
+
]:
|
61 |
+
values += data["results"][first_result_key][f"{metric}_acc"]
|
62 |
+
value = values / 4
|
63 |
+
df.loc[model_revision, f"{task}"] = float(value)
|
64 |
# MMLU has several metrics but we report just the average one
|
65 |
elif task.lower() == "mmlu":
|
66 |
value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
|