lewtun HF staff commited on
Commit
f7ee73a
·
1 Parent(s): c6428d5

Combine IFEval

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -49,11 +49,18 @@ def get_leaderboard_df():
49
  if task.lower() == "truthfulqa":
50
  value = data["results"][first_result_key]["truthfulqa_mc2"]
51
  df.loc[model_revision, task] = float(value)
52
- # IFEval has several metrics but we report just the prompt-loose-acc one
53
  elif task.lower() == "ifeval":
54
- for metric in ["prompt_level_loose", "prompt_level_strict"]:
55
- value = data["results"][first_result_key][f"{metric}_acc"]
56
- df.loc[model_revision, f"{task}_{metric}"] = float(value)
 
 
 
 
 
 
 
57
  # MMLU has several metrics but we report just the average one
58
  elif task.lower() == "mmlu":
59
  value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]
 
49
  if task.lower() == "truthfulqa":
50
  value = data["results"][first_result_key]["truthfulqa_mc2"]
51
  df.loc[model_revision, task] = float(value)
52
+ # IFEval has several metrics but we report the average like Llama3 paper
53
  elif task.lower() == "ifeval":
54
+ values = 0.0
55
+ for metric in [
56
+ "prompt_level_loose",
57
+ "prompt_level_strict",
58
+ "inst_level_strict",
59
+ "inst_level_loose",
60
+ ]:
61
+ values += data["results"][first_result_key][f"{metric}_acc"]
62
+ value = values / 4
63
+ df.loc[model_revision, f"{task}"] = float(value)
64
  # MMLU has several metrics but we report just the average one
65
  elif task.lower() == "mmlu":
66
  value = [v["acc"] for k, v in data["results"].items() if "_average" in k.lower()][0]