arjunguha commited on
Commit
2b8f77d
·
unverified ·
1 Parent(s): 92fe871
Files changed (4) hide show
  1. app.py +19 -11
  2. metrics.py +21 -7
  3. puzzles_cleaned.csv +2 -2
  4. results.duckdb +2 -2
app.py CHANGED
@@ -136,19 +136,27 @@ def summary_view():
136
 
137
 
138
  def r1_accuracy_by_completion_length():
139
- r1_completions = metrics.r1_accuracy_by_completion_length(conn).to_df()
 
 
 
 
 
 
140
  r1_completions["length"] = r1_completions["length"] / 3.2
141
- r1_completions.rename(columns={"length": "Response Length", "cumulative_correct": "Cumulative Correct"}, inplace=True)
142
 
143
- gr.LinePlot(
144
- r1_completions,
145
- x="Response Length",
146
- y="Cumulative Correct",
147
- title="R1 Accuracy by Completion Length",
148
- x_label="Max Response Length (tokens)",
149
- y_label="# Correct Answers",
150
- x_lim=[0, 32_768],
151
- )
 
 
 
152
 
153
  def all_challenges_view():
154
  # Using "markdown" as the datatype makes Gradio interpret newlines.
 
136
 
137
 
138
  def r1_accuracy_by_completion_length():
139
+ r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
140
+ gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
141
+
142
+ r1_completions["model"] = "R1"
143
+ gemini2_completions["model"] = "Gemini2"
144
+ r1_completions = pd.concat([r1_completions, gemini2_completions])
145
+
146
  r1_completions["length"] = r1_completions["length"] / 3.2
 
147
 
148
+ with gr.Blocks(fill_height=True):
149
+ gr.LinePlot(
150
+ r1_completions,
151
+ x="length",
152
+ y="cumulative_accuracy",
153
+ title="Accuracy by Maximum Completion Length",
154
+ x_label="Max Response Length (tokens)",
155
+ y_label="Accuracy (%)",
156
+ x_lim=[0, 32_768],
157
+ y_lim=[0, 1],
158
+ color="model",
159
+ )
160
 
161
  def all_challenges_view():
162
  # Using "markdown" as the datatype makes Gradio interpret newlines.
metrics.py CHANGED
@@ -41,9 +41,11 @@ def _check_answer(completion: str, answer: str) -> bool:
41
  completion. We ignore "thoughts", capitalization, and punctuation.
42
  """
43
  completion = _answer_without_thoughts(completion).lower()
 
44
  alternative_answers = _parse_answer(answer)
45
  for answer_phrases in alternative_answers:
46
- if all(phrase in completion for phrase in answer_phrases):
 
47
  return True
48
  return False
49
 
@@ -63,27 +65,39 @@ def load_results():
63
  conn.create_function("wrap_text", _wrap_text)
64
  return conn
65
 
66
- def r1_accuracy_by_completion_length(conn):
67
  """
68
  For the responses from the completions-r1 model:
69
  1. We calculate completion length and correctness for each problem.
70
  2. We sort by length.
71
  3. We compute cumulative number of correct responses.
72
  """
73
- # Use CTEs
74
- r1_completions = conn.sql("""
75
  WITH LengthsAndCorrectness AS (
76
  SELECT
77
  LENGTH(results.completion) AS length,
78
  CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
79
  FROM results.completions results JOIN challenges
80
  ON results.prompt_id = challenges.ID
81
- WHERE results.parent_dir = 'completions-r1'
 
 
 
 
 
 
 
 
 
 
82
  )
 
83
  SELECT
84
  length,
85
- COUNT(*) OVER (ORDER BY length) AS cumulative_correct
86
- FROM LengthsAndCorrectness
 
 
87
  """)
88
  return r1_completions
89
 
 
41
  completion. We ignore "thoughts", capitalization, and punctuation.
42
  """
43
  completion = _answer_without_thoughts(completion).lower()
44
+ completion = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
45
  alternative_answers = _parse_answer(answer)
46
  for answer_phrases in alternative_answers:
47
+ # if all(phrase in completion for phrase in answer_phrases):
48
+ if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
49
  return True
50
  return False
51
 
 
65
  conn.create_function("wrap_text", _wrap_text)
66
  return conn
67
 
68
+ def r1_accuracy_by_completion_length(conn,model_name):
69
  """
70
  For the responses from the completions-r1 model:
71
  1. We calculate completion length and correctness for each problem.
72
  2. We sort by length.
73
  3. We compute cumulative number of correct responses.
74
  """
75
+ r1_completions = conn.sql(f"""
 
76
  WITH LengthsAndCorrectness AS (
77
  SELECT
78
  LENGTH(results.completion) AS length,
79
  CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
80
  FROM results.completions results JOIN challenges
81
  ON results.prompt_id = challenges.ID
82
+ WHERE results.parent_dir = '{model_name}'
83
+ ),
84
+ TotalItems AS (
85
+ SELECT COUNT(*) as total_count
86
+ FROM LengthsAndCorrectness
87
+ ),
88
+ CumulativeCorrect AS (
89
+ SELECT
90
+ length,
91
+ SUM(correct) OVER (ORDER BY length) as cumulative_correct,
92
+ FROM LengthsAndCorrectness
93
  )
94
+
95
  SELECT
96
  length,
97
+ cumulative_correct,
98
+ CAST(cumulative_correct AS FLOAT) / total_count AS cumulative_accuracy
99
+ FROM CumulativeCorrect, TotalItems
100
+ ORDER BY length
101
  """)
102
  return r1_completions
103
 
puzzles_cleaned.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:257753179c4b2a5be8716ac03da2617c48d9037290cc39b4896ad55304e13337
3
- size 1119397
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a008da9d40adfebfa2e0fafa504aba03125c9064efd77b45dbfcb68655eb878c
3
+ size 1116075
results.duckdb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69d8936736a84631569786c1e6e998c44b60b77fb61eabf7eb61758674ba452d
3
- size 81801216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ddd1f445fd5b8c756511052e0508a98b84bea161775858c9f372a374a4d5ba9
3
+ size 136327168