Spaces:

nuprl
/

puzzle-reasoning-challenge

Running

App Files Files Community

arjunguha commited on 7 days ago

Commit

2b8f77d

unverified ·

1 Parent(s): 92fe871

Update

Browse files

Files changed (4) hide show

app.py +19 -11
metrics.py +21 -7
puzzles_cleaned.csv +2 -2
results.duckdb +2 -2

app.py CHANGED Viewed

@@ -136,19 +136,27 @@ def summary_view():
 def r1_accuracy_by_completion_length():
-    r1_completions = metrics.r1_accuracy_by_completion_length(conn).to_df()
     r1_completions["length"] = r1_completions["length"] / 3.2
-    r1_completions.rename(columns={"length": "Response Length", "cumulative_correct": "Cumulative Correct"}, inplace=True)
-    gr.LinePlot(
-        r1_completions,
-        x="Response Length",
-        y="Cumulative Correct",
-        title="R1 Accuracy by Completion Length",
-        x_label="Max Response Length (tokens)",
-        y_label="# Correct Answers",
-        x_lim=[0, 32_768],
-    )
 def all_challenges_view():
     # Using "markdown" as the datatype makes Gradio interpret newlines.

 def r1_accuracy_by_completion_length():
+    r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df()
+    gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df()
+    r1_completions["model"] = "R1"
+    gemini2_completions["model"] = "Gemini2"
+    r1_completions = pd.concat([r1_completions, gemini2_completions])
     r1_completions["length"] = r1_completions["length"] / 3.2
+    with gr.Blocks(fill_height=True):
+        gr.LinePlot(
+            r1_completions,
+            x="length",
+            y="cumulative_accuracy",
+            title="Accuracy by Maximum Completion Length",
+            x_label="Max Response Length (tokens)",
+            y_label="Accuracy (%)",
+            x_lim=[0, 32_768],
+            y_lim=[0, 1],
+            color="model",
+        )
 def all_challenges_view():
     # Using "markdown" as the datatype makes Gradio interpret newlines.

metrics.py CHANGED Viewed

@@ -41,9 +41,11 @@ def _check_answer(completion: str, answer: str) -> bool:
     completion. We ignore "thoughts", capitalization, and punctuation.
     """
     completion = _answer_without_thoughts(completion).lower()
     alternative_answers = _parse_answer(answer)
     for answer_phrases in alternative_answers:
-        if all(phrase in completion for phrase in answer_phrases):
             return True
     return False
@@ -63,27 +65,39 @@ def load_results():
     conn.create_function("wrap_text", _wrap_text)
     return conn
-def r1_accuracy_by_completion_length(conn):
     """
     For the responses from the completions-r1 model:
     1. We calculate completion length and correctness for each problem.
     2. We sort by length.
     3. We compute cumulative number of correct responses.
     """
-    # Use CTEs
-    r1_completions = conn.sql("""
         WITH LengthsAndCorrectness AS (
             SELECT
                 LENGTH(results.completion) AS length,
                 CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
             FROM results.completions results JOIN  challenges
             ON results.prompt_id = challenges.ID
-            WHERE results.parent_dir = 'completions-r1'
         )
         SELECT
             length,
-            COUNT(*) OVER (ORDER BY length) AS cumulative_correct
-        FROM LengthsAndCorrectness
     """)
     return r1_completions

     completion. We ignore "thoughts", capitalization, and punctuation.
     """
     completion = _answer_without_thoughts(completion).lower()
+    completion  = re.sub(r'[^\w\s]', ' ', completion) # this replaces punctuations with space, aligning with the _parse_answer function's ' '.join
     alternative_answers = _parse_answer(answer)
     for answer_phrases in alternative_answers:
+        # if all(phrase in completion for phrase in answer_phrases):
+        if all(re.search(rf'\b{re.escape(phrase)}\b', completion) for phrase in answer_phrases):
             return True
     return False
     conn.create_function("wrap_text", _wrap_text)
     return conn
+def r1_accuracy_by_completion_length(conn,model_name):
     """
     For the responses from the completions-r1 model:
     1. We calculate completion length and correctness for each problem.
     2. We sort by length.
     3. We compute cumulative number of correct responses.
     """
+    r1_completions = conn.sql(f"""
         WITH LengthsAndCorrectness AS (
             SELECT
                 LENGTH(results.completion) AS length,
                 CAST(check_answer(results.completion, challenges.answer) AS INT32) AS correct
             FROM results.completions results JOIN  challenges
             ON results.prompt_id = challenges.ID
+            WHERE results.parent_dir = '{model_name}'
+        ),
+        TotalItems AS (
+            SELECT COUNT(*) as total_count
+            FROM LengthsAndCorrectness
+        ),
+        CumulativeCorrect AS (
+            SELECT
+                length,
+                SUM(correct) OVER (ORDER BY length) as cumulative_correct,
+            FROM LengthsAndCorrectness
         )
         SELECT
             length,
+            cumulative_correct,
+            CAST(cumulative_correct AS FLOAT) / total_count AS cumulative_accuracy
+        FROM CumulativeCorrect, TotalItems
+        ORDER BY length
     """)
     return r1_completions

puzzles_cleaned.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:257753179c4b2a5be8716ac03da2617c48d9037290cc39b4896ad55304e13337
-size 1119397

 version https://git-lfs.github.com/spec/v1
+oid sha256:a008da9d40adfebfa2e0fafa504aba03125c9064efd77b45dbfcb68655eb878c
+size 1116075

results.duckdb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69d8936736a84631569786c1e6e998c44b60b77fb61eabf7eb61758674ba452d
-size 81801216

 version https://git-lfs.github.com/spec/v1
+oid sha256:0ddd1f445fd5b8c756511052e0508a98b84bea161775858c9f372a374a4d5ba9
+size 136327168