Spaces:

nuprl
/

puzzle-reasoning-challenge

Running

App Files Files Community

arjunguha commited on 10 days ago

Commit

000c07e

unverified ·

1 Parent(s): 861c325

Update

Browse files

Files changed (4) hide show

app.py +6 -52
metrics.py +85 -0
puzzles_cleaned.csv +2 -2
results.duckdb +2 -2

app.py CHANGED Viewed

@@ -18,52 +18,10 @@ app that displays the following:
 Note that not every model has a response for every puzzle.
 """
-import re
-import duckdb
 import gradio as gr
-import textwrap
-def split_into_words(text: str) -> list:
-    return re.findall(r'\b\w+\b', text.lower())
-def all_words_match(completion: str, answer: str) -> bool:
-    answer_words = split_into_words(answer)
-    completion = completion.lower()
-    return all(word in completion for word in answer_words)
-def answer_without_thoughts(completion: str) -> str:
-    if "<think>" not in completion[:200]:
-        return completion
-    chunks = completion.split("</think>")
-    if len(chunks) <= 1:
-        return ""
-    return chunks[-1].strip()
-def check_answer(completion: str, answer: str) -> bool:
-    """
-    Check if all words in the answer are in the completion, in the same order.
-    """
-    completion_words = split_into_words(answer_without_thoughts(completion))
-    answer_words = split_into_words(answer)
-    indices = []
-    for word in answer_words:
-        if word in completion_words:
-            indices.append(completion_words.index(word))
-        else:
-            return False
-    return indices == sorted(indices) or indices == sorted(indices, reverse=True)
-def clip_text(text: str, width: int) -> str:
-    return text if len(text) <= width else text[:width] + "..."
-def wrap_text(text: str, width: int) -> str:
-    return textwrap.fill(text, width=width)
 def get_model_response(prompt_id, model_name):
     query = f"""
         SELECT completion FROM results.completions
@@ -89,18 +47,14 @@ def display_model_response(puzzle_id, model_name):
     return "From " + model_name + ":\n" + response if response else "No response from this model."
-conn = duckdb.connect(":memory:")
-conn.execute("ATTACH DATABASE 'results.duckdb' AS results")
-conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
-conn.create_function("check_answer", check_answer)
-conn.create_function("clip_text", clip_text)
-conn.create_function("wrap_text", wrap_text)
 # Get all unique model names
 model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM results.completions").fetchall()]
 # Just for display.
 cleaned_model_names = [name.replace("completions-", "") for name in model_names]
-print(cleaned_model_names)
 def build_table():
     # Construct the query to create two columns for each model: MODEL_answer and MODEL_ok
@@ -141,11 +95,11 @@ def build_table():
 joined_df, model_correct_columns = build_table()
 relabelled_df = joined_df[['ID', 'challenge_clipped', 'answer', *model_correct_columns]].rename(columns={
-    'ID': 'Puzzle ID',
     'challenge_clipped': 'Challenge',
     'answer': 'Answer',
     **{model.replace("-", "_") + '_ok': model.replace("completions-", "") for model in model_names}
-})
 model_columns = {
     index + 3: name for index, name in enumerate(model_names)

 Note that not every model has a response for every puzzle.
 """
 import gradio as gr
+from metrics import load_results
 def get_model_response(prompt_id, model_name):
     query = f"""
         SELECT completion FROM results.completions
     return "From " + model_name + ":\n" + response if response else "No response from this model."
+conn = load_results()
 # Get all unique model names
 model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM results.completions").fetchall()]
+model_names.sort()
 # Just for display.
 cleaned_model_names = [name.replace("completions-", "") for name in model_names]
 def build_table():
     # Construct the query to create two columns for each model: MODEL_answer and MODEL_ok
 joined_df, model_correct_columns = build_table()
 relabelled_df = joined_df[['ID', 'challenge_clipped', 'answer', *model_correct_columns]].rename(columns={
+    'ID': 'ID',
     'challenge_clipped': 'Challenge',
     'answer': 'Answer',
     **{model.replace("-", "_") + '_ok': model.replace("completions-", "") for model in model_names}
+}).sort_values(by='ID')
 model_columns = {
     index + 3: name for index, name in enumerate(model_names)

metrics.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import re
+import duckdb
+import textwrap
+def _parse_answer(text: str) -> str:
+    """
+    Converts text to lowercase. Interprets "," and "-->" as separators for
+    elements of a set. Within each set, drops all non-alphanumeric characters
+    and returns that set.
+    Another way to describe this is that we interpret adjacent words as
+    phrases that must be present literally. However, comma and arrow separate
+    distinct phrases that may be present in any order. All other characters
+    are dropped.
+    """
+    text = text.lower()
+    groups = re.split(r'-->|,', text)
+    return [" ".join(re.findall(r'\b\w+\b', group)) for group in groups]
+def _answer_without_thoughts(completion: str) -> str:
+    if "<think>" not in completion[:200]:
+        return completion
+    chunks = completion.split("</think>")
+    if len(chunks) <= 1:
+        return ""
+    return chunks[-1].strip()
+def _check_answer(completion: str, answer: str) -> bool:
+    """
+    Check that all the phrases that must appear in the answer appear in the
+    completion. We ignore "thoughts", capitalization, and punctuation.
+    """
+    completion = _answer_without_thoughts(completion).lower()
+    answer_phrases = _parse_answer(answer)
+    r = all(phrase in completion for phrase in answer_phrases)
+    return r
+def _clip_text(text: str, width: int) -> str:
+    return text if len(text) <= width else text[:width] + "..."
+def _wrap_text(text: str, width: int) -> str:
+    return textwrap.fill(text, width=width)
+def load_results():
+    conn = duckdb.connect(":memory:")
+    conn.execute("ATTACH DATABASE 'results.duckdb' AS results")
+    conn.execute("CREATE TABLE challenges as SELECT * FROM 'puzzles_cleaned.csv'")
+    conn.create_function("check_answer", _check_answer)
+    conn.create_function("clip_text", _clip_text)
+    conn.create_function("wrap_text", _wrap_text)
+    return conn
+def accuracy_by_model(conn):
+    model_accuracies = conn.sql("""
+        WITH AnswerCheck AS (
+            SELECT
+                results.parent_dir AS model,
+                COUNT(*) AS total,
+                SUM(CAST(check_answer(results.completion, challenges.answer) AS INTEGER)) AS correct
+            FROM
+                results.completions results
+            JOIN
+                challenges challenges
+            ON
+                results.prompt_id = challenges.ID
+            GROUP BY
+                results.parent_dir
+        )
+        SELECT
+            model,
+            total,
+            correct,
+            ROUND(correct / total, 2) AS accuracy
+        FROM
+            AnswerCheck
+    """)
+    print(model_accuracies)
+if __name__ == "__main__":
+    conn = load_results()
+    accuracy_by_model(conn)

puzzles_cleaned.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0a6bd98c71e31ec98439b56cd22bd23af52763d24b66da7eda42d30c610693ce
-size 1134920

 version https://git-lfs.github.com/spec/v1
+oid sha256:7efd3a2897270124ecc8a299b96d14fb54600f3c0faf27b790d8b0312720f3cd
+size 1132332

results.duckdb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d83d136691e04a7570e3c3eb1b11fca96078d5041c1dc87f3aed86f5c9effa93
-size 29634560

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa7c7911a1ecf7fe4223995e3d393dd78cf8d4023409197854bf471fd8ab7c48
+size 32518144