|
""" |
|
This program helps us explore model's responses to the benchmark. It is a web |
|
app that displays the following: |
|
|
|
1. A list of benchmark items loaded from puzzles_cleaned.csv. The list shows |
|
the columns ID, challenge, and answer. |
|
2. When we select a puzzle from the list, we see the transcript, Explanation, |
|
and Editor's Note in textboxes. (Scrollable since they can be long.) |
|
3. The list in (1) also has a column for each model, with checkboxes indicating |
|
whether the model's response is correct or not. We load the model responses |
|
from results.duckdb. That file has a table called completions with |
|
columns 'prompt_id', 'parent_dir', and 'completion'. The prompt_id can be |
|
joined with ID from puzzles_cleaned.csv. The parent_dir is the model name. |
|
The completion is the model response, which we compare with the answer from |
|
puzzles_cleaned.csv using the function check_answer defined below. |
|
4. Finally, when an item is selected from the list, we get a dropdown that lets |
|
us select a model to see the completion from that model. |
|
|
|
Note that not every model has a response for every puzzle. |
|
""" |
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
from metrics import load_results, accuracy_by_model_and_time |
|
import metrics |
|
|
|
|
|
def get_model_response(prompt_id, model_name): |
|
query = f""" |
|
SELECT completion FROM results.completions |
|
WHERE prompt_id = {prompt_id} AND parent_dir = '{model_name}' |
|
""" |
|
response = conn.sql(query).fetchone() |
|
return response[0] if response else None |
|
|
|
def display_puzzle(puzzle_id): |
|
query = f""" |
|
SELECT challenge, answer, transcript, Explanation, "Editor's Notes" |
|
FROM challenges |
|
WHERE ID = {puzzle_id} |
|
""" |
|
puzzle = conn.sql(query).fetchone() |
|
return puzzle if puzzle else (None, None,None, None, None) |
|
|
|
def display_model_response(puzzle_id, model_name, show_thoughts): |
|
response = get_model_response(puzzle_id, model_name) |
|
if response is None: |
|
return "No response from this model." |
|
split_thoughts = response.split("</think>") |
|
if len(split_thoughts) > 1: |
|
if show_thoughts: |
|
return response.strip() |
|
else: |
|
return split_thoughts[-1].strip() |
|
else: |
|
return response.strip() |
|
|
|
|
|
conn = load_results() |
|
|
|
|
|
model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM results.completions").fetchall()] |
|
model_names.sort() |
|
|
|
cleaned_model_names = [name.replace("completions-", "") for name in model_names] |
|
|
|
|
|
def build_table(): |
|
|
|
query = """ |
|
SELECT c.ID, c.challenge, wrap_text(c.answer, 40) AS answer, |
|
""" |
|
|
|
model_correct_columns = [] |
|
for model in model_names: |
|
normalized_model_name = model.replace("-", "_") |
|
model_correct_columns.append(normalized_model_name + "_ok") |
|
query += f""" |
|
MAX(CASE WHEN r.parent_dir = '{model}' THEN r.completion ELSE NULL END) AS {normalized_model_name}_answer, |
|
MAX(CASE WHEN r.parent_dir = '{model}' THEN check_answer(r.completion, c.answer) ELSE NULL END) AS {normalized_model_name}_ok, |
|
""" |
|
|
|
query = query.rstrip(',') |
|
query += """ |
|
clip_text(c.challenge, 40) as challenge_clipped, |
|
FROM challenges c |
|
LEFT JOIN results.completions r |
|
ON c.ID = r.prompt_id |
|
GROUP BY c.ID, c.challenge, c.answer |
|
""" |
|
|
|
joined_df = conn.sql(query).fetchdf() |
|
|
|
|
|
for model in model_names: |
|
normalized_model_name = model.replace("-", "_") |
|
joined_df[normalized_model_name + '_ok'] = joined_df[normalized_model_name + '_ok'].apply( |
|
lambda x: "β
" if x == 1 else ("β" if x == 0 else "β") |
|
) |
|
|
|
return joined_df, model_correct_columns |
|
|
|
|
|
joined_df, model_correct_columns = build_table() |
|
|
|
relabelled_df = joined_df[['ID', 'challenge_clipped', 'answer', *model_correct_columns]].rename(columns={ |
|
'ID': 'ID', |
|
'challenge_clipped': 'Challenge', |
|
'answer': 'Answer', |
|
**{model.replace("-", "_") + '_ok': model.replace("completions-", "") for model in model_names} |
|
}).sort_values(by='ID') |
|
|
|
model_columns = { |
|
index + 3: name for index, name in enumerate(model_names) |
|
} |
|
|
|
valid_model_indices = list(model_columns.keys()) |
|
default_model = model_columns[valid_model_indices[0]] |
|
|
|
def summary_view(): |
|
accuracy_over_time = accuracy_by_model_and_time(conn).to_df() |
|
accuracy_over_time["model"] = accuracy_over_time["model"].apply(lambda x: x.replace("completions-", "")) |
|
|
|
accuracy_over_time["year"] = accuracy_over_time["year"].astype(str) |
|
accuracy_over_time.rename(columns={"model": "Model", "year": "Year", "accuracy": "Accuracy"}, inplace=True) |
|
gr.LinePlot( |
|
accuracy_over_time, |
|
x="Year", |
|
y="Accuracy", |
|
color="Model", |
|
title="Model Accuracy Over Time", |
|
y_lim=[0, 1], |
|
x_label="Year", |
|
y_label="Accuracy", |
|
) |
|
|
|
|
|
def r1_accuracy_by_completion_length(): |
|
r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df() |
|
gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df() |
|
|
|
r1_completions["model"] = "R1" |
|
gemini2_completions["model"] = "Gemini2" |
|
r1_completions = pd.concat([r1_completions, gemini2_completions]) |
|
|
|
r1_completions["length"] = r1_completions["length"] / 3.2 |
|
|
|
with gr.Blocks(fill_height=True): |
|
gr.LinePlot( |
|
r1_completions, |
|
x="length", |
|
y="cumulative_accuracy", |
|
title="Accuracy by Maximum Completion Length", |
|
x_label="Max Response Length (tokens)", |
|
y_label="Accuracy (%)", |
|
x_lim=[0, 32_768], |
|
y_lim=[0, 1], |
|
color="model", |
|
) |
|
|
|
def all_challenges_view(): |
|
|
|
puzzle_list = gr.DataFrame( |
|
value=relabelled_df, |
|
datatype=["number", "str", "markdown", *["str"] * len(model_correct_columns)], |
|
|
|
) |
|
with gr.Row(scale=2): |
|
model_name = gr.State(value=default_model) |
|
challenge_id = gr.State(value=0) |
|
show_thoughts = gr.State(value=False) |
|
with gr.Column(): |
|
challenge = gr.Textbox(label="Challenge", interactive=False) |
|
answer = gr.Textbox(label="Answer", interactive=False) |
|
explanation = gr.Textbox(label="Explanation", interactive=False) |
|
editors_note = gr.Textbox(label="Editor's Note", interactive=False) |
|
|
|
def show_thoughts_toggle(x): |
|
return not x |
|
|
|
with gr.Column(): |
|
show_thoughts_checkbox = gr.Checkbox( |
|
label="Show Thoughts", value=False |
|
).change( |
|
fn=show_thoughts_toggle, inputs=[show_thoughts], outputs=[show_thoughts] |
|
) |
|
model_response = gr.Textbox(label="Model Response", interactive=False) |
|
transcript = gr.Textbox(label="Transcript", interactive=False) |
|
|
|
def select_table_item(evt: gr.SelectData): |
|
model_index = evt.index[1] |
|
challenge_id = evt.index[0] |
|
model_name = model_columns[model_index] if model_index in valid_model_indices else default_model |
|
return (model_name, challenge_id) |
|
|
|
def update_puzzle(challenge_id: str, model_name: str, show_thoughts: bool): |
|
return (*display_puzzle(challenge_id), |
|
gr.Textbox( |
|
value=display_model_response(challenge_id, model_name, show_thoughts), |
|
label=model_name |
|
)) |
|
|
|
puzzle_list.select( |
|
fn=select_table_item, |
|
inputs=[], |
|
outputs=[model_name, challenge_id] |
|
) |
|
|
|
model_name.change( |
|
fn=update_puzzle, |
|
inputs=[challenge_id, model_name, show_thoughts], |
|
outputs=[challenge, answer, transcript, explanation, editors_note, model_response] |
|
) |
|
|
|
challenge_id.change( |
|
fn=update_puzzle, |
|
inputs=[challenge_id, model_name, show_thoughts], |
|
outputs=[challenge, answer, transcript, explanation, editors_note, model_response] |
|
) |
|
|
|
show_thoughts.change( |
|
fn=update_puzzle, |
|
inputs=[challenge_id, model_name, show_thoughts], |
|
outputs=[challenge, answer, transcript, explanation, editors_note, model_response] |
|
) |
|
|
|
|
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks() as demo: |
|
with gr.Tabs(): |
|
with gr.TabItem("All Challenges"): |
|
all_challenges_view() |
|
with gr.TabItem("Accuracy by Model"): |
|
gr.DataFrame(metrics.accuracy_by_model(conn).to_df()) |
|
with gr.TabItem("Accuracy Over Time"): |
|
summary_view() |
|
with gr.TabItem("DeepSeek R1 Analysis"): |
|
r1_accuracy_by_completion_length() |
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
create_interface() |
|
|
|
|