""" This program helps us explore model's responses to the benchmark. It is a web app that displays the following: 1. A list of benchmark items loaded from puzzles_cleaned.csv. The list shows the columns ID, challenge, and answer. 2. When we select a puzzle from the list, we see the transcript, Explanation, and Editor's Note in textboxes. (Scrollable since they can be long.) 3. The list in (1) also has a column for each model, with checkboxes indicating whether the model's response is correct or not. We load the model responses from results.duckdb. That file has a table called completions with columns 'prompt_id', 'parent_dir', and 'completion'. The prompt_id can be joined with ID from puzzles_cleaned.csv. The parent_dir is the model name. The completion is the model response, which we compare with the answer from puzzles_cleaned.csv using the function check_answer defined below. 4. Finally, when an item is selected from the list, we get a dropdown that lets us select a model to see the completion from that model. Note that not every model has a response for every puzzle. """ import gradio as gr import pandas as pd import numpy as np from metrics import load_results, accuracy_by_model_and_time import metrics def get_model_response(prompt_id, model_name): query = f""" SELECT completion FROM results.completions WHERE prompt_id = {prompt_id} AND parent_dir = '{model_name}' """ response = conn.sql(query).fetchone() return response[0] if response else None def display_puzzle(puzzle_id): query = f""" SELECT challenge, answer, transcript, Explanation, "Editor's Notes" FROM challenges WHERE ID = {puzzle_id} """ puzzle = conn.sql(query).fetchone() return puzzle if puzzle else (None, None,None, None, None) def display_model_response(puzzle_id, model_name, show_thoughts): response = get_model_response(puzzle_id, model_name) if response is None: return "No response from this model." split_thoughts = response.split("") if len(split_thoughts) > 1: if show_thoughts: return response.strip() else: return split_thoughts[-1].strip() else: return response.strip() conn = load_results() # Get all unique model names model_names = [item[0] for item in conn.sql("SELECT DISTINCT parent_dir FROM results.completions").fetchall()] model_names.sort() # Just for display. cleaned_model_names = [name.replace("completions-", "") for name in model_names] def build_table(): # Construct the query to create two columns for each model: MODEL_answer and MODEL_ok query = """ SELECT c.ID, c.challenge, wrap_text(c.answer, 40) AS answer, """ model_correct_columns = [] for model in model_names: normalized_model_name = model.replace("-", "_") model_correct_columns.append(normalized_model_name + "_ok") query += f""" MAX(CASE WHEN r.parent_dir = '{model}' THEN r.completion ELSE NULL END) AS {normalized_model_name}_answer, MAX(CASE WHEN r.parent_dir = '{model}' THEN check_answer(r.completion, c.answer) ELSE NULL END) AS {normalized_model_name}_ok, """ query = query.rstrip(',') # Remove the trailing comma query += """ clip_text(c.challenge, 40) as challenge_clipped, FROM challenges c LEFT JOIN results.completions r ON c.ID = r.prompt_id GROUP BY c.ID, c.challenge, c.answer """ joined_df = conn.sql(query).fetchdf() # Transform the model_correct columns to use emojis for model in model_names: normalized_model_name = model.replace("-", "_") joined_df[normalized_model_name + '_ok'] = joined_df[normalized_model_name + '_ok'].apply( lambda x: "✅" if x == 1 else ("❌" if x == 0 else "❓") ) return joined_df, model_correct_columns joined_df, model_correct_columns = build_table() relabelled_df = joined_df[['ID', 'challenge_clipped', 'answer', *model_correct_columns]].rename(columns={ 'ID': 'ID', 'challenge_clipped': 'Challenge', 'answer': 'Answer', **{model.replace("-", "_") + '_ok': model.replace("completions-", "") for model in model_names} }).sort_values(by='ID') model_columns = { index + 3: name for index, name in enumerate(model_names) } valid_model_indices = list(model_columns.keys()) default_model = model_columns[valid_model_indices[0]] def summary_view(): accuracy_over_time = accuracy_by_model_and_time(conn).to_df() accuracy_over_time["model"] = accuracy_over_time["model"].apply(lambda x: x.replace("completions-", "")) # This hack so that Gradio doesn't render a year 2020 as "2,020.0". accuracy_over_time["year"] = accuracy_over_time["year"].astype(str) accuracy_over_time.rename(columns={"model": "Model", "year": "Year", "accuracy": "Accuracy"}, inplace=True) gr.LinePlot( accuracy_over_time, x="Year", y="Accuracy", color="Model", title="Model Accuracy Over Time", y_lim=[0, 1], x_label="Year", y_label="Accuracy", ) def r1_accuracy_by_completion_length(): r1_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-r1').to_df() gemini2_completions = metrics.r1_accuracy_by_completion_length(conn,'completions-gemini2').to_df() r1_completions["model"] = "R1" gemini2_completions["model"] = "Gemini2" r1_completions = pd.concat([r1_completions, gemini2_completions]) r1_completions["length"] = r1_completions["length"] / 3.2 with gr.Blocks(fill_height=True): gr.LinePlot( r1_completions, x="length", y="cumulative_accuracy", title="Accuracy by Maximum Completion Length", x_label="Max Response Length (tokens)", y_label="Accuracy (%)", x_lim=[0, 32_768], y_lim=[0, 1], color="model", ) def all_challenges_view(): # Using "markdown" as the datatype makes Gradio interpret newlines. puzzle_list = gr.DataFrame( value=relabelled_df, datatype=["number", "str", "markdown", *["str"] * len(model_correct_columns)], # headers=["ID", "Challenge", "Answer", *cleaned_model_names], ) with gr.Row(scale=2): model_name = gr.State(value=default_model) challenge_id = gr.State(value=0) show_thoughts = gr.State(value=False) with gr.Column(): challenge = gr.Textbox(label="Challenge", interactive=False) answer = gr.Textbox(label="Answer", interactive=False) explanation = gr.Textbox(label="Explanation", interactive=False) editors_note = gr.Textbox(label="Editor's Note", interactive=False) def show_thoughts_toggle(x): return not x with gr.Column(): show_thoughts_checkbox = gr.Checkbox( label="Show Thoughts", value=False ).change( fn=show_thoughts_toggle, inputs=[show_thoughts], outputs=[show_thoughts] ) model_response = gr.Textbox(label="Model Response", interactive=False) transcript = gr.Textbox(label="Transcript", interactive=False) def select_table_item(evt: gr.SelectData): model_index = evt.index[1] challenge_id = evt.index[0] model_name = model_columns[model_index] if model_index in valid_model_indices else default_model return (model_name, challenge_id) def update_puzzle(challenge_id: str, model_name: str, show_thoughts: bool): return (*display_puzzle(challenge_id), gr.Textbox( value=display_model_response(challenge_id, model_name, show_thoughts), label=model_name )) puzzle_list.select( fn=select_table_item, inputs=[], outputs=[model_name, challenge_id] ) model_name.change( fn=update_puzzle, inputs=[challenge_id, model_name, show_thoughts], outputs=[challenge, answer, transcript, explanation, editors_note, model_response] ) challenge_id.change( fn=update_puzzle, inputs=[challenge_id, model_name, show_thoughts], outputs=[challenge, answer, transcript, explanation, editors_note, model_response] ) show_thoughts.change( fn=update_puzzle, inputs=[challenge_id, model_name, show_thoughts], outputs=[challenge, answer, transcript, explanation, editors_note, model_response] ) def create_interface(): with gr.Blocks() as demo: with gr.Tabs(): with gr.TabItem("All Challenges"): all_challenges_view() with gr.TabItem("Accuracy by Model"): gr.DataFrame(metrics.accuracy_by_model(conn).to_df()) with gr.TabItem("Accuracy Over Time"): summary_view() with gr.TabItem("DeepSeek R1 Analysis"): r1_accuracy_by_completion_length() demo.launch() if __name__ == "__main__": create_interface()