Spaces:

galileo-ai
/

agent-leaderboard

Running

App Files Files Community

Pratik Bhavsar commited on 9 days ago

Commit

10ad72f

1 Parent(s): 19b159e

working draft

Browse files

Files changed (4) hide show

app.py +79 -267
data_loader.py +78 -0
utils.py +70 -0
visualization.py +221 -0

app.py CHANGED Viewed

@@ -1,281 +1,93 @@
 import gradio as gr
-import pandas as pd
-import matplotlib.pyplot as plt
-import numpy as np
-import plotly.graph_objects as go
-df = pd.read_csv("results.csv").dropna()
-categories = {
-    "Overall": ["Model Avg"],
-    "Overall single turn": ["single turn perf"],
-    "Overall multi turn": ["multi turn perf"],
-    "Single func call": [
-        "xlam_single_tool_single_call",
-        "xlam_multiple_tool_single_call",
-    ],
-    "Multiple func call": [
-        "xlam_multiple_tool_multiple_call",
-        "xlam_single_tool_multiple_call",
-        "BFCL_v3_multi_turn_base_multi_func_call",
-    ],
-    "Irrelevant query": ["BFCL_v3_irrelevance"],
-    "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
-    "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
-    "Missing params": ["BFCL_v3_multi_turn_miss_param"],
-    "Composite": ["BFCL_v3_multi_turn_composite"],
-}
-def create_radar_plot(df, model_names):
-    datasets = df.columns[7:].tolist()
-    fig = go.Figure()
-    colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
-    line_colors = ["#4F46E5", "#16A34A"]
-    for idx, model_name in enumerate(model_names):
-        model_data = df[df["Model"] == model_name].iloc[0]
-        values = [model_data[m] for m in datasets]
-        values.append(values[0])
-        datasets_plot = datasets + [datasets[0]]
-        fig.add_trace(
-            go.Scatterpolar(
-                r=values,
-                theta=datasets_plot,
-                fill="toself",
-                fillcolor=colors[idx % len(colors)],
-                line=dict(color=line_colors[idx % len(line_colors)], width=2),
-                name=model_name,
-                text=[f"{val:.3f}" for val in values],
-                textposition="middle right",
-                mode="lines+markers+text",
-            )
-        )
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
-            ),
-            angularaxis=dict(
-                tickfont=dict(size=13, family="Arial"),
-                rotation=90,
-                direction="clockwise",
             ),
-        ),
-        showlegend=True,
-        title=dict(
-            text="Model Comparison",
-            x=0.5,
-            y=0.95,
-            font=dict(size=24, family="Arial", color="#1F2937"),
-        ),
-        paper_bgcolor="white",
-        plot_bgcolor="white",
-        height=800,
-        width=1000,
-    )
-    return fig
-def model_info_tab(model_names=None):
-    if model_names is None or len(model_names) == 0:
-        model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
-    filtered_df = df[df["Model"].isin(model_names)]
-    radar_chart = create_radar_plot(df, model_names)
-    info_html = filtered_df[
-        [
-            "Model",
-            "Model Type",
-            "Model Avg",
-            "Input cost per million token",
-            "Output cost per million token",
-            "single turn perf",
-            "multi turn perf",
-        ]
-    ].to_html(index=False)
-    return info_html, radar_chart
-def get_performance_chart(df):
-    df_sorted = df.sort_values("Model Avg", ascending=True)
-    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
-    fig, ax = plt.subplots(figsize=(16, 10))
-    bars = ax.barh(
-        np.arange(len(df_sorted)),
-        df_sorted["Model Avg"],
-        height=0.4,
-        color=[colors[t] for t in df_sorted["Model Type"]],
-    )
-    ax.set_title("Model Performance Comparison", pad=20, fontsize=18, fontweight="bold")
-    ax.set_xlabel("Average Score", fontsize=12, labelpad=10)
-    ax.set_xlim(0.6, 1.0)
-    ax.set_yticks(np.arange(len(df_sorted)))
-    ax.set_yticklabels(df_sorted["Model"], fontsize=10)
-    for i, v in enumerate(df_sorted["Model Avg"]):
-        ax.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=10)
-    ax.grid(True, axis="x", linestyle="--", alpha=0.2)
-    ax.spines[["top", "right"]].set_visible(False)
-    legend_elements = [
-        plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
-        for label, color in colors.items()
-    ]
-    ax.legend(handles=legend_elements, title="Model Type", loc="lower right")
-    plt.tight_layout()
-    return fig
-def get_performance_cost_chart(df):
-    plt.figure(figsize=(12, 8), dpi=300)
-    plt.grid(True, linestyle="--", alpha=0.2)
-    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
-    performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
-    for _, row in df.iterrows():
-        color = colors[row["Model Type"]]
-        size = 100 if row["Model Avg"] > 0.85 else 80
-        edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
-        plt.scatter(
-            row["Input cost per million token"],
-            row["Model Avg"] * 100,
-            c=color,
-            s=size,
-            alpha=0.9,
-            edgecolor=edge_color,
-            linewidth=1,
         )
-        plt.annotate(
-            f"{row['Model']}\n(${row['Input cost per million token']})",
-            (row["Input cost per million token"], row["Model Avg"] * 100),
-            xytext=(7, 7),
-            textcoords="offset points",
-            fontsize=9,
-            bbox=dict(facecolor="white", edgecolor="none", alpha=0.7),
         )
-    plt.xscale("log")
-    plt.xlabel("Cost per Million Tokens ($)", fontsize=12, weight="bold")
-    plt.ylabel("Model Performance Score", fontsize=12, weight="bold")
-    plt.ylim(60, 95)
-    legend_elements = [
-        plt.scatter([], [], c=color, label=label, s=80)
-        for label, color in colors.items()
-    ]
-    plt.legend(handles=legend_elements, loc="upper right")
-    plt.title("AI Language Model Performance vs. Cost", fontsize=14, weight="bold")
-    for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
-        plt.axhspan(y1, y2, alpha=0.2, color=color)
-    plt.tight_layout()
-    return plt.gcf()
-def filter_leaderboard(model_type, category):
-    filtered_df = df.copy()
-    if model_type != "All":
-        filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
-    dataset_columns = categories.get(category, ["Model Avg"])
-    avg_score = filtered_df[dataset_columns].mean(axis=1)
-    filtered_df["Category Score"] = avg_score
-    filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
-    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
-    perf_chart = get_performance_chart(filtered_df)
-    cost_chart = get_performance_cost_chart(filtered_df)
-    display_columns = [
-        "Rank",
-        "Model",
-        "Model Type",
-        "Input cost per million token",
-        "Output cost per million token",
-        "Category Score",
-    ]
-    table_html = filtered_df[display_columns].to_html(index=False)
-    return table_html, perf_chart, cost_chart
-with gr.Blocks(theme=gr.themes.Soft()) as app:
-    with gr.Tabs():
-        with gr.Tab("Leaderboard"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    gr.Markdown("# Filters")
-                    model_type = gr.Dropdown(
-                        choices=["All"] + df["Model Type"].unique().tolist(),
-                        value="All",
-                        label="Model Type",
-                    )
-                    category = gr.Dropdown(
-                        choices=list(categories.keys()),
-                        value=list(categories.keys())[0],
-                        label="Category",
-                    )
-                with gr.Column(scale=4):
-                    gr.Markdown("# Agent Leaderboard")
-                    output = gr.HTML()
-                    plot1 = gr.Plot()
-                    plot2 = gr.Plot()
-            for input_comp in [model_type, category]:
-                input_comp.change(
-                    fn=filter_leaderboard,
-                    inputs=[model_type, category],
-                    outputs=[output, plot1, plot2],
-                )
-        with gr.Tab("Model Performance"):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    model_selector = gr.Dropdown(
-                        choices=df["Model"].unique().tolist(),
-                        value=df.sort_values("Model Avg", ascending=False).iloc[0][
-                            "Model"
-                        ],
-                        multiselect=True,
-                        label="Models",
-                    )
-                with gr.Column(scale=4):
-                    model_info = gr.HTML()
-                    radar_plot = gr.Plot()
-            model_selector.change(
-                fn=model_info_tab,
-                inputs=[model_selector],
-                outputs=[model_info, radar_plot],
-            )
-    app.load(
-        fn=lambda: filter_leaderboard("All", list(categories.keys())[0]),
-        outputs=[output, plot1, plot2],
-    )
-    app.load(
-        fn=lambda: model_info_tab(
-            [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
-        ),
-        outputs=[model_info, radar_plot],
-    )
 if __name__ == "__main__":
-    demo = app
     demo.launch()

 import gradio as gr
+from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY
+from utils import model_info_tab, filter_leaderboard
+from visualization import setup_matplotlib
+def create_app():
+    setup_matplotlib()
+    df = load_data()
+    with gr.Blocks(theme=gr.themes.Soft()) as app:
+        with gr.Tabs():
+            with gr.Tab("Leaderboard"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("# Filters")
+                        model_type = gr.Dropdown(
+                            choices=["All"] + df["Model Type"].unique().tolist(),
+                            value="All",
+                            label="Model Type",
+                        )
+                        category = gr.Dropdown(
+                            choices=list(CATEGORIES.keys()),
+                            value=list(CATEGORIES.keys())[0],
+                            label="Category",
+                        )
+                        sort_by = gr.Radio(
+                            choices=["Performance", "Cost"],
+                            value="Performance",
+                            label="Sort by",
+                        )
+                    with gr.Column(scale=4):
+                        gr.Markdown("# Agent Leaderboard")
+                        output = gr.HTML()
+                        plot1 = gr.Plot()
+                        plot2 = gr.Plot()
+                for input_comp in [model_type, category, sort_by]:
+                    input_comp.change(
+                        fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
+                        inputs=[model_type, category, sort_by],
+                        outputs=[output, plot1, plot2],
+                    )
+            with gr.Tab("Model Performance"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        model_selector = gr.Dropdown(
+                            choices=df["Model"].unique().tolist(),
+                            value=df.sort_values("Model Avg", ascending=False).iloc[0][
+                                "Model"
+                            ],
+                            multiselect=True,
+                            label="Models",
+                        )
+                    with gr.Column(scale=4):
+                        model_info = gr.HTML()
+                        radar_plot = gr.Plot()
+                model_selector.change(
+                    fn=lambda m: model_info_tab(df, m),
+                    inputs=[model_selector],
+                    outputs=[model_info, radar_plot],
+                )
+            with gr.Tab("Methodology"):
+                gr.Markdown(METHODOLOGY)
+            with gr.Tab("Insights"):
+                gr.Markdown(INSIGHTS)
+        app.load(
+            fn=lambda: filter_leaderboard(
+                df, "All", list(CATEGORIES.keys())[0], "Performance"
             ),
+            outputs=[output, plot1, plot2],
         )
+        app.load(
+            fn=lambda: model_info_tab(
+                df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
+            ),
+            outputs=[model_info, radar_plot],
         )
+    return app
+# main.py
 if __name__ == "__main__":
+    demo = create_app()
     demo.launch()

data_loader.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import pandas as pd
+def load_data():
+    """Load and preprocess the data."""
+    df = pd.read_csv("results.csv").dropna()
+    # Add combined I/O cost column with 3:1 ratio
+    df["IO Cost"] = (
+        df["Input cost per million token"] * 0.75
+        + df["Output cost per million token"] * 0.25
+    )
+    return df
+# categories.py
+CATEGORIES = {
+    "Overall": ["Model Avg"],
+    "Overall single turn": ["single turn perf"],
+    "Overall multi turn": ["multi turn perf"],
+    "Single func call": [
+        "xlam_single_tool_single_call",
+        "xlam_multiple_tool_single_call",
+    ],
+    "Multiple func call": [
+        "xlam_multiple_tool_multiple_call",
+        "xlam_single_tool_multiple_call",
+        "BFCL_v3_multi_turn_base_multi_func_call",
+    ],
+    "Irrelevant query": ["BFCL_v3_irrelevance"],
+    "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
+    "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
+    "Missing params": ["BFCL_v3_multi_turn_miss_param"],
+    "Composite": ["BFCL_v3_multi_turn_composite"],
+}
+INSIGHTS = """
+                # Key Insights from Agent Leaderboard
+                | Category | Finding | Implications |
+                |----------|---------|--------------|
+                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
+                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
+                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
+                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
+                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
+                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
+                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
+                """
+METHODOLOGY = """
+                # Methodology
+                ## Overview
+                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
+                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
+                ## Tool Selection Quality Metric
+                Models are evaluated on their ability to:
+                - Correctly identify when tools are needed
+                - Select the appropriate tool for the task
+                - Handle cases where no suitable tool exists
+                - Maintain context across multiple interactions
+                ## Dataset Structure
+                | Type | Samples | Category | Dataset Name | Purpose |
+                |------|---------|-----------|--------------|----------|
+                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
+                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
+                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
+                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
+                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
+                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
+                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
+                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
+                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
+                """

utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from data_loader import CATEGORIES
+from visualization import (
+    create_radar_plot,
+    get_performance_chart,
+    get_performance_cost_chart,
+)
+def model_info_tab(df, model_names=None):
+    if model_names is None or len(model_names) == 0:
+        model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
+    filtered_df = df[df["Model"].isin(model_names)]
+    radar_chart = create_radar_plot(df, model_names)
+    info_html = filtered_df[
+        [
+            "Model",
+            "Model Type",
+            "Model Avg",
+            "IO Cost",
+            "single turn perf",
+            "multi turn perf",
+        ]
+    ].to_html(index=False)
+    return info_html, radar_chart
+def filter_leaderboard(df, model_type, category, sort_by):
+    filtered_df = df.copy()
+    if model_type != "All":
+        filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
+    dataset_columns = CATEGORIES.get(category, ["Model Avg"])
+    avg_score = filtered_df[dataset_columns].mean(axis=1)
+    filtered_df["Category Score"] = avg_score
+    if sort_by == "Performance":
+        filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
+    else:
+        filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
+    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
+    perf_chart = get_performance_chart(filtered_df, category)
+    cost_chart = get_performance_cost_chart(filtered_df, category)
+    filtered_df["Cost (Input/Output)"] = filtered_df.apply(
+        lambda x: f"${x['Input cost per million token']:.2f}/${x['Output cost per million token']:.2f}",
+        axis=1,
+    )
+    display_columns = [
+        "Rank",
+        "Model",
+        "Model Type",
+        "Cost (Input/Output)",
+        "Category Score",
+    ]
+    table_html = filtered_df[display_columns].to_html(index=False, escape=False)
+    note_html = """
+    <div style='margin-top: 20px; padding: 10px; background-color: #f3f4f6; border-radius: 4px;'>
+        <p style='margin: 0; font-size: 0.9em; color: #4b5563;'>
+            Note: Cost for sorting is calculated using 3:1 ratio on I/O. Cost of Gemini 2.0 is assumed to be same as that of Gemini 1.5.
+        </p>
+    </div>
+    """
+    table_html += note_html
+    return table_html, perf_chart, cost_chart

visualization.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import plotly.graph_objects as go
+def setup_matplotlib():
+    """Set up matplotlib configuration."""
+    matplotlib.use("Agg")
+    plt.close("all")
+def get_performance_chart(df, category_name="Overall"):
+    plt.close("all")
+    score_column = "Category Score"
+    df_sorted = df.sort_values(score_column, ascending=True)
+    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
+    height = max(8, len(df_sorted) * 0.8)
+    fig, ax = plt.subplots(figsize=(16, height))
+    plt.rcParams.update({"font.size": 12})
+    try:
+        bars = ax.barh(
+            np.arange(len(df_sorted)),
+            df_sorted[score_column],
+            height=0.6,
+            color=[colors[t] for t in df_sorted["Model Type"]],
+        )
+        ax.set_title(
+            f"Model Performance Comparison - {category_name}",
+            pad=20,
+            fontsize=20,
+            fontweight="bold",
+        )
+        ax.set_xlabel("Average Score", fontsize=14, labelpad=10)
+        ax.set_xlim(0.0, 1.0)
+        ax.set_yticks(np.arange(len(df_sorted)))
+        ax.set_yticklabels(df_sorted["Model"], fontsize=12)
+        plt.subplots_adjust(left=0.35)
+        for i, v in enumerate(df_sorted[score_column]):
+            ax.text(
+                v + 0.01, i, f"{v:.3f}", va="center", fontsize=12, fontweight="bold"
+            )
+        ax.grid(True, axis="x", linestyle="--", alpha=0.2)
+        ax.spines[["top", "right"]].set_visible(False)
+        legend_elements = [
+            plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
+            for label, color in colors.items()
+        ]
+        ax.legend(
+            handles=legend_elements,
+            title="Model Type",
+            loc="lower right",
+            fontsize=12,
+            title_fontsize=14,
+        )
+        plt.tight_layout()
+        return fig
+    finally:
+        plt.close(fig)
+def create_radar_plot(df, model_names):
+    datasets = [col for col in df.columns[7:] if col != "IO Cost"]
+    fig = go.Figure()
+    colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
+    line_colors = ["#4F46E5", "#16A34A"]
+    for idx, model_name in enumerate(model_names):
+        model_data = df[df["Model"] == model_name].iloc[0]
+        values = [model_data[m] for m in datasets]
+        values.append(values[0])
+        datasets_plot = datasets + [datasets[0]]
+        fig.add_trace(
+            go.Scatterpolar(
+                r=values,
+                theta=datasets_plot,
+                fill="toself",
+                fillcolor=colors[idx % len(colors)],
+                line=dict(color=line_colors[idx % len(line_colors)], width=2),
+                name=model_name,
+                text=[f"{val:.3f}" for val in values],
+                textposition="middle right",
+                mode="lines+markers+text",
+            )
+        )
+    fig.update_layout(
+        polar=dict(
+            radialaxis=dict(
+                visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
+            ),
+            angularaxis=dict(
+                tickfont=dict(size=13, family="Arial"),
+                rotation=90,
+                direction="clockwise",
+            ),
+        ),
+        showlegend=True,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.2,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=14),
+        ),
+        title=dict(
+            text="Model Comparison",
+            x=0.5,
+            y=0.95,
+            font=dict(size=24, family="Arial", color="#1F2937"),
+        ),
+        paper_bgcolor="white",
+        plot_bgcolor="white",
+        height=700,
+        width=900,
+        margin=dict(t=100, b=100, l=80, r=80),
+    )
+    return fig
+def get_performance_cost_chart(df, category_name="Overall"):
+    # Create figure and axis with specified style
+    fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
+    # Configure plot style
+    ax.grid(True, linestyle="--", alpha=0.15, which="both")
+    ax.set_facecolor("white")
+    fig.patch.set_facecolor("white")
+    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
+    performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
+    score_column = "Category Score"
+    # Plot data points
+    for _, row in df.iterrows():
+        color = colors[row["Model Type"]]
+        size = 100 if row[score_column] > 0.85 else 80
+        edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
+        # Plot scatter points
+        ax.scatter(
+            row["IO Cost"],
+            row[score_column] * 100,
+            c=color,
+            s=size,
+            alpha=0.9,
+            edgecolor=edge_color,
+            linewidth=1,
+            zorder=5,  # Ensure points are above grid
+        )
+        # Add annotations with model names
+        bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.8)
+        ax.annotate(
+            f"{row['Model']}\n(${row['IO Cost']:.2f})",
+            (row["IO Cost"], row[score_column] * 100),
+            xytext=(5, 5),
+            textcoords="offset points",
+            fontsize=8,
+            bbox=bbox_props,
+            zorder=6,
+        )
+    # Configure axes
+    ax.set_xscale("log")
+    ax.set_xlim(0.08, 40)  # Adjust based on your data range
+    ax.set_ylim(60, 95)
+    # Customize axis labels
+    ax.set_xlabel("I/O Cost per Million Tokens ($)", fontsize=10, labelpad=10)
+    ax.set_ylabel("Model Performance Score", fontsize=10, labelpad=10)
+    # Add legend
+    legend_elements = [
+        plt.scatter([], [], c=color, label=label, s=80)
+        for label, color in colors.items()
+    ]
+    ax.legend(
+        handles=legend_elements,
+        loc="upper right",
+        frameon=True,
+        facecolor="white",
+        edgecolor="none",
+        fontsize=9,
+    )
+    # Set title
+    ax.set_title(
+        f"AI Language Model Performance vs. Cost - {category_name}", fontsize=12, pad=15
+    )
+    # Add performance bands
+    for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
+        ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
+    # Customize tick parameters
+    ax.tick_params(axis="both", which="major", labelsize=9)
+    ax.tick_params(axis="both", which="minor", labelsize=8)
+    # Add minor ticks for log scale
+    ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
+    # Adjust layout
+    plt.tight_layout()
+    return fig