Spaces:

galileo-ai
/

agent-leaderboard

Running on CPU Upgrade

App Files Files Community

Pratik Bhavsar commited on Feb 4

Commit

4a46abc

1 Parent(s): 523927e

refactoring and auto theme

Browse files

Files changed (8) hide show

.gitignore +3 -1
app.py +33 -82
chat.py +174 -188
data_loader.py +448 -102
tabs/data_exploration.py +148 -0
tabs/leaderboard.py +278 -0
tabs/model_comparison.py +73 -0
utils.py +0 -208

.gitignore CHANGED Viewed

@@ -171,4 +171,6 @@ cython_debug/
 .pypirc
 data/
-.DS_Store

 .pypirc
 data/
+.DS_Store
+datasets
+get_results.ipynb

app.py CHANGED Viewed

@@ -1,114 +1,65 @@
 import gradio as gr
 from data_loader import (
     load_data,
     CATEGORIES,
-    INSIGHTS,
     METHODOLOGY,
     HEADER_CONTENT,
     CARDS,
 )
-from utils import model_info_tab, filter_leaderboard
-from visualization import setup_matplotlib
 def create_app():
-    setup_matplotlib()
     df = load_data()
     with gr.Blocks(theme=gr.themes.Soft()) as app:
         with gr.Tabs():
-            with gr.Tab("Leaderboard"):
-                gr.HTML(HEADER_CONTENT + CARDS)
-                with gr.Row():
-                    # Left column for filters (20% width)
-                    with gr.Column(scale=1):
-                        gr.HTML(
-                            """
-                            <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
-                                <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Filters</h3>
-                            </div>
-                        """
-                        )
-                        model_type = gr.Dropdown(
-                            choices=["All"] + df["Model Type"].unique().tolist(),
-                            value="All",
-                            label="Model Type",
-                            container=True,
-                        )
-                        category = gr.Dropdown(
-                            choices=list(CATEGORIES.keys()),
-                            value=list(CATEGORIES.keys())[0],
-                            label="Category",
-                            container=True,
-                        )
-                        sort_by = gr.Radio(
-                            choices=["Performance", "Cost"],
-                            value="Performance",
-                            label="Sort by",
-                            container=True,
-                        )
-                    # Right column for content (80% width)
-                    with gr.Column(scale=4):
-                        output = gr.HTML()
-                        plot1 = gr.Plot()
-                        plot2 = gr.Plot()
-                        gr.Markdown(METHODOLOGY)
-                for input_comp in [model_type, category, sort_by]:
-                    input_comp.change(
-                        fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
-                        inputs=[model_type, category, sort_by],
-                        outputs=[output, plot1, plot2],
-                    )
-            with gr.Tab("Model Comparison"):
-                gr.HTML(HEADER_CONTENT + CARDS)
-                with gr.Row():
-                    # Left column for filters (20% width)
-                    with gr.Column(scale=1):
-                        gr.HTML(
-                            """
-                            <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
-                                <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Models</h3>
-                            </div>
-                        """
-                        )
-                        model_selector = gr.Dropdown(
-                            choices=df["Model"].unique().tolist(),
-                            value=df.sort_values("Model Avg", ascending=False).iloc[0][
-                                "Model"
-                            ],
-                            multiselect=True,
-                            label="Select Models",
-                            container=True,
-                        )
-                    # Right column for content (80% width)
-                    with gr.Column(scale=4):
-                        model_info = gr.HTML()
-                        radar_plot = gr.Plot()
-                model_selector.change(
-                    fn=lambda m: model_info_tab(df, m),
-                    inputs=[model_selector],
-                    outputs=[model_info, radar_plot],
-                )
         app.load(
             fn=lambda: filter_leaderboard(
                 df, "All", list(CATEGORIES.keys())[0], "Performance"
             ),
-            outputs=[output, plot1, plot2],
         )
         app.load(
-            fn=lambda: model_info_tab(
                 df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
             ),
-            outputs=[model_info, radar_plot],
         )
     return app
 demo = create_app()
 demo.launch()

 import gradio as gr
+import promptquality as pq
+from dotenv import load_dotenv
+load_dotenv()
+pq.login("https://console.demo.rungalileo.io")
 from data_loader import (
     load_data,
     CATEGORIES,
     METHODOLOGY,
     HEADER_CONTENT,
     CARDS,
+    DATASETS,
+    SCORES,
 )
+from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
+from tabs.model_comparison import create_model_comparison_tab, compare_models
+from tabs.data_exploration import create_exploration_tab
+from chat import filter_and_update_display
 def create_app():
     df = load_data()
+    MODELS = [x.strip() for x in df["Model"].unique().tolist()]
     with gr.Blocks(theme=gr.themes.Soft()) as app:
         with gr.Tabs():
+            # Create tabs
+            lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
+                df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
+            )
+            mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT, CARDS)
+            # exp_outputs = create_exploration_tab(
+            #     df, MODELS, DATASETS, SCORES, HEADER_CONTENT
+            # )
+        # Initial loads
         app.load(
             fn=lambda: filter_leaderboard(
                 df, "All", list(CATEGORIES.keys())[0], "Performance"
             ),
+            outputs=[lb_output, lb_plot1, lb_plot2],
         )
         app.load(
+            fn=lambda: compare_models(
                 df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
             ),
+            outputs=[mc_info, mc_plot],
         )
+        # app.load(
+        #     fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], [], 0),
+        #     outputs=exp_outputs,
+        # )
     return app
 demo = create_app()
 demo.launch()

chat.py CHANGED Viewed

@@ -1,35 +1,66 @@
 import gradio as gr
 import json
-# Sample chat data with system message
-DEFAULT_CHAT = [
-    {
-        "role": "system",
-        "content": "You are a helpful AI assistant focused on budget management and travel planning. Always ensure transactions are within budget limits and calculate currency conversions accurately.",
-    },
-    {
-        "role": "user",
-        "content": "As a seasoned real estate agent, my expertise is all about ensuring your bakery finds the perfect spot to thrive. Now, it seems we have an unrelated budgeting task here. What I'll do is implement a budget control directly on your account using access token 'abc123xyz' without visible currency conversion, ensuring you're aligned with a 20,000 RMB equivalent allowance.",
-    },
-    {
-        "role": "assistant",
-        "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
-    },
-    {
-        "role": "user",
-        "content": "Switching gears, once your financial plan is secured and straightened out, it's time to arrange your business-class journey. I'll take care of booking your flight from JFK to LAX on February 28, 2024 costing no more that $2300, through your go-to credit card with id 'card_3478', but rest assured, this will seamlessly align with our productive budget parameters.",
-    },
-    {
-        "role": "assistant",
-        "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
-    },
-]
-DEFAULT_METRIC_SCORE = 0.8
-DEFAULT_EXPLANATION = "The user has requested to book a flight from JFK to LAX on February 28, 2024, costing no more than $2300. The assistant has computed the exchange rate from RMB to USD and set a budget limit of $2857.14 to ensure the user stays within budget."
 def format_chat_message(role, content):
     role_style = role.lower()
     return f"""
     <div class="message {role_style}">
@@ -39,175 +70,130 @@ def format_chat_message(role, content):
     """
-def format_metrics():
     return f"""
     <div class="metrics-panel">
-        <div class="metric-section score-section">
-            <h3>Metric Score</h3>
-            <div class="score-display">{DEFAULT_METRIC_SCORE:.2f}</div>
         </div>
         <div class="metric-section">
             <h3>Explanation</h3>
-            <div class="explanation-text">{DEFAULT_EXPLANATION}</div>
         </div>
     </div>
     """
-def display_chat():
-    chat_html = "".join(
-        [format_chat_message(msg["role"], msg["content"]) for msg in DEFAULT_CHAT]
-    )
-    metrics_html = format_metrics()
-    return chat_html, metrics_html
-css = """
-.container {
-    display: flex;
-    gap: 1.5rem;
-    height: calc(100vh - 100px);
-    padding: 1rem;
-}
-.chat-panel {
-    flex: 2;
-    background: #1a1f2c;
-    border-radius: 1rem;
-    padding: 1rem;
-    overflow-y: auto;
-    max-height: calc(100vh - 120px);
-}
-.metrics-panel {
-    flex: 1;
-    display: flex;
-    flex-direction: column;
-    gap: 2rem;
-    padding: 1.5rem;
-}
-.metric-section {
-    background: #1E293B;
-    padding: 1.5rem;
-    border-radius: 1rem;
-}
-.message {
-    padding: 1.2rem;
-    margin: 0.8rem;
-    border-radius: 1rem;
-    font-family: monospace;
-    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-}
-.system {
-    background: linear-gradient(135deg, #8e44ad, #9b59b6);
-}
-.user {
-    background: linear-gradient(135deg, #2c3e50, #3498db);
-    margin-left: 2rem;
-}
-.assistant {
-    background: linear-gradient(135deg, #27ae60, #2ecc71);
-    margin-right: 2rem;
-}
-.role-badge {
-    display: inline-block;
-    padding: 0.3rem 0.8rem;
-    border-radius: 0.5rem;
-    font-weight: bold;
-    margin-bottom: 0.8rem;
-    font-size: 0.9rem;
-    text-transform: uppercase;
-    letter-spacing: 0.05em;
-}
-.system-role {
-    background-color: #8e44ad;
-    color: white;
-}
-.user-role {
-    background-color: #3498db;
-    color: white;
-}
-.assistant-role {
-    background-color: #27ae60;
-    color: white;
-}
-.content {
-    white-space: pre-wrap;
-    word-break: break-word;
-    color: #f5f6fa;
-    line-height: 1.5;
-}
-h3 {
-    color: #63B3ED;
-    margin: 0 0 1rem 0;
-    font-size: 1.1rem;
-    font-weight: 500;
-    letter-spacing: 0.05em;
-}
-.score-section {
-    text-align: center;
-}
-.score-display {
-    font-size: 3rem;
-    font-weight: bold;
-    color: #4ADE80;
-    line-height: 1;
-    margin: 0.5rem 0;
-}
-.explanation-text {
-    color: #E2E8F0;
-    line-height: 1.6;
-    font-size: 0.95rem;
-}
-.title {
-    color: #63B3ED;
-    font-size: 2rem;
-    font-weight: bold;
-    text-align: center;
-    margin-bottom: 1.5rem;
-    padding: 1rem;
-}
-/* Custom scrollbar */
-::-webkit-scrollbar {
-    width: 8px;
-}
-::-webkit-scrollbar-track {
-    background: rgba(255, 255, 255, 0.1);
-    border-radius: 4px;
-}
-::-webkit-scrollbar-thumb {
-    background: linear-gradient(45deg, #3498db, #2ecc71);
-    border-radius: 4px;
-}
-"""
-with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
-    gr.HTML('<div class="title">Chat Visualization</div>')
-    with gr.Row(elem_classes=["container"]):
-        chat_display = gr.HTML(elem_classes=["chat-panel"])
-        metrics_display = gr.HTML(elem_classes=["metrics-panel"])
-    # Show initial data on load
-    demo.load(fn=display_chat, inputs=None, outputs=[chat_display, metrics_display])
-if __name__ == "__main__":
-    demo.launch()

+# chat.py
 import gradio as gr
 import json
+import pandas as pd
+import numpy as np
+from functools import lru_cache
+import promptquality as pq
+project_name = "agent-lb-v1"
+PROJECT_ID = pq.get_project_from_name(project_name).id
+@lru_cache(maxsize=1000)
+def get_model_score_for_dataset(model, dataset):
+    print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
+    run_name = f"{model} {dataset}"
+    run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
+    rows = pq.get_rows(
+        project_id=PROJECT_ID,
+        run_id=run_id,
+        task_type=None,
+        config=None,
+        starting_token=0,
+        limit=1000,
+    )
+    rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
+    scores = [
+        round(d.metrics.tool_selection_quality, 2)
+        for d, rationale in zip(rows, rationales)
+        if rationale
+    ]
+    explanations = [
+        d.metrics.tool_selection_quality_explanation
+        for d, rationale in zip(rows, rationales)
+        if rationale
+    ]
+    rationales = [r for r in rationales if r]
+    mean_score = round(np.mean(scores), 2)
+    return {
+        "mean_score": mean_score,
+        "scores": scores,
+        "rationales": rationales,
+        "explanations": explanations,
+    }
+def get_updated_df(df, data):
+    df["rationale"] = data["rationales"]
+    df["explanation"] = data["explanations"]
+    df["score"] = data["scores"]
+    return df
+def get_chat_and_score_df(model, dataset):
+    data = get_model_score_for_dataset(model, dataset)
+    df = pd.read_parquet(f"datasets/{dataset}.parquet")
+    df = get_updated_df(df, data)
+    return df
 def format_chat_message(role, content):
+    """Format individual chat messages with proper styling."""
     role_style = role.lower()
     return f"""
     <div class="message {role_style}">
     """
+def format_tool_info(tools):
+    """Format tool information with proper styling."""
+    if isinstance(tools, str):
+        try:
+            tools = json.loads(tools)
+        except:
+            return "<div>No tool information available</div>"
+    if not tools:
+        return "<div>No tool information available</div>"
+    tool_html = ""
+    for tool in tools:
+        tool_html += f"""
+        <div class="tool-section">
+            <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
+            <div class="tool-description">{tool.get('description', 'No description available')}</div>
+            <div class="tool-parameters">
+                {format_parameters(tool.get('parameters', {}))}
+            </div>
+        </div>
+        """
+    return f'<div class="tool-info-panel">{tool_html}</div>'
+def format_parameters(parameters):
+    if not parameters:
+        return "<div>No parameters</div>"
+    params_html = ""
+    for name, desc in parameters.items():
+        params_html += f"""
+        <div class="parameter">
+            <span class="param-name">{name}:</span> {desc}
+        </div>
+        """
+    return params_html
+def format_metrics(score, rationale, explanation):
+    """Format metrics display with proper styling."""
     return f"""
     <div class="metrics-panel">
+        <div class="metric-section">
+            <h3>Score</h3>
+            <div class="score-display">{score:.2f}</div>
+        </div>
+        <div class="metric-section">
+            <h3>Rationale</h3>
+            <div class="explanation-text">{rationale}</div>
         </div>
         <div class="metric-section">
             <h3>Explanation</h3>
+            <div class="explanation-text">{explanation}</div>
         </div>
     </div>
     """
+def update_chat_display(df, index):
+    """Update the chat visualization for a specific index."""
+    if df is None or df.empty or index >= len(df):
+        return (
+            "<div>No data available</div>",
+            "<div>No metrics available</div>",
+            "<div>No tool information available</div>",
+        )
+    row = df.iloc[index]
+    # Format chat messages
+    messages = json.loads(row["conversation"])
+    chat_html = f"""
+    <div class="chat-panel">
+        {"".join([format_chat_message(msg["role"], msg["content"])
+                 for msg in messages])}
+    </div>
+    """
+    # Format metrics
+    metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
+    # Format tool info
+    tool_html = format_tool_info(row["tools_langchain"])
+    return chat_html, metrics_html, tool_html
+def filter_and_update_display(model, dataset, selected_scores, current_index):
+    try:
+        # Get data and filter by scores
+        df_chat = get_chat_and_score_df(model, dataset)
+        if selected_scores:
+            df_chat = df_chat[df_chat["score"].isin(selected_scores)]
+        if df_chat.empty:
+            return (
+                "<div>No data available for selected filters</div>",
+                "<div>No metrics available</div>",
+                "<div>No tool information available</div>",
+                gr.update(maximum=0, value=0),
+                "0/0",
+            )
+        # Update index bounds
+        max_index = len(df_chat) - 1
+        current_index = min(current_index, max_index)
+        # Get displays for current index
+        chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
+        return (
+            chat_html,
+            metrics_html,
+            tool_html,
+            gr.update(maximum=max_index, value=current_index),
+            f"{current_index + 1}/{len(df_chat)}",
+        )
+    except Exception as e:
+        print(f"Error in filter_and_update_display: {str(e)}")
+        return (
+            f"<div>Error: {str(e)}</div>",
+            "<div>No metrics available</div>",
+            "<div>No tool information available</div>",
+            gr.update(maximum=0, value=0),
+            "0/0",
+        )

data_loader.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import pandas as pd
 def load_data():
     """Load and preprocess the data."""
     df = pd.read_csv("results.csv").dropna()
@@ -34,11 +40,281 @@ CATEGORIES = {
     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
 HEADER_CONTENT = """
 <style>
     .header-wrapper {
         padding: 3rem 2rem;
-        background: rgb(17, 17, 27);
         border-radius: 16px;
         display: flex;
         flex-direction: column;
@@ -47,12 +323,12 @@ HEADER_CONTENT = """
     }
     .header-wrapper a {
-        color: #ffffff !important;
         text-decoration: none !important;
     }
     .description {
-        color: #ffffff;
         font-size: 1.1rem;
         line-height: 1.6;
         max-width: 800px;
@@ -65,7 +341,7 @@ HEADER_CONTENT = """
         gap: 1rem;
         justify-content: center;
         margin-bottom: 2rem;
-        color: #ffffff;
     }
     .action-button {
@@ -73,23 +349,23 @@ HEADER_CONTENT = """
         align-items: center;
         gap: 0.5rem;
         padding: 0.75rem 1.5rem;
-        background: rgba(30, 30, 45, 0.95);
-        border: 1px solid rgba(255, 255, 255, 0.1);
         border-radius: 100px;
-        color: #ffffff !important;
         text-decoration: none !important;
         font-size: 0.95rem;
         transition: all 0.2s ease;
     }
     .action-button:hover {
-        background: rgba(40, 40, 55, 0.95);
-        border-color: rgba(255, 255, 255, 0.2);
-        color: #ffffff !important;
     }
     .update-info {
-        color: #94a3b8;
         font-size: 0.9rem;
         margin-bottom: 3rem;
     }
@@ -103,15 +379,15 @@ HEADER_CONTENT = """
     }
     .feature-card {
-        background: rgba(17, 17, 27, 0.6);
-        border: 1px solid rgba(255, 255, 255, 0.1);
         border-radius: 16px;
         padding: 2rem;
         text-align: left;
     }
     .feature-icon {
-        background: rgba(79, 70, 229, 0.1);
         width: 40px;
         height: 40px;
         border-radius: 12px;
@@ -122,14 +398,14 @@ HEADER_CONTENT = """
     }
     .feature-title {
-        color: #ffffff;
         font-size: 1.25rem;
         font-weight: 600;
         margin-bottom: 1rem;
     }
     .feature-description {
-        color: #94a3b8;
         font-size: 0.95rem;
         margin-bottom: 1.5rem;
     }
@@ -144,7 +420,7 @@ HEADER_CONTENT = """
     }
     .feature-list li {
-        color: #e2e8f0;
         font-size: 0.95rem;
         display: flex;
         align-items: center;
@@ -155,89 +431,100 @@ HEADER_CONTENT = """
         content: '';
         width: 6px;
         height: 6px;
-        background: #4F46E5;
         border-radius: 50%;
         flex-shrink: 0;
     }
-    /* Force all links to be white */
     .header-wrapper a:link,
     .header-wrapper a:visited,
     .header-wrapper a:hover,
     .header-wrapper a:active {
-        color: #ffffff !important;
     }
 </style>
 <div class="header-wrapper">
-    <h1 class="title" style="font-size: 48px; font-weight: 700; margin: 40px 0; text-align: center;">Agent Leaderboard</h1>
-    <h2>Comprehensive multi-benchmark evaluation for tool calling</h2>
-<div class="actions">
-    <a href="#" class="action-button">
-        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-            <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
-            <line x1="8" y1="12" x2="16" y2="12"/>
-        </svg>
-        Blog
-    </a>
-    <a href="#" class="action-button">
-        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-            <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
-        </svg>
-        GitHub
-    </a>
-    <a href="#" class="action-button">
-        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
-            <polyline points="7 10 12 15 17 10"/>
-            <line x1="12" y1="15" x2="12" y2="3"/>
-        </svg>
-        Dataset
-    </a>
-</div>
 """
 CARDS = """
     <div class="features-grid">
         <div class="feature-card">
             <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
-                    <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
                 </svg>
             </div>
-            <h3 class="feature-title">360° Domain Evaluation</h3>
-            <p class="feature-description">Comprehensive evaluation across multiple benchmarks and domains:</p>
             <ul class="feature-list">
-                <li>Cross-domain evaluation</li>
-                <li>Real-world use cases</li>
-                <li>Edge case evaluation</li>
             </ul>
         </div>
         <div class="feature-card">
             <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
-                    <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
                 </svg>
             </div>
-            <h3 class="feature-title">Make Better Decisions</h3>
-            <p class="feature-description">Beyond technical metrics, we provide:</p>
             <ul class="feature-list">
-                <li>Cost-effectiveness analysis</li>
-                <li>Business impact metrics</li>
-                <li>Vendor strategy insights</li>
             </ul>
         </div>
         <div class="feature-card">
             <div class="feature-icon">
-                <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
                     <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
                 </svg>
             </div>
             <h3 class="feature-title">Updated Periodically</h3>
-            <p class="feature-description">Regular updates with latest models:</p>
             <ul class="feature-list">
                 <li>11 private models evaluated</li>
                 <li>5 open source models included</li>
@@ -245,48 +532,107 @@ CARDS = """
             </ul>
         </div>
     </div>
 </div>
 """
-METHODOLOGY = """# Methodology
-                ## Overview
-                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
-                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
-                ## Tool Selection Quality Metric
-                Models are evaluated on their ability to:
-                - Correctly identify when tools are needed
-                - Select the appropriate tool for the task
-                - Handle cases where no suitable tool exists
-                - Maintain context across multiple interactions
-                ## Dataset Structure
-                | Type | Samples | Category | Dataset Name | Purpose |
-                |------|---------|-----------|--------------|----------|
-                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
-                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
-                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
-                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
-                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
-                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
-                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
-                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
-                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
-                """
-INSIGHTS = """
-                # Key Insights from Agent Leaderboard
-                | Category | Finding | Implications |
-                |----------|---------|--------------|
-                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
-                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
-                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
-                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
-                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
-                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
-                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
-                """

 import pandas as pd
+from glob import glob
+import numpy as np
+from pathlib import Path
+DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
+SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]
 def load_data():
     """Load and preprocess the data."""
     df = pd.read_csv("results.csv").dropna()
     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
+METHODOLOGY = """# Methodology
+                ## Overview
+                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
+                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
+                ## Tool Selection Quality Metric
+                Models are evaluated on their ability to:
+                - Correctly identify when tools are needed
+                - Select the appropriate tool for the task
+                - Handle cases where no suitable tool exists
+                - Maintain context across multiple interactions
+                ## Dataset Structure
+                | Type | Samples | Category | Dataset Name | Purpose |
+                |------|---------|-----------|--------------|----------|
+                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
+                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
+                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
+                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
+                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
+                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
+                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
+                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
+                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
+                """
+INSIGHTS = """
+                # Key Insights from Agent Leaderboard
+                | Category | Finding | Implications |
+                |----------|---------|--------------|
+                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
+                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
+                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
+                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
+                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
+                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
+                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
+                """
+chat_css = """
+/* Container styles */
+.container {
+    display: flex;
+    gap: 1.5rem;
+    height: calc(100vh - 100px);
+    padding: 1rem;
+}
+/* Chat panel styles */
+.chat-panel {
+    flex: 2;
+    background: #1a1f2c;
+    border-radius: 1rem;
+    padding: 1rem;
+    overflow-y: auto;
+    max-height: calc(100vh - 120px);
+}
+/* Message styles */
+.message {
+    padding: 1.2rem;
+    margin: 0.8rem;
+    border-radius: 1rem;
+    font-family: monospace;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.system {
+    background: linear-gradient(135deg, #8e44ad, #9b59b6);
+}
+.user {
+    background: linear-gradient(135deg, #2c3e50, #3498db);
+    margin-left: 2rem;
+}
+.assistant {
+    background: linear-gradient(135deg, #27ae60, #2ecc71);
+    margin-right: 2rem;
+}
+.role-badge {
+    display: inline-block;
+    padding: 0.3rem 0.8rem;
+    border-radius: 0.5rem;
+    font-weight: bold;
+    margin-bottom: 0.8rem;
+    font-size: 0.9rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.system-role {
+    background-color: #8e44ad;
+    color: white;
+}
+.user-role {
+    background-color: #3498db;
+    color: white;
+}
+.assistant-role {
+    background-color: #27ae60;
+    color: white;
+}
+.content {
+    white-space: pre-wrap;
+    word-break: break-word;
+    color: #f5f6fa;
+    line-height: 1.5;
+}
+/* Metrics panel styles */
+.metrics-panel {
+    flex: 1;
+    display: flex;
+    flex-direction: column;
+    gap: 2rem;
+    padding: 1.5rem;
+    background: #1a1f2c;
+    border-radius: 1rem;
+}
+.metric-section {
+    background: #1E293B;
+    padding: 1.5rem;
+    border-radius: 1rem;
+}
+.score-section {
+    text-align: center;
+}
+.score-display {
+    font-size: 3rem;
+    font-weight: bold;
+    color: #4ADE80;
+    line-height: 1;
+    margin: 0.5rem 0;
+}
+.explanation-text {
+    color: #E2E8F0;
+    line-height: 1.6;
+    font-size: 0.95rem;
+}
+/* Tool info panel styles */
+.tool-info-panel {
+    background: #1a1f2c;
+    padding: 1.5rem;
+    border-radius: 1rem;
+    color: #f5f6fa;
+}
+.tool-section {
+    margin-bottom: 1.5rem;
+}
+.tool-name {
+    font-size: 1.2rem;
+    color: #4ADE80;
+    font-weight: bold;
+    margin-bottom: 0.5rem;
+}
+.tool-description {
+    color: #E2E8F0;
+    line-height: 1.6;
+    margin-bottom: 1rem;
+}
+.tool-parameters .parameter {
+    margin: 0.5rem 0;
+    padding: 0.5rem;
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 0.5rem;
+}
+.param-name {
+    color: #63B3ED;
+    font-weight: bold;
+    margin-right: 0.5rem;
+}
+.tool-examples .example {
+    margin: 0.5rem 0;
+    padding: 0.5rem;
+    background: rgba(255, 255, 255, 0.05);
+    border-radius: 0.5rem;
+    font-family: monospace;
+}
+/* Custom scrollbar */
+::-webkit-scrollbar {
+    width: 8px;
+}
+::-webkit-scrollbar-track {
+    background: rgba(255, 255, 255, 0.1);
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb {
+    background: linear-gradient(45deg, #3498db, #2ecc71);
+    border-radius: 4px;
+}
+/* Title styles */
+.title {
+    color: #63B3ED;
+    font-size: 2rem;
+    font-weight: bold;
+    text-align: center;
+    margin-bottom: 1.5rem;
+    padding: 1rem;
+}
+/* Headers */
+h3 {
+    color: #63B3ED;
+    margin: 0 0 1rem 0;
+    font-size: 1.1rem;
+    font-weight: 500;
+    letter-spacing: 0.05em;
+}
+"""
+# Updated header and cards with theme awareness
 HEADER_CONTENT = """
 <style>
+    @media (prefers-color-scheme: dark) {
+        :root {
+            --bg-primary: rgb(17, 17, 27);
+            --bg-secondary: rgba(30, 30, 45, 0.95);
+            --bg-hover: rgba(40, 40, 55, 0.95);
+            --text-primary: #ffffff;
+            --text-secondary: #94a3b8;
+            --text-tertiary: #e2e8f0;
+            --border-color: rgba(255, 255, 255, 0.1);
+            --border-hover: rgba(255, 255, 255, 0.2);
+            --card-bg: rgba(17, 17, 27, 0.6);
+            --accent-color: #4F46E5;
+            --accent-bg: rgba(79, 70, 229, 0.1);
+        }
+    }
+    @media (prefers-color-scheme: light) {
+        :root {
+            --bg-primary: rgb(255, 255, 255);
+            --bg-secondary: rgba(243, 244, 246, 0.95);
+            --bg-hover: rgba(229, 231, 235, 0.95);
+            --text-primary: #000000;
+            --text-secondary: #4b5563;
+            --text-tertiary: #1f2937;
+            --border-color: rgba(0, 0, 0, 0.1);
+            --border-hover: rgba(0, 0, 0, 0.2);
+            --card-bg: rgba(249, 250, 251, 0.6);
+            --accent-color: #4F46E5;
+            --accent-bg: rgba(79, 70, 229, 0.1);
+        }
+    }
     .header-wrapper {
         padding: 3rem 2rem;
+        background: var(--bg-primary);
         border-radius: 16px;
         display: flex;
         flex-direction: column;
     }
     .header-wrapper a {
+        color: var(--text-primary) !important;
         text-decoration: none !important;
     }
     .description {
+        color: var(--text-primary);
         font-size: 1.1rem;
         line-height: 1.6;
         max-width: 800px;
         gap: 1rem;
         justify-content: center;
         margin-bottom: 2rem;
+        color: var(--text-primary);
     }
     .action-button {
         align-items: center;
         gap: 0.5rem;
         padding: 0.75rem 1.5rem;
+        background: var(--bg-secondary);
+        border: 1px solid var(--border-color);
         border-radius: 100px;
+        color: var(--text-primary) !important;
         text-decoration: none !important;
         font-size: 0.95rem;
         transition: all 0.2s ease;
     }
     .action-button:hover {
+        background: var(--bg-hover);
+        border-color: var(--border-hover);
+        color: var(--text-primary) !important;
     }
     .update-info {
+        color: var(--text-secondary);
         font-size: 0.9rem;
         margin-bottom: 3rem;
     }
     }
     .feature-card {
+        background: var(--card-bg);
+        border: 1px solid var(--border-color);
         border-radius: 16px;
         padding: 2rem;
         text-align: left;
     }
     .feature-icon {
+        background: var(--accent-bg);
         width: 40px;
         height: 40px;
         border-radius: 12px;
     }
     .feature-title {
+        color: var(--text-primary);
         font-size: 1.25rem;
         font-weight: 600;
         margin-bottom: 1rem;
     }
     .feature-description {
+        color: var(--text-secondary);
         font-size: 0.95rem;
         margin-bottom: 1.5rem;
     }
     }
     .feature-list li {
+        color: var(--text-tertiary);
         font-size: 0.95rem;
         display: flex;
         align-items: center;
         content: '';
         width: 6px;
         height: 6px;
+        background: var(--accent-color);
         border-radius: 50%;
         flex-shrink: 0;
     }
+    /* Force all links to match theme */
     .header-wrapper a:link,
     .header-wrapper a:visited,
     .header-wrapper a:hover,
     .header-wrapper a:active {
+        color: var(--text-primary) !important;
+    }
+    /* Title specific styles */
+    .main-title {
+        color: var(--text-primary);
+        font-size: 48px;
+        font-weight: 700;
+        margin: 40px 0;
+        text-align: center;
+    }
+    .subtitle {
+        color: var(--text-secondary);
+        margin-bottom: 2rem;
     }
 </style>
 <div class="header-wrapper">
+    <h1 class="main-title">Agent Leaderboard</h1>
+    <h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
+    <div class="actions">
+        <a href="#" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
+                <line x1="8" y1="12" x2="16" y2="12"/>
+            </svg>
+            Blog
+        </a>
+        <a href="#" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
+            </svg>
+            GitHub
+        </a>
+        <a href="#" class="action-button">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+                <polyline points="7 10 12 15 17 10"/>
+                <line x1="12" y1="15" x2="12" y2="3"/>
+            </svg>
+            Dataset
+        </a>
+    </div>
 """
 CARDS = """
     <div class="features-grid">
         <div class="feature-card">
             <div class="feature-icon">
+                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
+                    <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
                 </svg>
             </div>
+            <h3 class="feature-title">Make Better Decisions</h3>
             <ul class="feature-list">
+                <li>Cost-effectiveness analysis</li>
+                <li>Business impact metrics</li>
+                <li>Vendor strategy insights</li>
             </ul>
         </div>
         <div class="feature-card">
             <div class="feature-icon">
+                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
+                    <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
                 </svg>
             </div>
+            <h3 class="feature-title">360° Domain Evaluation</h3>
             <ul class="feature-list">
+                <li>Cross-domain evaluation</li>
+                <li>Real-world use cases</li>
+                <li>Edge case evaluation</li>
             </ul>
         </div>
         <div class="feature-card">
             <div class="feature-icon">
+                <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
                     <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
                 </svg>
             </div>
             <h3 class="feature-title">Updated Periodically</h3>
             <ul class="feature-list">
                 <li>11 private models evaluated</li>
                 <li>5 open source models included</li>
             </ul>
         </div>
     </div>
 </div>
 """
+DESCRIPTION_HTML = """
+<div style="
+    background: var(--bg-secondary, rgba(30, 30, 45, 0.95));
+    border-radius: 12px;
+    padding: 24px;
+    margin: 16px 0;
+">
+    <div style="
+        display: flex;
+        flex-direction: column;
+        gap: 16px;
+    ">
+        <div style="
+            color: var(--text-primary);
+            font-size: 1.1rem;
+            font-weight: 500;
+            display: flex;
+            align-items: center;
+            gap: 8px;
+        ">
+            🎯 Purpose
+            <span style="
+                background: var(--accent-color, #4F46E5);
+                color: white;
+                padding: 4px 12px;
+                border-radius: 100px;
+                font-size: 0.9rem;
+            ">Latest Update: Feb 2025</span>
+        </div>
+        <p style="
+            color: var(--text-secondary);
+            margin: 0;
+            line-height: 1.6;
+        ">
+            Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates
+            language models' ability to effectively utilize tools and functions in complex scenarios.
+        </p>
+        <div style="
+            color: var(--text-primary);
+            font-size: 1.1rem;
+            font-weight: 500;
+            margin-top: 8px;
+        ">
+            🔍 What We Evaluate
+        </div>
+        <div style="
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 16px;
+            color: var(--text-secondary);
+        ">
+            <div style="display: flex; gap: 8px; align-items: center;">
+                🔄 Single/Multi-turn Interactions
+            </div>
+            <div style="display: flex; gap: 8px; align-items: center;">
+                🧩 Function Composition
+            </div>
+            <div style="display: flex; gap: 8px; align-items: center;">
+                ⚡ Error Handling
+            </div>
+        </div>
+        <div style="
+            color: var(--text-primary);
+            font-size: 1.1rem;
+            font-weight: 500;
+            margin-top: 8px;
+        ">
+            📊 Key Results
+        </div>
+        <div style="
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
+            gap: 16px;
+            color: var(--text-secondary);
+        ">
+            <div style="display: flex; gap: 8px; align-items: center;">
+                ✅ Accuracy Performance
+            </div>
+            <div style="display: flex; gap: 8px; align-items: center;">
+                💰 Open Vs Closed Source
+            </div>
+            <div style="display: flex; gap: 8px; align-items: center;">
+                ⚖️ Overall Effectiveness
+            </div>
+        </div>
+        <div style="
+            border-left: 4px solid var(--accent-color, #4F46E5);
+            padding-left: 12px;
+            margin-top: 8px;
+            color: var(--text-secondary);
+            font-style: italic;
+        ">
+            💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
+        </div>
+    </div>
+</div>
+"""

tabs/data_exploration.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+from chat import get_chat_and_score_df, update_chat_display
+def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
+    def filter_and_update_display(model, dataset, selected_scores, current_index):
+        try:
+            df_chat = get_chat_and_score_df(model, dataset)
+            if selected_scores:
+                df_chat = df_chat[df_chat["score"].isin(selected_scores)]
+            if df_chat.empty:
+                return (
+                    "<div>No data available for selected filters</div>",
+                    "<div>No metrics available</div>",
+                    "<div>No tool information available</div>",
+                    gr.update(maximum=0, value=0),
+                    "0/0",
+                )
+            max_index = len(df_chat) - 1
+            current_index = min(current_index, max_index)
+            chat_html, metrics_html, tool_html = update_chat_display(
+                df_chat, current_index
+            )
+            return (
+                chat_html,
+                metrics_html,
+                tool_html,
+                gr.update(maximum=max_index, value=current_index),
+                f"{current_index + 1}/{len(df_chat)}",
+            )
+        except Exception as e:
+            print(f"Error in filter_and_update_display: {str(e)}")
+            return (
+                f"<div>Error: {str(e)}</div>",
+                "<div>No metrics available</div>",
+                "<div>No tool information available</div>",
+                gr.update(maximum=0, value=0),
+                "0/0",
+            )
+    with gr.Tab("Data Exploration"):
+        gr.HTML(HEADER_CONTENT)
+        with gr.Row():
+            filters_column = gr.Column(scale=1, min_width=300)
+            with filters_column:
+                gr.Markdown("# Exploration Filters")
+                explore_model = gr.Dropdown(
+                    choices=MODELS,
+                    value=MODELS[0],
+                    label="Select Model",
+                )
+                explore_dataset = gr.Dropdown(
+                    choices=DATASETS,
+                    value=DATASETS[0],
+                    label="Select Dataset",
+                )
+                explore_scores = gr.Dropdown(
+                    choices=SCORES,
+                    value=SCORES,
+                    multiselect=True,
+                    label="Score Range",
+                )
+                gr.Markdown("## Navigation")
+                index_slider = gr.Slider(
+                    minimum=0,
+                    maximum=0,
+                    step=1,
+                    value=0,
+                    label="Position",
+                )
+                index_text = gr.HTML("0/0")
+                with gr.Row():
+                    prev_btn = gr.Button("← Previous")
+                    next_btn = gr.Button("Next →")
+            content_column = gr.Column(scale=4)
+            with content_column:
+                chat_display = gr.HTML()
+                metrics_display = gr.HTML()
+                tool_info_display = gr.HTML()
+        def update_on_filter_change(model, dataset, scores, _):
+            return filter_and_update_display(model, dataset, scores, 0)
+        for control in [explore_model, explore_dataset, explore_scores]:
+            control.change(
+                update_on_filter_change,
+                inputs=[explore_model, explore_dataset, explore_scores, gr.State(0)],
+                outputs=[
+                    chat_display,
+                    metrics_display,
+                    tool_info_display,
+                    index_slider,
+                    index_text,
+                ],
+            )
+        def navigate(direction, current, model, dataset, scores):
+            new_index = current + direction
+            return filter_and_update_display(model, dataset, scores, new_index)
+        prev_btn.click(
+            lambda idx, m, d, s: navigate(-1, idx, m, d, s),
+            inputs=[index_slider, explore_model, explore_dataset, explore_scores],
+            outputs=[
+                chat_display,
+                metrics_display,
+                tool_info_display,
+                index_slider,
+                index_text,
+            ],
+        )
+        next_btn.click(
+            lambda idx, m, d, s: navigate(1, idx, m, d, s),
+            inputs=[index_slider, explore_model, explore_dataset, explore_scores],
+            outputs=[
+                chat_display,
+                metrics_display,
+                tool_info_display,
+                index_slider,
+                index_text,
+            ],
+        )
+        index_slider.change(
+            lambda idx, m, d, s: filter_and_update_display(m, d, s, int(idx)),
+            inputs=[index_slider, explore_model, explore_dataset, explore_scores],
+            outputs=[
+                chat_display,
+                metrics_display,
+                tool_info_display,
+                index_slider,
+                index_text,
+            ],
+        )
+        return (
+            chat_display,
+            metrics_display,
+            tool_info_display,
+            index_slider,
+            index_text,
+        )

tabs/leaderboard.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import gradio as gr
+from data_loader import CATEGORIES, DESCRIPTION_HTML
+from visualization import (
+    get_performance_chart,
+    get_performance_cost_chart,
+)
+def get_rank_badge(rank):
+    """Generate HTML for rank badge with appropriate styling"""
+    badge_styles = {
+        1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
+        2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
+        3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
+    }
+    if rank in badge_styles:
+        label, gradient, text_color = badge_styles[rank]
+        return f"""
+            <div style="
+                display: inline-flex;
+                align-items: center;
+                justify-content: center;
+                min-width: 48px;
+                padding: 4px 12px;
+                background: {gradient};
+                color: {text_color};
+                border-radius: 6px;
+                font-weight: 600;
+                font-size: 0.9em;
+                box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+            ">
+                {label}
+            </div>
+        """
+    return f"""
+        <div style="
+            display: inline-flex;
+            align-items: center;
+            justify-content: center;
+            min-width: 28px;
+            color: #a1a1aa;
+            font-weight: 500;
+        ">
+            {rank}
+        </div>
+    """
+def get_type_badge(model_type):
+    """Generate HTML for model type badge"""
+    colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
+    bg_color = colors.get(model_type, "#4F46E5")
+    return f"""
+        <div style="
+            display: inline-flex;
+            align-items: center;
+            padding: 4px 8px;
+            background: {bg_color};
+            color: white;
+            border-radius: 4px;
+            font-size: 0.85em;
+            font-weight: 500;
+        ">
+            {model_type}
+        </div>
+    """
+def get_score_bar(score):
+    """Generate HTML for score bar"""
+    width = score * 100
+    return f"""
+        <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
+            <div style="
+                flex-grow: 1;
+                height: 6px;
+                background: var(--score-bg, rgba(255, 255, 255, 0.1));
+                border-radius: 3px;
+                overflow: hidden;
+                max-width: 200px;
+            ">
+                <div style="
+                    width: {width}%;
+                    height: 100%;
+                    background: var(--accent-color, #4F46E5);
+                    border-radius: 3px;
+                "></div>
+            </div>
+            <span style="
+                font-family: 'SF Mono', monospace;
+                font-weight: 600;
+                color: var(--text-primary, #ffffff);
+                min-width: 60px;
+            ">{score:.3f}</span>
+        </div>
+    """
+def filter_leaderboard(df, model_type, category, sort_by):
+    filtered_df = df.copy()
+    if model_type != "All":
+        filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
+    dataset_columns = CATEGORIES.get(category, ["Model Avg"])
+    avg_score = filtered_df[dataset_columns].mean(axis=1)
+    filtered_df["Category Score"] = avg_score
+    if sort_by == "Performance":
+        filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
+    else:
+        filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
+    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
+    perf_chart = get_performance_chart(filtered_df, category)
+    cost_chart = get_performance_cost_chart(filtered_df, category)
+    # Generate styled table HTML
+    table_html = f"""
+    <style>
+        @media (prefers-color-scheme: dark) {{
+            :root {{
+                --bg-color: #1a1b1e;
+                --text-color: #ffffff;
+                --border-color: #2d2e32;
+                --hover-bg: #2d2e32;
+                --note-bg: #2d2e32;
+                --note-text: #a1a1aa;
+            }}
+        }}
+        @media (prefers-color-scheme: light) {{
+            :root {{
+                --bg-color: #ffffff;
+                --text-color: #000000;
+                --border-color: #e5e7eb;
+                --hover-bg: #f3f4f6;
+                --note-bg: #f3f4f6;
+                --note-text: #4b5563;
+            }}
+        }}
+        .dark-table-container {{
+            background: var(--bg-color);
+            border-radius: 12px;
+            padding: 1px;
+            margin: 20px 0;
+        }}
+        .dark-styled-table {{
+            width: 100%;
+            border-collapse: collapse;
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+            background: var(--bg-color);
+            color: var(--text-color);
+        }}
+        .dark-styled-table thead {{
+            position: sticky;
+            top: 0;
+            background: var(--bg-color);
+            z-index: 1;
+        }}
+        .dark-styled-table th {{
+            padding: 16px;
+            text-align: left;
+            font-weight: 500;
+            color: var(--text-color);
+            border-bottom: 1px solid var(--border-color);
+        }}
+        .dark-styled-table td {{
+            padding: 16px;
+            border-bottom: 1px solid var(--border-color);
+            color: var(--text-color);
+        }}
+        .dark-styled-table tbody tr:hover {{
+            background: var(--hover-bg);
+        }}
+        .model-cell {{
+            font-weight: 500;
+        }}
+        .score-cell {{
+            font-weight: 500;
+        }}
+        .note-box {{
+            margin-top: 20px;
+            padding: 16px;
+            background: var(--note-bg);
+            border-radius: 8px;
+            color: var(--note-text);
+        }}
+    </style>
+    <div class="dark-table-container">
+        <table class="dark-styled-table">
+            <thead>
+                <tr>
+                    <th>Rank</th>
+                    <th>Model</th>
+                    <th>Type</th>
+                    <th>Cost (I/O)</th>
+                    <th>Category Score</th>
+                </tr>
+            </thead>
+            <tbody>
+    """
+    for _, row in filtered_df.iterrows():
+        table_html += f"""
+            <tr>
+                <td>{get_rank_badge(row['Rank'])}</td>
+                <td class="model-cell">{row['Model']}</td>
+                <td>{get_type_badge(row['Model Type'])}</td>
+                <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
+                <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
+            </tr>
+        """
+    table_html += """
+            </tbody>
+        </table>
+    </div>
+    <div class="note-box">
+        <p style="margin: 0; font-size: 0.9em;">
+            Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. For Gemini 2.0, the cost is assumed to match Gemini 1.5's pricing since actual rates aren't yet available.
+        </p>
+    </div>
+    """
+    return table_html, perf_chart, cost_chart
+def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
+    with gr.Tab("Leaderboard"):
+        gr.HTML(HEADER_CONTENT + CARDS)
+        gr.HTML(DESCRIPTION_HTML)
+        # Filters row
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                model_type = gr.Dropdown(
+                    choices=["All"] + df["Model Type"].unique().tolist(),
+                    value="All",
+                    label="Model Type",
+                )
+            with gr.Column(scale=1):
+                category = gr.Dropdown(
+                    choices=list(CATEGORIES.keys()),
+                    value=list(CATEGORIES.keys())[0],
+                    label="Category",
+                )
+            with gr.Column(scale=1):
+                sort_by = gr.Radio(
+                    choices=["Performance", "Cost"],
+                    value="Performance",
+                    label="Sort by",
+                )
+        # Content
+        output = gr.HTML()
+        plot1 = gr.Plot()
+        plot2 = gr.Plot()
+        gr.Markdown(METHODOLOGY)
+        for input_comp in [model_type, category, sort_by]:
+            input_comp.change(
+                fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
+                inputs=[model_type, category, sort_by],
+                outputs=[output, plot1, plot2],
+            )
+        return output, plot1, plot2

tabs/model_comparison.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import gradio as gr
+from visualization import create_radar_plot
+def compare_models(df, model_names=None):
+    if model_names is None or len(model_names) == 0:
+        model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
+    filtered_df = df[df["Model"].isin(model_names)]
+    radar_chart = create_radar_plot(df, model_names)
+    # Create styled table for model info
+    info_html = f"""
+    <div class="dark-table-container">
+        <table class="dark-styled-table">
+            <thead>
+                <tr>
+                    <th>Model</th>
+                    <th>Type</th>
+                    <th>Average</th>
+                    <th>I/O Cost</th>
+                    <th>Single Turn</th>
+                    <th>Multi Turn</th>
+                </tr>
+            </thead>
+            <tbody>
+    """
+    for _, row in filtered_df.iterrows():
+        info_html += f"""
+            <tr>
+                <td>{row['Model']}</td>
+                <td>{row['Model Type']}</td>
+                <td>{row['Model Avg']:.3f}</td>
+                <td>${row['IO Cost']:.2f}</td>
+                <td>{row['single turn perf']:.3f}</td>
+                <td>{row['multi turn perf']:.3f}</td>
+            </tr>
+        """
+    info_html += """
+            </tbody>
+        </table>
+    </div>
+    """
+    return info_html, radar_chart
+def create_model_comparison_tab(df, HEADER_CONTENT, CARDS):
+    with gr.Tab("Model Comparison"):
+        gr.HTML(HEADER_CONTENT)
+        with gr.Column():
+            # Filters row
+            with gr.Row(equal_height=True):
+                model_selector = gr.Dropdown(
+                    choices=df["Model"].unique().tolist(),
+                    value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
+                    multiselect=True,
+                    label="Select Models to Compare",
+                )
+            # Content
+            model_info = gr.HTML()
+            radar_plot = gr.Plot()
+        model_selector.change(
+            fn=lambda m: compare_models(df, m),
+            inputs=[model_selector],
+            outputs=[model_info, radar_plot],
+        )
+        return model_info, radar_plot

utils.py CHANGED Viewed

@@ -1,56 +1,3 @@
-from data_loader import CATEGORIES
-from visualization import (
-    create_radar_plot,
-    get_performance_chart,
-    get_performance_cost_chart,
-)
-def model_info_tab(df, model_names=None):
-    if model_names is None or len(model_names) == 0:
-        model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
-    filtered_df = df[df["Model"].isin(model_names)]
-    radar_chart = create_radar_plot(df, model_names)
-    # Create styled table for model info
-    info_html = f"""
-    <div class="dark-table-container">
-        <table class="dark-styled-table">
-            <thead>
-                <tr>
-                    <th>Model</th>
-                    <th>Type</th>
-                    <th>Average</th>
-                    <th>I/O Cost</th>
-                    <th>Single Turn</th>
-                    <th>Multi Turn</th>
-                </tr>
-            </thead>
-            <tbody>
-    """
-    for _, row in filtered_df.iterrows():
-        info_html += f"""
-            <tr>
-                <td>{row['Model']}</td>
-                <td>{row['Model Type']}</td>
-                <td>{row['Model Avg']:.3f}</td>
-                <td>${row['IO Cost']:.2f}</td>
-                <td>{row['single turn perf']:.3f}</td>
-                <td>{row['multi turn perf']:.3f}</td>
-            </tr>
-        """
-    info_html += """
-            </tbody>
-        </table>
-    </div>
-    """
-    return info_html, radar_chart
 def get_rank_badge(rank):
     """Generate HTML for rank badge with appropriate styling"""
     badge_styles = {
@@ -140,158 +87,3 @@ def get_score_bar(score):
             ">{score:.3f}</span>
         </div>
     """
-def filter_leaderboard(df, model_type, category, sort_by):
-    filtered_df = df.copy()
-    if model_type != "All":
-        filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
-    dataset_columns = CATEGORIES.get(category, ["Model Avg"])
-    avg_score = filtered_df[dataset_columns].mean(axis=1)
-    filtered_df["Category Score"] = avg_score
-    if sort_by == "Performance":
-        filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
-    else:
-        filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
-    filtered_df["Rank"] = range(1, len(filtered_df) + 1)
-    perf_chart = get_performance_chart(filtered_df, category)
-    cost_chart = get_performance_cost_chart(filtered_df, category)
-    table_html = f"""
-    <style>
-        .dark-table-container {{
-            max-height: 600px;
-            overflow-y: auto;
-            background: linear-gradient(145deg, #1a1b1e, #1f2023);
-            border-radius: 16px;
-            padding: 1px;
-            margin: 20px 0;
-            box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
-                        0 2px 4px -1px rgba(0, 0, 0, 0.06);
-        }}
-        .dark-styled-table {{
-            width: 100%;
-            border-collapse: separate;
-            border-spacing: 0;
-            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-            background: transparent;
-            color: #ffffff;
-        }}
-        .dark-styled-table thead {{
-            position: sticky;
-            top: 0;
-            background: linear-gradient(180deg, #1a1b1e, #1d1e22);
-            z-index: 1;
-        }}
-        .dark-styled-table th {{
-            padding: 12px 20px;
-            text-align: left;
-            font-weight: 600;
-            color: #ffffff;
-            text-transform: uppercase;
-            font-size: 0.75em;
-            background: #1a1b1e;
-            letter-spacing: 0.05em;
-            border-bottom: 1px solid #2d2e32;
-        }}
-        .dark-styled-table td {{
-            padding: 16px 20px;
-            border-bottom: 1px solid rgba(45, 46, 50, 0.5);
-            color: #ffffff;
-            font-size: 0.95em;
-        }}
-        .dark-styled-table tbody tr {{
-            transition: all 0.2s ease;
-            background: transparent;
-        }}
-        .dark-styled-table tbody tr:hover {{
-            background: rgba(45, 46, 50, 0.5);
-        }}
-        .model-cell {{
-            font-weight: 500;
-            color: #e2e8f0;
-        }}
-        .cost-cell {{
-            font-family: 'SF Mono', monospace;
-            color: #94a3b8;
-        }}
-        .note-box {{
-            margin: 20px 0;
-            padding: 16px 20px;
-            background: rgba(45, 46, 50, 0.5);
-            border-radius: 12px;
-            color: #94a3b8;
-            font-size: 0.9em;
-            border-left: 4px solid #4f46e5;
-        }}
-        /* Custom scrollbar */
-        .dark-table-container::-webkit-scrollbar {{
-            width: 8px;
-        }}
-        .dark-table-container::-webkit-scrollbar-track {{
-            background: #1a1b1e;
-            border-radius: 4px;
-        }}
-        .dark-table-container::-webkit-scrollbar-thumb {{
-            background: #2d2e32;
-            border-radius: 4px;
-        }}
-        .dark-table-container::-webkit-scrollbar-thumb:hover {{
-            background: #3d3e42;
-        }}
-    </style>
-    <div class="dark-table-container">
-        <table class="dark-styled-table">
-            <thead>
-                <tr>
-                    <th>RANK</th>
-                    <th>MODEL</th>
-                    <th>TYPE</th>
-                    <th>COST (I/O)</th>
-                    <th>SCORE</th>
-                </tr>
-            </thead>
-            <tbody>
-    """
-    for _, row in filtered_df.iterrows():
-        rank_display = get_rank_badge(row["Rank"])
-        type_badge = get_type_badge(row["Model Type"])
-        score_bar = get_score_bar(row["Category Score"])
-        table_html += f"""
-            <tr>
-                <td>{rank_display}</td>
-                <td class="model-cell">{row['Model']}</td>
-                <td>{type_badge}</td>
-                <td class="cost-cell">${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
-                <td>{score_bar}</td>
-            </tr>
-        """
-    table_html += """
-            </tbody>
-        </table>
-    </div>
-    <div class="note-box">
-        Note: Cost for sorting is calculated using 3:1 ratio on I/O. Cost of Gemini 2.0 is assumed to be same as that of Gemini 1.5.
-    </div>
-    """
-    return table_html, perf_chart, cost_chart

 def get_rank_badge(rank):
     """Generate HTML for rank badge with appropriate styling"""
     badge_styles = {
             ">{score:.3f}</span>
         </div>
     """