Spaces:

galileo-ai
/

agent-leaderboard

Running

App Files Files Community

Pratik Bhavsar commited on 5 days ago

Commit

523927e

1 Parent(s): 80c01c6

improved title

Browse files

Files changed (2) hide show

app.py +37 -17
data_loader.py +85 -78

app.py CHANGED Viewed

@@ -1,9 +1,15 @@
 import gradio as gr
-from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY, HEADER_CONTENT
 from utils import model_info_tab, filter_leaderboard
 from visualization import setup_matplotlib
 def create_app():
     setup_matplotlib()
     df = load_data()
@@ -11,37 +17,42 @@ def create_app():
     with gr.Blocks(theme=gr.themes.Soft()) as app:
         with gr.Tabs():
             with gr.Tab("Leaderboard"):
                 with gr.Row():
                     with gr.Column(scale=1):
-                        gr.Markdown("# Filters")
                         model_type = gr.Dropdown(
                             choices=["All"] + df["Model Type"].unique().tolist(),
                             value="All",
                             label="Model Type",
                         )
                         category = gr.Dropdown(
                             choices=list(CATEGORIES.keys()),
                             value=list(CATEGORIES.keys())[0],
                             label="Category",
                         )
                         sort_by = gr.Radio(
                             choices=["Performance", "Cost"],
                             value="Performance",
                             label="Sort by",
                         )
                     with gr.Column(scale=4):
-                        # Add the new header content above everything
-                        gr.HTML(HEADER_CONTENT)
                         output = gr.HTML()
                         plot1 = gr.Plot()
                         plot2 = gr.Plot()
-                        # Add methodology section
-                        gr.Markdown("# Methodology")
                         gr.Markdown(METHODOLOGY)
-                        # Add insights section
-                        gr.Markdown("# Key Insights")
-                        gr.Markdown(INSIGHTS)
                 for input_comp in [model_type, category, sort_by]:
                     input_comp.change(
@@ -50,18 +61,29 @@ def create_app():
                         outputs=[output, plot1, plot2],
                     )
-            with gr.Tab("Model Performance"):
-                gr.HTML(HEADER_CONTENT)
                 with gr.Row():
                     with gr.Column(scale=1):
                         model_selector = gr.Dropdown(
                             choices=df["Model"].unique().tolist(),
                             value=df.sort_values("Model Avg", ascending=False).iloc[0][
                                 "Model"
                             ],
                             multiselect=True,
-                            label="Models",
                         )
                     with gr.Column(scale=4):
                         model_info = gr.HTML()
                         radar_plot = gr.Plot()
@@ -88,7 +110,5 @@ def create_app():
     return app
-if __name__ == "__main__":
-    demo = create_app()
-    demo.launch()

 import gradio as gr
+from data_loader import (
+    load_data,
+    CATEGORIES,
+    INSIGHTS,
+    METHODOLOGY,
+    HEADER_CONTENT,
+    CARDS,
+)
 from utils import model_info_tab, filter_leaderboard
 from visualization import setup_matplotlib
 def create_app():
     setup_matplotlib()
     df = load_data()
     with gr.Blocks(theme=gr.themes.Soft()) as app:
         with gr.Tabs():
             with gr.Tab("Leaderboard"):
+                gr.HTML(HEADER_CONTENT + CARDS)
                 with gr.Row():
+                    # Left column for filters (20% width)
                     with gr.Column(scale=1):
+                        gr.HTML(
+                            """
+                            <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
+                                <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Filters</h3>
+                            </div>
+                        """
+                        )
                         model_type = gr.Dropdown(
                             choices=["All"] + df["Model Type"].unique().tolist(),
                             value="All",
                             label="Model Type",
+                            container=True,
                         )
                         category = gr.Dropdown(
                             choices=list(CATEGORIES.keys()),
                             value=list(CATEGORIES.keys())[0],
                             label="Category",
+                            container=True,
                         )
                         sort_by = gr.Radio(
                             choices=["Performance", "Cost"],
                             value="Performance",
                             label="Sort by",
+                            container=True,
                         )
+                    # Right column for content (80% width)
                     with gr.Column(scale=4):
                         output = gr.HTML()
                         plot1 = gr.Plot()
                         plot2 = gr.Plot()
                         gr.Markdown(METHODOLOGY)
                 for input_comp in [model_type, category, sort_by]:
                     input_comp.change(
                         outputs=[output, plot1, plot2],
                     )
+            with gr.Tab("Model Comparison"):
+                gr.HTML(HEADER_CONTENT + CARDS)
                 with gr.Row():
+                    # Left column for filters (20% width)
                     with gr.Column(scale=1):
+                        gr.HTML(
+                            """
+                            <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
+                                <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Models</h3>
+                            </div>
+                        """
+                        )
                         model_selector = gr.Dropdown(
                             choices=df["Model"].unique().tolist(),
                             value=df.sort_values("Model Avg", ascending=False).iloc[0][
                                 "Model"
                             ],
                             multiselect=True,
+                            label="Select Models",
+                            container=True,
                         )
+                    # Right column for content (80% width)
                     with gr.Column(scale=4):
                         model_info = gr.HTML()
                         radar_plot = gr.Plot()
     return app
+demo = create_app()
+demo.launch()

data_loader.py CHANGED Viewed

@@ -34,49 +34,6 @@ CATEGORIES = {
     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
-INSIGHTS = """
-                # Key Insights from Agent Leaderboard
-                | Category | Finding | Implications |
-                |----------|---------|--------------|
-                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
-                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
-                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
-                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
-                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
-                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
-                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
-                """
-METHODOLOGY = """
-                # Methodology
-                ## Overview
-                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
-                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
-                ## Tool Selection Quality Metric
-                Models are evaluated on their ability to:
-                - Correctly identify when tools are needed
-                - Select the appropriate tool for the task
-                - Handle cases where no suitable tool exists
-                - Maintain context across multiple interactions
-                ## Dataset Structure
-                | Type | Samples | Category | Dataset Name | Purpose |
-                |------|---------|-----------|--------------|----------|
-                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
-                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
-                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
-                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
-                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
-                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
-                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
-                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
-                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
-                """
 HEADER_CONTENT = """
 <style>
     .header-wrapper {
@@ -89,12 +46,9 @@ HEADER_CONTENT = """
         text-align: center;
     }
-    .title {
-        color: #ffffff;
-        font-size: 2.5rem;
-        font-weight: 600;
-        margin-bottom: 1.5rem;
-        text-align: center;
     }
     .description {
@@ -111,6 +65,7 @@ HEADER_CONTENT = """
         gap: 1rem;
         justify-content: center;
         margin-bottom: 2rem;
     }
     .action-button {
@@ -121,8 +76,8 @@ HEADER_CONTENT = """
         background: rgba(30, 30, 45, 0.95);
         border: 1px solid rgba(255, 255, 255, 0.1);
         border-radius: 100px;
-        color: #fff;
-        text-decoration: none;
         font-size: 0.95rem;
         transition: all 0.2s ease;
     }
@@ -130,6 +85,7 @@ HEADER_CONTENT = """
     .action-button:hover {
         background: rgba(40, 40, 55, 0.95);
         border-color: rgba(255, 255, 255, 0.2);
     }
     .update-info {
@@ -203,38 +159,46 @@ HEADER_CONTENT = """
         border-radius: 50%;
         flex-shrink: 0;
     }
 </style>
 <div class="header-wrapper">
-    <h1 class="title">Agent Leaderboard</h1>
-    <p class="description">
-        A comprehensive benchmark for evaluating AI agents in real-world business scenarios, comparing practical performance across multiple domains and use cases.
-    </p>
-    <div class="actions">
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
-                <line x1="8" y1="12" x2="16" y2="12"/>
-            </svg>
-            Blog
-        </a>
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
-            </svg>
-            GitHub
-        </a>
-        <a href="#" class="action-button">
-            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
-                <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
-                <polyline points="7 10 12 15 17 10"/>
-                <line x1="12" y1="15" x2="12" y2="3"/>
-            </svg>
-            Dataset
-        </a>
-    </div>
     <div class="features-grid">
         <div class="feature-card">
             <div class="feature-icon">
@@ -283,3 +247,46 @@ HEADER_CONTENT = """
     </div>
 </div>
 """

     "Composite": ["BFCL_v3_multi_turn_composite"],
 }
 HEADER_CONTENT = """
 <style>
     .header-wrapper {
         text-align: center;
     }
+    .header-wrapper a {
+        color: #ffffff !important;
+        text-decoration: none !important;
     }
     .description {
         gap: 1rem;
         justify-content: center;
         margin-bottom: 2rem;
+        color: #ffffff;
     }
     .action-button {
         background: rgba(30, 30, 45, 0.95);
         border: 1px solid rgba(255, 255, 255, 0.1);
         border-radius: 100px;
+        color: #ffffff !important;
+        text-decoration: none !important;
         font-size: 0.95rem;
         transition: all 0.2s ease;
     }
     .action-button:hover {
         background: rgba(40, 40, 55, 0.95);
         border-color: rgba(255, 255, 255, 0.2);
+        color: #ffffff !important;
     }
     .update-info {
         border-radius: 50%;
         flex-shrink: 0;
     }
+    /* Force all links to be white */
+    .header-wrapper a:link,
+    .header-wrapper a:visited,
+    .header-wrapper a:hover,
+    .header-wrapper a:active {
+        color: #ffffff !important;
+    }
 </style>
 <div class="header-wrapper">
+    <h1 class="title" style="font-size: 48px; font-weight: 700; margin: 40px 0; text-align: center;">Agent Leaderboard</h1>
+    <h2>Comprehensive multi-benchmark evaluation for tool calling</h2>
+<div class="actions">
+    <a href="#" class="action-button">
+        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
+            <line x1="8" y1="12" x2="16" y2="12"/>
+        </svg>
+        Blog
+    </a>
+    <a href="#" class="action-button">
+        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
+        </svg>
+        GitHub
+    </a>
+    <a href="#" class="action-button">
+        <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+            <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
+            <polyline points="7 10 12 15 17 10"/>
+            <line x1="12" y1="15" x2="12" y2="3"/>
+        </svg>
+        Dataset
+    </a>
+</div>
+"""
+CARDS = """
     <div class="features-grid">
         <div class="feature-card">
             <div class="feature-icon">
     </div>
 </div>
 """
+METHODOLOGY = """# Methodology
+                ## Overview
+                The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
+                The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
+                ## Tool Selection Quality Metric
+                Models are evaluated on their ability to:
+                - Correctly identify when tools are needed
+                - Select the appropriate tool for the task
+                - Handle cases where no suitable tool exists
+                - Maintain context across multiple interactions
+                ## Dataset Structure
+                | Type | Samples | Category | Dataset Name | Purpose |
+                |------|---------|-----------|--------------|----------|
+                | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
+                | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
+                | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
+                | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
+                | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
+                | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
+                | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
+                | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
+                | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
+                """
+INSIGHTS = """
+                # Key Insights from Agent Leaderboard
+                | Category | Finding | Implications |
+                |----------|---------|--------------|
+                | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
+                | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
+                | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
+                | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
+                | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
+                | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
+                **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
+                """