import pandas as pd from glob import glob import numpy as np from pathlib import Path DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] def load_data(): """Load and preprocess the data.""" df = pd.read_csv("results.csv").dropna() # Add combined I/O cost column with 3:1 ratio df["IO Cost"] = ( df["Input cost per million token"] * 0.75 + df["Output cost per million token"] * 0.25 ) return df # categories.py CATEGORIES = { "Overall": ["Model Avg"], "Overall single turn": ["single turn perf"], "Overall multi turn": ["multi turn perf"], "Single func call": [ "xlam_single_tool_single_call", "xlam_multiple_tool_single_call", ], "Multiple func call": [ "xlam_multiple_tool_multiple_call", "xlam_single_tool_multiple_call", "BFCL_v3_multi_turn_base_multi_func_call", ], "Irrelevant query": ["BFCL_v3_irrelevance"], "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], "Missing params": ["BFCL_v3_multi_turn_miss_param"], "Composite": ["BFCL_v3_multi_turn_composite"], } METHODOLOGY = """# Methodology ## Overview The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations. The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability. ## Tool Selection Quality Metric Models are evaluated on their ability to: - Correctly identify when tools are needed - Select the appropriate tool for the task - Handle cases where no suitable tool exists - Maintain context across multiple interactions ## Dataset Structure | Type | Samples | Category | Dataset Name | Purpose | |------|---------|-----------|--------------|----------| | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls | | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities | | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs | | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions | | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities | | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation | | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools | | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information | | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios | """ INSIGHTS = """ # Key Insights from Agent Leaderboard | Category | Finding | Implications | |----------|---------|--------------| | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing | | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end | | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption | | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement | | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions | | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases | **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios. """ chat_css = """ /* Container styles */ .container { display: flex; gap: 1.5rem; height: calc(100vh - 100px); padding: 1rem; } /* Chat panel styles */ .chat-panel { flex: 2; background: #1a1f2c; border-radius: 1rem; padding: 1rem; overflow-y: auto; max-height: calc(100vh - 120px); } /* Message styles */ .message { padding: 1.2rem; margin: 0.8rem; border-radius: 1rem; font-family: monospace; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .system { background: linear-gradient(135deg, #8e44ad, #9b59b6); } .user { background: linear-gradient(135deg, #2c3e50, #3498db); margin-left: 2rem; } .assistant { background: linear-gradient(135deg, #27ae60, #2ecc71); margin-right: 2rem; } .role-badge { display: inline-block; padding: 0.3rem 0.8rem; border-radius: 0.5rem; font-weight: bold; margin-bottom: 0.8rem; font-size: 0.9rem; text-transform: uppercase; letter-spacing: 0.05em; } .system-role { background-color: #8e44ad; color: white; } .user-role { background-color: #3498db; color: white; } .assistant-role { background-color: #27ae60; color: white; } .content { white-space: pre-wrap; word-break: break-word; color: #f5f6fa; line-height: 1.5; } /* Metrics panel styles */ .metrics-panel { flex: 1; display: flex; flex-direction: column; gap: 2rem; padding: 1.5rem; background: #1a1f2c; border-radius: 1rem; } .metric-section { background: #1E293B; padding: 1.5rem; border-radius: 1rem; } .score-section { text-align: center; } .score-display { font-size: 3rem; font-weight: bold; color: #4ADE80; line-height: 1; margin: 0.5rem 0; } .explanation-text { color: #E2E8F0; line-height: 1.6; font-size: 0.95rem; } /* Tool info panel styles */ .tool-info-panel { background: #1a1f2c; padding: 1.5rem; border-radius: 1rem; color: #f5f6fa; } .tool-section { margin-bottom: 1.5rem; } .tool-name { font-size: 1.2rem; color: #4ADE80; font-weight: bold; margin-bottom: 0.5rem; } .tool-description { color: #E2E8F0; line-height: 1.6; margin-bottom: 1rem; } .tool-parameters .parameter { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; } .param-name { color: #63B3ED; font-weight: bold; margin-right: 0.5rem; } .tool-examples .example { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; font-family: monospace; } /* Custom scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: rgba(255, 255, 255, 0.1); border-radius: 4px; } ::-webkit-scrollbar-thumb { background: linear-gradient(45deg, #3498db, #2ecc71); border-radius: 4px; } /* Title styles */ .title { color: #63B3ED; font-size: 2rem; font-weight: bold; text-align: center; margin-bottom: 1.5rem; padding: 1rem; } /* Headers */ h3 { color: #63B3ED; margin: 0 0 1rem 0; font-size: 1.1rem; font-weight: 500; letter-spacing: 0.05em; } """ # Updated header and cards with theme awareness HEADER_CONTENT = """

Agent Leaderboard

Comprehensive multi-benchmark evaluation for tool calling

Blog GitHub Dataset
""" CARDS = """

Make Better Decisions

  • Cost-effectiveness analysis
  • Business impact metrics
  • Vendor strategy insights

360° Domain Evaluation

  • Cross-domain evaluation
  • Real-world use cases
  • Edge case evaluation

Updated Periodically

  • 11 private models evaluated
  • 5 open source models included
  • Monthly model additions
""" DESCRIPTION_HTML = """
🎯 Purpose Latest Update: Feb 2025

Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.

🔍 What We Evaluate
🔄 Single/Multi-turn Interactions
🧩 Function Composition
⚡ Error Handling
📊 Key Results
✅ Accuracy Performance
💰 Open Vs Closed Source
⚖️ Overall Effectiveness
💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
"""