import pandas as pd from glob import glob import numpy as np from pathlib import Path DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] def load_data(): """Load and preprocess the data.""" df = pd.read_csv("results.csv").dropna() # Add combined I/O cost column with 3:1 ratio df["IO Cost"] = ( df["Input cost per million token"] * 0.75 + df["Output cost per million token"] * 0.25 ) return df # categories.py CATEGORIES = { "Overall": ["Model Avg"], "Overall single turn": ["single turn perf"], "Overall multi turn": ["multi turn perf"], "Single func call": [ "xlam_single_tool_single_call", "xlam_multiple_tool_single_call", ], "Multiple func call": [ "xlam_multiple_tool_multiple_call", "xlam_single_tool_multiple_call", "BFCL_v3_multi_turn_base_multi_func_call", ], "Irrelevant query": ["BFCL_v3_irrelevance"], "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], "Missing params": ["BFCL_v3_multi_turn_miss_param"], "Composite": ["BFCL_v3_multi_turn_composite"], } chat_css = """ /* Container styles */ .container { display: flex; gap: 1.5rem; height: calc(100vh - 100px); padding: 1rem; } /* Chat panel styles */ .chat-panel { flex: 2; background: #1a1f2c; border-radius: 1rem; padding: 1rem; overflow-y: auto; max-height: calc(100vh - 120px); } /* Message styles */ .message { padding: 1.2rem; margin: 0.8rem; border-radius: 1rem; font-family: monospace; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .system { background: linear-gradient(135deg, #8e44ad, #9b59b6); } .user { background: linear-gradient(135deg, #2c3e50, #3498db); margin-left: 2rem; } .assistant { background: linear-gradient(135deg, #27ae60, #2ecc71); margin-right: 2rem; } .role-badge { display: inline-block; padding: 0.3rem 0.8rem; border-radius: 0.5rem; font-weight: bold; margin-bottom: 0.8rem; font-size: 0.9rem; text-transform: uppercase; letter-spacing: 0.05em; } .system-role { background-color: #8e44ad; color: white; } .user-role { background-color: #3498db; color: white; } .assistant-role { background-color: #27ae60; color: white; } .content { white-space: pre-wrap; word-break: break-word; color: #f5f6fa; line-height: 1.5; } /* Metrics panel styles */ .metrics-panel { flex: 1; display: flex; flex-direction: column; gap: 2rem; padding: 1.5rem; background: #1a1f2c; border-radius: 1rem; } .metric-section { background: #1E293B; padding: 1.5rem; border-radius: 1rem; } .score-section { text-align: center; } .score-display { font-size: 3rem; font-weight: bold; color: #4ADE80; line-height: 1; margin: 0.5rem 0; } .explanation-text { color: #E2E8F0; line-height: 1.6; font-size: 0.95rem; } /* Tool info panel styles */ .tool-info-panel { background: #1a1f2c; padding: 1.5rem; border-radius: 1rem; color: #f5f6fa; } .tool-section { margin-bottom: 1.5rem; } .tool-name { font-size: 1.2rem; color: #4ADE80; font-weight: bold; margin-bottom: 0.5rem; } .tool-description { color: #E2E8F0; line-height: 1.6; margin-bottom: 1rem; } .tool-parameters .parameter { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; } .param-name { color: #63B3ED; font-weight: bold; margin-right: 0.5rem; } .tool-examples .example { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; font-family: monospace; } /* Custom scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: rgba(255, 255, 255, 0.1); border-radius: 4px; } ::-webkit-scrollbar-thumb { background: linear-gradient(45deg, #3498db, #2ecc71); border-radius: 4px; } /* Title styles */ .title { color: #63B3ED; font-size: 2rem; font-weight: bold; text-align: center; margin-bottom: 1.5rem; padding: 1rem; } /* Headers */ h3 { color: #63B3ED; margin: 0 0 1rem 0; font-size: 1.1rem; font-weight: 500; letter-spacing: 0.05em; } """ COMMON = """ """ DESCRIPTION_HTML = """
🎯 Purpose Latest Update: Feb 2025

This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.

🔍 What We Evaluate
🔄 Single/Multi-turn Interactions
🧩 Function Composition
⚡ Error Handling
📊 Key Results
✅ Tool Selection Quality
💰 Open Vs Closed Source
⚖️ Overall Effectiveness
""" HEADER_CONTENT = ( COMMON + """
Welcome to the
Agent Leaderboard!
The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year. We built this leaderboard to answer one simple question:
"How do AI agents perform in real-world agentic scenarios?"
""" ) CARDS = """
17
Total Models
12 Private
5 Open Source
14
Evaluation Datasets
Cross-Domain Testing
Real-world use cases
TSQ
Evaluation Metric
Tool Selection Quality
GPT-4o Based Judge
""" METHODOLOGY = """

Methodology

Overview

We evaluate language models' ability to effectively use tools in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.

Key Insights

Category Finding
Performance Champion Gemini-2.0-flash dominates with 0.935 score at just $0.075 per million tokens, excelling in both complex tasks (0.95) and safety features (0.98)
Price-Performance Paradox Top 3 models span 20x price difference yet only 3% performance gap, challenging pricing assumptions
Open Vs Closed Source The new Mistral-small leads in open source models and performs similar to GPT-4o-mini at 0.83, signaling OSS maturity in tool calling
Reasoning Models Although being great for reasoning, o1 and o3-mini are far from perfect scoring 0.87 and 0.84 respectively. DeepSeek V3 and R1 were excluded from rankings due to limited function support
Tool Miss Detection Dataset averages of 0.59 and 0.78 reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks
Architecture Trade-offs Long context vs parallel execution shows architectural limits: O1 leads context (0.98) but fails parallel tasks (0.43), while GPT-4o shows opposite pattern

Development Implications

Area Recommendation
Task Complexity Simple tasks work with most models. Complex workflows requiring multiple tools need models with 0.85+ scores in composite tests
Error Handling Models with low tool selection scores need guardrails. Add validation layers and structured error recovery, especially for parameter collection
Context Management Long conversations require either models strong in context retention or external context storage systems
Reasoning Models While o1 and o3-mini excelled in function calling, DeepSeek V3 and R1 were excluded from rankings due to limited function support
Safety Controls Add strict tool access controls for models weak in irrelevance detection. Include validation layers for inconsistent performers
Open Vs Closed Source Private models lead in complex tasks, but open-source options work well for basic operations. Choose based on your scaling needs

Dataset Structure

Type Samples Category Dataset Name Purpose
Single-Turn 100 + 100 Single Function Call xlam_single_tool_single_call Evaluates basic ability to read documentation and make single function calls
200 + 50 Multiple Function Call xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call Tests parallel execution and result aggregation capabilities
100 Irrelevant Query BFCL_v3_irrelevance Tests ability to recognize when available tools don't match user needs
100 Long Context tau_long_context Assesses handling of extended interactions and complex instructions
Multi-Turn 50 + 30 Single Function Call BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call Tests basic conversational function calling abilities
50 Multiple Function Call BFCL_v3_multi_turn_base_multi_func_call Evaluates handling of multiple function calls in conversation
100 Missing Function BFCL_v3_multi_turn_miss_func Tests graceful handling of unavailable tools
100 Missing Parameters BFCL_v3_multi_turn_miss_param Assesses parameter collection and handling incomplete information
100 Composite BFCL_v3_multi_turn_composite Tests overall robustness in complex scenarios

Make Better Decisions

  • Cost-effectiveness analysis
  • Business impact metrics
  • Vendor strategy insights

360° Domain Evaluation

  • Cross-domain evaluation
  • Real-world use cases
  • Edge case evaluation

Updated Periodically

  • 12 private models evaluated
  • 5 open source models included
  • Monthly model additions
"""