agent-leaderboard / data_loader.py
Pratik Bhavsar
refactoring and auto theme
4a46abc
import pandas as pd
from glob import glob
import numpy as np
from pathlib import Path
DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]
def load_data():
"""Load and preprocess the data."""
df = pd.read_csv("results.csv").dropna()
# Add combined I/O cost column with 3:1 ratio
df["IO Cost"] = (
df["Input cost per million token"] * 0.75
+ df["Output cost per million token"] * 0.25
)
return df
# categories.py
CATEGORIES = {
"Overall": ["Model Avg"],
"Overall single turn": ["single turn perf"],
"Overall multi turn": ["multi turn perf"],
"Single func call": [
"xlam_single_tool_single_call",
"xlam_multiple_tool_single_call",
],
"Multiple func call": [
"xlam_multiple_tool_multiple_call",
"xlam_single_tool_multiple_call",
"BFCL_v3_multi_turn_base_multi_func_call",
],
"Irrelevant query": ["BFCL_v3_irrelevance"],
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
"Missing params": ["BFCL_v3_multi_turn_miss_param"],
"Composite": ["BFCL_v3_multi_turn_composite"],
}
METHODOLOGY = """# Methodology
## Overview
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
## Tool Selection Quality Metric
Models are evaluated on their ability to:
- Correctly identify when tools are needed
- Select the appropriate tool for the task
- Handle cases where no suitable tool exists
- Maintain context across multiple interactions
## Dataset Structure
| Type | Samples | Category | Dataset Name | Purpose |
|------|---------|-----------|--------------|----------|
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
"""
INSIGHTS = """
# Key Insights from Agent Leaderboard
| Category | Finding | Implications |
|----------|---------|--------------|
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
"""
chat_css = """
/* Container styles */
.container {
display: flex;
gap: 1.5rem;
height: calc(100vh - 100px);
padding: 1rem;
}
/* Chat panel styles */
.chat-panel {
flex: 2;
background: #1a1f2c;
border-radius: 1rem;
padding: 1rem;
overflow-y: auto;
max-height: calc(100vh - 120px);
}
/* Message styles */
.message {
padding: 1.2rem;
margin: 0.8rem;
border-radius: 1rem;
font-family: monospace;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.system {
background: linear-gradient(135deg, #8e44ad, #9b59b6);
}
.user {
background: linear-gradient(135deg, #2c3e50, #3498db);
margin-left: 2rem;
}
.assistant {
background: linear-gradient(135deg, #27ae60, #2ecc71);
margin-right: 2rem;
}
.role-badge {
display: inline-block;
padding: 0.3rem 0.8rem;
border-radius: 0.5rem;
font-weight: bold;
margin-bottom: 0.8rem;
font-size: 0.9rem;
text-transform: uppercase;
letter-spacing: 0.05em;
}
.system-role {
background-color: #8e44ad;
color: white;
}
.user-role {
background-color: #3498db;
color: white;
}
.assistant-role {
background-color: #27ae60;
color: white;
}
.content {
white-space: pre-wrap;
word-break: break-word;
color: #f5f6fa;
line-height: 1.5;
}
/* Metrics panel styles */
.metrics-panel {
flex: 1;
display: flex;
flex-direction: column;
gap: 2rem;
padding: 1.5rem;
background: #1a1f2c;
border-radius: 1rem;
}
.metric-section {
background: #1E293B;
padding: 1.5rem;
border-radius: 1rem;
}
.score-section {
text-align: center;
}
.score-display {
font-size: 3rem;
font-weight: bold;
color: #4ADE80;
line-height: 1;
margin: 0.5rem 0;
}
.explanation-text {
color: #E2E8F0;
line-height: 1.6;
font-size: 0.95rem;
}
/* Tool info panel styles */
.tool-info-panel {
background: #1a1f2c;
padding: 1.5rem;
border-radius: 1rem;
color: #f5f6fa;
}
.tool-section {
margin-bottom: 1.5rem;
}
.tool-name {
font-size: 1.2rem;
color: #4ADE80;
font-weight: bold;
margin-bottom: 0.5rem;
}
.tool-description {
color: #E2E8F0;
line-height: 1.6;
margin-bottom: 1rem;
}
.tool-parameters .parameter {
margin: 0.5rem 0;
padding: 0.5rem;
background: rgba(255, 255, 255, 0.05);
border-radius: 0.5rem;
}
.param-name {
color: #63B3ED;
font-weight: bold;
margin-right: 0.5rem;
}
.tool-examples .example {
margin: 0.5rem 0;
padding: 0.5rem;
background: rgba(255, 255, 255, 0.05);
border-radius: 0.5rem;
font-family: monospace;
}
/* Custom scrollbar */
::-webkit-scrollbar {
width: 8px;
}
::-webkit-scrollbar-track {
background: rgba(255, 255, 255, 0.1);
border-radius: 4px;
}
::-webkit-scrollbar-thumb {
background: linear-gradient(45deg, #3498db, #2ecc71);
border-radius: 4px;
}
/* Title styles */
.title {
color: #63B3ED;
font-size: 2rem;
font-weight: bold;
text-align: center;
margin-bottom: 1.5rem;
padding: 1rem;
}
/* Headers */
h3 {
color: #63B3ED;
margin: 0 0 1rem 0;
font-size: 1.1rem;
font-weight: 500;
letter-spacing: 0.05em;
}
"""
# Updated header and cards with theme awareness
HEADER_CONTENT = """
<style>
@media (prefers-color-scheme: dark) {
:root {
--bg-primary: rgb(17, 17, 27);
--bg-secondary: rgba(30, 30, 45, 0.95);
--bg-hover: rgba(40, 40, 55, 0.95);
--text-primary: #ffffff;
--text-secondary: #94a3b8;
--text-tertiary: #e2e8f0;
--border-color: rgba(255, 255, 255, 0.1);
--border-hover: rgba(255, 255, 255, 0.2);
--card-bg: rgba(17, 17, 27, 0.6);
--accent-color: #4F46E5;
--accent-bg: rgba(79, 70, 229, 0.1);
}
}
@media (prefers-color-scheme: light) {
:root {
--bg-primary: rgb(255, 255, 255);
--bg-secondary: rgba(243, 244, 246, 0.95);
--bg-hover: rgba(229, 231, 235, 0.95);
--text-primary: #000000;
--text-secondary: #4b5563;
--text-tertiary: #1f2937;
--border-color: rgba(0, 0, 0, 0.1);
--border-hover: rgba(0, 0, 0, 0.2);
--card-bg: rgba(249, 250, 251, 0.6);
--accent-color: #4F46E5;
--accent-bg: rgba(79, 70, 229, 0.1);
}
}
.header-wrapper {
padding: 3rem 2rem;
background: var(--bg-primary);
border-radius: 16px;
display: flex;
flex-direction: column;
align-items: center;
text-align: center;
}
.header-wrapper a {
color: var(--text-primary) !important;
text-decoration: none !important;
}
.description {
color: var(--text-primary);
font-size: 1.1rem;
line-height: 1.6;
max-width: 800px;
margin: 0 auto 2rem;
text-align: center;
}
.actions {
display: flex;
gap: 1rem;
justify-content: center;
margin-bottom: 2rem;
color: var(--text-primary);
}
.action-button {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.75rem 1.5rem;
background: var(--bg-secondary);
border: 1px solid var(--border-color);
border-radius: 100px;
color: var(--text-primary) !important;
text-decoration: none !important;
font-size: 0.95rem;
transition: all 0.2s ease;
}
.action-button:hover {
background: var(--bg-hover);
border-color: var(--border-hover);
color: var(--text-primary) !important;
}
.update-info {
color: var(--text-secondary);
font-size: 0.9rem;
margin-bottom: 3rem;
}
.features-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 1.5rem;
width: 100%;
max-width: 1200px;
}
.feature-card {
background: var(--card-bg);
border: 1px solid var(--border-color);
border-radius: 16px;
padding: 2rem;
text-align: left;
}
.feature-icon {
background: var(--accent-bg);
width: 40px;
height: 40px;
border-radius: 12px;
display: flex;
align-items: center;
justify-content: center;
margin-bottom: 1.5rem;
}
.feature-title {
color: var(--text-primary);
font-size: 1.25rem;
font-weight: 600;
margin-bottom: 1rem;
}
.feature-description {
color: var(--text-secondary);
font-size: 0.95rem;
margin-bottom: 1.5rem;
}
.feature-list {
list-style: none;
padding: 0;
margin: 0;
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.feature-list li {
color: var(--text-tertiary);
font-size: 0.95rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.feature-list li::before {
content: '';
width: 6px;
height: 6px;
background: var(--accent-color);
border-radius: 50%;
flex-shrink: 0;
}
/* Force all links to match theme */
.header-wrapper a:link,
.header-wrapper a:visited,
.header-wrapper a:hover,
.header-wrapper a:active {
color: var(--text-primary) !important;
}
/* Title specific styles */
.main-title {
color: var(--text-primary);
font-size: 48px;
font-weight: 700;
margin: 40px 0;
text-align: center;
}
.subtitle {
color: var(--text-secondary);
margin-bottom: 2rem;
}
</style>
<div class="header-wrapper">
<h1 class="main-title">Agent Leaderboard</h1>
<h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
<div class="actions">
<a href="#" class="action-button">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
<line x1="8" y1="12" x2="16" y2="12"/>
</svg>
Blog
</a>
<a href="#" class="action-button">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
</svg>
GitHub
</a>
<a href="#" class="action-button">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
<polyline points="7 10 12 15 17 10"/>
<line x1="12" y1="15" x2="12" y2="3"/>
</svg>
Dataset
</a>
</div>
"""
CARDS = """
<div class="features-grid">
<div class="feature-card">
<div class="feature-icon">
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
</svg>
</div>
<h3 class="feature-title">Make Better Decisions</h3>
<ul class="feature-list">
<li>Cost-effectiveness analysis</li>
<li>Business impact metrics</li>
<li>Vendor strategy insights</li>
</ul>
</div>
<div class="feature-card">
<div class="feature-icon">
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
</svg>
</div>
<h3 class="feature-title">360Β° Domain Evaluation</h3>
<ul class="feature-list">
<li>Cross-domain evaluation</li>
<li>Real-world use cases</li>
<li>Edge case evaluation</li>
</ul>
</div>
<div class="feature-card">
<div class="feature-icon">
<svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
</svg>
</div>
<h3 class="feature-title">Updated Periodically</h3>
<ul class="feature-list">
<li>11 private models evaluated</li>
<li>5 open source models included</li>
<li>Monthly model additions</li>
</ul>
</div>
</div>
</div>
"""
DESCRIPTION_HTML = """
<div style="
background: var(--bg-secondary, rgba(30, 30, 45, 0.95));
border-radius: 12px;
padding: 24px;
margin: 16px 0;
">
<div style="
display: flex;
flex-direction: column;
gap: 16px;
">
<div style="
color: var(--text-primary);
font-size: 1.1rem;
font-weight: 500;
display: flex;
align-items: center;
gap: 8px;
">
🎯 Purpose
<span style="
background: var(--accent-color, #4F46E5);
color: white;
padding: 4px 12px;
border-radius: 100px;
font-size: 0.9rem;
">Latest Update: Feb 2025</span>
</div>
<p style="
color: var(--text-secondary);
margin: 0;
line-height: 1.6;
">
Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates
language models' ability to effectively utilize tools and functions in complex scenarios.
</p>
<div style="
color: var(--text-primary);
font-size: 1.1rem;
font-weight: 500;
margin-top: 8px;
">
πŸ” What We Evaluate
</div>
<div style="
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 16px;
color: var(--text-secondary);
">
<div style="display: flex; gap: 8px; align-items: center;">
πŸ”„ Single/Multi-turn Interactions
</div>
<div style="display: flex; gap: 8px; align-items: center;">
🧩 Function Composition
</div>
<div style="display: flex; gap: 8px; align-items: center;">
⚑ Error Handling
</div>
</div>
<div style="
color: var(--text-primary);
font-size: 1.1rem;
font-weight: 500;
margin-top: 8px;
">
πŸ“Š Key Results
</div>
<div style="
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 16px;
color: var(--text-secondary);
">
<div style="display: flex; gap: 8px; align-items: center;">
βœ… Accuracy Performance
</div>
<div style="display: flex; gap: 8px; align-items: center;">
πŸ’° Open Vs Closed Source
</div>
<div style="display: flex; gap: 8px; align-items: center;">
βš–οΈ Overall Effectiveness
</div>
</div>
<div style="
border-left: 4px solid var(--accent-color, #4F46E5);
padding-left: 12px;
margin-top: 8px;
color: var(--text-secondary);
font-style: italic;
">
πŸ’‘ Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
</div>
</div>
</div>
"""