import pandas as pd from glob import glob import numpy as np from pathlib import Path DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")] SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()] def load_data(): """Load and preprocess the data.""" df = pd.read_csv("results.csv").dropna() # Add combined I/O cost column with 3:1 ratio df["IO Cost"] = ( df["Input cost per million token"] * 0.75 + df["Output cost per million token"] * 0.25 ) return df # categories.py CATEGORIES = { "Overall": ["Model Avg"], "Overall single turn": ["single turn perf"], "Overall multi turn": ["multi turn perf"], "Single func call": [ "xlam_single_tool_single_call", "xlam_multiple_tool_single_call", ], "Multiple func call": [ "xlam_multiple_tool_multiple_call", "xlam_single_tool_multiple_call", "BFCL_v3_multi_turn_base_multi_func_call", ], "Irrelevant query": ["BFCL_v3_irrelevance"], "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"], "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"], "Missing params": ["BFCL_v3_multi_turn_miss_param"], "Composite": ["BFCL_v3_multi_turn_composite"], } chat_css = """ /* Container styles */ .container { display: flex; gap: 1.5rem; height: calc(100vh - 100px); padding: 1rem; } /* Chat panel styles */ .chat-panel { flex: 2; background: #1a1f2c; border-radius: 1rem; padding: 1rem; overflow-y: auto; max-height: calc(100vh - 120px); } /* Message styles */ .message { padding: 1.2rem; margin: 0.8rem; border-radius: 1rem; font-family: monospace; box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); } .system { background: linear-gradient(135deg, #8e44ad, #9b59b6); } .user { background: linear-gradient(135deg, #2c3e50, #3498db); margin-left: 2rem; } .assistant { background: linear-gradient(135deg, #27ae60, #2ecc71); margin-right: 2rem; } .role-badge { display: inline-block; padding: 0.3rem 0.8rem; border-radius: 0.5rem; font-weight: bold; margin-bottom: 0.8rem; font-size: 0.9rem; text-transform: uppercase; letter-spacing: 0.05em; } .system-role { background-color: #8e44ad; color: white; } .user-role { background-color: #3498db; color: white; } .assistant-role { background-color: #27ae60; color: white; } .content { white-space: pre-wrap; word-break: break-word; color: #f5f6fa; line-height: 1.5; } /* Metrics panel styles */ .metrics-panel { flex: 1; display: flex; flex-direction: column; gap: 2rem; padding: 1.5rem; background: #1a1f2c; border-radius: 1rem; } .metric-section { background: #1E293B; padding: 1.5rem; border-radius: 1rem; } .score-section { text-align: center; } .score-display { font-size: 3rem; font-weight: bold; color: #4ADE80; line-height: 1; margin: 0.5rem 0; } .explanation-text { color: #E2E8F0; line-height: 1.6; font-size: 0.95rem; } /* Tool info panel styles */ .tool-info-panel { background: #1a1f2c; padding: 1.5rem; border-radius: 1rem; color: #f5f6fa; } .tool-section { margin-bottom: 1.5rem; } .tool-name { font-size: 1.2rem; color: #4ADE80; font-weight: bold; margin-bottom: 0.5rem; } .tool-description { color: #E2E8F0; line-height: 1.6; margin-bottom: 1rem; } .tool-parameters .parameter { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; } .param-name { color: #63B3ED; font-weight: bold; margin-right: 0.5rem; } .tool-examples .example { margin: 0.5rem 0; padding: 0.5rem; background: rgba(255, 255, 255, 0.05); border-radius: 0.5rem; font-family: monospace; } /* Custom scrollbar */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: rgba(255, 255, 255, 0.1); border-radius: 4px; } ::-webkit-scrollbar-thumb { background: linear-gradient(45deg, #3498db, #2ecc71); border-radius: 4px; } /* Title styles */ .title { color: #63B3ED; font-size: 2rem; font-weight: bold; text-align: center; margin-bottom: 1.5rem; padding: 1rem; } /* Headers */ h3 { color: #63B3ED; margin: 0 0 1rem 0; font-size: 1.1rem; font-weight: 500; letter-spacing: 0.05em; } """ COMMON = """ """ DESCRIPTION_HTML = """
This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
We evaluate language models' ability to effectively use tools in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
Category | Finding |
---|---|
Performance Champion | Gemini-2.0-flash dominates with 0.935 score at just $0.075 per million tokens, excelling in both complex tasks (0.95) and safety features (0.98) |
Price-Performance Paradox | Top 3 models span 20x price difference yet only 3% performance gap, challenging pricing assumptions |
Open Vs Closed Source | The new Mistral-small leads in open source models and performs similar to GPT-4o-mini at 0.83, signaling OSS maturity in tool calling |
Reasoning Models | Although being great for reasoning, o1 and o3-mini are far from perfect scoring 0.87 and 0.84 respectively. DeepSeek V3 and R1 were excluded from rankings due to limited function support |
Tool Miss Detection | Dataset averages of 0.59 and 0.78 reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks |
Architecture Trade-offs | Long context vs parallel execution shows architectural limits: O1 leads context (0.98) but fails parallel tasks (0.43), while GPT-4o shows opposite pattern |
Area | Recommendation |
---|---|
Task Complexity | Simple tasks work with most models. Complex workflows requiring multiple tools need models with 0.85+ scores in composite tests |
Error Handling | Models with low tool selection scores need guardrails. Add validation layers and structured error recovery, especially for parameter collection |
Context Management | Long conversations require either models strong in context retention or external context storage systems |
Reasoning Models | While o1 and o3-mini excelled in function calling, DeepSeek V3 and R1 were excluded from rankings due to limited function support |
Safety Controls | Add strict tool access controls for models weak in irrelevance detection. Include validation layers for inconsistent performers |
Open Vs Closed Source | Private models lead in complex tasks, but open-source options work well for basic operations. Choose based on your scaling needs |
Type | Samples | Category | Dataset Name | Purpose |
---|---|---|---|---|
Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities | |
100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs | |
100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions | |
Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation | |
100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools | |
100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information | |
100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |