import pandas as pd
from glob import glob
import numpy as np
from pathlib import Path


DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]

def load_data():
    """Load and preprocess the data."""
    df = pd.read_csv("results.csv").dropna()

    # Add combined I/O cost column with 3:1 ratio
    df["IO Cost"] = (
        df["Input cost per million token"] * 0.75
        + df["Output cost per million token"] * 0.25
    )
    return df


# categories.py
CATEGORIES = {
    "Overall": ["Model Avg"],
    "Overall single turn": ["single turn perf"],
    "Overall multi turn": ["multi turn perf"],
    "Single func call": [
        "xlam_single_tool_single_call",
        "xlam_multiple_tool_single_call",
    ],
    "Multiple func call": [
        "xlam_multiple_tool_multiple_call",
        "xlam_single_tool_multiple_call",
        "BFCL_v3_multi_turn_base_multi_func_call",
    ],
    "Irrelevant query": ["BFCL_v3_irrelevance"],
    "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
    "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
    "Missing params": ["BFCL_v3_multi_turn_miss_param"],
    "Composite": ["BFCL_v3_multi_turn_composite"],
}


chat_css = """
/* Container styles */
.container {
    display: flex;
    gap: 1.5rem;
    height: calc(100vh - 100px);
    padding: 1rem;
}

/* Chat panel styles */
.chat-panel {
    flex: 2;
    background: #1a1f2c;
    border-radius: 1rem;
    padding: 1rem;
    overflow-y: auto;
    max-height: calc(100vh - 120px);
}

/* Message styles */
.message {
    padding: 1.2rem;
    margin: 0.8rem;
    border-radius: 1rem;
    font-family: monospace;
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

.system {
    background: linear-gradient(135deg, #8e44ad, #9b59b6);
}

.user {
    background: linear-gradient(135deg, #2c3e50, #3498db);
    margin-left: 2rem;
}

.assistant {
    background: linear-gradient(135deg, #27ae60, #2ecc71);
    margin-right: 2rem;
}

.role-badge {
    display: inline-block;
    padding: 0.3rem 0.8rem;
    border-radius: 0.5rem;
    font-weight: bold;
    margin-bottom: 0.8rem;
    font-size: 0.9rem;
    text-transform: uppercase;
    letter-spacing: 0.05em;
}

.system-role {
    background-color: #8e44ad;
    color: white;
}

.user-role {
    background-color: #3498db;
    color: white;
}

.assistant-role {
    background-color: #27ae60;
    color: white;
}

.content {
    white-space: pre-wrap;
    word-break: break-word;
    color: #f5f6fa;
    line-height: 1.5;
}

/* Metrics panel styles */
.metrics-panel {
    flex: 1;
    display: flex;
    flex-direction: column;
    gap: 2rem;
    padding: 1.5rem;
    background: #1a1f2c;
    border-radius: 1rem;
}

.metric-section {
    background: #1E293B;
    padding: 1.5rem;
    border-radius: 1rem;
}

.score-section {
    text-align: center;
}

.score-display {
    font-size: 3rem;
    font-weight: bold;
    color: #4ADE80;
    line-height: 1;
    margin: 0.5rem 0;
}

.explanation-text {
    color: #E2E8F0;
    line-height: 1.6;
    font-size: 0.95rem;
}

/* Tool info panel styles */
.tool-info-panel {
    background: #1a1f2c;
    padding: 1.5rem;
    border-radius: 1rem;
    color: #f5f6fa;
}

.tool-section {
    margin-bottom: 1.5rem;
}

.tool-name {
    font-size: 1.2rem;
    color: #4ADE80;
    font-weight: bold;
    margin-bottom: 0.5rem;
}

.tool-description {
    color: #E2E8F0;
    line-height: 1.6;
    margin-bottom: 1rem;
}

.tool-parameters .parameter {
    margin: 0.5rem 0;
    padding: 0.5rem;
    background: rgba(255, 255, 255, 0.05);
    border-radius: 0.5rem;
}

.param-name {
    color: #63B3ED;
    font-weight: bold;
    margin-right: 0.5rem;
}

.tool-examples .example {
    margin: 0.5rem 0;
    padding: 0.5rem;
    background: rgba(255, 255, 255, 0.05);
    border-radius: 0.5rem;
    font-family: monospace;
}

/* Custom scrollbar */
::-webkit-scrollbar {
    width: 8px;
}

::-webkit-scrollbar-track {
    background: rgba(255, 255, 255, 0.1);
    border-radius: 4px;
}

::-webkit-scrollbar-thumb {
    background: linear-gradient(45deg, #3498db, #2ecc71);
    border-radius: 4px;
}

/* Title styles */
.title {
    color: #63B3ED;
    font-size: 2rem;
    font-weight: bold;
    text-align: center;
    margin-bottom: 1.5rem;
    padding: 1rem;
}


/* Headers */
h3 {
    color: #63B3ED;
    margin: 0 0 1rem 0;
    font-size: 1.1rem;
    font-weight: 500;
    letter-spacing: 0.05em;
}
"""

COMMON = """
<style>
    @media (prefers-color-scheme: dark) {
        :root {
            --bg-primary: #0B0B19;
            --bg-secondary: rgba(19, 19, 37, 0.4);
            --bg-hover: rgba(30, 30, 45, 0.95);
            --text-primary: #ffffff;
            --text-secondary: #e2e8f0;
            --text-tertiary: #e2e8f0;
            --border-color: rgba(31, 41, 55, 0.5);
            --border-hover: rgba(79, 70, 229, 0.4);
            --card-bg: rgba(17, 17, 27, 0.4);
            --accent-color: #ffffff;
            --accent-bg: rgba(79, 70, 229, 0.1);
            --blue-gradient: linear-gradient(45deg, #60A5FA, #3B82F6);
            --purple-gradient: linear-gradient(45deg, #A78BFA, #8B5CF6);
            --pink-gradient: linear-gradient(45deg, #F472B6, #EC4899);
            --shadow-color: rgba(0, 0, 0, 0.2);
        }
    }
    
    @media (prefers-color-scheme: light) {
        :root {
            --bg-primary: #ffffff;
            --bg-secondary: rgba(243, 244, 246, 0.4);
            --bg-hover: rgba(229, 231, 235, 0.95);
            --text-primary: #1F2937;
            --text-secondary: #4B5563;
            --text-tertiary: #6B7280;
            --border-color: rgba(209, 213, 219, 0.5);
            --border-hover: rgba(79, 70, 229, 0.4);
            --card-bg: rgba(249, 250, 251, 0.4);
            --accent-color: #4F46E5;
            --accent-bg: rgba(79, 70, 229, 0.1);
            --blue-gradient: linear-gradient(45deg, #3B82F6, #2563EB);
            --purple-gradient: linear-gradient(45deg, #8B5CF6, #EF43CD);
            --pink-gradient: linear-gradient(45deg, #EC4899, #DB2777);
            --shadow-color: rgba(0, 0, 0, 0.1);
        }
    }
</style>
"""

DESCRIPTION_HTML = """
<div style="
    background: var(--bg-secondary, rgba(30, 30, 45, 0.95));
    border-radius: 12px;
    padding: 24px;
    margin: 16px 0;
">
    <div style="
        display: flex;
        flex-direction: column;
        gap: 16px;
    ">
        <div style="
            color: var(--text-primary);
            font-size: 1.1rem;
            font-weight: 500;
            display: flex;
            align-items: center;
            gap: 8px;
        ">
            🎯 Purpose
            <span style="
                background: linear-gradient(to right, var(--accent-blue), var(--accent-purple));
                color: white;
                padding: 4px 12px;
                border-radius: 100px;
                font-size: 0.9rem;
            ">Latest Update: Feb 2025</span>
        </div>
        <p style="
            color: var(--text-secondary);
            margin: 0;
            line-height: 1.6;
        ">
            This comprehensive benchmark evaluates language models' ability to effectively utilize tools and functions in complex scenarios.
        </p>

        <div style="
            color: var(--text-primary);
            font-size: 1.1rem;
            font-weight: 500;
            margin-top: 8px;
        ">
            🔍 What We Evaluate
        </div>
        <div style="
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 16px;
            color: var(--text-secondary);
        ">
            <div style="display: flex; gap: 8px; align-items: center;">
                🔄 Single/Multi-turn Interactions
            </div>
            <div style="display: flex; gap: 8px; align-items: center;">
                🧩 Function Composition
            </div>
            <div style="display: flex; gap: 8px; align-items: center;">
                ⚡ Error Handling
            </div>
        </div>

        <div style="
            color: var(--text-primary);
            font-size: 1.1rem;
            font-weight: 500;
            margin-top: 8px;
        ">
            📊 Key Results
        </div>
        <div style="
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 16px;
            color: var(--text-secondary);
        ">
            <div style="display: flex; gap: 8px; align-items: center;">
                ✅ Tool Selection Quality
            </div>
            <div style="display: flex; gap: 8px; align-items: center;">
                💰 Open Vs Closed Source
            </div>
            <div style="display: flex; gap: 8px; align-items: center;">
                ⚖️ Overall Effectiveness
            </div>
        </div>
    </div>
</div>
"""


HEADER_CONTENT = (
    COMMON
    + """
<style>

    .header-wrapper {
        background: var(--bg-primary);
        padding: 4rem 2rem;
        border-radius: 16px;
        margin-bottom: 0;
        transition: all 0.3s ease;
    }

    .header-content {
        max-width: 72rem;
        margin: 0 auto;
    }

    .title-section {
        text-align: center;
        margin-bottom: 4rem;
    }

    .title-gradient {
        font-size: 5rem;
        font-weight: 800;
        line-height: 1.1;
        background: var(--purple-gradient);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        margin-bottom: 0.5rem;
    }

    .subtitle-white {
        font-size: 5rem;
        font-weight: 800;
        line-height: 1.1;
        color: var(--text-primary);
        margin-bottom: 3rem;
        transition: color 0.3s ease;
    }

    .description {
        color: var(--text-secondary);
        font-size: 1.25rem;
        line-height: 1.75;
        max-width: 800px;
        margin: 0 auto;
        text-align: center;
        transition: color 0.3s ease;
    }

    .highlight-question {
        background: var(--blue-gradient);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        display: block;
        margin-top: 1rem;
        font-size: 1.5rem;
        font-weight: 500;
    }

    .metrics-grid {
        display: grid;
        grid-template-columns: repeat(3, 1fr);
        gap: 1.5rem;
        margin-top: 4rem;
    }

    .metric-card {
        background: var(--bg-secondary);
        border: 1px solid var(--border-color);
        border-radius: 1rem;
        padding: 2rem;
        transition: all 0.3s ease;
        align-items: center;
    }

    .metric-card:hover {
        transform: translateY(-5px);
        border-color: var(--border-hover);
        box-shadow: 0 4px 20px var(--shadow-color);
    }

    .metric-number {
        font-size: 4rem;
        font-weight: 800;
        margin-bottom: 1rem;
    }

    .metric-blue { 
        background: var(--blue-gradient);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }

    .metric-purple { 
        background: var(--purple-gradient);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }

    .metric-pink { 
        background: var(--pink-gradient);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }

    .metric-label {
        color: var(--text-secondary);
        font-size: 1.5rem;
        margin-bottom: 1.5rem;
        transition: color 0.3s ease;
    }

    .metric-detail {
        font-size: 1.125rem;
        line-height: 1.75;
        margin-top: 0.5rem;
        transition: color 0.3s ease;
    }

    .metric-detail.primary {
        color: var(--accent-color);
    }

    .metric-detail.secondary {
        color: var(--text-secondary);
    }

    .actions {
        display: flex;
        gap: 1rem;
        justify-content: center;
        margin-top: 3rem;
    }

    .action-button {
        display: flex;
        align-items: center;
        gap: 0.5rem;
        padding: 0.75rem 1.5rem;
        background: var(--bg-secondary);
        border: 1px solid var(--border-color);
        border-radius: 100px;
        color: var(--text-primary) !important;
        text-decoration: none !important;
        font-size: 0.95rem;
        transition: all 0.3s ease;
    }

    .action-button:hover {
        transform: translateY(-2px);
        border-color: var(--accent-color);
        background: var(--accent-bg);
    }

    @media (max-width: 768px) {
        .title-gradient, .subtitle-white {
            font-size: 3rem;
        }
        .metrics-grid {
            grid-template-columns: 1fr;
        }
    }
</style>

<div class="header-wrapper">
    <div class="header-content">
        <div class="title-section">
            <div class="subtitle-white">Welcome to the</div>
            <div class="title-gradient">Agent Leaderboard!</div>
            
            <div class="description">
                The landscape of AI agents is evolving rapidly, with major tech CEOs predicting 2025 as a pivotal year. 
                We built this leaderboard to answer one simple question:
                <div class="highlight-question">
                    "How do AI agents perform in real-world agentic scenarios?"
                </div>
            </div>
        </div>
        
        <div class="actions">
            <a href="#" class="action-button">
                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                    <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
                    <line x1="8" y1="12" x2="16" y2="12"/>
                </svg>
                Blog
            </a>
            <a href="#" class="action-button">
                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                    <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
                </svg>
                GitHub
            </a>
            <a href="#" class="action-button">
                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                    <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
                    <polyline points="7 10 12 15 17 10"/>
                    <line x1="12" y1="15" x2="12" y2="3"/>
                </svg>
                Dataset
            </a>
        </div>
    </div>
</div>
"""
)

CARDS = """        <div class="metrics-grid">
            <div class="metric-card">
                <div class="metric-number metric-blue">17</div>
                <div class="metric-label">Total Models</div>
                <div class="metric-detail primary">12 Private</div>
                <div class="metric-detail primary">5 Open Source</div>
            </div>

            <div class="metric-card">
                <div class="metric-number metric-purple">14</div>
                <div class="metric-label">Evaluation Datasets</div>
                <div class="metric-detail primary">Cross-Domain Testing</div>
                <div class="metric-detail primary">Real-world use cases</div>
            </div>

            <div class="metric-card">
                <div class="metric-number metric-pink">TSQ</div>
                <div class="metric-label">Evaluation Metric</div>
                <div class="metric-detail primary">Tool Selection Quality</div>
                <div class="metric-detail primary">GPT-4o Based Judge</div>
            </div>
        </div>"""

METHODOLOGY = """
<style>
    @media (prefers-color-scheme: dark) {
        :root {
            --bg-primary: #0B0B19;
            --bg-secondary: rgba(19, 19, 37, 0.4);
            --bg-tertiary: rgba(30, 30, 45, 0.95);
            --text-primary: #ffffff;
            --text-secondary: #94A3B8;
            --text-tertiary: #E2E8F0;
            --border-primary: rgba(31, 41, 55, 0.5);
            --border-hover: rgba(79, 70, 229, 0.4);
            --accent-blue: #60A5FA;
            --accent-purple: #A78BFA;
            --accent-pink: #F472B6;
            --card-hover-bg: rgba(79, 70, 229, 0.1);
            --shadow-color: rgba(79, 70, 229, 0.1);
        }
    }
    
    @media (prefers-color-scheme: light) {
        :root {
            --bg-primary: #ffffff;
            --bg-secondary: rgba(243, 244, 246, 0.4);
            --bg-tertiary: rgba(249, 250, 251, 0.95);
            --text-primary: #111827;
            --text-secondary: #4B5563;
            --text-tertiary: #6B7280;
            --border-primary: rgba(209, 213, 219, 0.5);
            --border-hover: rgba(79, 70, 229, 0.4);
            --accent-blue: #3B82F6;
            --accent-purple: #8B5CF6;
            --accent-pink: #EC4899;
            --card-hover-bg: rgba(243, 244, 246, 0.8);
            --shadow-color: rgba(0, 0, 0, 0.1);
        }
    }

    /* [Previous CSS remains the same until features-grid] */

    /* Features Grid Section */
    .features-grid {
        display: grid;
        grid-template-columns: repeat(3, 1fr);
        gap: 1.5rem;
        width: 100%;
        padding: 2rem 0;
    }
    
    .dataset-table {
        width: 100%;
        border-collapse: separate;
        border-spacing: 0;
        margin: 2rem 0;
        background: var(--bg-tertiary);
        border-radius: 1rem;
        overflow: hidden;
        box-shadow: 0 4px 20px var(--shadow-color);
    }

    .dataset-table thead {
        background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
    }

    .dataset-table th {
        padding: 1.25rem 1rem;
        text-align: left;
        color: white;
        font-weight: 600;
        font-size: 1rem;
    }

    .dataset-table td {
        padding: 1rem;
        border-bottom: 1px solid var(--border-primary);
        color: var(--text-secondary);
        transition: all 0.2s ease;
    }

    .dataset-table tbody tr:hover td {
        background: var(--card-hover-bg);
        color: var(--text-primary);
    }

    .dataset-table td[rowspan] {
        background: var(--bg-secondary);
        color: var(--accent-blue);
        font-weight: 600;
        border-right: 1px solid var(--border-primary);
    }

    .purpose-cell {
        max-width: 300px;
        line-height: 1.5;
    }

    .category-cell {
        color: var(--accent-purple);
        font-weight: 500;
    }

    .dataset-name {
        font-family: monospace;
        color: var(--accent-pink);
        font-size: 0.9rem;
    }

    [Rest of the CSS remains the same]
</style>
<!-- Methodology Section -->
<div class="methodology-section">
    <h1 class="methodology-title">Methodology</h1>
    
    <h2 class="methodology-subtitle">Overview</h2>
    <p class="methodology-text">
        We evaluate language models' ability to effectively use tools 
        in single and multi-turn conversations. Our evaluation focuses on both basic functionality and edge 
        cases that challenge real-world applicability.
    </p>

<style>
.key-insights thead tr {
    background: linear-gradient(90deg, #60A5FA, #818CF8);
}

.key-insights td:first-child {
    color: var(--accent-blue);
    background: var(--bg-primary);
}

.key-insights td:last-child {
    background: var(--bg-primary);
}

.key-insights td {
    padding: 1rem;
    border-bottom: 1px solid rgba(31, 41, 55, 0.5);
}
</style>

<div class="methodology-section">
    <h1 class="methodology-subtitle">Key Insights</h1>
    <div class="table-container">
        <table class="dataset-table key-insights">
            <thead>
                <tr>
                    <th>Category</th>
                    <th>Finding</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Performance Champion</td>
                    <td>Gemini-2.0-flash dominates with 0.935 score at just $0.075 per million tokens, excelling in both complex tasks (0.95) and safety features (0.98)</td>
                </tr>
                <tr>
                    <td>Price-Performance Paradox</td>
                    <td>Top 3 models span 20x price difference yet only 3% performance gap, challenging pricing assumptions</td>
                </tr>
                <tr>
                    <td>Open Vs Closed Source</td>
                    <td>The new Mistral-small leads in open source models and performs similar to GPT-4o-mini at 0.83, signaling OSS maturity in tool calling</td>
                </tr>
                <tr>
                    <td>Reasoning Models</td>
                    <td>Although being great for reasoning, o1 and o3-mini are far from perfect scoring 0.87 and 0.84 respectively. DeepSeek V3 and R1 were excluded from rankings due to limited function support</td>
                </tr>
                <tr>
                    <td>Tool Miss Detection</td>
                    <td>Dataset averages of 0.59 and 0.78 reveal fundamental challenges in handling edge cases and maintaining context, even as models excel at basic tasks</td>
                </tr>
                <tr>
                    <td>Architecture Trade-offs</td>
                    <td>Long context vs parallel execution shows architectural limits: O1 leads context (0.98) but fails parallel tasks (0.43), while GPT-4o shows opposite pattern</td>
                </tr>
            </tbody>
        </table>
    </div>

    <h2 class="methodology-subtitle">Development Implications</h2>
    <div class="table-container">
        <table class="dataset-table key-insights">
            <thead>
                <tr>
                    <th>Area</th>
                    <th>Recommendation</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Task Complexity</td>
                    <td>Simple tasks work with most models. Complex workflows requiring multiple tools need models with 0.85+ scores in composite tests</td>
                </tr>
                <tr>
                    <td>Error Handling</td>
                    <td>Models with low tool selection scores need guardrails. Add validation layers and structured error recovery, especially for parameter collection</td>
                </tr>
                <tr>
                    <td>Context Management</td>
                    <td>Long conversations require either models strong in context retention or external context storage systems</td>
                </tr>
                <tr>
                    <td>Reasoning Models</td>
                    <td>While o1 and o3-mini excelled in function calling, DeepSeek V3 and R1 were excluded from rankings due to limited function support</td>
                </tr>
                <tr>
                    <td>Safety Controls</td>
                    <td>Add strict tool access controls for models weak in irrelevance detection. Include validation layers for inconsistent performers</td>
                </tr>
                <tr>
                    <td>Open Vs Closed Source</td>
                    <td>Private models lead in complex tasks, but open-source options work well for basic operations. Choose based on your scaling needs</td>
                </tr>
            </tbody>
        </table>
    </div>

    <h2 class="methodology-subtitle">Dataset Structure</h2>
    <div class="table-container">
        <table class="dataset-table">
            <thead>
                <tr>
                    <th>Type</th>
                    <th>Samples</th>
                    <th>Category</th>
                    <th>Dataset Name</th>
                    <th>Purpose</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td rowspan="4">Single-Turn</td>
                    <td>100 + 100</td>
                    <td class="category-cell">Single Function Call</td>
                    <td class="dataset-name">xlam_single_tool_single_call</td>
                    <td class="purpose-cell">Evaluates basic ability to read documentation and make single function calls</td>
                </tr>
                <tr>
                    <td>200 + 50</td>
                    <td class="category-cell">Multiple Function Call</td>
                    <td class="dataset-name">xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call</td>
                    <td class="purpose-cell">Tests parallel execution and result aggregation capabilities</td>
                </tr>
                <tr>
                    <td>100</td>
                    <td class="category-cell">Irrelevant Query</td>
                    <td class="dataset-name">BFCL_v3_irrelevance</td>
                    <td class="purpose-cell">Tests ability to recognize when available tools don't match user needs</td>
                </tr>
                <tr>
                    <td>100</td>
                    <td class="category-cell">Long Context</td>
                    <td class="dataset-name">tau_long_context</td>
                    <td class="purpose-cell">Assesses handling of extended interactions and complex instructions</td>
                </tr>
                <tr>
                    <td rowspan="5">Multi-Turn</td>
                    <td>50 + 30</td>
                    <td class="category-cell">Single Function Call</td>
                    <td class="dataset-name">BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call</td>
                    <td class="purpose-cell">Tests basic conversational function calling abilities</td>
                </tr>
                <tr>
                    <td>50</td>
                    <td class="category-cell">Multiple Function Call</td>
                    <td class="dataset-name">BFCL_v3_multi_turn_base_multi_func_call</td>
                    <td class="purpose-cell">Evaluates handling of multiple function calls in conversation</td>
                </tr>
                <tr>
                    <td>100</td>
                    <td class="category-cell">Missing Function</td>
                    <td class="dataset-name">BFCL_v3_multi_turn_miss_func</td>
                    <td class="purpose-cell">Tests graceful handling of unavailable tools</td>
                </tr>
                <tr>
                    <td>100</td>
                    <td class="category-cell">Missing Parameters</td>
                    <td class="dataset-name">BFCL_v3_multi_turn_miss_param</td>
                    <td class="purpose-cell">Assesses parameter collection and handling incomplete information</td>
                </tr>
                <tr>
                    <td>100</td>
                    <td class="category-cell">Composite</td>
                    <td class="dataset-name">BFCL_v3_multi_turn_composite</td>
                    <td class="purpose-cell">Tests overall robustness in complex scenarios</td>
                </tr>
            </tbody>
        </table>
    </div>

<!-- Features Grid Section -->
<div class="features-grid">
    <div class="feature-card">
        <div class="feature-icon">
            <svg width="24" height="24" fill="none" stroke="var(--accent-blue)" stroke-width="2" viewBox="0 0 24 24">
                <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
            </svg>
        </div>
        <h3 class="feature-title">Make Better Decisions</h3>
        <ul class="feature-list">
            <li>Cost-effectiveness analysis</li>
            <li>Business impact metrics</li>
            <li>Vendor strategy insights</li>
        </ul>
    </div>
    
    <div class="feature-card">
        <div class="feature-icon">
            <svg width="24" height="24" fill="none" stroke="var(--accent-purple)" stroke-width="2" viewBox="0 0 24 24">
                <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
            </svg>
        </div>
        <h3 class="feature-title">360° Domain Evaluation</h3>
        <ul class="feature-list">
            <li>Cross-domain evaluation</li>
            <li>Real-world use cases</li>
            <li>Edge case evaluation</li>
        </ul>
    </div>

    <div class="feature-card">
        <div class="feature-icon">
            <svg width="24" height="24" fill="none" stroke="var(--accent-pink)" stroke-width="2" viewBox="0 0 24 24">
                <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
            </svg>
        </div>
        <h3 class="feature-title">Updated Periodically</h3>
        <ul class="feature-list">
            <li>12 private models evaluated</li>
            <li>5 open source models included</li>
            <li>Monthly model additions</li>
        </ul>
    </div>

"""