Spaces:
Running
Running
Pratik Bhavsar
commited on
Commit
·
523927e
1
Parent(s):
80c01c6
improved title
Browse files- app.py +37 -17
- data_loader.py +85 -78
app.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
-
from data_loader import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from utils import model_info_tab, filter_leaderboard
|
4 |
from visualization import setup_matplotlib
|
5 |
|
6 |
-
|
7 |
def create_app():
|
8 |
setup_matplotlib()
|
9 |
df = load_data()
|
@@ -11,37 +17,42 @@ def create_app():
|
|
11 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
12 |
with gr.Tabs():
|
13 |
with gr.Tab("Leaderboard"):
|
|
|
14 |
with gr.Row():
|
|
|
15 |
with gr.Column(scale=1):
|
16 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
model_type = gr.Dropdown(
|
18 |
choices=["All"] + df["Model Type"].unique().tolist(),
|
19 |
value="All",
|
20 |
label="Model Type",
|
|
|
21 |
)
|
22 |
category = gr.Dropdown(
|
23 |
choices=list(CATEGORIES.keys()),
|
24 |
value=list(CATEGORIES.keys())[0],
|
25 |
label="Category",
|
|
|
26 |
)
|
27 |
sort_by = gr.Radio(
|
28 |
choices=["Performance", "Cost"],
|
29 |
value="Performance",
|
30 |
label="Sort by",
|
|
|
31 |
)
|
32 |
|
|
|
33 |
with gr.Column(scale=4):
|
34 |
-
# Add the new header content above everything
|
35 |
-
gr.HTML(HEADER_CONTENT)
|
36 |
output = gr.HTML()
|
37 |
plot1 = gr.Plot()
|
38 |
plot2 = gr.Plot()
|
39 |
-
# Add methodology section
|
40 |
-
gr.Markdown("# Methodology")
|
41 |
gr.Markdown(METHODOLOGY)
|
42 |
-
# Add insights section
|
43 |
-
gr.Markdown("# Key Insights")
|
44 |
-
gr.Markdown(INSIGHTS)
|
45 |
|
46 |
for input_comp in [model_type, category, sort_by]:
|
47 |
input_comp.change(
|
@@ -50,18 +61,29 @@ def create_app():
|
|
50 |
outputs=[output, plot1, plot2],
|
51 |
)
|
52 |
|
53 |
-
with gr.Tab("Model
|
54 |
-
gr.HTML(HEADER_CONTENT)
|
55 |
with gr.Row():
|
|
|
56 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
model_selector = gr.Dropdown(
|
58 |
choices=df["Model"].unique().tolist(),
|
59 |
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
60 |
"Model"
|
61 |
],
|
62 |
multiselect=True,
|
63 |
-
label="Models",
|
|
|
64 |
)
|
|
|
|
|
65 |
with gr.Column(scale=4):
|
66 |
model_info = gr.HTML()
|
67 |
radar_plot = gr.Plot()
|
@@ -88,7 +110,5 @@ def create_app():
|
|
88 |
|
89 |
return app
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
demo = create_app()
|
94 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from data_loader import (
|
3 |
+
load_data,
|
4 |
+
CATEGORIES,
|
5 |
+
INSIGHTS,
|
6 |
+
METHODOLOGY,
|
7 |
+
HEADER_CONTENT,
|
8 |
+
CARDS,
|
9 |
+
)
|
10 |
from utils import model_info_tab, filter_leaderboard
|
11 |
from visualization import setup_matplotlib
|
12 |
|
|
|
13 |
def create_app():
|
14 |
setup_matplotlib()
|
15 |
df = load_data()
|
|
|
17 |
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
18 |
with gr.Tabs():
|
19 |
with gr.Tab("Leaderboard"):
|
20 |
+
gr.HTML(HEADER_CONTENT + CARDS)
|
21 |
with gr.Row():
|
22 |
+
# Left column for filters (20% width)
|
23 |
with gr.Column(scale=1):
|
24 |
+
gr.HTML(
|
25 |
+
"""
|
26 |
+
<div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
|
27 |
+
<h3 style="margin-top: 0; color: white; font-size: 1.2em;">Filters</h3>
|
28 |
+
</div>
|
29 |
+
"""
|
30 |
+
)
|
31 |
model_type = gr.Dropdown(
|
32 |
choices=["All"] + df["Model Type"].unique().tolist(),
|
33 |
value="All",
|
34 |
label="Model Type",
|
35 |
+
container=True,
|
36 |
)
|
37 |
category = gr.Dropdown(
|
38 |
choices=list(CATEGORIES.keys()),
|
39 |
value=list(CATEGORIES.keys())[0],
|
40 |
label="Category",
|
41 |
+
container=True,
|
42 |
)
|
43 |
sort_by = gr.Radio(
|
44 |
choices=["Performance", "Cost"],
|
45 |
value="Performance",
|
46 |
label="Sort by",
|
47 |
+
container=True,
|
48 |
)
|
49 |
|
50 |
+
# Right column for content (80% width)
|
51 |
with gr.Column(scale=4):
|
|
|
|
|
52 |
output = gr.HTML()
|
53 |
plot1 = gr.Plot()
|
54 |
plot2 = gr.Plot()
|
|
|
|
|
55 |
gr.Markdown(METHODOLOGY)
|
|
|
|
|
|
|
56 |
|
57 |
for input_comp in [model_type, category, sort_by]:
|
58 |
input_comp.change(
|
|
|
61 |
outputs=[output, plot1, plot2],
|
62 |
)
|
63 |
|
64 |
+
with gr.Tab("Model Comparison"):
|
65 |
+
gr.HTML(HEADER_CONTENT + CARDS)
|
66 |
with gr.Row():
|
67 |
+
# Left column for filters (20% width)
|
68 |
with gr.Column(scale=1):
|
69 |
+
gr.HTML(
|
70 |
+
"""
|
71 |
+
<div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
|
72 |
+
<h3 style="margin-top: 0; color: white; font-size: 1.2em;">Models</h3>
|
73 |
+
</div>
|
74 |
+
"""
|
75 |
+
)
|
76 |
model_selector = gr.Dropdown(
|
77 |
choices=df["Model"].unique().tolist(),
|
78 |
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
79 |
"Model"
|
80 |
],
|
81 |
multiselect=True,
|
82 |
+
label="Select Models",
|
83 |
+
container=True,
|
84 |
)
|
85 |
+
|
86 |
+
# Right column for content (80% width)
|
87 |
with gr.Column(scale=4):
|
88 |
model_info = gr.HTML()
|
89 |
radar_plot = gr.Plot()
|
|
|
110 |
|
111 |
return app
|
112 |
|
113 |
+
demo = create_app()
|
114 |
+
demo.launch()
|
|
|
|
data_loader.py
CHANGED
@@ -34,49 +34,6 @@ CATEGORIES = {
|
|
34 |
"Composite": ["BFCL_v3_multi_turn_composite"],
|
35 |
}
|
36 |
|
37 |
-
INSIGHTS = """
|
38 |
-
# Key Insights from Agent Leaderboard
|
39 |
-
|
40 |
-
| Category | Finding | Implications |
|
41 |
-
|----------|---------|--------------|
|
42 |
-
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
|
43 |
-
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
|
44 |
-
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
|
45 |
-
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
|
46 |
-
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
|
47 |
-
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
|
48 |
-
|
49 |
-
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
|
50 |
-
"""
|
51 |
-
|
52 |
-
METHODOLOGY = """
|
53 |
-
# Methodology
|
54 |
-
|
55 |
-
## Overview
|
56 |
-
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
|
57 |
-
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
|
58 |
-
|
59 |
-
## Tool Selection Quality Metric
|
60 |
-
Models are evaluated on their ability to:
|
61 |
-
- Correctly identify when tools are needed
|
62 |
-
- Select the appropriate tool for the task
|
63 |
-
- Handle cases where no suitable tool exists
|
64 |
-
- Maintain context across multiple interactions
|
65 |
-
|
66 |
-
## Dataset Structure
|
67 |
-
| Type | Samples | Category | Dataset Name | Purpose |
|
68 |
-
|------|---------|-----------|--------------|----------|
|
69 |
-
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
|
70 |
-
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
|
71 |
-
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
|
72 |
-
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
|
73 |
-
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
|
74 |
-
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
|
75 |
-
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
|
76 |
-
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
|
77 |
-
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
78 |
-
"""
|
79 |
-
|
80 |
HEADER_CONTENT = """
|
81 |
<style>
|
82 |
.header-wrapper {
|
@@ -89,12 +46,9 @@ HEADER_CONTENT = """
|
|
89 |
text-align: center;
|
90 |
}
|
91 |
|
92 |
-
.
|
93 |
-
color: #ffffff;
|
94 |
-
|
95 |
-
font-weight: 600;
|
96 |
-
margin-bottom: 1.5rem;
|
97 |
-
text-align: center;
|
98 |
}
|
99 |
|
100 |
.description {
|
@@ -111,6 +65,7 @@ HEADER_CONTENT = """
|
|
111 |
gap: 1rem;
|
112 |
justify-content: center;
|
113 |
margin-bottom: 2rem;
|
|
|
114 |
}
|
115 |
|
116 |
.action-button {
|
@@ -121,8 +76,8 @@ HEADER_CONTENT = """
|
|
121 |
background: rgba(30, 30, 45, 0.95);
|
122 |
border: 1px solid rgba(255, 255, 255, 0.1);
|
123 |
border-radius: 100px;
|
124 |
-
color: #
|
125 |
-
text-decoration: none;
|
126 |
font-size: 0.95rem;
|
127 |
transition: all 0.2s ease;
|
128 |
}
|
@@ -130,6 +85,7 @@ HEADER_CONTENT = """
|
|
130 |
.action-button:hover {
|
131 |
background: rgba(40, 40, 55, 0.95);
|
132 |
border-color: rgba(255, 255, 255, 0.2);
|
|
|
133 |
}
|
134 |
|
135 |
.update-info {
|
@@ -203,38 +159,46 @@ HEADER_CONTENT = """
|
|
203 |
border-radius: 50%;
|
204 |
flex-shrink: 0;
|
205 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
</style>
|
207 |
|
208 |
<div class="header-wrapper">
|
209 |
-
<h1 class="title">Agent Leaderboard</h1>
|
210 |
-
<
|
211 |
-
A comprehensive benchmark for evaluating AI agents in real-world business scenarios, comparing practical performance across multiple domains and use cases.
|
212 |
-
</p>
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
237 |
|
|
|
238 |
<div class="features-grid">
|
239 |
<div class="feature-card">
|
240 |
<div class="feature-icon">
|
@@ -283,3 +247,46 @@ HEADER_CONTENT = """
|
|
283 |
</div>
|
284 |
</div>
|
285 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
"Composite": ["BFCL_v3_multi_turn_composite"],
|
35 |
}
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
HEADER_CONTENT = """
|
38 |
<style>
|
39 |
.header-wrapper {
|
|
|
46 |
text-align: center;
|
47 |
}
|
48 |
|
49 |
+
.header-wrapper a {
|
50 |
+
color: #ffffff !important;
|
51 |
+
text-decoration: none !important;
|
|
|
|
|
|
|
52 |
}
|
53 |
|
54 |
.description {
|
|
|
65 |
gap: 1rem;
|
66 |
justify-content: center;
|
67 |
margin-bottom: 2rem;
|
68 |
+
color: #ffffff;
|
69 |
}
|
70 |
|
71 |
.action-button {
|
|
|
76 |
background: rgba(30, 30, 45, 0.95);
|
77 |
border: 1px solid rgba(255, 255, 255, 0.1);
|
78 |
border-radius: 100px;
|
79 |
+
color: #ffffff !important;
|
80 |
+
text-decoration: none !important;
|
81 |
font-size: 0.95rem;
|
82 |
transition: all 0.2s ease;
|
83 |
}
|
|
|
85 |
.action-button:hover {
|
86 |
background: rgba(40, 40, 55, 0.95);
|
87 |
border-color: rgba(255, 255, 255, 0.2);
|
88 |
+
color: #ffffff !important;
|
89 |
}
|
90 |
|
91 |
.update-info {
|
|
|
159 |
border-radius: 50%;
|
160 |
flex-shrink: 0;
|
161 |
}
|
162 |
+
|
163 |
+
/* Force all links to be white */
|
164 |
+
.header-wrapper a:link,
|
165 |
+
.header-wrapper a:visited,
|
166 |
+
.header-wrapper a:hover,
|
167 |
+
.header-wrapper a:active {
|
168 |
+
color: #ffffff !important;
|
169 |
+
}
|
170 |
</style>
|
171 |
|
172 |
<div class="header-wrapper">
|
173 |
+
<h1 class="title" style="font-size: 48px; font-weight: 700; margin: 40px 0; text-align: center;">Agent Leaderboard</h1>
|
174 |
+
<h2>Comprehensive multi-benchmark evaluation for tool calling</h2>
|
|
|
|
|
175 |
|
176 |
+
<div class="actions">
|
177 |
+
<a href="#" class="action-button">
|
178 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
179 |
+
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
180 |
+
<line x1="8" y1="12" x2="16" y2="12"/>
|
181 |
+
</svg>
|
182 |
+
Blog
|
183 |
+
</a>
|
184 |
+
<a href="#" class="action-button">
|
185 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
186 |
+
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
187 |
+
</svg>
|
188 |
+
GitHub
|
189 |
+
</a>
|
190 |
+
<a href="#" class="action-button">
|
191 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
192 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
193 |
+
<polyline points="7 10 12 15 17 10"/>
|
194 |
+
<line x1="12" y1="15" x2="12" y2="3"/>
|
195 |
+
</svg>
|
196 |
+
Dataset
|
197 |
+
</a>
|
198 |
+
</div>
|
199 |
+
"""
|
200 |
|
201 |
+
CARDS = """
|
202 |
<div class="features-grid">
|
203 |
<div class="feature-card">
|
204 |
<div class="feature-icon">
|
|
|
247 |
</div>
|
248 |
</div>
|
249 |
"""
|
250 |
+
|
251 |
+
|
252 |
+
METHODOLOGY = """# Methodology
|
253 |
+
## Overview
|
254 |
+
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
|
255 |
+
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
|
256 |
+
|
257 |
+
## Tool Selection Quality Metric
|
258 |
+
Models are evaluated on their ability to:
|
259 |
+
- Correctly identify when tools are needed
|
260 |
+
- Select the appropriate tool for the task
|
261 |
+
- Handle cases where no suitable tool exists
|
262 |
+
- Maintain context across multiple interactions
|
263 |
+
|
264 |
+
## Dataset Structure
|
265 |
+
| Type | Samples | Category | Dataset Name | Purpose |
|
266 |
+
|------|---------|-----------|--------------|----------|
|
267 |
+
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
|
268 |
+
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
|
269 |
+
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
|
270 |
+
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
|
271 |
+
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
|
272 |
+
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
|
273 |
+
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
|
274 |
+
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
|
275 |
+
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
276 |
+
"""
|
277 |
+
|
278 |
+
|
279 |
+
INSIGHTS = """
|
280 |
+
# Key Insights from Agent Leaderboard
|
281 |
+
|
282 |
+
| Category | Finding | Implications |
|
283 |
+
|----------|---------|--------------|
|
284 |
+
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
|
285 |
+
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
|
286 |
+
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
|
287 |
+
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
|
288 |
+
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
|
289 |
+
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
|
290 |
+
|
291 |
+
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
|
292 |
+
"""
|