Pratik Bhavsar commited on
Commit
523927e
·
1 Parent(s): 80c01c6

improved title

Browse files
Files changed (2) hide show
  1. app.py +37 -17
  2. data_loader.py +85 -78
app.py CHANGED
@@ -1,9 +1,15 @@
1
  import gradio as gr
2
- from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY, HEADER_CONTENT
 
 
 
 
 
 
 
3
  from utils import model_info_tab, filter_leaderboard
4
  from visualization import setup_matplotlib
5
 
6
-
7
  def create_app():
8
  setup_matplotlib()
9
  df = load_data()
@@ -11,37 +17,42 @@ def create_app():
11
  with gr.Blocks(theme=gr.themes.Soft()) as app:
12
  with gr.Tabs():
13
  with gr.Tab("Leaderboard"):
 
14
  with gr.Row():
 
15
  with gr.Column(scale=1):
16
- gr.Markdown("# Filters")
 
 
 
 
 
 
17
  model_type = gr.Dropdown(
18
  choices=["All"] + df["Model Type"].unique().tolist(),
19
  value="All",
20
  label="Model Type",
 
21
  )
22
  category = gr.Dropdown(
23
  choices=list(CATEGORIES.keys()),
24
  value=list(CATEGORIES.keys())[0],
25
  label="Category",
 
26
  )
27
  sort_by = gr.Radio(
28
  choices=["Performance", "Cost"],
29
  value="Performance",
30
  label="Sort by",
 
31
  )
32
 
 
33
  with gr.Column(scale=4):
34
- # Add the new header content above everything
35
- gr.HTML(HEADER_CONTENT)
36
  output = gr.HTML()
37
  plot1 = gr.Plot()
38
  plot2 = gr.Plot()
39
- # Add methodology section
40
- gr.Markdown("# Methodology")
41
  gr.Markdown(METHODOLOGY)
42
- # Add insights section
43
- gr.Markdown("# Key Insights")
44
- gr.Markdown(INSIGHTS)
45
 
46
  for input_comp in [model_type, category, sort_by]:
47
  input_comp.change(
@@ -50,18 +61,29 @@ def create_app():
50
  outputs=[output, plot1, plot2],
51
  )
52
 
53
- with gr.Tab("Model Performance"):
54
- gr.HTML(HEADER_CONTENT)
55
  with gr.Row():
 
56
  with gr.Column(scale=1):
 
 
 
 
 
 
 
57
  model_selector = gr.Dropdown(
58
  choices=df["Model"].unique().tolist(),
59
  value=df.sort_values("Model Avg", ascending=False).iloc[0][
60
  "Model"
61
  ],
62
  multiselect=True,
63
- label="Models",
 
64
  )
 
 
65
  with gr.Column(scale=4):
66
  model_info = gr.HTML()
67
  radar_plot = gr.Plot()
@@ -88,7 +110,5 @@ def create_app():
88
 
89
  return app
90
 
91
-
92
- if __name__ == "__main__":
93
- demo = create_app()
94
- demo.launch()
 
1
  import gradio as gr
2
+ from data_loader import (
3
+ load_data,
4
+ CATEGORIES,
5
+ INSIGHTS,
6
+ METHODOLOGY,
7
+ HEADER_CONTENT,
8
+ CARDS,
9
+ )
10
  from utils import model_info_tab, filter_leaderboard
11
  from visualization import setup_matplotlib
12
 
 
13
  def create_app():
14
  setup_matplotlib()
15
  df = load_data()
 
17
  with gr.Blocks(theme=gr.themes.Soft()) as app:
18
  with gr.Tabs():
19
  with gr.Tab("Leaderboard"):
20
+ gr.HTML(HEADER_CONTENT + CARDS)
21
  with gr.Row():
22
+ # Left column for filters (20% width)
23
  with gr.Column(scale=1):
24
+ gr.HTML(
25
+ """
26
+ <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
27
+ <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Filters</h3>
28
+ </div>
29
+ """
30
+ )
31
  model_type = gr.Dropdown(
32
  choices=["All"] + df["Model Type"].unique().tolist(),
33
  value="All",
34
  label="Model Type",
35
+ container=True,
36
  )
37
  category = gr.Dropdown(
38
  choices=list(CATEGORIES.keys()),
39
  value=list(CATEGORIES.keys())[0],
40
  label="Category",
41
+ container=True,
42
  )
43
  sort_by = gr.Radio(
44
  choices=["Performance", "Cost"],
45
  value="Performance",
46
  label="Sort by",
47
+ container=True,
48
  )
49
 
50
+ # Right column for content (80% width)
51
  with gr.Column(scale=4):
 
 
52
  output = gr.HTML()
53
  plot1 = gr.Plot()
54
  plot2 = gr.Plot()
 
 
55
  gr.Markdown(METHODOLOGY)
 
 
 
56
 
57
  for input_comp in [model_type, category, sort_by]:
58
  input_comp.change(
 
61
  outputs=[output, plot1, plot2],
62
  )
63
 
64
+ with gr.Tab("Model Comparison"):
65
+ gr.HTML(HEADER_CONTENT + CARDS)
66
  with gr.Row():
67
+ # Left column for filters (20% width)
68
  with gr.Column(scale=1):
69
+ gr.HTML(
70
+ """
71
+ <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
72
+ <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Models</h3>
73
+ </div>
74
+ """
75
+ )
76
  model_selector = gr.Dropdown(
77
  choices=df["Model"].unique().tolist(),
78
  value=df.sort_values("Model Avg", ascending=False).iloc[0][
79
  "Model"
80
  ],
81
  multiselect=True,
82
+ label="Select Models",
83
+ container=True,
84
  )
85
+
86
+ # Right column for content (80% width)
87
  with gr.Column(scale=4):
88
  model_info = gr.HTML()
89
  radar_plot = gr.Plot()
 
110
 
111
  return app
112
 
113
+ demo = create_app()
114
+ demo.launch()
 
 
data_loader.py CHANGED
@@ -34,49 +34,6 @@ CATEGORIES = {
34
  "Composite": ["BFCL_v3_multi_turn_composite"],
35
  }
36
 
37
- INSIGHTS = """
38
- # Key Insights from Agent Leaderboard
39
-
40
- | Category | Finding | Implications |
41
- |----------|---------|--------------|
42
- | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
43
- | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
44
- | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
45
- | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
46
- | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
47
- | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
48
-
49
- **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
50
- """
51
-
52
- METHODOLOGY = """
53
- # Methodology
54
-
55
- ## Overview
56
- The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
57
- The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
58
-
59
- ## Tool Selection Quality Metric
60
- Models are evaluated on their ability to:
61
- - Correctly identify when tools are needed
62
- - Select the appropriate tool for the task
63
- - Handle cases where no suitable tool exists
64
- - Maintain context across multiple interactions
65
-
66
- ## Dataset Structure
67
- | Type | Samples | Category | Dataset Name | Purpose |
68
- |------|---------|-----------|--------------|----------|
69
- | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
70
- | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
71
- | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
72
- | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
73
- | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
74
- | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
75
- | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
76
- | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
77
- | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
78
- """
79
-
80
  HEADER_CONTENT = """
81
  <style>
82
  .header-wrapper {
@@ -89,12 +46,9 @@ HEADER_CONTENT = """
89
  text-align: center;
90
  }
91
 
92
- .title {
93
- color: #ffffff;
94
- font-size: 2.5rem;
95
- font-weight: 600;
96
- margin-bottom: 1.5rem;
97
- text-align: center;
98
  }
99
 
100
  .description {
@@ -111,6 +65,7 @@ HEADER_CONTENT = """
111
  gap: 1rem;
112
  justify-content: center;
113
  margin-bottom: 2rem;
 
114
  }
115
 
116
  .action-button {
@@ -121,8 +76,8 @@ HEADER_CONTENT = """
121
  background: rgba(30, 30, 45, 0.95);
122
  border: 1px solid rgba(255, 255, 255, 0.1);
123
  border-radius: 100px;
124
- color: #fff;
125
- text-decoration: none;
126
  font-size: 0.95rem;
127
  transition: all 0.2s ease;
128
  }
@@ -130,6 +85,7 @@ HEADER_CONTENT = """
130
  .action-button:hover {
131
  background: rgba(40, 40, 55, 0.95);
132
  border-color: rgba(255, 255, 255, 0.2);
 
133
  }
134
 
135
  .update-info {
@@ -203,38 +159,46 @@ HEADER_CONTENT = """
203
  border-radius: 50%;
204
  flex-shrink: 0;
205
  }
 
 
 
 
 
 
 
 
206
  </style>
207
 
208
  <div class="header-wrapper">
209
- <h1 class="title">Agent Leaderboard</h1>
210
- <p class="description">
211
- A comprehensive benchmark for evaluating AI agents in real-world business scenarios, comparing practical performance across multiple domains and use cases.
212
- </p>
213
 
214
- <div class="actions">
215
- <a href="#" class="action-button">
216
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
217
- <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
218
- <line x1="8" y1="12" x2="16" y2="12"/>
219
- </svg>
220
- Blog
221
- </a>
222
- <a href="#" class="action-button">
223
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
224
- <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
225
- </svg>
226
- GitHub
227
- </a>
228
- <a href="#" class="action-button">
229
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
230
- <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
231
- <polyline points="7 10 12 15 17 10"/>
232
- <line x1="12" y1="15" x2="12" y2="3"/>
233
- </svg>
234
- Dataset
235
- </a>
236
- </div>
 
237
 
 
238
  <div class="features-grid">
239
  <div class="feature-card">
240
  <div class="feature-icon">
@@ -283,3 +247,46 @@ HEADER_CONTENT = """
283
  </div>
284
  </div>
285
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "Composite": ["BFCL_v3_multi_turn_composite"],
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  HEADER_CONTENT = """
38
  <style>
39
  .header-wrapper {
 
46
  text-align: center;
47
  }
48
 
49
+ .header-wrapper a {
50
+ color: #ffffff !important;
51
+ text-decoration: none !important;
 
 
 
52
  }
53
 
54
  .description {
 
65
  gap: 1rem;
66
  justify-content: center;
67
  margin-bottom: 2rem;
68
+ color: #ffffff;
69
  }
70
 
71
  .action-button {
 
76
  background: rgba(30, 30, 45, 0.95);
77
  border: 1px solid rgba(255, 255, 255, 0.1);
78
  border-radius: 100px;
79
+ color: #ffffff !important;
80
+ text-decoration: none !important;
81
  font-size: 0.95rem;
82
  transition: all 0.2s ease;
83
  }
 
85
  .action-button:hover {
86
  background: rgba(40, 40, 55, 0.95);
87
  border-color: rgba(255, 255, 255, 0.2);
88
+ color: #ffffff !important;
89
  }
90
 
91
  .update-info {
 
159
  border-radius: 50%;
160
  flex-shrink: 0;
161
  }
162
+
163
+ /* Force all links to be white */
164
+ .header-wrapper a:link,
165
+ .header-wrapper a:visited,
166
+ .header-wrapper a:hover,
167
+ .header-wrapper a:active {
168
+ color: #ffffff !important;
169
+ }
170
  </style>
171
 
172
  <div class="header-wrapper">
173
+ <h1 class="title" style="font-size: 48px; font-weight: 700; margin: 40px 0; text-align: center;">Agent Leaderboard</h1>
174
+ <h2>Comprehensive multi-benchmark evaluation for tool calling</h2>
 
 
175
 
176
+ <div class="actions">
177
+ <a href="#" class="action-button">
178
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
179
+ <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
180
+ <line x1="8" y1="12" x2="16" y2="12"/>
181
+ </svg>
182
+ Blog
183
+ </a>
184
+ <a href="#" class="action-button">
185
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
186
+ <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
187
+ </svg>
188
+ GitHub
189
+ </a>
190
+ <a href="#" class="action-button">
191
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
192
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
193
+ <polyline points="7 10 12 15 17 10"/>
194
+ <line x1="12" y1="15" x2="12" y2="3"/>
195
+ </svg>
196
+ Dataset
197
+ </a>
198
+ </div>
199
+ """
200
 
201
+ CARDS = """
202
  <div class="features-grid">
203
  <div class="feature-card">
204
  <div class="feature-icon">
 
247
  </div>
248
  </div>
249
  """
250
+
251
+
252
+ METHODOLOGY = """# Methodology
253
+ ## Overview
254
+ The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
255
+ The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
256
+
257
+ ## Tool Selection Quality Metric
258
+ Models are evaluated on their ability to:
259
+ - Correctly identify when tools are needed
260
+ - Select the appropriate tool for the task
261
+ - Handle cases where no suitable tool exists
262
+ - Maintain context across multiple interactions
263
+
264
+ ## Dataset Structure
265
+ | Type | Samples | Category | Dataset Name | Purpose |
266
+ |------|---------|-----------|--------------|----------|
267
+ | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
268
+ | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
269
+ | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
270
+ | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
271
+ | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
272
+ | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
273
+ | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
274
+ | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
275
+ | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
276
+ """
277
+
278
+
279
+ INSIGHTS = """
280
+ # Key Insights from Agent Leaderboard
281
+
282
+ | Category | Finding | Implications |
283
+ |----------|---------|--------------|
284
+ | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
285
+ | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
286
+ | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
287
+ | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
288
+ | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
289
+ | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
290
+
291
+ **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
292
+ """