Spaces:
Running
Running
Pratik Bhavsar
commited on
Commit
·
80c01c6
1
Parent(s):
fe118de
improved title
Browse files- app.py +10 -12
- data_loader.py +202 -15
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY,
|
3 |
from utils import model_info_tab, filter_leaderboard
|
4 |
from visualization import setup_matplotlib
|
5 |
|
@@ -31,10 +31,17 @@ def create_app():
|
|
31 |
)
|
32 |
|
33 |
with gr.Column(scale=4):
|
34 |
-
|
|
|
35 |
output = gr.HTML()
|
36 |
plot1 = gr.Plot()
|
37 |
plot2 = gr.Plot()
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
for input_comp in [model_type, category, sort_by]:
|
40 |
input_comp.change(
|
@@ -44,7 +51,7 @@ def create_app():
|
|
44 |
)
|
45 |
|
46 |
with gr.Tab("Model Performance"):
|
47 |
-
gr.
|
48 |
with gr.Row():
|
49 |
with gr.Column(scale=1):
|
50 |
model_selector = gr.Dropdown(
|
@@ -65,14 +72,6 @@ def create_app():
|
|
65 |
outputs=[model_info, radar_plot],
|
66 |
)
|
67 |
|
68 |
-
with gr.Tab("Methodology"):
|
69 |
-
gr.Markdown(TITLE)
|
70 |
-
gr.Markdown(METHODOLOGY)
|
71 |
-
|
72 |
-
with gr.Tab("Insights"):
|
73 |
-
gr.Markdown(TITLE)
|
74 |
-
gr.Markdown(INSIGHTS)
|
75 |
-
|
76 |
app.load(
|
77 |
fn=lambda: filter_leaderboard(
|
78 |
df, "All", list(CATEGORIES.keys())[0], "Performance"
|
@@ -90,7 +89,6 @@ def create_app():
|
|
90 |
return app
|
91 |
|
92 |
|
93 |
-
# main.py
|
94 |
if __name__ == "__main__":
|
95 |
demo = create_app()
|
96 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY, HEADER_CONTENT
|
3 |
from utils import model_info_tab, filter_leaderboard
|
4 |
from visualization import setup_matplotlib
|
5 |
|
|
|
31 |
)
|
32 |
|
33 |
with gr.Column(scale=4):
|
34 |
+
# Add the new header content above everything
|
35 |
+
gr.HTML(HEADER_CONTENT)
|
36 |
output = gr.HTML()
|
37 |
plot1 = gr.Plot()
|
38 |
plot2 = gr.Plot()
|
39 |
+
# Add methodology section
|
40 |
+
gr.Markdown("# Methodology")
|
41 |
+
gr.Markdown(METHODOLOGY)
|
42 |
+
# Add insights section
|
43 |
+
gr.Markdown("# Key Insights")
|
44 |
+
gr.Markdown(INSIGHTS)
|
45 |
|
46 |
for input_comp in [model_type, category, sort_by]:
|
47 |
input_comp.change(
|
|
|
51 |
)
|
52 |
|
53 |
with gr.Tab("Model Performance"):
|
54 |
+
gr.HTML(HEADER_CONTENT)
|
55 |
with gr.Row():
|
56 |
with gr.Column(scale=1):
|
57 |
model_selector = gr.Dropdown(
|
|
|
72 |
outputs=[model_info, radar_plot],
|
73 |
)
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
app.load(
|
76 |
fn=lambda: filter_leaderboard(
|
77 |
df, "All", list(CATEGORIES.keys())[0], "Performance"
|
|
|
89 |
return app
|
90 |
|
91 |
|
|
|
92 |
if __name__ == "__main__":
|
93 |
demo = create_app()
|
94 |
demo.launch()
|
data_loader.py
CHANGED
@@ -77,22 +77,209 @@ METHODOLOGY = """
|
|
77 |
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
78 |
"""
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
</div>
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
</div>
|
97 |
</div>
|
|
|
98 |
"""
|
|
|
77 |
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
78 |
"""
|
79 |
|
80 |
+
HEADER_CONTENT = """
|
81 |
+
<style>
|
82 |
+
.header-wrapper {
|
83 |
+
padding: 3rem 2rem;
|
84 |
+
background: rgb(17, 17, 27);
|
85 |
+
border-radius: 16px;
|
86 |
+
display: flex;
|
87 |
+
flex-direction: column;
|
88 |
+
align-items: center;
|
89 |
+
text-align: center;
|
90 |
+
}
|
91 |
+
|
92 |
+
.title {
|
93 |
+
color: #ffffff;
|
94 |
+
font-size: 2.5rem;
|
95 |
+
font-weight: 600;
|
96 |
+
margin-bottom: 1.5rem;
|
97 |
+
text-align: center;
|
98 |
+
}
|
99 |
+
|
100 |
+
.description {
|
101 |
+
color: #ffffff;
|
102 |
+
font-size: 1.1rem;
|
103 |
+
line-height: 1.6;
|
104 |
+
max-width: 800px;
|
105 |
+
margin: 0 auto 2rem;
|
106 |
+
text-align: center;
|
107 |
+
}
|
108 |
+
|
109 |
+
.actions {
|
110 |
+
display: flex;
|
111 |
+
gap: 1rem;
|
112 |
+
justify-content: center;
|
113 |
+
margin-bottom: 2rem;
|
114 |
+
}
|
115 |
+
|
116 |
+
.action-button {
|
117 |
+
display: flex;
|
118 |
+
align-items: center;
|
119 |
+
gap: 0.5rem;
|
120 |
+
padding: 0.75rem 1.5rem;
|
121 |
+
background: rgba(30, 30, 45, 0.95);
|
122 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
123 |
+
border-radius: 100px;
|
124 |
+
color: #fff;
|
125 |
+
text-decoration: none;
|
126 |
+
font-size: 0.95rem;
|
127 |
+
transition: all 0.2s ease;
|
128 |
+
}
|
129 |
+
|
130 |
+
.action-button:hover {
|
131 |
+
background: rgba(40, 40, 55, 0.95);
|
132 |
+
border-color: rgba(255, 255, 255, 0.2);
|
133 |
+
}
|
134 |
+
|
135 |
+
.update-info {
|
136 |
+
color: #94a3b8;
|
137 |
+
font-size: 0.9rem;
|
138 |
+
margin-bottom: 3rem;
|
139 |
+
}
|
140 |
+
|
141 |
+
.features-grid {
|
142 |
+
display: grid;
|
143 |
+
grid-template-columns: repeat(3, 1fr);
|
144 |
+
gap: 1.5rem;
|
145 |
+
width: 100%;
|
146 |
+
max-width: 1200px;
|
147 |
+
}
|
148 |
+
|
149 |
+
.feature-card {
|
150 |
+
background: rgba(17, 17, 27, 0.6);
|
151 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
152 |
+
border-radius: 16px;
|
153 |
+
padding: 2rem;
|
154 |
+
text-align: left;
|
155 |
+
}
|
156 |
+
|
157 |
+
.feature-icon {
|
158 |
+
background: rgba(79, 70, 229, 0.1);
|
159 |
+
width: 40px;
|
160 |
+
height: 40px;
|
161 |
+
border-radius: 12px;
|
162 |
+
display: flex;
|
163 |
+
align-items: center;
|
164 |
+
justify-content: center;
|
165 |
+
margin-bottom: 1.5rem;
|
166 |
+
}
|
167 |
+
|
168 |
+
.feature-title {
|
169 |
+
color: #ffffff;
|
170 |
+
font-size: 1.25rem;
|
171 |
+
font-weight: 600;
|
172 |
+
margin-bottom: 1rem;
|
173 |
+
}
|
174 |
+
|
175 |
+
.feature-description {
|
176 |
+
color: #94a3b8;
|
177 |
+
font-size: 0.95rem;
|
178 |
+
margin-bottom: 1.5rem;
|
179 |
+
}
|
180 |
+
|
181 |
+
.feature-list {
|
182 |
+
list-style: none;
|
183 |
+
padding: 0;
|
184 |
+
margin: 0;
|
185 |
+
display: flex;
|
186 |
+
flex-direction: column;
|
187 |
+
gap: 0.75rem;
|
188 |
+
}
|
189 |
+
|
190 |
+
.feature-list li {
|
191 |
+
color: #e2e8f0;
|
192 |
+
font-size: 0.95rem;
|
193 |
+
display: flex;
|
194 |
+
align-items: center;
|
195 |
+
gap: 0.5rem;
|
196 |
+
}
|
197 |
+
|
198 |
+
.feature-list li::before {
|
199 |
+
content: '';
|
200 |
+
width: 6px;
|
201 |
+
height: 6px;
|
202 |
+
background: #4F46E5;
|
203 |
+
border-radius: 50%;
|
204 |
+
flex-shrink: 0;
|
205 |
+
}
|
206 |
+
</style>
|
207 |
+
|
208 |
+
<div class="header-wrapper">
|
209 |
+
<h1 class="title">Agent Leaderboard</h1>
|
210 |
+
<p class="description">
|
211 |
+
A comprehensive benchmark for evaluating AI agents in real-world business scenarios, comparing practical performance across multiple domains and use cases.
|
212 |
+
</p>
|
213 |
+
|
214 |
+
<div class="actions">
|
215 |
+
<a href="#" class="action-button">
|
216 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
217 |
+
<path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
|
218 |
+
<line x1="8" y1="12" x2="16" y2="12"/>
|
219 |
+
</svg>
|
220 |
+
Blog
|
221 |
+
</a>
|
222 |
+
<a href="#" class="action-button">
|
223 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
224 |
+
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
|
225 |
+
</svg>
|
226 |
+
GitHub
|
227 |
+
</a>
|
228 |
+
<a href="#" class="action-button">
|
229 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
230 |
+
<path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
|
231 |
+
<polyline points="7 10 12 15 17 10"/>
|
232 |
+
<line x1="12" y1="15" x2="12" y2="3"/>
|
233 |
+
</svg>
|
234 |
+
Dataset
|
235 |
+
</a>
|
236 |
+
</div>
|
237 |
+
|
238 |
+
<div class="features-grid">
|
239 |
+
<div class="feature-card">
|
240 |
+
<div class="feature-icon">
|
241 |
+
<svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
|
242 |
+
<path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
|
243 |
+
</svg>
|
244 |
+
</div>
|
245 |
+
<h3 class="feature-title">360° Domain Evaluation</h3>
|
246 |
+
<p class="feature-description">Comprehensive evaluation across multiple benchmarks and domains:</p>
|
247 |
+
<ul class="feature-list">
|
248 |
+
<li>Cross-domain evaluation</li>
|
249 |
+
<li>Real-world use cases</li>
|
250 |
+
<li>Edge case evaluation</li>
|
251 |
+
</ul>
|
252 |
</div>
|
253 |
+
|
254 |
+
<div class="feature-card">
|
255 |
+
<div class="feature-icon">
|
256 |
+
<svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
|
257 |
+
<path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
|
258 |
+
</svg>
|
259 |
+
</div>
|
260 |
+
<h3 class="feature-title">Make Better Decisions</h3>
|
261 |
+
<p class="feature-description">Beyond technical metrics, we provide:</p>
|
262 |
+
<ul class="feature-list">
|
263 |
+
<li>Cost-effectiveness analysis</li>
|
264 |
+
<li>Business impact metrics</li>
|
265 |
+
<li>Vendor strategy insights</li>
|
266 |
+
</ul>
|
267 |
+
</div>
|
268 |
+
|
269 |
+
<div class="feature-card">
|
270 |
+
<div class="feature-icon">
|
271 |
+
<svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
|
272 |
+
<path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
|
273 |
+
</svg>
|
274 |
+
</div>
|
275 |
+
<h3 class="feature-title">Updated Periodically</h3>
|
276 |
+
<p class="feature-description">Regular updates with latest models:</p>
|
277 |
+
<ul class="feature-list">
|
278 |
+
<li>11 private models evaluated</li>
|
279 |
+
<li>5 open source models included</li>
|
280 |
+
<li>Monthly model additions</li>
|
281 |
+
</ul>
|
282 |
</div>
|
283 |
</div>
|
284 |
+
</div>
|
285 |
"""
|