Spaces:
Running
Running
Pratik Bhavsar
commited on
Commit
·
10ad72f
1
Parent(s):
19b159e
working draft
Browse files- app.py +79 -267
- data_loader.py +78 -0
- utils.py +70 -0
- visualization.py +221 -0
app.py
CHANGED
@@ -1,281 +1,93 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
values = [model_data[m] for m in datasets]
|
40 |
-
values.append(values[0])
|
41 |
-
datasets_plot = datasets + [datasets[0]]
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
r=values,
|
46 |
-
theta=datasets_plot,
|
47 |
-
fill="toself",
|
48 |
-
fillcolor=colors[idx % len(colors)],
|
49 |
-
line=dict(color=line_colors[idx % len(line_colors)], width=2),
|
50 |
-
name=model_name,
|
51 |
-
text=[f"{val:.3f}" for val in values],
|
52 |
-
textposition="middle right",
|
53 |
-
mode="lines+markers+text",
|
54 |
-
)
|
55 |
-
)
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
|
61 |
-
),
|
62 |
-
angularaxis=dict(
|
63 |
-
tickfont=dict(size=13, family="Arial"),
|
64 |
-
rotation=90,
|
65 |
-
direction="clockwise",
|
66 |
),
|
67 |
-
|
68 |
-
showlegend=True,
|
69 |
-
title=dict(
|
70 |
-
text="Model Comparison",
|
71 |
-
x=0.5,
|
72 |
-
y=0.95,
|
73 |
-
font=dict(size=24, family="Arial", color="#1F2937"),
|
74 |
-
),
|
75 |
-
paper_bgcolor="white",
|
76 |
-
plot_bgcolor="white",
|
77 |
-
height=800,
|
78 |
-
width=1000,
|
79 |
-
)
|
80 |
-
|
81 |
-
return fig
|
82 |
-
|
83 |
-
|
84 |
-
def model_info_tab(model_names=None):
|
85 |
-
if model_names is None or len(model_names) == 0:
|
86 |
-
model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
87 |
-
|
88 |
-
filtered_df = df[df["Model"].isin(model_names)]
|
89 |
-
radar_chart = create_radar_plot(df, model_names)
|
90 |
-
info_html = filtered_df[
|
91 |
-
[
|
92 |
-
"Model",
|
93 |
-
"Model Type",
|
94 |
-
"Model Avg",
|
95 |
-
"Input cost per million token",
|
96 |
-
"Output cost per million token",
|
97 |
-
"single turn perf",
|
98 |
-
"multi turn perf",
|
99 |
-
]
|
100 |
-
].to_html(index=False)
|
101 |
-
|
102 |
-
return info_html, radar_chart
|
103 |
-
|
104 |
-
|
105 |
-
def get_performance_chart(df):
|
106 |
-
df_sorted = df.sort_values("Model Avg", ascending=True)
|
107 |
-
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
108 |
-
|
109 |
-
fig, ax = plt.subplots(figsize=(16, 10))
|
110 |
-
bars = ax.barh(
|
111 |
-
np.arange(len(df_sorted)),
|
112 |
-
df_sorted["Model Avg"],
|
113 |
-
height=0.4,
|
114 |
-
color=[colors[t] for t in df_sorted["Model Type"]],
|
115 |
-
)
|
116 |
-
|
117 |
-
ax.set_title("Model Performance Comparison", pad=20, fontsize=18, fontweight="bold")
|
118 |
-
ax.set_xlabel("Average Score", fontsize=12, labelpad=10)
|
119 |
-
ax.set_xlim(0.6, 1.0)
|
120 |
-
ax.set_yticks(np.arange(len(df_sorted)))
|
121 |
-
ax.set_yticklabels(df_sorted["Model"], fontsize=10)
|
122 |
-
|
123 |
-
for i, v in enumerate(df_sorted["Model Avg"]):
|
124 |
-
ax.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=10)
|
125 |
-
|
126 |
-
ax.grid(True, axis="x", linestyle="--", alpha=0.2)
|
127 |
-
ax.spines[["top", "right"]].set_visible(False)
|
128 |
-
|
129 |
-
legend_elements = [
|
130 |
-
plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
|
131 |
-
for label, color in colors.items()
|
132 |
-
]
|
133 |
-
ax.legend(handles=legend_elements, title="Model Type", loc="lower right")
|
134 |
-
|
135 |
-
plt.tight_layout()
|
136 |
-
return fig
|
137 |
-
|
138 |
-
def get_performance_cost_chart(df):
|
139 |
-
plt.figure(figsize=(12, 8), dpi=300)
|
140 |
-
plt.grid(True, linestyle="--", alpha=0.2)
|
141 |
-
|
142 |
-
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
143 |
-
performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
|
144 |
-
|
145 |
-
for _, row in df.iterrows():
|
146 |
-
color = colors[row["Model Type"]]
|
147 |
-
size = 100 if row["Model Avg"] > 0.85 else 80
|
148 |
-
edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
|
149 |
-
|
150 |
-
plt.scatter(
|
151 |
-
row["Input cost per million token"],
|
152 |
-
row["Model Avg"] * 100,
|
153 |
-
c=color,
|
154 |
-
s=size,
|
155 |
-
alpha=0.9,
|
156 |
-
edgecolor=edge_color,
|
157 |
-
linewidth=1,
|
158 |
)
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
fontsize=9,
|
166 |
-
bbox=dict(facecolor="white", edgecolor="none", alpha=0.7),
|
167 |
)
|
168 |
|
169 |
-
|
170 |
-
plt.xlabel("Cost per Million Tokens ($)", fontsize=12, weight="bold")
|
171 |
-
plt.ylabel("Model Performance Score", fontsize=12, weight="bold")
|
172 |
-
plt.ylim(60, 95)
|
173 |
-
|
174 |
-
legend_elements = [
|
175 |
-
plt.scatter([], [], c=color, label=label, s=80)
|
176 |
-
for label, color in colors.items()
|
177 |
-
]
|
178 |
-
plt.legend(handles=legend_elements, loc="upper right")
|
179 |
-
plt.title("AI Language Model Performance vs. Cost", fontsize=14, weight="bold")
|
180 |
-
|
181 |
-
for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
|
182 |
-
plt.axhspan(y1, y2, alpha=0.2, color=color)
|
183 |
-
|
184 |
-
plt.tight_layout()
|
185 |
-
return plt.gcf()
|
186 |
-
|
187 |
-
|
188 |
-
def filter_leaderboard(model_type, category):
|
189 |
-
filtered_df = df.copy()
|
190 |
-
if model_type != "All":
|
191 |
-
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
192 |
-
|
193 |
-
dataset_columns = categories.get(category, ["Model Avg"])
|
194 |
-
avg_score = filtered_df[dataset_columns].mean(axis=1)
|
195 |
-
filtered_df["Category Score"] = avg_score
|
196 |
-
|
197 |
-
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
|
198 |
-
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
199 |
-
|
200 |
-
perf_chart = get_performance_chart(filtered_df)
|
201 |
-
cost_chart = get_performance_cost_chart(filtered_df)
|
202 |
-
|
203 |
-
display_columns = [
|
204 |
-
"Rank",
|
205 |
-
"Model",
|
206 |
-
"Model Type",
|
207 |
-
"Input cost per million token",
|
208 |
-
"Output cost per million token",
|
209 |
-
"Category Score",
|
210 |
-
]
|
211 |
-
|
212 |
-
table_html = filtered_df[display_columns].to_html(index=False)
|
213 |
-
return table_html, perf_chart, cost_chart
|
214 |
-
|
215 |
-
|
216 |
-
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
217 |
-
with gr.Tabs():
|
218 |
-
with gr.Tab("Leaderboard"):
|
219 |
-
with gr.Row():
|
220 |
-
with gr.Column(scale=1):
|
221 |
-
gr.Markdown("# Filters")
|
222 |
-
model_type = gr.Dropdown(
|
223 |
-
choices=["All"] + df["Model Type"].unique().tolist(),
|
224 |
-
value="All",
|
225 |
-
label="Model Type",
|
226 |
-
)
|
227 |
-
category = gr.Dropdown(
|
228 |
-
choices=list(categories.keys()),
|
229 |
-
value=list(categories.keys())[0],
|
230 |
-
label="Category",
|
231 |
-
)
|
232 |
-
|
233 |
-
with gr.Column(scale=4):
|
234 |
-
gr.Markdown("# Agent Leaderboard")
|
235 |
-
output = gr.HTML()
|
236 |
-
plot1 = gr.Plot()
|
237 |
-
plot2 = gr.Plot()
|
238 |
-
|
239 |
-
for input_comp in [model_type, category]:
|
240 |
-
input_comp.change(
|
241 |
-
fn=filter_leaderboard,
|
242 |
-
inputs=[model_type, category],
|
243 |
-
outputs=[output, plot1, plot2],
|
244 |
-
)
|
245 |
-
|
246 |
-
with gr.Tab("Model Performance"):
|
247 |
-
with gr.Row():
|
248 |
-
with gr.Column(scale=1):
|
249 |
-
model_selector = gr.Dropdown(
|
250 |
-
choices=df["Model"].unique().tolist(),
|
251 |
-
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
252 |
-
"Model"
|
253 |
-
],
|
254 |
-
multiselect=True,
|
255 |
-
label="Models",
|
256 |
-
)
|
257 |
-
with gr.Column(scale=4):
|
258 |
-
model_info = gr.HTML()
|
259 |
-
radar_plot = gr.Plot()
|
260 |
-
|
261 |
-
model_selector.change(
|
262 |
-
fn=model_info_tab,
|
263 |
-
inputs=[model_selector],
|
264 |
-
outputs=[model_info, radar_plot],
|
265 |
-
)
|
266 |
-
|
267 |
-
app.load(
|
268 |
-
fn=lambda: filter_leaderboard("All", list(categories.keys())[0]),
|
269 |
-
outputs=[output, plot1, plot2],
|
270 |
-
)
|
271 |
|
272 |
-
app.load(
|
273 |
-
fn=lambda: model_info_tab(
|
274 |
-
[df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
275 |
-
),
|
276 |
-
outputs=[model_info, radar_plot],
|
277 |
-
)
|
278 |
|
|
|
279 |
if __name__ == "__main__":
|
280 |
-
demo =
|
281 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY
|
3 |
+
from utils import model_info_tab, filter_leaderboard
|
4 |
+
from visualization import setup_matplotlib
|
5 |
+
|
6 |
+
|
7 |
+
def create_app():
|
8 |
+
setup_matplotlib()
|
9 |
+
df = load_data()
|
10 |
+
|
11 |
+
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
12 |
+
with gr.Tabs():
|
13 |
+
with gr.Tab("Leaderboard"):
|
14 |
+
with gr.Row():
|
15 |
+
with gr.Column(scale=1):
|
16 |
+
gr.Markdown("# Filters")
|
17 |
+
model_type = gr.Dropdown(
|
18 |
+
choices=["All"] + df["Model Type"].unique().tolist(),
|
19 |
+
value="All",
|
20 |
+
label="Model Type",
|
21 |
+
)
|
22 |
+
category = gr.Dropdown(
|
23 |
+
choices=list(CATEGORIES.keys()),
|
24 |
+
value=list(CATEGORIES.keys())[0],
|
25 |
+
label="Category",
|
26 |
+
)
|
27 |
+
sort_by = gr.Radio(
|
28 |
+
choices=["Performance", "Cost"],
|
29 |
+
value="Performance",
|
30 |
+
label="Sort by",
|
31 |
+
)
|
32 |
+
|
33 |
+
with gr.Column(scale=4):
|
34 |
+
gr.Markdown("# Agent Leaderboard")
|
35 |
+
output = gr.HTML()
|
36 |
+
plot1 = gr.Plot()
|
37 |
+
plot2 = gr.Plot()
|
38 |
+
|
39 |
+
for input_comp in [model_type, category, sort_by]:
|
40 |
+
input_comp.change(
|
41 |
+
fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
|
42 |
+
inputs=[model_type, category, sort_by],
|
43 |
+
outputs=[output, plot1, plot2],
|
44 |
+
)
|
45 |
|
46 |
+
with gr.Tab("Model Performance"):
|
47 |
+
with gr.Row():
|
48 |
+
with gr.Column(scale=1):
|
49 |
+
model_selector = gr.Dropdown(
|
50 |
+
choices=df["Model"].unique().tolist(),
|
51 |
+
value=df.sort_values("Model Avg", ascending=False).iloc[0][
|
52 |
+
"Model"
|
53 |
+
],
|
54 |
+
multiselect=True,
|
55 |
+
label="Models",
|
56 |
+
)
|
57 |
+
with gr.Column(scale=4):
|
58 |
+
model_info = gr.HTML()
|
59 |
+
radar_plot = gr.Plot()
|
60 |
+
|
61 |
+
model_selector.change(
|
62 |
+
fn=lambda m: model_info_tab(df, m),
|
63 |
+
inputs=[model_selector],
|
64 |
+
outputs=[model_info, radar_plot],
|
65 |
+
)
|
66 |
|
67 |
+
with gr.Tab("Methodology"):
|
68 |
+
gr.Markdown(METHODOLOGY)
|
|
|
|
|
|
|
69 |
|
70 |
+
with gr.Tab("Insights"):
|
71 |
+
gr.Markdown(INSIGHTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
app.load(
|
74 |
+
fn=lambda: filter_leaderboard(
|
75 |
+
df, "All", list(CATEGORIES.keys())[0], "Performance"
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
),
|
77 |
+
outputs=[output, plot1, plot2],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
)
|
79 |
|
80 |
+
app.load(
|
81 |
+
fn=lambda: model_info_tab(
|
82 |
+
df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
83 |
+
),
|
84 |
+
outputs=[model_info, radar_plot],
|
|
|
|
|
85 |
)
|
86 |
|
87 |
+
return app
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
# main.py
|
91 |
if __name__ == "__main__":
|
92 |
+
demo = create_app()
|
93 |
demo.launch()
|
data_loader.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
|
4 |
+
def load_data():
|
5 |
+
"""Load and preprocess the data."""
|
6 |
+
df = pd.read_csv("results.csv").dropna()
|
7 |
+
|
8 |
+
# Add combined I/O cost column with 3:1 ratio
|
9 |
+
df["IO Cost"] = (
|
10 |
+
df["Input cost per million token"] * 0.75
|
11 |
+
+ df["Output cost per million token"] * 0.25
|
12 |
+
)
|
13 |
+
return df
|
14 |
+
|
15 |
+
|
16 |
+
# categories.py
|
17 |
+
CATEGORIES = {
|
18 |
+
"Overall": ["Model Avg"],
|
19 |
+
"Overall single turn": ["single turn perf"],
|
20 |
+
"Overall multi turn": ["multi turn perf"],
|
21 |
+
"Single func call": [
|
22 |
+
"xlam_single_tool_single_call",
|
23 |
+
"xlam_multiple_tool_single_call",
|
24 |
+
],
|
25 |
+
"Multiple func call": [
|
26 |
+
"xlam_multiple_tool_multiple_call",
|
27 |
+
"xlam_single_tool_multiple_call",
|
28 |
+
"BFCL_v3_multi_turn_base_multi_func_call",
|
29 |
+
],
|
30 |
+
"Irrelevant query": ["BFCL_v3_irrelevance"],
|
31 |
+
"Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
|
32 |
+
"Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
|
33 |
+
"Missing params": ["BFCL_v3_multi_turn_miss_param"],
|
34 |
+
"Composite": ["BFCL_v3_multi_turn_composite"],
|
35 |
+
}
|
36 |
+
|
37 |
+
INSIGHTS = """
|
38 |
+
# Key Insights from Agent Leaderboard
|
39 |
+
|
40 |
+
| Category | Finding | Implications |
|
41 |
+
|----------|---------|--------------|
|
42 |
+
| Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
|
43 |
+
| Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
|
44 |
+
| Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
|
45 |
+
| Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
|
46 |
+
| Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
|
47 |
+
| Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
|
48 |
+
|
49 |
+
**Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
|
50 |
+
"""
|
51 |
+
|
52 |
+
METHODOLOGY = """
|
53 |
+
# Methodology
|
54 |
+
|
55 |
+
## Overview
|
56 |
+
The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
|
57 |
+
The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
|
58 |
+
|
59 |
+
## Tool Selection Quality Metric
|
60 |
+
Models are evaluated on their ability to:
|
61 |
+
- Correctly identify when tools are needed
|
62 |
+
- Select the appropriate tool for the task
|
63 |
+
- Handle cases where no suitable tool exists
|
64 |
+
- Maintain context across multiple interactions
|
65 |
+
|
66 |
+
## Dataset Structure
|
67 |
+
| Type | Samples | Category | Dataset Name | Purpose |
|
68 |
+
|------|---------|-----------|--------------|----------|
|
69 |
+
| Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
|
70 |
+
| | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
|
71 |
+
| | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
|
72 |
+
| | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
|
73 |
+
| Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
|
74 |
+
| | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
|
75 |
+
| | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
|
76 |
+
| | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
|
77 |
+
| | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
|
78 |
+
"""
|
utils.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_loader import CATEGORIES
|
2 |
+
from visualization import (
|
3 |
+
create_radar_plot,
|
4 |
+
get_performance_chart,
|
5 |
+
get_performance_cost_chart,
|
6 |
+
)
|
7 |
+
|
8 |
+
|
9 |
+
def model_info_tab(df, model_names=None):
|
10 |
+
if model_names is None or len(model_names) == 0:
|
11 |
+
model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
|
12 |
+
|
13 |
+
filtered_df = df[df["Model"].isin(model_names)]
|
14 |
+
radar_chart = create_radar_plot(df, model_names)
|
15 |
+
info_html = filtered_df[
|
16 |
+
[
|
17 |
+
"Model",
|
18 |
+
"Model Type",
|
19 |
+
"Model Avg",
|
20 |
+
"IO Cost",
|
21 |
+
"single turn perf",
|
22 |
+
"multi turn perf",
|
23 |
+
]
|
24 |
+
].to_html(index=False)
|
25 |
+
|
26 |
+
return info_html, radar_chart
|
27 |
+
|
28 |
+
|
29 |
+
def filter_leaderboard(df, model_type, category, sort_by):
|
30 |
+
filtered_df = df.copy()
|
31 |
+
if model_type != "All":
|
32 |
+
filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
|
33 |
+
|
34 |
+
dataset_columns = CATEGORIES.get(category, ["Model Avg"])
|
35 |
+
avg_score = filtered_df[dataset_columns].mean(axis=1)
|
36 |
+
filtered_df["Category Score"] = avg_score
|
37 |
+
|
38 |
+
if sort_by == "Performance":
|
39 |
+
filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
|
40 |
+
else:
|
41 |
+
filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
|
42 |
+
|
43 |
+
filtered_df["Rank"] = range(1, len(filtered_df) + 1)
|
44 |
+
|
45 |
+
perf_chart = get_performance_chart(filtered_df, category)
|
46 |
+
cost_chart = get_performance_cost_chart(filtered_df, category)
|
47 |
+
|
48 |
+
filtered_df["Cost (Input/Output)"] = filtered_df.apply(
|
49 |
+
lambda x: f"${x['Input cost per million token']:.2f}/${x['Output cost per million token']:.2f}",
|
50 |
+
axis=1,
|
51 |
+
)
|
52 |
+
|
53 |
+
display_columns = [
|
54 |
+
"Rank",
|
55 |
+
"Model",
|
56 |
+
"Model Type",
|
57 |
+
"Cost (Input/Output)",
|
58 |
+
"Category Score",
|
59 |
+
]
|
60 |
+
|
61 |
+
table_html = filtered_df[display_columns].to_html(index=False, escape=False)
|
62 |
+
note_html = """
|
63 |
+
<div style='margin-top: 20px; padding: 10px; background-color: #f3f4f6; border-radius: 4px;'>
|
64 |
+
<p style='margin: 0; font-size: 0.9em; color: #4b5563;'>
|
65 |
+
Note: Cost for sorting is calculated using 3:1 ratio on I/O. Cost of Gemini 2.0 is assumed to be same as that of Gemini 1.5.
|
66 |
+
</p>
|
67 |
+
</div>
|
68 |
+
"""
|
69 |
+
table_html += note_html
|
70 |
+
return table_html, perf_chart, cost_chart
|
visualization.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
|
6 |
+
|
7 |
+
def setup_matplotlib():
|
8 |
+
"""Set up matplotlib configuration."""
|
9 |
+
matplotlib.use("Agg")
|
10 |
+
plt.close("all")
|
11 |
+
|
12 |
+
|
13 |
+
def get_performance_chart(df, category_name="Overall"):
|
14 |
+
plt.close("all")
|
15 |
+
score_column = "Category Score"
|
16 |
+
df_sorted = df.sort_values(score_column, ascending=True)
|
17 |
+
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
18 |
+
|
19 |
+
height = max(8, len(df_sorted) * 0.8)
|
20 |
+
fig, ax = plt.subplots(figsize=(16, height))
|
21 |
+
plt.rcParams.update({"font.size": 12})
|
22 |
+
|
23 |
+
try:
|
24 |
+
bars = ax.barh(
|
25 |
+
np.arange(len(df_sorted)),
|
26 |
+
df_sorted[score_column],
|
27 |
+
height=0.6,
|
28 |
+
color=[colors[t] for t in df_sorted["Model Type"]],
|
29 |
+
)
|
30 |
+
|
31 |
+
ax.set_title(
|
32 |
+
f"Model Performance Comparison - {category_name}",
|
33 |
+
pad=20,
|
34 |
+
fontsize=20,
|
35 |
+
fontweight="bold",
|
36 |
+
)
|
37 |
+
ax.set_xlabel("Average Score", fontsize=14, labelpad=10)
|
38 |
+
ax.set_xlim(0.0, 1.0)
|
39 |
+
|
40 |
+
ax.set_yticks(np.arange(len(df_sorted)))
|
41 |
+
ax.set_yticklabels(df_sorted["Model"], fontsize=12)
|
42 |
+
|
43 |
+
plt.subplots_adjust(left=0.35)
|
44 |
+
|
45 |
+
for i, v in enumerate(df_sorted[score_column]):
|
46 |
+
ax.text(
|
47 |
+
v + 0.01, i, f"{v:.3f}", va="center", fontsize=12, fontweight="bold"
|
48 |
+
)
|
49 |
+
|
50 |
+
ax.grid(True, axis="x", linestyle="--", alpha=0.2)
|
51 |
+
ax.spines[["top", "right"]].set_visible(False)
|
52 |
+
|
53 |
+
legend_elements = [
|
54 |
+
plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
|
55 |
+
for label, color in colors.items()
|
56 |
+
]
|
57 |
+
ax.legend(
|
58 |
+
handles=legend_elements,
|
59 |
+
title="Model Type",
|
60 |
+
loc="lower right",
|
61 |
+
fontsize=12,
|
62 |
+
title_fontsize=14,
|
63 |
+
)
|
64 |
+
|
65 |
+
plt.tight_layout()
|
66 |
+
return fig
|
67 |
+
finally:
|
68 |
+
plt.close(fig)
|
69 |
+
|
70 |
+
|
71 |
+
def create_radar_plot(df, model_names):
|
72 |
+
datasets = [col for col in df.columns[7:] if col != "IO Cost"]
|
73 |
+
fig = go.Figure()
|
74 |
+
|
75 |
+
colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
|
76 |
+
line_colors = ["#4F46E5", "#16A34A"]
|
77 |
+
|
78 |
+
for idx, model_name in enumerate(model_names):
|
79 |
+
model_data = df[df["Model"] == model_name].iloc[0]
|
80 |
+
values = [model_data[m] for m in datasets]
|
81 |
+
values.append(values[0])
|
82 |
+
datasets_plot = datasets + [datasets[0]]
|
83 |
+
|
84 |
+
fig.add_trace(
|
85 |
+
go.Scatterpolar(
|
86 |
+
r=values,
|
87 |
+
theta=datasets_plot,
|
88 |
+
fill="toself",
|
89 |
+
fillcolor=colors[idx % len(colors)],
|
90 |
+
line=dict(color=line_colors[idx % len(line_colors)], width=2),
|
91 |
+
name=model_name,
|
92 |
+
text=[f"{val:.3f}" for val in values],
|
93 |
+
textposition="middle right",
|
94 |
+
mode="lines+markers+text",
|
95 |
+
)
|
96 |
+
)
|
97 |
+
|
98 |
+
fig.update_layout(
|
99 |
+
polar=dict(
|
100 |
+
radialaxis=dict(
|
101 |
+
visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
|
102 |
+
),
|
103 |
+
angularaxis=dict(
|
104 |
+
tickfont=dict(size=13, family="Arial"),
|
105 |
+
rotation=90,
|
106 |
+
direction="clockwise",
|
107 |
+
),
|
108 |
+
),
|
109 |
+
showlegend=True,
|
110 |
+
legend=dict(
|
111 |
+
orientation="h",
|
112 |
+
yanchor="bottom",
|
113 |
+
y=-0.2,
|
114 |
+
xanchor="center",
|
115 |
+
x=0.5,
|
116 |
+
font=dict(size=14),
|
117 |
+
),
|
118 |
+
title=dict(
|
119 |
+
text="Model Comparison",
|
120 |
+
x=0.5,
|
121 |
+
y=0.95,
|
122 |
+
font=dict(size=24, family="Arial", color="#1F2937"),
|
123 |
+
),
|
124 |
+
paper_bgcolor="white",
|
125 |
+
plot_bgcolor="white",
|
126 |
+
height=700,
|
127 |
+
width=900,
|
128 |
+
margin=dict(t=100, b=100, l=80, r=80),
|
129 |
+
)
|
130 |
+
|
131 |
+
return fig
|
132 |
+
|
133 |
+
|
134 |
+
def get_performance_cost_chart(df, category_name="Overall"):
|
135 |
+
# Create figure and axis with specified style
|
136 |
+
fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
|
137 |
+
|
138 |
+
# Configure plot style
|
139 |
+
ax.grid(True, linestyle="--", alpha=0.15, which="both")
|
140 |
+
ax.set_facecolor("white")
|
141 |
+
fig.patch.set_facecolor("white")
|
142 |
+
|
143 |
+
colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
|
144 |
+
performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
|
145 |
+
|
146 |
+
score_column = "Category Score"
|
147 |
+
|
148 |
+
# Plot data points
|
149 |
+
for _, row in df.iterrows():
|
150 |
+
color = colors[row["Model Type"]]
|
151 |
+
size = 100 if row[score_column] > 0.85 else 80
|
152 |
+
edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
|
153 |
+
|
154 |
+
# Plot scatter points
|
155 |
+
ax.scatter(
|
156 |
+
row["IO Cost"],
|
157 |
+
row[score_column] * 100,
|
158 |
+
c=color,
|
159 |
+
s=size,
|
160 |
+
alpha=0.9,
|
161 |
+
edgecolor=edge_color,
|
162 |
+
linewidth=1,
|
163 |
+
zorder=5, # Ensure points are above grid
|
164 |
+
)
|
165 |
+
|
166 |
+
# Add annotations with model names
|
167 |
+
bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.8)
|
168 |
+
|
169 |
+
ax.annotate(
|
170 |
+
f"{row['Model']}\n(${row['IO Cost']:.2f})",
|
171 |
+
(row["IO Cost"], row[score_column] * 100),
|
172 |
+
xytext=(5, 5),
|
173 |
+
textcoords="offset points",
|
174 |
+
fontsize=8,
|
175 |
+
bbox=bbox_props,
|
176 |
+
zorder=6,
|
177 |
+
)
|
178 |
+
|
179 |
+
# Configure axes
|
180 |
+
ax.set_xscale("log")
|
181 |
+
ax.set_xlim(0.08, 40) # Adjust based on your data range
|
182 |
+
ax.set_ylim(60, 95)
|
183 |
+
|
184 |
+
# Customize axis labels
|
185 |
+
ax.set_xlabel("I/O Cost per Million Tokens ($)", fontsize=10, labelpad=10)
|
186 |
+
ax.set_ylabel("Model Performance Score", fontsize=10, labelpad=10)
|
187 |
+
|
188 |
+
# Add legend
|
189 |
+
legend_elements = [
|
190 |
+
plt.scatter([], [], c=color, label=label, s=80)
|
191 |
+
for label, color in colors.items()
|
192 |
+
]
|
193 |
+
ax.legend(
|
194 |
+
handles=legend_elements,
|
195 |
+
loc="upper right",
|
196 |
+
frameon=True,
|
197 |
+
facecolor="white",
|
198 |
+
edgecolor="none",
|
199 |
+
fontsize=9,
|
200 |
+
)
|
201 |
+
|
202 |
+
# Set title
|
203 |
+
ax.set_title(
|
204 |
+
f"AI Language Model Performance vs. Cost - {category_name}", fontsize=12, pad=15
|
205 |
+
)
|
206 |
+
|
207 |
+
# Add performance bands
|
208 |
+
for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
|
209 |
+
ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
|
210 |
+
|
211 |
+
# Customize tick parameters
|
212 |
+
ax.tick_params(axis="both", which="major", labelsize=9)
|
213 |
+
ax.tick_params(axis="both", which="minor", labelsize=8)
|
214 |
+
|
215 |
+
# Add minor ticks for log scale
|
216 |
+
ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
|
217 |
+
|
218 |
+
# Adjust layout
|
219 |
+
plt.tight_layout()
|
220 |
+
|
221 |
+
return fig
|