Pratik Bhavsar commited on
Commit
10ad72f
·
1 Parent(s): 19b159e

working draft

Browse files
Files changed (4) hide show
  1. app.py +79 -267
  2. data_loader.py +78 -0
  3. utils.py +70 -0
  4. visualization.py +221 -0
app.py CHANGED
@@ -1,281 +1,93 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- import plotly.graph_objects as go
6
-
7
- df = pd.read_csv("results.csv").dropna()
8
-
9
- categories = {
10
- "Overall": ["Model Avg"],
11
- "Overall single turn": ["single turn perf"],
12
- "Overall multi turn": ["multi turn perf"],
13
- "Single func call": [
14
- "xlam_single_tool_single_call",
15
- "xlam_multiple_tool_single_call",
16
- ],
17
- "Multiple func call": [
18
- "xlam_multiple_tool_multiple_call",
19
- "xlam_single_tool_multiple_call",
20
- "BFCL_v3_multi_turn_base_multi_func_call",
21
- ],
22
- "Irrelevant query": ["BFCL_v3_irrelevance"],
23
- "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
24
- "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
25
- "Missing params": ["BFCL_v3_multi_turn_miss_param"],
26
- "Composite": ["BFCL_v3_multi_turn_composite"],
27
- }
28
-
29
-
30
- def create_radar_plot(df, model_names):
31
- datasets = df.columns[7:].tolist()
32
- fig = go.Figure()
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
35
- line_colors = ["#4F46E5", "#16A34A"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- for idx, model_name in enumerate(model_names):
38
- model_data = df[df["Model"] == model_name].iloc[0]
39
- values = [model_data[m] for m in datasets]
40
- values.append(values[0])
41
- datasets_plot = datasets + [datasets[0]]
42
 
43
- fig.add_trace(
44
- go.Scatterpolar(
45
- r=values,
46
- theta=datasets_plot,
47
- fill="toself",
48
- fillcolor=colors[idx % len(colors)],
49
- line=dict(color=line_colors[idx % len(line_colors)], width=2),
50
- name=model_name,
51
- text=[f"{val:.3f}" for val in values],
52
- textposition="middle right",
53
- mode="lines+markers+text",
54
- )
55
- )
56
 
57
- fig.update_layout(
58
- polar=dict(
59
- radialaxis=dict(
60
- visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
61
- ),
62
- angularaxis=dict(
63
- tickfont=dict(size=13, family="Arial"),
64
- rotation=90,
65
- direction="clockwise",
66
  ),
67
- ),
68
- showlegend=True,
69
- title=dict(
70
- text="Model Comparison",
71
- x=0.5,
72
- y=0.95,
73
- font=dict(size=24, family="Arial", color="#1F2937"),
74
- ),
75
- paper_bgcolor="white",
76
- plot_bgcolor="white",
77
- height=800,
78
- width=1000,
79
- )
80
-
81
- return fig
82
-
83
-
84
- def model_info_tab(model_names=None):
85
- if model_names is None or len(model_names) == 0:
86
- model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
87
-
88
- filtered_df = df[df["Model"].isin(model_names)]
89
- radar_chart = create_radar_plot(df, model_names)
90
- info_html = filtered_df[
91
- [
92
- "Model",
93
- "Model Type",
94
- "Model Avg",
95
- "Input cost per million token",
96
- "Output cost per million token",
97
- "single turn perf",
98
- "multi turn perf",
99
- ]
100
- ].to_html(index=False)
101
-
102
- return info_html, radar_chart
103
-
104
-
105
- def get_performance_chart(df):
106
- df_sorted = df.sort_values("Model Avg", ascending=True)
107
- colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
108
-
109
- fig, ax = plt.subplots(figsize=(16, 10))
110
- bars = ax.barh(
111
- np.arange(len(df_sorted)),
112
- df_sorted["Model Avg"],
113
- height=0.4,
114
- color=[colors[t] for t in df_sorted["Model Type"]],
115
- )
116
-
117
- ax.set_title("Model Performance Comparison", pad=20, fontsize=18, fontweight="bold")
118
- ax.set_xlabel("Average Score", fontsize=12, labelpad=10)
119
- ax.set_xlim(0.6, 1.0)
120
- ax.set_yticks(np.arange(len(df_sorted)))
121
- ax.set_yticklabels(df_sorted["Model"], fontsize=10)
122
-
123
- for i, v in enumerate(df_sorted["Model Avg"]):
124
- ax.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=10)
125
-
126
- ax.grid(True, axis="x", linestyle="--", alpha=0.2)
127
- ax.spines[["top", "right"]].set_visible(False)
128
-
129
- legend_elements = [
130
- plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
131
- for label, color in colors.items()
132
- ]
133
- ax.legend(handles=legend_elements, title="Model Type", loc="lower right")
134
-
135
- plt.tight_layout()
136
- return fig
137
-
138
- def get_performance_cost_chart(df):
139
- plt.figure(figsize=(12, 8), dpi=300)
140
- plt.grid(True, linestyle="--", alpha=0.2)
141
-
142
- colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
143
- performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
144
-
145
- for _, row in df.iterrows():
146
- color = colors[row["Model Type"]]
147
- size = 100 if row["Model Avg"] > 0.85 else 80
148
- edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
149
-
150
- plt.scatter(
151
- row["Input cost per million token"],
152
- row["Model Avg"] * 100,
153
- c=color,
154
- s=size,
155
- alpha=0.9,
156
- edgecolor=edge_color,
157
- linewidth=1,
158
  )
159
 
160
- plt.annotate(
161
- f"{row['Model']}\n(${row['Input cost per million token']})",
162
- (row["Input cost per million token"], row["Model Avg"] * 100),
163
- xytext=(7, 7),
164
- textcoords="offset points",
165
- fontsize=9,
166
- bbox=dict(facecolor="white", edgecolor="none", alpha=0.7),
167
  )
168
 
169
- plt.xscale("log")
170
- plt.xlabel("Cost per Million Tokens ($)", fontsize=12, weight="bold")
171
- plt.ylabel("Model Performance Score", fontsize=12, weight="bold")
172
- plt.ylim(60, 95)
173
-
174
- legend_elements = [
175
- plt.scatter([], [], c=color, label=label, s=80)
176
- for label, color in colors.items()
177
- ]
178
- plt.legend(handles=legend_elements, loc="upper right")
179
- plt.title("AI Language Model Performance vs. Cost", fontsize=14, weight="bold")
180
-
181
- for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
182
- plt.axhspan(y1, y2, alpha=0.2, color=color)
183
-
184
- plt.tight_layout()
185
- return plt.gcf()
186
-
187
-
188
- def filter_leaderboard(model_type, category):
189
- filtered_df = df.copy()
190
- if model_type != "All":
191
- filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
192
-
193
- dataset_columns = categories.get(category, ["Model Avg"])
194
- avg_score = filtered_df[dataset_columns].mean(axis=1)
195
- filtered_df["Category Score"] = avg_score
196
-
197
- filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
198
- filtered_df["Rank"] = range(1, len(filtered_df) + 1)
199
-
200
- perf_chart = get_performance_chart(filtered_df)
201
- cost_chart = get_performance_cost_chart(filtered_df)
202
-
203
- display_columns = [
204
- "Rank",
205
- "Model",
206
- "Model Type",
207
- "Input cost per million token",
208
- "Output cost per million token",
209
- "Category Score",
210
- ]
211
-
212
- table_html = filtered_df[display_columns].to_html(index=False)
213
- return table_html, perf_chart, cost_chart
214
-
215
-
216
- with gr.Blocks(theme=gr.themes.Soft()) as app:
217
- with gr.Tabs():
218
- with gr.Tab("Leaderboard"):
219
- with gr.Row():
220
- with gr.Column(scale=1):
221
- gr.Markdown("# Filters")
222
- model_type = gr.Dropdown(
223
- choices=["All"] + df["Model Type"].unique().tolist(),
224
- value="All",
225
- label="Model Type",
226
- )
227
- category = gr.Dropdown(
228
- choices=list(categories.keys()),
229
- value=list(categories.keys())[0],
230
- label="Category",
231
- )
232
-
233
- with gr.Column(scale=4):
234
- gr.Markdown("# Agent Leaderboard")
235
- output = gr.HTML()
236
- plot1 = gr.Plot()
237
- plot2 = gr.Plot()
238
-
239
- for input_comp in [model_type, category]:
240
- input_comp.change(
241
- fn=filter_leaderboard,
242
- inputs=[model_type, category],
243
- outputs=[output, plot1, plot2],
244
- )
245
-
246
- with gr.Tab("Model Performance"):
247
- with gr.Row():
248
- with gr.Column(scale=1):
249
- model_selector = gr.Dropdown(
250
- choices=df["Model"].unique().tolist(),
251
- value=df.sort_values("Model Avg", ascending=False).iloc[0][
252
- "Model"
253
- ],
254
- multiselect=True,
255
- label="Models",
256
- )
257
- with gr.Column(scale=4):
258
- model_info = gr.HTML()
259
- radar_plot = gr.Plot()
260
-
261
- model_selector.change(
262
- fn=model_info_tab,
263
- inputs=[model_selector],
264
- outputs=[model_info, radar_plot],
265
- )
266
-
267
- app.load(
268
- fn=lambda: filter_leaderboard("All", list(categories.keys())[0]),
269
- outputs=[output, plot1, plot2],
270
- )
271
 
272
- app.load(
273
- fn=lambda: model_info_tab(
274
- [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
275
- ),
276
- outputs=[model_info, radar_plot],
277
- )
278
 
 
279
  if __name__ == "__main__":
280
- demo = app
281
  demo.launch()
 
1
  import gradio as gr
2
+ from data_loader import load_data, CATEGORIES, INSIGHTS, METHODOLOGY
3
+ from utils import model_info_tab, filter_leaderboard
4
+ from visualization import setup_matplotlib
5
+
6
+
7
+ def create_app():
8
+ setup_matplotlib()
9
+ df = load_data()
10
+
11
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
12
+ with gr.Tabs():
13
+ with gr.Tab("Leaderboard"):
14
+ with gr.Row():
15
+ with gr.Column(scale=1):
16
+ gr.Markdown("# Filters")
17
+ model_type = gr.Dropdown(
18
+ choices=["All"] + df["Model Type"].unique().tolist(),
19
+ value="All",
20
+ label="Model Type",
21
+ )
22
+ category = gr.Dropdown(
23
+ choices=list(CATEGORIES.keys()),
24
+ value=list(CATEGORIES.keys())[0],
25
+ label="Category",
26
+ )
27
+ sort_by = gr.Radio(
28
+ choices=["Performance", "Cost"],
29
+ value="Performance",
30
+ label="Sort by",
31
+ )
32
+
33
+ with gr.Column(scale=4):
34
+ gr.Markdown("# Agent Leaderboard")
35
+ output = gr.HTML()
36
+ plot1 = gr.Plot()
37
+ plot2 = gr.Plot()
38
+
39
+ for input_comp in [model_type, category, sort_by]:
40
+ input_comp.change(
41
+ fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
42
+ inputs=[model_type, category, sort_by],
43
+ outputs=[output, plot1, plot2],
44
+ )
45
 
46
+ with gr.Tab("Model Performance"):
47
+ with gr.Row():
48
+ with gr.Column(scale=1):
49
+ model_selector = gr.Dropdown(
50
+ choices=df["Model"].unique().tolist(),
51
+ value=df.sort_values("Model Avg", ascending=False).iloc[0][
52
+ "Model"
53
+ ],
54
+ multiselect=True,
55
+ label="Models",
56
+ )
57
+ with gr.Column(scale=4):
58
+ model_info = gr.HTML()
59
+ radar_plot = gr.Plot()
60
+
61
+ model_selector.change(
62
+ fn=lambda m: model_info_tab(df, m),
63
+ inputs=[model_selector],
64
+ outputs=[model_info, radar_plot],
65
+ )
66
 
67
+ with gr.Tab("Methodology"):
68
+ gr.Markdown(METHODOLOGY)
 
 
 
69
 
70
+ with gr.Tab("Insights"):
71
+ gr.Markdown(INSIGHTS)
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ app.load(
74
+ fn=lambda: filter_leaderboard(
75
+ df, "All", list(CATEGORIES.keys())[0], "Performance"
 
 
 
 
 
 
76
  ),
77
+ outputs=[output, plot1, plot2],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
 
80
+ app.load(
81
+ fn=lambda: model_info_tab(
82
+ df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
83
+ ),
84
+ outputs=[model_info, radar_plot],
 
 
85
  )
86
 
87
+ return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
89
 
90
+ # main.py
91
  if __name__ == "__main__":
92
+ demo = create_app()
93
  demo.launch()
data_loader.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def load_data():
5
+ """Load and preprocess the data."""
6
+ df = pd.read_csv("results.csv").dropna()
7
+
8
+ # Add combined I/O cost column with 3:1 ratio
9
+ df["IO Cost"] = (
10
+ df["Input cost per million token"] * 0.75
11
+ + df["Output cost per million token"] * 0.25
12
+ )
13
+ return df
14
+
15
+
16
+ # categories.py
17
+ CATEGORIES = {
18
+ "Overall": ["Model Avg"],
19
+ "Overall single turn": ["single turn perf"],
20
+ "Overall multi turn": ["multi turn perf"],
21
+ "Single func call": [
22
+ "xlam_single_tool_single_call",
23
+ "xlam_multiple_tool_single_call",
24
+ ],
25
+ "Multiple func call": [
26
+ "xlam_multiple_tool_multiple_call",
27
+ "xlam_single_tool_multiple_call",
28
+ "BFCL_v3_multi_turn_base_multi_func_call",
29
+ ],
30
+ "Irrelevant query": ["BFCL_v3_irrelevance"],
31
+ "Long context": ["tau_long_context", "BFCL_v3_multi_turn_long_context"],
32
+ "Missing func": ["xlam_tool_miss", "BFCL_v3_multi_turn_miss_func"],
33
+ "Missing params": ["BFCL_v3_multi_turn_miss_param"],
34
+ "Composite": ["BFCL_v3_multi_turn_composite"],
35
+ }
36
+
37
+ INSIGHTS = """
38
+ # Key Insights from Agent Leaderboard
39
+
40
+ | Category | Finding | Implications |
41
+ |----------|---------|--------------|
42
+ | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
43
+ | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
44
+ | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
45
+ | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
46
+ | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
47
+ | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
48
+
49
+ **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
50
+ """
51
+
52
+ METHODOLOGY = """
53
+ # Methodology
54
+
55
+ ## Overview
56
+ The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
57
+ The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
58
+
59
+ ## Tool Selection Quality Metric
60
+ Models are evaluated on their ability to:
61
+ - Correctly identify when tools are needed
62
+ - Select the appropriate tool for the task
63
+ - Handle cases where no suitable tool exists
64
+ - Maintain context across multiple interactions
65
+
66
+ ## Dataset Structure
67
+ | Type | Samples | Category | Dataset Name | Purpose |
68
+ |------|---------|-----------|--------------|----------|
69
+ | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
70
+ | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
71
+ | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
72
+ | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
73
+ | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
74
+ | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
75
+ | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
76
+ | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
77
+ | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
78
+ """
utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_loader import CATEGORIES
2
+ from visualization import (
3
+ create_radar_plot,
4
+ get_performance_chart,
5
+ get_performance_cost_chart,
6
+ )
7
+
8
+
9
+ def model_info_tab(df, model_names=None):
10
+ if model_names is None or len(model_names) == 0:
11
+ model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
12
+
13
+ filtered_df = df[df["Model"].isin(model_names)]
14
+ radar_chart = create_radar_plot(df, model_names)
15
+ info_html = filtered_df[
16
+ [
17
+ "Model",
18
+ "Model Type",
19
+ "Model Avg",
20
+ "IO Cost",
21
+ "single turn perf",
22
+ "multi turn perf",
23
+ ]
24
+ ].to_html(index=False)
25
+
26
+ return info_html, radar_chart
27
+
28
+
29
+ def filter_leaderboard(df, model_type, category, sort_by):
30
+ filtered_df = df.copy()
31
+ if model_type != "All":
32
+ filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
33
+
34
+ dataset_columns = CATEGORIES.get(category, ["Model Avg"])
35
+ avg_score = filtered_df[dataset_columns].mean(axis=1)
36
+ filtered_df["Category Score"] = avg_score
37
+
38
+ if sort_by == "Performance":
39
+ filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
40
+ else:
41
+ filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
42
+
43
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
44
+
45
+ perf_chart = get_performance_chart(filtered_df, category)
46
+ cost_chart = get_performance_cost_chart(filtered_df, category)
47
+
48
+ filtered_df["Cost (Input/Output)"] = filtered_df.apply(
49
+ lambda x: f"${x['Input cost per million token']:.2f}/${x['Output cost per million token']:.2f}",
50
+ axis=1,
51
+ )
52
+
53
+ display_columns = [
54
+ "Rank",
55
+ "Model",
56
+ "Model Type",
57
+ "Cost (Input/Output)",
58
+ "Category Score",
59
+ ]
60
+
61
+ table_html = filtered_df[display_columns].to_html(index=False, escape=False)
62
+ note_html = """
63
+ <div style='margin-top: 20px; padding: 10px; background-color: #f3f4f6; border-radius: 4px;'>
64
+ <p style='margin: 0; font-size: 0.9em; color: #4b5563;'>
65
+ Note: Cost for sorting is calculated using 3:1 ratio on I/O. Cost of Gemini 2.0 is assumed to be same as that of Gemini 1.5.
66
+ </p>
67
+ </div>
68
+ """
69
+ table_html += note_html
70
+ return table_html, perf_chart, cost_chart
visualization.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import plotly.graph_objects as go
5
+
6
+
7
+ def setup_matplotlib():
8
+ """Set up matplotlib configuration."""
9
+ matplotlib.use("Agg")
10
+ plt.close("all")
11
+
12
+
13
+ def get_performance_chart(df, category_name="Overall"):
14
+ plt.close("all")
15
+ score_column = "Category Score"
16
+ df_sorted = df.sort_values(score_column, ascending=True)
17
+ colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
18
+
19
+ height = max(8, len(df_sorted) * 0.8)
20
+ fig, ax = plt.subplots(figsize=(16, height))
21
+ plt.rcParams.update({"font.size": 12})
22
+
23
+ try:
24
+ bars = ax.barh(
25
+ np.arange(len(df_sorted)),
26
+ df_sorted[score_column],
27
+ height=0.6,
28
+ color=[colors[t] for t in df_sorted["Model Type"]],
29
+ )
30
+
31
+ ax.set_title(
32
+ f"Model Performance Comparison - {category_name}",
33
+ pad=20,
34
+ fontsize=20,
35
+ fontweight="bold",
36
+ )
37
+ ax.set_xlabel("Average Score", fontsize=14, labelpad=10)
38
+ ax.set_xlim(0.0, 1.0)
39
+
40
+ ax.set_yticks(np.arange(len(df_sorted)))
41
+ ax.set_yticklabels(df_sorted["Model"], fontsize=12)
42
+
43
+ plt.subplots_adjust(left=0.35)
44
+
45
+ for i, v in enumerate(df_sorted[score_column]):
46
+ ax.text(
47
+ v + 0.01, i, f"{v:.3f}", va="center", fontsize=12, fontweight="bold"
48
+ )
49
+
50
+ ax.grid(True, axis="x", linestyle="--", alpha=0.2)
51
+ ax.spines[["top", "right"]].set_visible(False)
52
+
53
+ legend_elements = [
54
+ plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
55
+ for label, color in colors.items()
56
+ ]
57
+ ax.legend(
58
+ handles=legend_elements,
59
+ title="Model Type",
60
+ loc="lower right",
61
+ fontsize=12,
62
+ title_fontsize=14,
63
+ )
64
+
65
+ plt.tight_layout()
66
+ return fig
67
+ finally:
68
+ plt.close(fig)
69
+
70
+
71
+ def create_radar_plot(df, model_names):
72
+ datasets = [col for col in df.columns[7:] if col != "IO Cost"]
73
+ fig = go.Figure()
74
+
75
+ colors = ["rgba(99, 102, 241, 0.3)", "rgba(34, 197, 94, 0.3)"]
76
+ line_colors = ["#4F46E5", "#16A34A"]
77
+
78
+ for idx, model_name in enumerate(model_names):
79
+ model_data = df[df["Model"] == model_name].iloc[0]
80
+ values = [model_data[m] for m in datasets]
81
+ values.append(values[0])
82
+ datasets_plot = datasets + [datasets[0]]
83
+
84
+ fig.add_trace(
85
+ go.Scatterpolar(
86
+ r=values,
87
+ theta=datasets_plot,
88
+ fill="toself",
89
+ fillcolor=colors[idx % len(colors)],
90
+ line=dict(color=line_colors[idx % len(line_colors)], width=2),
91
+ name=model_name,
92
+ text=[f"{val:.3f}" for val in values],
93
+ textposition="middle right",
94
+ mode="lines+markers+text",
95
+ )
96
+ )
97
+
98
+ fig.update_layout(
99
+ polar=dict(
100
+ radialaxis=dict(
101
+ visible=True, range=[0, 1], showline=False, tickfont=dict(size=12)
102
+ ),
103
+ angularaxis=dict(
104
+ tickfont=dict(size=13, family="Arial"),
105
+ rotation=90,
106
+ direction="clockwise",
107
+ ),
108
+ ),
109
+ showlegend=True,
110
+ legend=dict(
111
+ orientation="h",
112
+ yanchor="bottom",
113
+ y=-0.2,
114
+ xanchor="center",
115
+ x=0.5,
116
+ font=dict(size=14),
117
+ ),
118
+ title=dict(
119
+ text="Model Comparison",
120
+ x=0.5,
121
+ y=0.95,
122
+ font=dict(size=24, family="Arial", color="#1F2937"),
123
+ ),
124
+ paper_bgcolor="white",
125
+ plot_bgcolor="white",
126
+ height=700,
127
+ width=900,
128
+ margin=dict(t=100, b=100, l=80, r=80),
129
+ )
130
+
131
+ return fig
132
+
133
+
134
+ def get_performance_cost_chart(df, category_name="Overall"):
135
+ # Create figure and axis with specified style
136
+ fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
137
+
138
+ # Configure plot style
139
+ ax.grid(True, linestyle="--", alpha=0.15, which="both")
140
+ ax.set_facecolor("white")
141
+ fig.patch.set_facecolor("white")
142
+
143
+ colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
144
+ performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
145
+
146
+ score_column = "Category Score"
147
+
148
+ # Plot data points
149
+ for _, row in df.iterrows():
150
+ color = colors[row["Model Type"]]
151
+ size = 100 if row[score_column] > 0.85 else 80
152
+ edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
153
+
154
+ # Plot scatter points
155
+ ax.scatter(
156
+ row["IO Cost"],
157
+ row[score_column] * 100,
158
+ c=color,
159
+ s=size,
160
+ alpha=0.9,
161
+ edgecolor=edge_color,
162
+ linewidth=1,
163
+ zorder=5, # Ensure points are above grid
164
+ )
165
+
166
+ # Add annotations with model names
167
+ bbox_props = dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.8)
168
+
169
+ ax.annotate(
170
+ f"{row['Model']}\n(${row['IO Cost']:.2f})",
171
+ (row["IO Cost"], row[score_column] * 100),
172
+ xytext=(5, 5),
173
+ textcoords="offset points",
174
+ fontsize=8,
175
+ bbox=bbox_props,
176
+ zorder=6,
177
+ )
178
+
179
+ # Configure axes
180
+ ax.set_xscale("log")
181
+ ax.set_xlim(0.08, 40) # Adjust based on your data range
182
+ ax.set_ylim(60, 95)
183
+
184
+ # Customize axis labels
185
+ ax.set_xlabel("I/O Cost per Million Tokens ($)", fontsize=10, labelpad=10)
186
+ ax.set_ylabel("Model Performance Score", fontsize=10, labelpad=10)
187
+
188
+ # Add legend
189
+ legend_elements = [
190
+ plt.scatter([], [], c=color, label=label, s=80)
191
+ for label, color in colors.items()
192
+ ]
193
+ ax.legend(
194
+ handles=legend_elements,
195
+ loc="upper right",
196
+ frameon=True,
197
+ facecolor="white",
198
+ edgecolor="none",
199
+ fontsize=9,
200
+ )
201
+
202
+ # Set title
203
+ ax.set_title(
204
+ f"AI Language Model Performance vs. Cost - {category_name}", fontsize=12, pad=15
205
+ )
206
+
207
+ # Add performance bands
208
+ for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
209
+ ax.axhspan(y1, y2, alpha=0.2, color=color, zorder=1)
210
+
211
+ # Customize tick parameters
212
+ ax.tick_params(axis="both", which="major", labelsize=9)
213
+ ax.tick_params(axis="both", which="minor", labelsize=8)
214
+
215
+ # Add minor ticks for log scale
216
+ ax.xaxis.set_minor_locator(plt.LogLocator(base=10.0, subs=np.arange(2, 10) * 0.1))
217
+
218
+ # Adjust layout
219
+ plt.tight_layout()
220
+
221
+ return fig