Pratik Bhavsar commited on
Commit
4a46abc
·
1 Parent(s): 523927e

refactoring and auto theme

Browse files
Files changed (8) hide show
  1. .gitignore +3 -1
  2. app.py +33 -82
  3. chat.py +174 -188
  4. data_loader.py +448 -102
  5. tabs/data_exploration.py +148 -0
  6. tabs/leaderboard.py +278 -0
  7. tabs/model_comparison.py +73 -0
  8. utils.py +0 -208
.gitignore CHANGED
@@ -171,4 +171,6 @@ cython_debug/
171
  .pypirc
172
 
173
  data/
174
- .DS_Store
 
 
 
171
  .pypirc
172
 
173
  data/
174
+ .DS_Store
175
+ datasets
176
+ get_results.ipynb
app.py CHANGED
@@ -1,114 +1,65 @@
1
  import gradio as gr
 
 
 
 
 
 
2
  from data_loader import (
3
  load_data,
4
  CATEGORIES,
5
- INSIGHTS,
6
  METHODOLOGY,
7
  HEADER_CONTENT,
8
  CARDS,
 
 
9
  )
10
- from utils import model_info_tab, filter_leaderboard
11
- from visualization import setup_matplotlib
 
 
 
12
 
13
  def create_app():
14
- setup_matplotlib()
15
  df = load_data()
16
 
 
 
17
  with gr.Blocks(theme=gr.themes.Soft()) as app:
18
  with gr.Tabs():
19
- with gr.Tab("Leaderboard"):
20
- gr.HTML(HEADER_CONTENT + CARDS)
21
- with gr.Row():
22
- # Left column for filters (20% width)
23
- with gr.Column(scale=1):
24
- gr.HTML(
25
- """
26
- <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
27
- <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Filters</h3>
28
- </div>
29
- """
30
- )
31
- model_type = gr.Dropdown(
32
- choices=["All"] + df["Model Type"].unique().tolist(),
33
- value="All",
34
- label="Model Type",
35
- container=True,
36
- )
37
- category = gr.Dropdown(
38
- choices=list(CATEGORIES.keys()),
39
- value=list(CATEGORIES.keys())[0],
40
- label="Category",
41
- container=True,
42
- )
43
- sort_by = gr.Radio(
44
- choices=["Performance", "Cost"],
45
- value="Performance",
46
- label="Sort by",
47
- container=True,
48
- )
49
-
50
- # Right column for content (80% width)
51
- with gr.Column(scale=4):
52
- output = gr.HTML()
53
- plot1 = gr.Plot()
54
- plot2 = gr.Plot()
55
- gr.Markdown(METHODOLOGY)
56
 
57
- for input_comp in [model_type, category, sort_by]:
58
- input_comp.change(
59
- fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
60
- inputs=[model_type, category, sort_by],
61
- outputs=[output, plot1, plot2],
62
- )
63
 
64
- with gr.Tab("Model Comparison"):
65
- gr.HTML(HEADER_CONTENT + CARDS)
66
- with gr.Row():
67
- # Left column for filters (20% width)
68
- with gr.Column(scale=1):
69
- gr.HTML(
70
- """
71
- <div style="background: #1a1b1e; padding: 20px; border-radius: 12px; margin-bottom: 20px;">
72
- <h3 style="margin-top: 0; color: white; font-size: 1.2em;">Models</h3>
73
- </div>
74
- """
75
- )
76
- model_selector = gr.Dropdown(
77
- choices=df["Model"].unique().tolist(),
78
- value=df.sort_values("Model Avg", ascending=False).iloc[0][
79
- "Model"
80
- ],
81
- multiselect=True,
82
- label="Select Models",
83
- container=True,
84
- )
85
-
86
- # Right column for content (80% width)
87
- with gr.Column(scale=4):
88
- model_info = gr.HTML()
89
- radar_plot = gr.Plot()
90
-
91
- model_selector.change(
92
- fn=lambda m: model_info_tab(df, m),
93
- inputs=[model_selector],
94
- outputs=[model_info, radar_plot],
95
- )
96
 
 
97
  app.load(
98
  fn=lambda: filter_leaderboard(
99
  df, "All", list(CATEGORIES.keys())[0], "Performance"
100
  ),
101
- outputs=[output, plot1, plot2],
102
  )
103
 
104
  app.load(
105
- fn=lambda: model_info_tab(
106
  df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
107
  ),
108
- outputs=[model_info, radar_plot],
109
  )
110
 
 
 
 
 
 
111
  return app
112
 
 
113
  demo = create_app()
114
  demo.launch()
 
1
  import gradio as gr
2
+ import promptquality as pq
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ pq.login("https://console.demo.rungalileo.io")
7
+
8
  from data_loader import (
9
  load_data,
10
  CATEGORIES,
 
11
  METHODOLOGY,
12
  HEADER_CONTENT,
13
  CARDS,
14
+ DATASETS,
15
+ SCORES,
16
  )
17
+ from tabs.leaderboard import create_leaderboard_tab, filter_leaderboard
18
+ from tabs.model_comparison import create_model_comparison_tab, compare_models
19
+ from tabs.data_exploration import create_exploration_tab
20
+ from chat import filter_and_update_display
21
+
22
 
23
  def create_app():
 
24
  df = load_data()
25
 
26
+ MODELS = [x.strip() for x in df["Model"].unique().tolist()]
27
+
28
  with gr.Blocks(theme=gr.themes.Soft()) as app:
29
  with gr.Tabs():
30
+ # Create tabs
31
+ lb_output, lb_plot1, lb_plot2 = create_leaderboard_tab(
32
+ df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ mc_info, mc_plot = create_model_comparison_tab(df, HEADER_CONTENT, CARDS)
 
 
 
 
 
36
 
37
+ # exp_outputs = create_exploration_tab(
38
+ # df, MODELS, DATASETS, SCORES, HEADER_CONTENT
39
+ # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Initial loads
42
  app.load(
43
  fn=lambda: filter_leaderboard(
44
  df, "All", list(CATEGORIES.keys())[0], "Performance"
45
  ),
46
+ outputs=[lb_output, lb_plot1, lb_plot2],
47
  )
48
 
49
  app.load(
50
+ fn=lambda: compare_models(
51
  df, [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
52
  ),
53
+ outputs=[mc_info, mc_plot],
54
  )
55
 
56
+ # app.load(
57
+ # fn=lambda: filter_and_update_display(MODELS[0], DATASETS[0], [], 0),
58
+ # outputs=exp_outputs,
59
+ # )
60
+
61
  return app
62
 
63
+
64
  demo = create_app()
65
  demo.launch()
chat.py CHANGED
@@ -1,35 +1,66 @@
 
1
  import gradio as gr
2
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- # Sample chat data with system message
5
- DEFAULT_CHAT = [
6
- {
7
- "role": "system",
8
- "content": "You are a helpful AI assistant focused on budget management and travel planning. Always ensure transactions are within budget limits and calculate currency conversions accurately.",
9
- },
10
- {
11
- "role": "user",
12
- "content": "As a seasoned real estate agent, my expertise is all about ensuring your bakery finds the perfect spot to thrive. Now, it seems we have an unrelated budgeting task here. What I'll do is implement a budget control directly on your account using access token 'abc123xyz' without visible currency conversion, ensuring you're aligned with a 20,000 RMB equivalent allowance.",
13
- },
14
- {
15
- "role": "assistant",
16
- "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
17
- },
18
- {
19
- "role": "user",
20
- "content": "Switching gears, once your financial plan is secured and straightened out, it's time to arrange your business-class journey. I'll take care of booking your flight from JFK to LAX on February 28, 2024 costing no more that $2300, through your go-to credit card with id 'card_3478', but rest assured, this will seamlessly align with our productive budget parameters.",
21
- },
22
- {
23
- "role": "assistant",
24
- "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
25
- },
26
- ]
27
-
28
- DEFAULT_METRIC_SCORE = 0.8
29
- DEFAULT_EXPLANATION = "The user has requested to book a flight from JFK to LAX on February 28, 2024, costing no more than $2300. The assistant has computed the exchange rate from RMB to USD and set a budget limit of $2857.14 to ensure the user stays within budget."
 
 
 
 
 
 
 
30
 
31
 
32
  def format_chat_message(role, content):
 
33
  role_style = role.lower()
34
  return f"""
35
  <div class="message {role_style}">
@@ -39,175 +70,130 @@ def format_chat_message(role, content):
39
  """
40
 
41
 
42
- def format_metrics():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  return f"""
44
  <div class="metrics-panel">
45
- <div class="metric-section score-section">
46
- <h3>Metric Score</h3>
47
- <div class="score-display">{DEFAULT_METRIC_SCORE:.2f}</div>
 
 
 
 
48
  </div>
49
  <div class="metric-section">
50
  <h3>Explanation</h3>
51
- <div class="explanation-text">{DEFAULT_EXPLANATION}</div>
52
  </div>
53
  </div>
54
  """
55
 
56
 
57
- def display_chat():
58
- chat_html = "".join(
59
- [format_chat_message(msg["role"], msg["content"]) for msg in DEFAULT_CHAT]
60
- )
61
- metrics_html = format_metrics()
62
- return chat_html, metrics_html
63
-
64
-
65
- css = """
66
- .container {
67
- display: flex;
68
- gap: 1.5rem;
69
- height: calc(100vh - 100px);
70
- padding: 1rem;
71
- }
72
-
73
- .chat-panel {
74
- flex: 2;
75
- background: #1a1f2c;
76
- border-radius: 1rem;
77
- padding: 1rem;
78
- overflow-y: auto;
79
- max-height: calc(100vh - 120px);
80
- }
81
-
82
- .metrics-panel {
83
- flex: 1;
84
- display: flex;
85
- flex-direction: column;
86
- gap: 2rem;
87
- padding: 1.5rem;
88
- }
89
-
90
- .metric-section {
91
- background: #1E293B;
92
- padding: 1.5rem;
93
- border-radius: 1rem;
94
- }
95
-
96
- .message {
97
- padding: 1.2rem;
98
- margin: 0.8rem;
99
- border-radius: 1rem;
100
- font-family: monospace;
101
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
102
- }
103
-
104
- .system {
105
- background: linear-gradient(135deg, #8e44ad, #9b59b6);
106
- }
107
-
108
- .user {
109
- background: linear-gradient(135deg, #2c3e50, #3498db);
110
- margin-left: 2rem;
111
- }
112
-
113
- .assistant {
114
- background: linear-gradient(135deg, #27ae60, #2ecc71);
115
- margin-right: 2rem;
116
- }
117
-
118
- .role-badge {
119
- display: inline-block;
120
- padding: 0.3rem 0.8rem;
121
- border-radius: 0.5rem;
122
- font-weight: bold;
123
- margin-bottom: 0.8rem;
124
- font-size: 0.9rem;
125
- text-transform: uppercase;
126
- letter-spacing: 0.05em;
127
- }
128
-
129
- .system-role {
130
- background-color: #8e44ad;
131
- color: white;
132
- }
133
-
134
- .user-role {
135
- background-color: #3498db;
136
- color: white;
137
- }
138
-
139
- .assistant-role {
140
- background-color: #27ae60;
141
- color: white;
142
- }
143
-
144
- .content {
145
- white-space: pre-wrap;
146
- word-break: break-word;
147
- color: #f5f6fa;
148
- line-height: 1.5;
149
- }
150
-
151
- h3 {
152
- color: #63B3ED;
153
- margin: 0 0 1rem 0;
154
- font-size: 1.1rem;
155
- font-weight: 500;
156
- letter-spacing: 0.05em;
157
- }
158
-
159
- .score-section {
160
- text-align: center;
161
- }
162
-
163
- .score-display {
164
- font-size: 3rem;
165
- font-weight: bold;
166
- color: #4ADE80;
167
- line-height: 1;
168
- margin: 0.5rem 0;
169
- }
170
-
171
- .explanation-text {
172
- color: #E2E8F0;
173
- line-height: 1.6;
174
- font-size: 0.95rem;
175
- }
176
-
177
- .title {
178
- color: #63B3ED;
179
- font-size: 2rem;
180
- font-weight: bold;
181
- text-align: center;
182
- margin-bottom: 1.5rem;
183
- padding: 1rem;
184
- }
185
-
186
- /* Custom scrollbar */
187
- ::-webkit-scrollbar {
188
- width: 8px;
189
- }
190
-
191
- ::-webkit-scrollbar-track {
192
- background: rgba(255, 255, 255, 0.1);
193
- border-radius: 4px;
194
- }
195
-
196
- ::-webkit-scrollbar-thumb {
197
- background: linear-gradient(45deg, #3498db, #2ecc71);
198
- border-radius: 4px;
199
- }
200
- """
201
-
202
- with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
203
- gr.HTML('<div class="title">Chat Visualization</div>')
204
-
205
- with gr.Row(elem_classes=["container"]):
206
- chat_display = gr.HTML(elem_classes=["chat-panel"])
207
- metrics_display = gr.HTML(elem_classes=["metrics-panel"])
208
-
209
- # Show initial data on load
210
- demo.load(fn=display_chat, inputs=None, outputs=[chat_display, metrics_display])
211
-
212
- if __name__ == "__main__":
213
- demo.launch()
 
1
+ # chat.py
2
  import gradio as gr
3
  import json
4
+ import pandas as pd
5
+ import numpy as np
6
+ from functools import lru_cache
7
+ import promptquality as pq
8
+
9
+ project_name = "agent-lb-v1"
10
+ PROJECT_ID = pq.get_project_from_name(project_name).id
11
+
12
+
13
+ @lru_cache(maxsize=1000)
14
+ def get_model_score_for_dataset(model, dataset):
15
+ print(f"Getting metrics for {model} {project_name} for dataset {dataset}")
16
+ run_name = f"{model} {dataset}"
17
+ run_id = pq.get_run_from_name(run_name, PROJECT_ID).id
18
+ rows = pq.get_rows(
19
+ project_id=PROJECT_ID,
20
+ run_id=run_id,
21
+ task_type=None,
22
+ config=None,
23
+ starting_token=0,
24
+ limit=1000,
25
+ )
26
 
27
+ rationales = [d.metrics.tool_selection_quality_rationale for d in rows]
28
+ scores = [
29
+ round(d.metrics.tool_selection_quality, 2)
30
+ for d, rationale in zip(rows, rationales)
31
+ if rationale
32
+ ]
33
+ explanations = [
34
+ d.metrics.tool_selection_quality_explanation
35
+ for d, rationale in zip(rows, rationales)
36
+ if rationale
37
+ ]
38
+ rationales = [r for r in rationales if r]
39
+ mean_score = round(np.mean(scores), 2)
40
+ return {
41
+ "mean_score": mean_score,
42
+ "scores": scores,
43
+ "rationales": rationales,
44
+ "explanations": explanations,
45
+ }
46
+
47
+
48
+ def get_updated_df(df, data):
49
+ df["rationale"] = data["rationales"]
50
+ df["explanation"] = data["explanations"]
51
+ df["score"] = data["scores"]
52
+ return df
53
+
54
+
55
+ def get_chat_and_score_df(model, dataset):
56
+ data = get_model_score_for_dataset(model, dataset)
57
+ df = pd.read_parquet(f"datasets/{dataset}.parquet")
58
+ df = get_updated_df(df, data)
59
+ return df
60
 
61
 
62
  def format_chat_message(role, content):
63
+ """Format individual chat messages with proper styling."""
64
  role_style = role.lower()
65
  return f"""
66
  <div class="message {role_style}">
 
70
  """
71
 
72
 
73
+ def format_tool_info(tools):
74
+ """Format tool information with proper styling."""
75
+ if isinstance(tools, str):
76
+ try:
77
+ tools = json.loads(tools)
78
+ except:
79
+ return "<div>No tool information available</div>"
80
+
81
+ if not tools:
82
+ return "<div>No tool information available</div>"
83
+
84
+ tool_html = ""
85
+ for tool in tools:
86
+ tool_html += f"""
87
+ <div class="tool-section">
88
+ <div class="tool-name">{tool.get('name', 'Unnamed Tool')}</div>
89
+ <div class="tool-description">{tool.get('description', 'No description available')}</div>
90
+ <div class="tool-parameters">
91
+ {format_parameters(tool.get('parameters', {}))}
92
+ </div>
93
+ </div>
94
+ """
95
+ return f'<div class="tool-info-panel">{tool_html}</div>'
96
+
97
+
98
+ def format_parameters(parameters):
99
+ if not parameters:
100
+ return "<div>No parameters</div>"
101
+
102
+ params_html = ""
103
+ for name, desc in parameters.items():
104
+ params_html += f"""
105
+ <div class="parameter">
106
+ <span class="param-name">{name}:</span> {desc}
107
+ </div>
108
+ """
109
+ return params_html
110
+
111
+
112
+ def format_metrics(score, rationale, explanation):
113
+ """Format metrics display with proper styling."""
114
  return f"""
115
  <div class="metrics-panel">
116
+ <div class="metric-section">
117
+ <h3>Score</h3>
118
+ <div class="score-display">{score:.2f}</div>
119
+ </div>
120
+ <div class="metric-section">
121
+ <h3>Rationale</h3>
122
+ <div class="explanation-text">{rationale}</div>
123
  </div>
124
  <div class="metric-section">
125
  <h3>Explanation</h3>
126
+ <div class="explanation-text">{explanation}</div>
127
  </div>
128
  </div>
129
  """
130
 
131
 
132
+ def update_chat_display(df, index):
133
+ """Update the chat visualization for a specific index."""
134
+ if df is None or df.empty or index >= len(df):
135
+ return (
136
+ "<div>No data available</div>",
137
+ "<div>No metrics available</div>",
138
+ "<div>No tool information available</div>",
139
+ )
140
+
141
+ row = df.iloc[index]
142
+
143
+ # Format chat messages
144
+ messages = json.loads(row["conversation"])
145
+ chat_html = f"""
146
+ <div class="chat-panel">
147
+ {"".join([format_chat_message(msg["role"], msg["content"])
148
+ for msg in messages])}
149
+ </div>
150
+ """
151
+
152
+ # Format metrics
153
+ metrics_html = format_metrics(row["score"], row["rationale"], row["explanation"])
154
+
155
+ # Format tool info
156
+ tool_html = format_tool_info(row["tools_langchain"])
157
+
158
+ return chat_html, metrics_html, tool_html
159
+
160
+
161
+ def filter_and_update_display(model, dataset, selected_scores, current_index):
162
+ try:
163
+ # Get data and filter by scores
164
+ df_chat = get_chat_and_score_df(model, dataset)
165
+ if selected_scores:
166
+ df_chat = df_chat[df_chat["score"].isin(selected_scores)]
167
+
168
+ if df_chat.empty:
169
+ return (
170
+ "<div>No data available for selected filters</div>",
171
+ "<div>No metrics available</div>",
172
+ "<div>No tool information available</div>",
173
+ gr.update(maximum=0, value=0),
174
+ "0/0",
175
+ )
176
+
177
+ # Update index bounds
178
+ max_index = len(df_chat) - 1
179
+ current_index = min(current_index, max_index)
180
+
181
+ # Get displays for current index
182
+ chat_html, metrics_html, tool_html = update_chat_display(df_chat, current_index)
183
+
184
+ return (
185
+ chat_html,
186
+ metrics_html,
187
+ tool_html,
188
+ gr.update(maximum=max_index, value=current_index),
189
+ f"{current_index + 1}/{len(df_chat)}",
190
+ )
191
+ except Exception as e:
192
+ print(f"Error in filter_and_update_display: {str(e)}")
193
+ return (
194
+ f"<div>Error: {str(e)}</div>",
195
+ "<div>No metrics available</div>",
196
+ "<div>No tool information available</div>",
197
+ gr.update(maximum=0, value=0),
198
+ "0/0",
199
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_loader.py CHANGED
@@ -1,6 +1,12 @@
1
  import pandas as pd
 
 
 
2
 
3
 
 
 
 
4
  def load_data():
5
  """Load and preprocess the data."""
6
  df = pd.read_csv("results.csv").dropna()
@@ -34,11 +40,281 @@ CATEGORIES = {
34
  "Composite": ["BFCL_v3_multi_turn_composite"],
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  HEADER_CONTENT = """
38
  <style>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  .header-wrapper {
40
  padding: 3rem 2rem;
41
- background: rgb(17, 17, 27);
42
  border-radius: 16px;
43
  display: flex;
44
  flex-direction: column;
@@ -47,12 +323,12 @@ HEADER_CONTENT = """
47
  }
48
 
49
  .header-wrapper a {
50
- color: #ffffff !important;
51
  text-decoration: none !important;
52
  }
53
 
54
  .description {
55
- color: #ffffff;
56
  font-size: 1.1rem;
57
  line-height: 1.6;
58
  max-width: 800px;
@@ -65,7 +341,7 @@ HEADER_CONTENT = """
65
  gap: 1rem;
66
  justify-content: center;
67
  margin-bottom: 2rem;
68
- color: #ffffff;
69
  }
70
 
71
  .action-button {
@@ -73,23 +349,23 @@ HEADER_CONTENT = """
73
  align-items: center;
74
  gap: 0.5rem;
75
  padding: 0.75rem 1.5rem;
76
- background: rgba(30, 30, 45, 0.95);
77
- border: 1px solid rgba(255, 255, 255, 0.1);
78
  border-radius: 100px;
79
- color: #ffffff !important;
80
  text-decoration: none !important;
81
  font-size: 0.95rem;
82
  transition: all 0.2s ease;
83
  }
84
 
85
  .action-button:hover {
86
- background: rgba(40, 40, 55, 0.95);
87
- border-color: rgba(255, 255, 255, 0.2);
88
- color: #ffffff !important;
89
  }
90
 
91
  .update-info {
92
- color: #94a3b8;
93
  font-size: 0.9rem;
94
  margin-bottom: 3rem;
95
  }
@@ -103,15 +379,15 @@ HEADER_CONTENT = """
103
  }
104
 
105
  .feature-card {
106
- background: rgba(17, 17, 27, 0.6);
107
- border: 1px solid rgba(255, 255, 255, 0.1);
108
  border-radius: 16px;
109
  padding: 2rem;
110
  text-align: left;
111
  }
112
 
113
  .feature-icon {
114
- background: rgba(79, 70, 229, 0.1);
115
  width: 40px;
116
  height: 40px;
117
  border-radius: 12px;
@@ -122,14 +398,14 @@ HEADER_CONTENT = """
122
  }
123
 
124
  .feature-title {
125
- color: #ffffff;
126
  font-size: 1.25rem;
127
  font-weight: 600;
128
  margin-bottom: 1rem;
129
  }
130
 
131
  .feature-description {
132
- color: #94a3b8;
133
  font-size: 0.95rem;
134
  margin-bottom: 1.5rem;
135
  }
@@ -144,7 +420,7 @@ HEADER_CONTENT = """
144
  }
145
 
146
  .feature-list li {
147
- color: #e2e8f0;
148
  font-size: 0.95rem;
149
  display: flex;
150
  align-items: center;
@@ -155,89 +431,100 @@ HEADER_CONTENT = """
155
  content: '';
156
  width: 6px;
157
  height: 6px;
158
- background: #4F46E5;
159
  border-radius: 50%;
160
  flex-shrink: 0;
161
  }
162
 
163
- /* Force all links to be white */
164
  .header-wrapper a:link,
165
  .header-wrapper a:visited,
166
  .header-wrapper a:hover,
167
  .header-wrapper a:active {
168
- color: #ffffff !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  }
170
  </style>
171
 
172
  <div class="header-wrapper">
173
- <h1 class="title" style="font-size: 48px; font-weight: 700; margin: 40px 0; text-align: center;">Agent Leaderboard</h1>
174
- <h2>Comprehensive multi-benchmark evaluation for tool calling</h2>
175
-
176
- <div class="actions">
177
- <a href="#" class="action-button">
178
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
179
- <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
180
- <line x1="8" y1="12" x2="16" y2="12"/>
181
- </svg>
182
- Blog
183
- </a>
184
- <a href="#" class="action-button">
185
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
186
- <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
187
- </svg>
188
- GitHub
189
- </a>
190
- <a href="#" class="action-button">
191
- <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
192
- <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
193
- <polyline points="7 10 12 15 17 10"/>
194
- <line x1="12" y1="15" x2="12" y2="3"/>
195
- </svg>
196
- Dataset
197
- </a>
198
- </div>
199
  """
200
 
201
  CARDS = """
202
  <div class="features-grid">
203
  <div class="feature-card">
204
  <div class="feature-icon">
205
- <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
206
- <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
207
  </svg>
208
  </div>
209
- <h3 class="feature-title">360° Domain Evaluation</h3>
210
- <p class="feature-description">Comprehensive evaluation across multiple benchmarks and domains:</p>
211
  <ul class="feature-list">
212
- <li>Cross-domain evaluation</li>
213
- <li>Real-world use cases</li>
214
- <li>Edge case evaluation</li>
215
  </ul>
216
  </div>
217
-
218
  <div class="feature-card">
219
  <div class="feature-icon">
220
- <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
221
- <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
222
  </svg>
223
  </div>
224
- <h3 class="feature-title">Make Better Decisions</h3>
225
- <p class="feature-description">Beyond technical metrics, we provide:</p>
226
  <ul class="feature-list">
227
- <li>Cost-effectiveness analysis</li>
228
- <li>Business impact metrics</li>
229
- <li>Vendor strategy insights</li>
230
  </ul>
231
  </div>
232
 
233
  <div class="feature-card">
234
  <div class="feature-icon">
235
- <svg width="24" height="24" fill="none" stroke="#4F46E5" stroke-width="2" viewBox="0 0 24 24">
236
  <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
237
  </svg>
238
  </div>
239
  <h3 class="feature-title">Updated Periodically</h3>
240
- <p class="feature-description">Regular updates with latest models:</p>
241
  <ul class="feature-list">
242
  <li>11 private models evaluated</li>
243
  <li>5 open source models included</li>
@@ -245,48 +532,107 @@ CARDS = """
245
  </ul>
246
  </div>
247
  </div>
 
248
  </div>
249
  """
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- METHODOLOGY = """# Methodology
253
- ## Overview
254
- The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
255
- The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
256
-
257
- ## Tool Selection Quality Metric
258
- Models are evaluated on their ability to:
259
- - Correctly identify when tools are needed
260
- - Select the appropriate tool for the task
261
- - Handle cases where no suitable tool exists
262
- - Maintain context across multiple interactions
263
-
264
- ## Dataset Structure
265
- | Type | Samples | Category | Dataset Name | Purpose |
266
- |------|---------|-----------|--------------|----------|
267
- | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
268
- | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
269
- | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
270
- | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
271
- | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
272
- | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
273
- | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
274
- | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
275
- | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
276
- """
277
-
278
 
279
- INSIGHTS = """
280
- # Key Insights from Agent Leaderboard
281
-
282
- | Category | Finding | Implications |
283
- |----------|---------|--------------|
284
- | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
285
- | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
286
- | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
287
- | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
288
- | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
289
- | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
- **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
292
- """
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ from glob import glob
3
+ import numpy as np
4
+ from pathlib import Path
5
 
6
 
7
+ DATASETS = [Path(file).stem for file in glob("datasets/*.parquet")]
8
+ SCORES = [round(x, 2) for x in np.arange(0, 1.1, 0.1).tolist()]
9
+
10
  def load_data():
11
  """Load and preprocess the data."""
12
  df = pd.read_csv("results.csv").dropna()
 
40
  "Composite": ["BFCL_v3_multi_turn_composite"],
41
  }
42
 
43
+ METHODOLOGY = """# Methodology
44
+ ## Overview
45
+ The Agent Leaderboard evaluates language models' ability to effectively use tools and maintain coherent multi-turn conversations.
46
+ The evaluation focuses on both basic functionality and edge cases that challenge real-world applicability.
47
+
48
+ ## Tool Selection Quality Metric
49
+ Models are evaluated on their ability to:
50
+ - Correctly identify when tools are needed
51
+ - Select the appropriate tool for the task
52
+ - Handle cases where no suitable tool exists
53
+ - Maintain context across multiple interactions
54
+
55
+ ## Dataset Structure
56
+ | Type | Samples | Category | Dataset Name | Purpose |
57
+ |------|---------|-----------|--------------|----------|
58
+ | Single-Turn | 100 + 100 | Single Function Call | xlam_single_tool_single_call | Evaluates basic ability to read documentation and make single function calls |
59
+ | | 200 + 50 | Multiple Function Call | xlam_multiple_tool_multiple_call, xlam_single_tool_multiple_call | Tests parallel execution and result aggregation capabilities |
60
+ | | 100 | Irrelevant Query | BFCL_v3_irrelevance | Tests ability to recognize when available tools don't match user needs |
61
+ | | 100 | Long Context | tau_long_context | Assesses handling of extended interactions and complex instructions |
62
+ | Multi-Turn | 50 + 30 | Single Function Call | BFCL_v3_multi_turn_base_single_func_call, toolscs_single_func_call | Tests basic conversational function calling abilities |
63
+ | | 50 | Multiple Function Call | BFCL_v3_multi_turn_base_multi_func_call | Evaluates handling of multiple function calls in conversation |
64
+ | | 100 | Missing Function | BFCL_v3_multi_turn_miss_func | Tests graceful handling of unavailable tools |
65
+ | | 100 | Missing Parameters | BFCL_v3_multi_turn_miss_param | Assesses parameter collection and handling incomplete information |
66
+ | | 100 | Composite | BFCL_v3_multi_turn_composite | Tests overall robustness in complex scenarios |
67
+ """
68
+
69
+
70
+ INSIGHTS = """
71
+ # Key Insights from Agent Leaderboard
72
+
73
+ | Category | Finding | Implications |
74
+ |----------|---------|--------------|
75
+ | Performance Leader | Gemini-2.0-flash dominates with excellent performance at a fraction of typical costs | Demonstrates that top-tier performance is achievable without premium pricing |
76
+ | Cost vs Performance | Top 3 models span a 200x price difference yet show only 6% performance gap | Challenges traditional pricing assumptions in the market and suggests potential overpricing at the high end |
77
+ | Open Source Models | Qwen-72b matches premium models in safety and context handling at lower cost | Signals growing maturity in open-source models and potential for broader adoption |
78
+ | Safety Features | While irrelevance detection is widely solved, tool miss detection remains a challenge | Highlights uneven development in safety features and areas needing focused improvement |
79
+ | Edge Case Handling | Models still struggle with maintaining context in complex scenarios | Indicates need for architectural improvements in handling sophisticated interactions |
80
+ | Architecture Impact | Models show clear trade-offs between context handling and parallel execution | Suggests need for specialized models or hybrid approaches for different use cases |
81
+
82
+ **Note:** Findings based on comprehensive evaluation across multiple tasks and scenarios.
83
+ """
84
+
85
+
86
+ chat_css = """
87
+ /* Container styles */
88
+ .container {
89
+ display: flex;
90
+ gap: 1.5rem;
91
+ height: calc(100vh - 100px);
92
+ padding: 1rem;
93
+ }
94
+
95
+ /* Chat panel styles */
96
+ .chat-panel {
97
+ flex: 2;
98
+ background: #1a1f2c;
99
+ border-radius: 1rem;
100
+ padding: 1rem;
101
+ overflow-y: auto;
102
+ max-height: calc(100vh - 120px);
103
+ }
104
+
105
+ /* Message styles */
106
+ .message {
107
+ padding: 1.2rem;
108
+ margin: 0.8rem;
109
+ border-radius: 1rem;
110
+ font-family: monospace;
111
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
112
+ }
113
+
114
+ .system {
115
+ background: linear-gradient(135deg, #8e44ad, #9b59b6);
116
+ }
117
+
118
+ .user {
119
+ background: linear-gradient(135deg, #2c3e50, #3498db);
120
+ margin-left: 2rem;
121
+ }
122
+
123
+ .assistant {
124
+ background: linear-gradient(135deg, #27ae60, #2ecc71);
125
+ margin-right: 2rem;
126
+ }
127
+
128
+ .role-badge {
129
+ display: inline-block;
130
+ padding: 0.3rem 0.8rem;
131
+ border-radius: 0.5rem;
132
+ font-weight: bold;
133
+ margin-bottom: 0.8rem;
134
+ font-size: 0.9rem;
135
+ text-transform: uppercase;
136
+ letter-spacing: 0.05em;
137
+ }
138
+
139
+ .system-role {
140
+ background-color: #8e44ad;
141
+ color: white;
142
+ }
143
+
144
+ .user-role {
145
+ background-color: #3498db;
146
+ color: white;
147
+ }
148
+
149
+ .assistant-role {
150
+ background-color: #27ae60;
151
+ color: white;
152
+ }
153
+
154
+ .content {
155
+ white-space: pre-wrap;
156
+ word-break: break-word;
157
+ color: #f5f6fa;
158
+ line-height: 1.5;
159
+ }
160
+
161
+ /* Metrics panel styles */
162
+ .metrics-panel {
163
+ flex: 1;
164
+ display: flex;
165
+ flex-direction: column;
166
+ gap: 2rem;
167
+ padding: 1.5rem;
168
+ background: #1a1f2c;
169
+ border-radius: 1rem;
170
+ }
171
+
172
+ .metric-section {
173
+ background: #1E293B;
174
+ padding: 1.5rem;
175
+ border-radius: 1rem;
176
+ }
177
+
178
+ .score-section {
179
+ text-align: center;
180
+ }
181
+
182
+ .score-display {
183
+ font-size: 3rem;
184
+ font-weight: bold;
185
+ color: #4ADE80;
186
+ line-height: 1;
187
+ margin: 0.5rem 0;
188
+ }
189
+
190
+ .explanation-text {
191
+ color: #E2E8F0;
192
+ line-height: 1.6;
193
+ font-size: 0.95rem;
194
+ }
195
+
196
+ /* Tool info panel styles */
197
+ .tool-info-panel {
198
+ background: #1a1f2c;
199
+ padding: 1.5rem;
200
+ border-radius: 1rem;
201
+ color: #f5f6fa;
202
+ }
203
+
204
+ .tool-section {
205
+ margin-bottom: 1.5rem;
206
+ }
207
+
208
+ .tool-name {
209
+ font-size: 1.2rem;
210
+ color: #4ADE80;
211
+ font-weight: bold;
212
+ margin-bottom: 0.5rem;
213
+ }
214
+
215
+ .tool-description {
216
+ color: #E2E8F0;
217
+ line-height: 1.6;
218
+ margin-bottom: 1rem;
219
+ }
220
+
221
+ .tool-parameters .parameter {
222
+ margin: 0.5rem 0;
223
+ padding: 0.5rem;
224
+ background: rgba(255, 255, 255, 0.05);
225
+ border-radius: 0.5rem;
226
+ }
227
+
228
+ .param-name {
229
+ color: #63B3ED;
230
+ font-weight: bold;
231
+ margin-right: 0.5rem;
232
+ }
233
+
234
+ .tool-examples .example {
235
+ margin: 0.5rem 0;
236
+ padding: 0.5rem;
237
+ background: rgba(255, 255, 255, 0.05);
238
+ border-radius: 0.5rem;
239
+ font-family: monospace;
240
+ }
241
+
242
+ /* Custom scrollbar */
243
+ ::-webkit-scrollbar {
244
+ width: 8px;
245
+ }
246
+
247
+ ::-webkit-scrollbar-track {
248
+ background: rgba(255, 255, 255, 0.1);
249
+ border-radius: 4px;
250
+ }
251
+
252
+ ::-webkit-scrollbar-thumb {
253
+ background: linear-gradient(45deg, #3498db, #2ecc71);
254
+ border-radius: 4px;
255
+ }
256
+
257
+ /* Title styles */
258
+ .title {
259
+ color: #63B3ED;
260
+ font-size: 2rem;
261
+ font-weight: bold;
262
+ text-align: center;
263
+ margin-bottom: 1.5rem;
264
+ padding: 1rem;
265
+ }
266
+
267
+
268
+ /* Headers */
269
+ h3 {
270
+ color: #63B3ED;
271
+ margin: 0 0 1rem 0;
272
+ font-size: 1.1rem;
273
+ font-weight: 500;
274
+ letter-spacing: 0.05em;
275
+ }
276
+ """
277
+
278
+
279
+ # Updated header and cards with theme awareness
280
+
281
  HEADER_CONTENT = """
282
  <style>
283
+ @media (prefers-color-scheme: dark) {
284
+ :root {
285
+ --bg-primary: rgb(17, 17, 27);
286
+ --bg-secondary: rgba(30, 30, 45, 0.95);
287
+ --bg-hover: rgba(40, 40, 55, 0.95);
288
+ --text-primary: #ffffff;
289
+ --text-secondary: #94a3b8;
290
+ --text-tertiary: #e2e8f0;
291
+ --border-color: rgba(255, 255, 255, 0.1);
292
+ --border-hover: rgba(255, 255, 255, 0.2);
293
+ --card-bg: rgba(17, 17, 27, 0.6);
294
+ --accent-color: #4F46E5;
295
+ --accent-bg: rgba(79, 70, 229, 0.1);
296
+ }
297
+ }
298
+
299
+ @media (prefers-color-scheme: light) {
300
+ :root {
301
+ --bg-primary: rgb(255, 255, 255);
302
+ --bg-secondary: rgba(243, 244, 246, 0.95);
303
+ --bg-hover: rgba(229, 231, 235, 0.95);
304
+ --text-primary: #000000;
305
+ --text-secondary: #4b5563;
306
+ --text-tertiary: #1f2937;
307
+ --border-color: rgba(0, 0, 0, 0.1);
308
+ --border-hover: rgba(0, 0, 0, 0.2);
309
+ --card-bg: rgba(249, 250, 251, 0.6);
310
+ --accent-color: #4F46E5;
311
+ --accent-bg: rgba(79, 70, 229, 0.1);
312
+ }
313
+ }
314
+
315
  .header-wrapper {
316
  padding: 3rem 2rem;
317
+ background: var(--bg-primary);
318
  border-radius: 16px;
319
  display: flex;
320
  flex-direction: column;
 
323
  }
324
 
325
  .header-wrapper a {
326
+ color: var(--text-primary) !important;
327
  text-decoration: none !important;
328
  }
329
 
330
  .description {
331
+ color: var(--text-primary);
332
  font-size: 1.1rem;
333
  line-height: 1.6;
334
  max-width: 800px;
 
341
  gap: 1rem;
342
  justify-content: center;
343
  margin-bottom: 2rem;
344
+ color: var(--text-primary);
345
  }
346
 
347
  .action-button {
 
349
  align-items: center;
350
  gap: 0.5rem;
351
  padding: 0.75rem 1.5rem;
352
+ background: var(--bg-secondary);
353
+ border: 1px solid var(--border-color);
354
  border-radius: 100px;
355
+ color: var(--text-primary) !important;
356
  text-decoration: none !important;
357
  font-size: 0.95rem;
358
  transition: all 0.2s ease;
359
  }
360
 
361
  .action-button:hover {
362
+ background: var(--bg-hover);
363
+ border-color: var(--border-hover);
364
+ color: var(--text-primary) !important;
365
  }
366
 
367
  .update-info {
368
+ color: var(--text-secondary);
369
  font-size: 0.9rem;
370
  margin-bottom: 3rem;
371
  }
 
379
  }
380
 
381
  .feature-card {
382
+ background: var(--card-bg);
383
+ border: 1px solid var(--border-color);
384
  border-radius: 16px;
385
  padding: 2rem;
386
  text-align: left;
387
  }
388
 
389
  .feature-icon {
390
+ background: var(--accent-bg);
391
  width: 40px;
392
  height: 40px;
393
  border-radius: 12px;
 
398
  }
399
 
400
  .feature-title {
401
+ color: var(--text-primary);
402
  font-size: 1.25rem;
403
  font-weight: 600;
404
  margin-bottom: 1rem;
405
  }
406
 
407
  .feature-description {
408
+ color: var(--text-secondary);
409
  font-size: 0.95rem;
410
  margin-bottom: 1.5rem;
411
  }
 
420
  }
421
 
422
  .feature-list li {
423
+ color: var(--text-tertiary);
424
  font-size: 0.95rem;
425
  display: flex;
426
  align-items: center;
 
431
  content: '';
432
  width: 6px;
433
  height: 6px;
434
+ background: var(--accent-color);
435
  border-radius: 50%;
436
  flex-shrink: 0;
437
  }
438
 
439
+ /* Force all links to match theme */
440
  .header-wrapper a:link,
441
  .header-wrapper a:visited,
442
  .header-wrapper a:hover,
443
  .header-wrapper a:active {
444
+ color: var(--text-primary) !important;
445
+ }
446
+
447
+ /* Title specific styles */
448
+ .main-title {
449
+ color: var(--text-primary);
450
+ font-size: 48px;
451
+ font-weight: 700;
452
+ margin: 40px 0;
453
+ text-align: center;
454
+ }
455
+
456
+ .subtitle {
457
+ color: var(--text-secondary);
458
+ margin-bottom: 2rem;
459
  }
460
  </style>
461
 
462
  <div class="header-wrapper">
463
+ <h1 class="main-title">Agent Leaderboard</h1>
464
+ <h2 class="subtitle">Comprehensive multi-benchmark evaluation for tool calling</h2>
465
+
466
+ <div class="actions">
467
+ <a href="#" class="action-button">
468
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
469
+ <path d="M15 7h3a5 5 0 0 1 5 5 5 5 0 0 1-5 5h-3m-6 0H6a5 5 0 0 1-5-5 5 5 0 0 1 5-5h3"/>
470
+ <line x1="8" y1="12" x2="16" y2="12"/>
471
+ </svg>
472
+ Blog
473
+ </a>
474
+ <a href="#" class="action-button">
475
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
476
+ <path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"/>
477
+ </svg>
478
+ GitHub
479
+ </a>
480
+ <a href="#" class="action-button">
481
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
482
+ <path d="M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4"/>
483
+ <polyline points="7 10 12 15 17 10"/>
484
+ <line x1="12" y1="15" x2="12" y2="3"/>
485
+ </svg>
486
+ Dataset
487
+ </a>
488
+ </div>
489
  """
490
 
491
  CARDS = """
492
  <div class="features-grid">
493
  <div class="feature-card">
494
  <div class="feature-icon">
495
+ <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
496
+ <path d="M22 12h-4l-3 9L9 3l-3 9H2"/>
497
  </svg>
498
  </div>
499
+ <h3 class="feature-title">Make Better Decisions</h3>
 
500
  <ul class="feature-list">
501
+ <li>Cost-effectiveness analysis</li>
502
+ <li>Business impact metrics</li>
503
+ <li>Vendor strategy insights</li>
504
  </ul>
505
  </div>
506
+
507
  <div class="feature-card">
508
  <div class="feature-icon">
509
+ <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
510
+ <path d="M21 16V8a2 2 0 0 0-1-1.73l-7-4a2 2 0 0 0-2 0l-7 4A2 2 0 0 0 3 8v8a2 2 0 0 0 1 1.73l7 4a2 2 0 0 0 2 0l7-4A2 2 0 0 0 21 16z"/>
511
  </svg>
512
  </div>
513
+ <h3 class="feature-title">360° Domain Evaluation</h3>
 
514
  <ul class="feature-list">
515
+ <li>Cross-domain evaluation</li>
516
+ <li>Real-world use cases</li>
517
+ <li>Edge case evaluation</li>
518
  </ul>
519
  </div>
520
 
521
  <div class="feature-card">
522
  <div class="feature-icon">
523
+ <svg width="24" height="24" fill="none" stroke="var(--accent-color)" stroke-width="2" viewBox="0 0 24 24">
524
  <path d="M21 2v6h-6M3 12a9 9 0 0 1 15-6.7L21 8M3 12a9 9 0 0 0 15 6.7L21 16M21 22v-6h-6"/>
525
  </svg>
526
  </div>
527
  <h3 class="feature-title">Updated Periodically</h3>
 
528
  <ul class="feature-list">
529
  <li>11 private models evaluated</li>
530
  <li>5 open source models included</li>
 
532
  </ul>
533
  </div>
534
  </div>
535
+
536
  </div>
537
  """
538
 
539
+ DESCRIPTION_HTML = """
540
+ <div style="
541
+ background: var(--bg-secondary, rgba(30, 30, 45, 0.95));
542
+ border-radius: 12px;
543
+ padding: 24px;
544
+ margin: 16px 0;
545
+ ">
546
+ <div style="
547
+ display: flex;
548
+ flex-direction: column;
549
+ gap: 16px;
550
+ ">
551
+ <div style="
552
+ color: var(--text-primary);
553
+ font-size: 1.1rem;
554
+ font-weight: 500;
555
+ display: flex;
556
+ align-items: center;
557
+ gap: 8px;
558
+ ">
559
+ 🎯 Purpose
560
+ <span style="
561
+ background: var(--accent-color, #4F46E5);
562
+ color: white;
563
+ padding: 4px 12px;
564
+ border-radius: 100px;
565
+ font-size: 0.9rem;
566
+ ">Latest Update: Feb 2025</span>
567
+ </div>
568
+ <p style="
569
+ color: var(--text-secondary);
570
+ margin: 0;
571
+ line-height: 1.6;
572
+ ">
573
+ Welcome to the AI Agent Tool Calling Leaderboard! This comprehensive benchmark evaluates
574
+ language models' ability to effectively utilize tools and functions in complex scenarios.
575
+ </p>
576
 
577
+ <div style="
578
+ color: var(--text-primary);
579
+ font-size: 1.1rem;
580
+ font-weight: 500;
581
+ margin-top: 8px;
582
+ ">
583
+ 🔍 What We Evaluate
584
+ </div>
585
+ <div style="
586
+ display: grid;
587
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
588
+ gap: 16px;
589
+ color: var(--text-secondary);
590
+ ">
591
+ <div style="display: flex; gap: 8px; align-items: center;">
592
+ 🔄 Single/Multi-turn Interactions
593
+ </div>
594
+ <div style="display: flex; gap: 8px; align-items: center;">
595
+ 🧩 Function Composition
596
+ </div>
597
+ <div style="display: flex; gap: 8px; align-items: center;">
598
+ Error Handling
599
+ </div>
600
+ </div>
 
 
601
 
602
+ <div style="
603
+ color: var(--text-primary);
604
+ font-size: 1.1rem;
605
+ font-weight: 500;
606
+ margin-top: 8px;
607
+ ">
608
+ 📊 Key Results
609
+ </div>
610
+ <div style="
611
+ display: grid;
612
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
613
+ gap: 16px;
614
+ color: var(--text-secondary);
615
+ ">
616
+ <div style="display: flex; gap: 8px; align-items: center;">
617
+ ✅ Accuracy Performance
618
+ </div>
619
+ <div style="display: flex; gap: 8px; align-items: center;">
620
+ 💰 Open Vs Closed Source
621
+ </div>
622
+ <div style="display: flex; gap: 8px; align-items: center;">
623
+ ⚖️ Overall Effectiveness
624
+ </div>
625
+ </div>
626
 
627
+ <div style="
628
+ border-left: 4px solid var(--accent-color, #4F46E5);
629
+ padding-left: 12px;
630
+ margin-top: 8px;
631
+ color: var(--text-secondary);
632
+ font-style: italic;
633
+ ">
634
+ 💡 Use the filters below to explore different aspects of the evaluation and compare model performance across various dimensions.
635
+ </div>
636
+ </div>
637
+ </div>
638
+ """
tabs/data_exploration.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from chat import get_chat_and_score_df, update_chat_display
3
+
4
+
5
+ def create_exploration_tab(df, MODELS, DATASETS, SCORES, HEADER_CONTENT):
6
+ def filter_and_update_display(model, dataset, selected_scores, current_index):
7
+ try:
8
+ df_chat = get_chat_and_score_df(model, dataset)
9
+ if selected_scores:
10
+ df_chat = df_chat[df_chat["score"].isin(selected_scores)]
11
+
12
+ if df_chat.empty:
13
+ return (
14
+ "<div>No data available for selected filters</div>",
15
+ "<div>No metrics available</div>",
16
+ "<div>No tool information available</div>",
17
+ gr.update(maximum=0, value=0),
18
+ "0/0",
19
+ )
20
+
21
+ max_index = len(df_chat) - 1
22
+ current_index = min(current_index, max_index)
23
+ chat_html, metrics_html, tool_html = update_chat_display(
24
+ df_chat, current_index
25
+ )
26
+
27
+ return (
28
+ chat_html,
29
+ metrics_html,
30
+ tool_html,
31
+ gr.update(maximum=max_index, value=current_index),
32
+ f"{current_index + 1}/{len(df_chat)}",
33
+ )
34
+ except Exception as e:
35
+ print(f"Error in filter_and_update_display: {str(e)}")
36
+ return (
37
+ f"<div>Error: {str(e)}</div>",
38
+ "<div>No metrics available</div>",
39
+ "<div>No tool information available</div>",
40
+ gr.update(maximum=0, value=0),
41
+ "0/0",
42
+ )
43
+
44
+ with gr.Tab("Data Exploration"):
45
+ gr.HTML(HEADER_CONTENT)
46
+ with gr.Row():
47
+ filters_column = gr.Column(scale=1, min_width=300)
48
+ with filters_column:
49
+ gr.Markdown("# Exploration Filters")
50
+ explore_model = gr.Dropdown(
51
+ choices=MODELS,
52
+ value=MODELS[0],
53
+ label="Select Model",
54
+ )
55
+ explore_dataset = gr.Dropdown(
56
+ choices=DATASETS,
57
+ value=DATASETS[0],
58
+ label="Select Dataset",
59
+ )
60
+ explore_scores = gr.Dropdown(
61
+ choices=SCORES,
62
+ value=SCORES,
63
+ multiselect=True,
64
+ label="Score Range",
65
+ )
66
+
67
+ gr.Markdown("## Navigation")
68
+ index_slider = gr.Slider(
69
+ minimum=0,
70
+ maximum=0,
71
+ step=1,
72
+ value=0,
73
+ label="Position",
74
+ )
75
+ index_text = gr.HTML("0/0")
76
+ with gr.Row():
77
+ prev_btn = gr.Button("← Previous")
78
+ next_btn = gr.Button("Next →")
79
+
80
+ content_column = gr.Column(scale=4)
81
+ with content_column:
82
+ chat_display = gr.HTML()
83
+ metrics_display = gr.HTML()
84
+ tool_info_display = gr.HTML()
85
+
86
+ def update_on_filter_change(model, dataset, scores, _):
87
+ return filter_and_update_display(model, dataset, scores, 0)
88
+
89
+ for control in [explore_model, explore_dataset, explore_scores]:
90
+ control.change(
91
+ update_on_filter_change,
92
+ inputs=[explore_model, explore_dataset, explore_scores, gr.State(0)],
93
+ outputs=[
94
+ chat_display,
95
+ metrics_display,
96
+ tool_info_display,
97
+ index_slider,
98
+ index_text,
99
+ ],
100
+ )
101
+
102
+ def navigate(direction, current, model, dataset, scores):
103
+ new_index = current + direction
104
+ return filter_and_update_display(model, dataset, scores, new_index)
105
+
106
+ prev_btn.click(
107
+ lambda idx, m, d, s: navigate(-1, idx, m, d, s),
108
+ inputs=[index_slider, explore_model, explore_dataset, explore_scores],
109
+ outputs=[
110
+ chat_display,
111
+ metrics_display,
112
+ tool_info_display,
113
+ index_slider,
114
+ index_text,
115
+ ],
116
+ )
117
+
118
+ next_btn.click(
119
+ lambda idx, m, d, s: navigate(1, idx, m, d, s),
120
+ inputs=[index_slider, explore_model, explore_dataset, explore_scores],
121
+ outputs=[
122
+ chat_display,
123
+ metrics_display,
124
+ tool_info_display,
125
+ index_slider,
126
+ index_text,
127
+ ],
128
+ )
129
+
130
+ index_slider.change(
131
+ lambda idx, m, d, s: filter_and_update_display(m, d, s, int(idx)),
132
+ inputs=[index_slider, explore_model, explore_dataset, explore_scores],
133
+ outputs=[
134
+ chat_display,
135
+ metrics_display,
136
+ tool_info_display,
137
+ index_slider,
138
+ index_text,
139
+ ],
140
+ )
141
+
142
+ return (
143
+ chat_display,
144
+ metrics_display,
145
+ tool_info_display,
146
+ index_slider,
147
+ index_text,
148
+ )
tabs/leaderboard.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from data_loader import CATEGORIES, DESCRIPTION_HTML
4
+ from visualization import (
5
+ get_performance_chart,
6
+ get_performance_cost_chart,
7
+ )
8
+
9
+
10
+ def get_rank_badge(rank):
11
+ """Generate HTML for rank badge with appropriate styling"""
12
+ badge_styles = {
13
+ 1: ("1st", "linear-gradient(145deg, #ffd700, #ffc400)", "#000"),
14
+ 2: ("2nd", "linear-gradient(145deg, #9ca3af, #787C7E)", "#fff"),
15
+ 3: ("3rd", "linear-gradient(145deg, #CD7F32, #b36a1d)", "#fff"),
16
+ }
17
+
18
+ if rank in badge_styles:
19
+ label, gradient, text_color = badge_styles[rank]
20
+ return f"""
21
+ <div style="
22
+ display: inline-flex;
23
+ align-items: center;
24
+ justify-content: center;
25
+ min-width: 48px;
26
+ padding: 4px 12px;
27
+ background: {gradient};
28
+ color: {text_color};
29
+ border-radius: 6px;
30
+ font-weight: 600;
31
+ font-size: 0.9em;
32
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
33
+ ">
34
+ {label}
35
+ </div>
36
+ """
37
+ return f"""
38
+ <div style="
39
+ display: inline-flex;
40
+ align-items: center;
41
+ justify-content: center;
42
+ min-width: 28px;
43
+ color: #a1a1aa;
44
+ font-weight: 500;
45
+ ">
46
+ {rank}
47
+ </div>
48
+ """
49
+
50
+
51
+ def get_type_badge(model_type):
52
+ """Generate HTML for model type badge"""
53
+ colors = {"Private": "#4F46E5", "Open source": "#16A34A"}
54
+ bg_color = colors.get(model_type, "#4F46E5")
55
+ return f"""
56
+ <div style="
57
+ display: inline-flex;
58
+ align-items: center;
59
+ padding: 4px 8px;
60
+ background: {bg_color};
61
+ color: white;
62
+ border-radius: 4px;
63
+ font-size: 0.85em;
64
+ font-weight: 500;
65
+ ">
66
+ {model_type}
67
+ </div>
68
+ """
69
+
70
+
71
+ def get_score_bar(score):
72
+ """Generate HTML for score bar"""
73
+ width = score * 100
74
+ return f"""
75
+ <div style="display: flex; align-items: center; gap: 12px; width: 100%;">
76
+ <div style="
77
+ flex-grow: 1;
78
+ height: 6px;
79
+ background: var(--score-bg, rgba(255, 255, 255, 0.1));
80
+ border-radius: 3px;
81
+ overflow: hidden;
82
+ max-width: 200px;
83
+ ">
84
+ <div style="
85
+ width: {width}%;
86
+ height: 100%;
87
+ background: var(--accent-color, #4F46E5);
88
+ border-radius: 3px;
89
+ "></div>
90
+ </div>
91
+ <span style="
92
+ font-family: 'SF Mono', monospace;
93
+ font-weight: 600;
94
+ color: var(--text-primary, #ffffff);
95
+ min-width: 60px;
96
+ ">{score:.3f}</span>
97
+ </div>
98
+ """
99
+
100
+
101
+ def filter_leaderboard(df, model_type, category, sort_by):
102
+ filtered_df = df.copy()
103
+ if model_type != "All":
104
+ filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
105
+
106
+ dataset_columns = CATEGORIES.get(category, ["Model Avg"])
107
+ avg_score = filtered_df[dataset_columns].mean(axis=1)
108
+ filtered_df["Category Score"] = avg_score
109
+
110
+ if sort_by == "Performance":
111
+ filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
112
+ else:
113
+ filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
114
+
115
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
116
+ perf_chart = get_performance_chart(filtered_df, category)
117
+ cost_chart = get_performance_cost_chart(filtered_df, category)
118
+
119
+ # Generate styled table HTML
120
+ table_html = f"""
121
+ <style>
122
+ @media (prefers-color-scheme: dark) {{
123
+ :root {{
124
+ --bg-color: #1a1b1e;
125
+ --text-color: #ffffff;
126
+ --border-color: #2d2e32;
127
+ --hover-bg: #2d2e32;
128
+ --note-bg: #2d2e32;
129
+ --note-text: #a1a1aa;
130
+ }}
131
+ }}
132
+
133
+ @media (prefers-color-scheme: light) {{
134
+ :root {{
135
+ --bg-color: #ffffff;
136
+ --text-color: #000000;
137
+ --border-color: #e5e7eb;
138
+ --hover-bg: #f3f4f6;
139
+ --note-bg: #f3f4f6;
140
+ --note-text: #4b5563;
141
+ }}
142
+ }}
143
+
144
+ .dark-table-container {{
145
+ background: var(--bg-color);
146
+ border-radius: 12px;
147
+ padding: 1px;
148
+ margin: 20px 0;
149
+ }}
150
+
151
+ .dark-styled-table {{
152
+ width: 100%;
153
+ border-collapse: collapse;
154
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
155
+ background: var(--bg-color);
156
+ color: var(--text-color);
157
+ }}
158
+
159
+ .dark-styled-table thead {{
160
+ position: sticky;
161
+ top: 0;
162
+ background: var(--bg-color);
163
+ z-index: 1;
164
+ }}
165
+
166
+ .dark-styled-table th {{
167
+ padding: 16px;
168
+ text-align: left;
169
+ font-weight: 500;
170
+ color: var(--text-color);
171
+ border-bottom: 1px solid var(--border-color);
172
+ }}
173
+
174
+ .dark-styled-table td {{
175
+ padding: 16px;
176
+ border-bottom: 1px solid var(--border-color);
177
+ color: var(--text-color);
178
+ }}
179
+
180
+ .dark-styled-table tbody tr:hover {{
181
+ background: var(--hover-bg);
182
+ }}
183
+
184
+ .model-cell {{
185
+ font-weight: 500;
186
+ }}
187
+
188
+ .score-cell {{
189
+ font-weight: 500;
190
+ }}
191
+
192
+ .note-box {{
193
+ margin-top: 20px;
194
+ padding: 16px;
195
+ background: var(--note-bg);
196
+ border-radius: 8px;
197
+ color: var(--note-text);
198
+ }}
199
+ </style>
200
+ <div class="dark-table-container">
201
+ <table class="dark-styled-table">
202
+ <thead>
203
+ <tr>
204
+ <th>Rank</th>
205
+ <th>Model</th>
206
+ <th>Type</th>
207
+ <th>Cost (I/O)</th>
208
+ <th>Category Score</th>
209
+ </tr>
210
+ </thead>
211
+ <tbody>
212
+ """
213
+
214
+ for _, row in filtered_df.iterrows():
215
+ table_html += f"""
216
+ <tr>
217
+ <td>{get_rank_badge(row['Rank'])}</td>
218
+ <td class="model-cell">{row['Model']}</td>
219
+ <td>{get_type_badge(row['Model Type'])}</td>
220
+ <td>${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
221
+ <td class="score-cell">{get_score_bar(row['Category Score'])}</td>
222
+ </tr>
223
+ """
224
+
225
+ table_html += """
226
+ </tbody>
227
+ </table>
228
+ </div>
229
+ <div class="note-box">
230
+ <p style="margin: 0; font-size: 0.9em;">
231
+ Note: API pricing for sorting by cost uses a 3-to-1 input/output ratio calculation. For Gemini 2.0, the cost is assumed to match Gemini 1.5's pricing since actual rates aren't yet available.
232
+ </p>
233
+ </div>
234
+ """
235
+
236
+ return table_html, perf_chart, cost_chart
237
+
238
+
239
+ def create_leaderboard_tab(df, CATEGORIES, METHODOLOGY, HEADER_CONTENT, CARDS):
240
+ with gr.Tab("Leaderboard"):
241
+ gr.HTML(HEADER_CONTENT + CARDS)
242
+ gr.HTML(DESCRIPTION_HTML)
243
+
244
+ # Filters row
245
+ with gr.Row(equal_height=True):
246
+ with gr.Column(scale=1):
247
+ model_type = gr.Dropdown(
248
+ choices=["All"] + df["Model Type"].unique().tolist(),
249
+ value="All",
250
+ label="Model Type",
251
+ )
252
+ with gr.Column(scale=1):
253
+ category = gr.Dropdown(
254
+ choices=list(CATEGORIES.keys()),
255
+ value=list(CATEGORIES.keys())[0],
256
+ label="Category",
257
+ )
258
+ with gr.Column(scale=1):
259
+ sort_by = gr.Radio(
260
+ choices=["Performance", "Cost"],
261
+ value="Performance",
262
+ label="Sort by",
263
+ )
264
+
265
+ # Content
266
+ output = gr.HTML()
267
+ plot1 = gr.Plot()
268
+ plot2 = gr.Plot()
269
+ gr.Markdown(METHODOLOGY)
270
+
271
+ for input_comp in [model_type, category, sort_by]:
272
+ input_comp.change(
273
+ fn=lambda m, c, s: filter_leaderboard(df, m, c, s),
274
+ inputs=[model_type, category, sort_by],
275
+ outputs=[output, plot1, plot2],
276
+ )
277
+
278
+ return output, plot1, plot2
tabs/model_comparison.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from visualization import create_radar_plot
3
+
4
+
5
+ def compare_models(df, model_names=None):
6
+ if model_names is None or len(model_names) == 0:
7
+ model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
8
+
9
+ filtered_df = df[df["Model"].isin(model_names)]
10
+ radar_chart = create_radar_plot(df, model_names)
11
+
12
+ # Create styled table for model info
13
+ info_html = f"""
14
+ <div class="dark-table-container">
15
+ <table class="dark-styled-table">
16
+ <thead>
17
+ <tr>
18
+ <th>Model</th>
19
+ <th>Type</th>
20
+ <th>Average</th>
21
+ <th>I/O Cost</th>
22
+ <th>Single Turn</th>
23
+ <th>Multi Turn</th>
24
+ </tr>
25
+ </thead>
26
+ <tbody>
27
+ """
28
+
29
+ for _, row in filtered_df.iterrows():
30
+ info_html += f"""
31
+ <tr>
32
+ <td>{row['Model']}</td>
33
+ <td>{row['Model Type']}</td>
34
+ <td>{row['Model Avg']:.3f}</td>
35
+ <td>${row['IO Cost']:.2f}</td>
36
+ <td>{row['single turn perf']:.3f}</td>
37
+ <td>{row['multi turn perf']:.3f}</td>
38
+ </tr>
39
+ """
40
+
41
+ info_html += """
42
+ </tbody>
43
+ </table>
44
+ </div>
45
+ """
46
+
47
+ return info_html, radar_chart
48
+
49
+
50
+ def create_model_comparison_tab(df, HEADER_CONTENT, CARDS):
51
+ with gr.Tab("Model Comparison"):
52
+ gr.HTML(HEADER_CONTENT)
53
+ with gr.Column():
54
+ # Filters row
55
+ with gr.Row(equal_height=True):
56
+ model_selector = gr.Dropdown(
57
+ choices=df["Model"].unique().tolist(),
58
+ value=df.sort_values("Model Avg", ascending=False).iloc[0]["Model"],
59
+ multiselect=True,
60
+ label="Select Models to Compare",
61
+ )
62
+
63
+ # Content
64
+ model_info = gr.HTML()
65
+ radar_plot = gr.Plot()
66
+
67
+ model_selector.change(
68
+ fn=lambda m: compare_models(df, m),
69
+ inputs=[model_selector],
70
+ outputs=[model_info, radar_plot],
71
+ )
72
+
73
+ return model_info, radar_plot
utils.py CHANGED
@@ -1,56 +1,3 @@
1
- from data_loader import CATEGORIES
2
- from visualization import (
3
- create_radar_plot,
4
- get_performance_chart,
5
- get_performance_cost_chart,
6
- )
7
-
8
-
9
- def model_info_tab(df, model_names=None):
10
- if model_names is None or len(model_names) == 0:
11
- model_names = [df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]]
12
-
13
- filtered_df = df[df["Model"].isin(model_names)]
14
- radar_chart = create_radar_plot(df, model_names)
15
-
16
- # Create styled table for model info
17
- info_html = f"""
18
- <div class="dark-table-container">
19
- <table class="dark-styled-table">
20
- <thead>
21
- <tr>
22
- <th>Model</th>
23
- <th>Type</th>
24
- <th>Average</th>
25
- <th>I/O Cost</th>
26
- <th>Single Turn</th>
27
- <th>Multi Turn</th>
28
- </tr>
29
- </thead>
30
- <tbody>
31
- """
32
-
33
- for _, row in filtered_df.iterrows():
34
- info_html += f"""
35
- <tr>
36
- <td>{row['Model']}</td>
37
- <td>{row['Model Type']}</td>
38
- <td>{row['Model Avg']:.3f}</td>
39
- <td>${row['IO Cost']:.2f}</td>
40
- <td>{row['single turn perf']:.3f}</td>
41
- <td>{row['multi turn perf']:.3f}</td>
42
- </tr>
43
- """
44
-
45
- info_html += """
46
- </tbody>
47
- </table>
48
- </div>
49
- """
50
-
51
- return info_html, radar_chart
52
-
53
-
54
  def get_rank_badge(rank):
55
  """Generate HTML for rank badge with appropriate styling"""
56
  badge_styles = {
@@ -140,158 +87,3 @@ def get_score_bar(score):
140
  ">{score:.3f}</span>
141
  </div>
142
  """
143
-
144
-
145
- def filter_leaderboard(df, model_type, category, sort_by):
146
- filtered_df = df.copy()
147
- if model_type != "All":
148
- filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
149
-
150
- dataset_columns = CATEGORIES.get(category, ["Model Avg"])
151
- avg_score = filtered_df[dataset_columns].mean(axis=1)
152
- filtered_df["Category Score"] = avg_score
153
-
154
- if sort_by == "Performance":
155
- filtered_df = filtered_df.sort_values(by="Category Score", ascending=False)
156
- else:
157
- filtered_df = filtered_df.sort_values(by="IO Cost", ascending=True)
158
-
159
- filtered_df["Rank"] = range(1, len(filtered_df) + 1)
160
- perf_chart = get_performance_chart(filtered_df, category)
161
- cost_chart = get_performance_cost_chart(filtered_df, category)
162
-
163
- table_html = f"""
164
- <style>
165
- .dark-table-container {{
166
- max-height: 600px;
167
- overflow-y: auto;
168
- background: linear-gradient(145deg, #1a1b1e, #1f2023);
169
- border-radius: 16px;
170
- padding: 1px;
171
- margin: 20px 0;
172
- box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1),
173
- 0 2px 4px -1px rgba(0, 0, 0, 0.06);
174
- }}
175
-
176
- .dark-styled-table {{
177
- width: 100%;
178
- border-collapse: separate;
179
- border-spacing: 0;
180
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
181
- background: transparent;
182
- color: #ffffff;
183
- }}
184
-
185
- .dark-styled-table thead {{
186
- position: sticky;
187
- top: 0;
188
- background: linear-gradient(180deg, #1a1b1e, #1d1e22);
189
- z-index: 1;
190
- }}
191
-
192
- .dark-styled-table th {{
193
- padding: 12px 20px;
194
- text-align: left;
195
- font-weight: 600;
196
- color: #ffffff;
197
- text-transform: uppercase;
198
- font-size: 0.75em;
199
- background: #1a1b1e;
200
- letter-spacing: 0.05em;
201
- border-bottom: 1px solid #2d2e32;
202
- }}
203
-
204
- .dark-styled-table td {{
205
- padding: 16px 20px;
206
- border-bottom: 1px solid rgba(45, 46, 50, 0.5);
207
- color: #ffffff;
208
- font-size: 0.95em;
209
- }}
210
-
211
- .dark-styled-table tbody tr {{
212
- transition: all 0.2s ease;
213
- background: transparent;
214
- }}
215
-
216
- .dark-styled-table tbody tr:hover {{
217
- background: rgba(45, 46, 50, 0.5);
218
- }}
219
-
220
- .model-cell {{
221
- font-weight: 500;
222
- color: #e2e8f0;
223
- }}
224
-
225
- .cost-cell {{
226
- font-family: 'SF Mono', monospace;
227
- color: #94a3b8;
228
- }}
229
-
230
- .note-box {{
231
- margin: 20px 0;
232
- padding: 16px 20px;
233
- background: rgba(45, 46, 50, 0.5);
234
- border-radius: 12px;
235
- color: #94a3b8;
236
- font-size: 0.9em;
237
- border-left: 4px solid #4f46e5;
238
- }}
239
-
240
- /* Custom scrollbar */
241
- .dark-table-container::-webkit-scrollbar {{
242
- width: 8px;
243
- }}
244
-
245
- .dark-table-container::-webkit-scrollbar-track {{
246
- background: #1a1b1e;
247
- border-radius: 4px;
248
- }}
249
-
250
- .dark-table-container::-webkit-scrollbar-thumb {{
251
- background: #2d2e32;
252
- border-radius: 4px;
253
- }}
254
-
255
- .dark-table-container::-webkit-scrollbar-thumb:hover {{
256
- background: #3d3e42;
257
- }}
258
- </style>
259
- <div class="dark-table-container">
260
- <table class="dark-styled-table">
261
- <thead>
262
- <tr>
263
- <th>RANK</th>
264
- <th>MODEL</th>
265
- <th>TYPE</th>
266
- <th>COST (I/O)</th>
267
- <th>SCORE</th>
268
- </tr>
269
- </thead>
270
- <tbody>
271
- """
272
-
273
- for _, row in filtered_df.iterrows():
274
- rank_display = get_rank_badge(row["Rank"])
275
- type_badge = get_type_badge(row["Model Type"])
276
- score_bar = get_score_bar(row["Category Score"])
277
-
278
- table_html += f"""
279
- <tr>
280
- <td>{rank_display}</td>
281
- <td class="model-cell">{row['Model']}</td>
282
- <td>{type_badge}</td>
283
- <td class="cost-cell">${row['Input cost per million token']:.2f}/${row['Output cost per million token']:.2f}</td>
284
- <td>{score_bar}</td>
285
- </tr>
286
- """
287
-
288
- table_html += """
289
- </tbody>
290
- </table>
291
- </div>
292
- <div class="note-box">
293
- Note: Cost for sorting is calculated using 3:1 ratio on I/O. Cost of Gemini 2.0 is assumed to be same as that of Gemini 1.5.
294
- </div>
295
- """
296
-
297
- return table_html, perf_chart, cost_chart
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  def get_rank_badge(rank):
2
  """Generate HTML for rank badge with appropriate styling"""
3
  badge_styles = {
 
87
  ">{score:.3f}</span>
88
  </div>
89
  """