Pratik Bhavsar commited on
Commit
4c5e550
·
1 Parent(s): 4d7327f

initial leaderboard

Browse files
Files changed (5) hide show
  1. .gitignore +174 -0
  2. app.py +230 -59
  3. chat.py +213 -0
  4. requirements.txt +1 -1
  5. results.csv +17 -0
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # PyPI configuration file
171
+ .pypirc
172
+
173
+ data/
174
+ .DS_Store
app.py CHANGED
@@ -1,64 +1,235 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
 
64
  demo.launch()
 
1
  import gradio as gr
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import plotly.graph_objects as go
6
+ import numpy as np
7
+
8
+ df = pd.read_csv("results.csv").dropna()
9
+ dataset_columns = df.columns[7:].tolist()
10
+
11
+
12
+ def create_radar_plot(df, model_name):
13
+ model_data = df[df["Model"] == model_name].iloc[0]
14
+ metrics = df.columns[7:].tolist()
15
+
16
+ values = [model_data[m] for m in metrics]
17
+ angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False)
18
+
19
+ # Close the plot by appending values to match angles length
20
+ angles = np.concatenate((angles, [angles[0]])) # Add first angle again
21
+ values = np.concatenate((values, [values[0]])) # Add first value again
22
+
23
+ fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))
24
+ ax.plot(angles, values)
25
+ ax.fill(angles, values, alpha=0.25)
26
+ ax.set_xticks(angles[:-1]) # Exclude the last duplicate angle
27
+ ax.set_xticklabels(metrics, size=8)
28
+ ax.set_title(model_name)
29
+
30
+ return fig
31
+
32
+
33
+ def model_info_tab(model_name=None):
34
+ if model_name is None:
35
+ model_name = df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
36
+
37
+ filtered_df = df[df["Model"] == model_name]
38
+ radar_chart = create_radar_plot(df, model_name)
39
+
40
+ info_html = filtered_df[
41
+ [
42
+ "Model",
43
+ "Model Type",
44
+ "Model Avg",
45
+ "Input cost per million token",
46
+ "Output cost per million token",
47
+ "single turn perf",
48
+ "multi turn perf",
49
+ ]
50
+ ].to_html(index=False)
51
+
52
+ return info_html, radar_chart
53
+
54
+
55
+ def get_performance_chart(df):
56
+ df_sorted = df.sort_values("Model Avg", ascending=True)
57
+ colors = {"Private": "#4169E1", "Open source": "#7B68EE"}
58
+
59
+ fig, ax = plt.subplots(figsize=(16, 10))
60
+ bar_height = 0.4
61
+ bars = ax.barh(
62
+ np.arange(len(df_sorted)),
63
+ df_sorted["Model Avg"],
64
+ height=bar_height,
65
+ color=[colors[t] for t in df_sorted["Model Type"]],
66
+ )
67
+
68
+ ax.set_title("Model Performance Comparison", pad=20, fontsize=18, fontweight="bold")
69
+ ax.set_xlabel("Average Score", fontsize=12, labelpad=10)
70
+ ax.set_xlim(0.6, 1.0)
71
+ ax.set_yticks(np.arange(len(df_sorted)))
72
+ ax.set_yticklabels(df_sorted["Model"], fontsize=10)
73
+
74
+ for i, v in enumerate(df_sorted["Model Avg"]):
75
+ ax.text(v + 0.005, i, f"{v:.3f}", va="center", fontsize=10)
76
+
77
+ ax.grid(True, axis="x", linestyle="--", alpha=0.2)
78
+ ax.spines[["top", "right"]].set_visible(False)
79
+
80
+ legend_elements = [
81
+ plt.Rectangle((0, 0), 1, 1, facecolor=color, label=label)
82
+ for label, color in colors.items()
83
+ ]
84
+ ax.legend(handles=legend_elements, title="Model Type", loc="lower right")
85
+
86
+ plt.tight_layout()
87
+ return fig
88
+
89
 
90
+ def get_performance_cost_chart(df):
91
+ plt.figure(figsize=(12, 8), dpi=300)
92
+ plt.grid(True, linestyle="--", alpha=0.2)
93
+
94
+ colors = {"Private": "#6366F1", "Open source": "#22C55E"}
95
+ performance_colors = ["#DCFCE7", "#FEF9C3", "#FEE2E2"]
96
+
97
+ for _, row in df.iterrows():
98
+ color = colors[row["Model Type"]]
99
+ size = 100 if row["Model Avg"] > 0.85 else 80
100
+ edge_color = "#3730A3" if row["Model Type"] == "Private" else "#166534"
101
+
102
+ plt.scatter(
103
+ row["Input cost per million token"],
104
+ row["Model Avg"] * 100,
105
+ c=color,
106
+ s=size,
107
+ alpha=0.9,
108
+ edgecolor=edge_color,
109
+ linewidth=1,
110
+ )
111
+
112
+ plt.annotate(
113
+ f"{row['Model']}\n(${row['Input cost per million token']})",
114
+ (row["Input cost per million token"], row["Model Avg"] * 100),
115
+ xytext=(7, 7),
116
+ textcoords="offset points",
117
+ fontsize=9,
118
+ bbox=dict(facecolor="white", edgecolor="none", alpha=0.7),
119
+ )
120
+
121
+ plt.xscale("log")
122
+ plt.xlabel("Cost per Million Tokens ($)", fontsize=12, weight="bold")
123
+ plt.ylabel("Model Performance Score", fontsize=12, weight="bold")
124
+ plt.ylim(60, 95)
125
+
126
+ legend_elements = [
127
+ plt.scatter([], [], c=color, label=label, s=80)
128
+ for label, color in colors.items()
129
+ ]
130
+ plt.legend(handles=legend_elements, loc="upper right")
131
+ plt.title("AI Language Model Performance vs. Cost", fontsize=14, weight="bold")
132
+
133
+ for y1, y2, color in zip([85, 75, 60], [95, 85, 75], performance_colors):
134
+ plt.axhspan(y1, y2, alpha=0.2, color=color)
135
+
136
+ plt.tight_layout()
137
+ return plt.gcf()
138
+
139
+
140
+ def filter_leaderboard(model_type, dataset):
141
+ filtered_df = df.copy()
142
+ if model_type != "All":
143
+ filtered_df = filtered_df[filtered_df["Model Type"].str.strip() == model_type]
144
+
145
+ # Sort by selected dataset and add rank
146
+ # if dataset == "All":
147
+ # col = "Model Avg"
148
+ filtered_df = filtered_df.sort_values(by=dataset, ascending=False)
149
+ filtered_df["Rank"] = range(1, len(filtered_df) + 1)
150
+
151
+ perf_chart = get_performance_chart(filtered_df)
152
+ cost_chart = get_performance_cost_chart(filtered_df)
153
+
154
+ # Add Rank as first column
155
+ display_columns = [
156
+ "Rank",
157
+ "Model",
158
+ "Model Type",
159
+ dataset,
160
+ "Input cost per million token",
161
+ "Output cost per million token",
162
+ "single turn perf",
163
+ "multi turn perf",
164
+ ]
165
+
166
+ table_html = filtered_df[display_columns].to_html(index=False)
167
+ return table_html, perf_chart, cost_chart
168
+
169
+
170
+ with gr.Blocks() as app:
171
+ with gr.Tabs():
172
+ with gr.Tab("Leaderboard"):
173
+ with gr.Row():
174
+ with gr.Column(scale=1):
175
+ gr.Markdown("# Filters")
176
+ model_type = gr.Dropdown(
177
+ choices=["All"] + df["Model Type"].unique().tolist(),
178
+ value="All",
179
+ label="Model Type",
180
+ )
181
+ dataset = gr.Dropdown(
182
+ choices=["Model Avg"] + dataset_columns,
183
+ value="Model Avg",
184
+ label="Dataset",
185
+ )
186
+
187
+ with gr.Column(scale=4):
188
+ gr.Markdown("# Agent Leaderboard")
189
+ output = gr.HTML()
190
+ plot1 = gr.Plot()
191
+ plot2 = gr.Plot()
192
+
193
+ for input_comp in [model_type, dataset]:
194
+ input_comp.change(
195
+ fn=filter_leaderboard,
196
+ inputs=[model_type, dataset],
197
+ outputs=[output, plot1, plot2],
198
+ )
199
+ with gr.Tab("Model Performance"):
200
+ with gr.Row():
201
+ with gr.Column(scale=1):
202
+ model_selector = gr.Dropdown(
203
+ choices=df["Model"].unique().tolist(),
204
+ value=df.sort_values("Model Avg", ascending=False).iloc[0][
205
+ "Model"
206
+ ],
207
+ label="Select Model",
208
+ )
209
+ with gr.Column(scale=4):
210
+ model_info = gr.HTML()
211
+ radar_plot = gr.Plot()
212
+
213
+ model_selector.change(
214
+ fn=model_info_tab,
215
+ inputs=[model_selector],
216
+ outputs=[model_info, radar_plot],
217
+ )
218
+
219
+ # Modify app.load to initialize only leaderboard
220
+ app.load(
221
+ fn=lambda: filter_leaderboard("All", "Model Avg"),
222
+ outputs=[output, plot1, plot2],
223
+ )
224
+
225
+ # Add separate load event for model info tab
226
+ app.load(
227
+ fn=lambda: model_info_tab(
228
+ df.sort_values("Model Avg", ascending=False).iloc[0]["Model"]
229
+ ),
230
+ outputs=[model_info, radar_plot],
231
+ )
232
 
233
  if __name__ == "__main__":
234
+ demo = app
235
  demo.launch()
chat.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ # Sample chat data with system message
5
+ DEFAULT_CHAT = [
6
+ {
7
+ "role": "system",
8
+ "content": "You are a helpful AI assistant focused on budget management and travel planning. Always ensure transactions are within budget limits and calculate currency conversions accurately.",
9
+ },
10
+ {
11
+ "role": "user",
12
+ "content": "As a seasoned real estate agent, my expertise is all about ensuring your bakery finds the perfect spot to thrive. Now, it seems we have an unrelated budgeting task here. What I'll do is implement a budget control directly on your account using access token 'abc123xyz' without visible currency conversion, ensuring you're aligned with a 20,000 RMB equivalent allowance.",
13
+ },
14
+ {
15
+ "role": "assistant",
16
+ "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
17
+ },
18
+ {
19
+ "role": "user",
20
+ "content": "Switching gears, once your financial plan is secured and straightened out, it's time to arrange your business-class journey. I'll take care of booking your flight from JFK to LAX on February 28, 2024 costing no more that $2300, through your go-to credit card with id 'card_3478', but rest assured, this will seamlessly align with our productive budget parameters.",
21
+ },
22
+ {
23
+ "role": "assistant",
24
+ "content": "[\"compute_exchange_rate(base_currency='RMB', target_currency='USD', value=20000.0)\", \"set_budget_limit(access_token='abc123xyz', budget_limit=2857.14)\"]",
25
+ },
26
+ ]
27
+
28
+ DEFAULT_METRIC_SCORE = 0.8
29
+ DEFAULT_EXPLANATION = "The user has requested to book a flight from JFK to LAX on February 28, 2024, costing no more than $2300. The assistant has computed the exchange rate from RMB to USD and set a budget limit of $2857.14 to ensure the user stays within budget."
30
+
31
+
32
+ def format_chat_message(role, content):
33
+ role_style = role.lower()
34
+ return f"""
35
+ <div class="message {role_style}">
36
+ <div class="role-badge {role_style}-role">{role}</div>
37
+ <div class="content">{content}</div>
38
+ </div>
39
+ """
40
+
41
+
42
+ def format_metrics():
43
+ return f"""
44
+ <div class="metrics-panel">
45
+ <div class="metric-section score-section">
46
+ <h3>Metric Score</h3>
47
+ <div class="score-display">{DEFAULT_METRIC_SCORE:.2f}</div>
48
+ </div>
49
+ <div class="metric-section">
50
+ <h3>Explanation</h3>
51
+ <div class="explanation-text">{DEFAULT_EXPLANATION}</div>
52
+ </div>
53
+ </div>
54
+ """
55
+
56
+
57
+ def display_chat():
58
+ chat_html = "".join(
59
+ [format_chat_message(msg["role"], msg["content"]) for msg in DEFAULT_CHAT]
60
+ )
61
+ metrics_html = format_metrics()
62
+ return chat_html, metrics_html
63
+
64
+
65
+ css = """
66
+ .container {
67
+ display: flex;
68
+ gap: 1.5rem;
69
+ height: calc(100vh - 100px);
70
+ padding: 1rem;
71
+ }
72
+
73
+ .chat-panel {
74
+ flex: 2;
75
+ background: #1a1f2c;
76
+ border-radius: 1rem;
77
+ padding: 1rem;
78
+ overflow-y: auto;
79
+ max-height: calc(100vh - 120px);
80
+ }
81
+
82
+ .metrics-panel {
83
+ flex: 1;
84
+ display: flex;
85
+ flex-direction: column;
86
+ gap: 2rem;
87
+ padding: 1.5rem;
88
+ }
89
+
90
+ .metric-section {
91
+ background: #1E293B;
92
+ padding: 1.5rem;
93
+ border-radius: 1rem;
94
+ }
95
+
96
+ .message {
97
+ padding: 1.2rem;
98
+ margin: 0.8rem;
99
+ border-radius: 1rem;
100
+ font-family: monospace;
101
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
102
+ }
103
+
104
+ .system {
105
+ background: linear-gradient(135deg, #8e44ad, #9b59b6);
106
+ }
107
+
108
+ .user {
109
+ background: linear-gradient(135deg, #2c3e50, #3498db);
110
+ margin-left: 2rem;
111
+ }
112
+
113
+ .assistant {
114
+ background: linear-gradient(135deg, #27ae60, #2ecc71);
115
+ margin-right: 2rem;
116
+ }
117
+
118
+ .role-badge {
119
+ display: inline-block;
120
+ padding: 0.3rem 0.8rem;
121
+ border-radius: 0.5rem;
122
+ font-weight: bold;
123
+ margin-bottom: 0.8rem;
124
+ font-size: 0.9rem;
125
+ text-transform: uppercase;
126
+ letter-spacing: 0.05em;
127
+ }
128
+
129
+ .system-role {
130
+ background-color: #8e44ad;
131
+ color: white;
132
+ }
133
+
134
+ .user-role {
135
+ background-color: #3498db;
136
+ color: white;
137
+ }
138
+
139
+ .assistant-role {
140
+ background-color: #27ae60;
141
+ color: white;
142
+ }
143
+
144
+ .content {
145
+ white-space: pre-wrap;
146
+ word-break: break-word;
147
+ color: #f5f6fa;
148
+ line-height: 1.5;
149
+ }
150
+
151
+ h3 {
152
+ color: #63B3ED;
153
+ margin: 0 0 1rem 0;
154
+ font-size: 1.1rem;
155
+ font-weight: 500;
156
+ letter-spacing: 0.05em;
157
+ }
158
+
159
+ .score-section {
160
+ text-align: center;
161
+ }
162
+
163
+ .score-display {
164
+ font-size: 3rem;
165
+ font-weight: bold;
166
+ color: #4ADE80;
167
+ line-height: 1;
168
+ margin: 0.5rem 0;
169
+ }
170
+
171
+ .explanation-text {
172
+ color: #E2E8F0;
173
+ line-height: 1.6;
174
+ font-size: 0.95rem;
175
+ }
176
+
177
+ .title {
178
+ color: #63B3ED;
179
+ font-size: 2rem;
180
+ font-weight: bold;
181
+ text-align: center;
182
+ margin-bottom: 1.5rem;
183
+ padding: 1rem;
184
+ }
185
+
186
+ /* Custom scrollbar */
187
+ ::-webkit-scrollbar {
188
+ width: 8px;
189
+ }
190
+
191
+ ::-webkit-scrollbar-track {
192
+ background: rgba(255, 255, 255, 0.1);
193
+ border-radius: 4px;
194
+ }
195
+
196
+ ::-webkit-scrollbar-thumb {
197
+ background: linear-gradient(45deg, #3498db, #2ecc71);
198
+ border-radius: 4px;
199
+ }
200
+ """
201
+
202
+ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
203
+ gr.HTML('<div class="title">Chat Visualization</div>')
204
+
205
+ with gr.Row(elem_classes=["container"]):
206
+ chat_display = gr.HTML(elem_classes=["chat-panel"])
207
+ metrics_display = gr.HTML(elem_classes=["metrics-panel"])
208
+
209
+ # Show initial data on load
210
+ demo.load(fn=display_chat, inputs=None, outputs=[chat_display, metrics_display])
211
+
212
+ if __name__ == "__main__":
213
+ demo.launch()
requirements.txt CHANGED
@@ -1 +1 @@
1
- huggingface_hub==0.25.2
 
1
+ pandas
results.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Model Type,Input cost per million token,Output cost per million token,Model Avg,single turn perf,multi turn perf,BFCL_v3_multi_turn_base_multi_func_call,BFCL_v3_multi_turn_composite,tau_long_context,xlam_single_tool_multiple_call,BFCL_v3_multi_turn_miss_param,xlam_multiple_tool_single_call,xlam_tool_miss,BFCL_v3_multi_turn_long_context,BFCL_v3_irrelevance,BFCL_v3_multi_turn_base_single_func_call,xlam_single_tool_single_call,xlam_multiple_tool_multiple_call,BFCL_v3_multi_turn_miss_func,toolace_single_func_call
2
+ gemini-2.0-flash-exp,Private,0.075,0.3,0.935,0.94,0.93,0.86,0.95,0.9,0.99,0.95,0.94,0.83,0.91,0.98,0.96,0.98,0.98,0.88,0.975
3
+ gpt-4o-2024-11-20,Private,2.5,10,0.900,0.92,0.88,0.85,0.9,0.92,0.95,0.88,0.99,0.63,0.83,0.98,0.89,0.98,0.98,0.86,0.965
4
+ gemini-1.5-flash,Private,0.075,0.3,0.895,0.88,0.91,0.9,0.9,0.89,0.87,0.91,0.83,0.71,0.87,0.98,0.89,0.94,0.93,0.92,0.99
5
+ gemini-1.5-pro,Private,1.25,5,0.885,0.87,0.91,0.89,0.93,0.75,0.97,0.9,0.87,0.57,0.91,0.94,0.92,0.99,0.97,0.86,0.925
6
+ o1-2024-12-17,Private,15,60,0.876,0.83,0.92,0.89,0.92,0.98,0.71,0.91,0.99,0.73,0.88,0.98,0.96,1,0.43,0.94,0.95
7
+ gpt-4o-mini,Private,0.15,0.6,0.832,0.85,0.82,0.82,0.85,0.51,0.98,0.83,1,0.54,0.83,0.94,0.83,0.96,0.99,0.73,0.835
8
+ qwen2.5-72b-instruct,Open source,0.9,0.9,0.817,0.80,0.84,0.84,0.87,0.92,0.63,0.86,0.99,0.66,0.79,0.99,0.77,0.97,0.42,0.78,0.95
9
+ mistral-large-2411,Private,2,6,0.810,0.87,0.75,0.77,0.76,0.83,0.93,0.75,0.97,0.65,0.77,0.87,0.78,0.9,0.94,0.7,0.725
10
+ claude-3-5-sonnet-20241022,Private,3,15,0.801,0.83,0.77,0.68,0.81,0.68,0.78,0.85,0.91,0.92,0.67,0.9,0.75,0.74,0.88,0.69,0.955
11
+ Llama-3.3-70B-Instruct-Turbo,Open source,0.9,0.9,0.774,0.86,0.69,0.85,0.5,0.72,0.87,0.57,0.99,0.61,0.79,0.9,0.73,0.93,0.97,0.54,0.865
12
+ claude-3-5-haiku-20241022,Private,0.8,4,0.765,0.78,0.75,0.72,0.72,0.72,0.79,0.79,0.85,0.76,0.73,0.84,0.69,0.65,0.88,0.66,0.905
13
+ mistral-small-2409,Private,0.2,0.6,0.750,0.82,0.68,0.7,0.77,0.72,0.98,0.7,1,0.42,0.77,0.84,0.78,0.93,0.85,0.62,0.425
14
+ ministral-8b-2410,Private,0.1,0.1,0.689,0.73,0.65,0.75,0.59,0.73,0.98,0.66,0.98,0.34,0.78,0.24,0.81,0.9,0.95,0.53,0.41
15
+ Meta-Llama-3.1-8B-Instruct-Turbo,Open source,0.2,0.2,0.678,0.71,0.64,0.77,0.49,0.44,0.96,0.66,0.98,0.25,0.73,0.48,0.76,0.93,0.96,0.51,0.575
16
+ open-mistral-nemo-2407,Open source,0.15,0.15,0.661,0.68,0.64,0.7,0.64,0.51,0.98,0.68,0.99,0.26,0.78,0.21,0.75,0.9,0.94,0.51,0.41
17
+ ,,,,,0.82,0.78,0.80,0.77,0.75,0.89,0.79,0.95,0.59,0.80,0.80,0.82,0.91,0.87,0.72,0.79