meghsn commited on
Commit
8627a70
·
1 Parent(s): cc74085

Init leaderboard

Browse files
Files changed (44) hide show
  1. Dockerfile +12 -0
  2. README.md +5 -6
  3. app.py +288 -0
  4. requirements.txt +6 -0
  5. results/Bgym-GPT-3.5/README.md +1 -0
  6. results/Bgym-GPT-3.5/config.json +4 -0
  7. results/Bgym-GPT-3.5/miniwob.json +16 -0
  8. results/Bgym-GPT-3.5/results.json +53 -0
  9. results/Bgym-GPT-3.5/webarena.json +16 -0
  10. results/Bgym-GPT-3.5/workarena++-l2.json +16 -0
  11. results/Bgym-GPT-3.5/workarena++-l3.json +16 -0
  12. results/Bgym-GPT-3.5/workarena-l1.json +44 -0
  13. results/Bgym-GPT-4o-V/README.md +1 -0
  14. results/Bgym-GPT-4o-V/config.json +4 -0
  15. results/Bgym-GPT-4o-V/miniwob.json +16 -0
  16. results/Bgym-GPT-4o-V/results.json +52 -0
  17. results/Bgym-GPT-4o-V/webarena.json +16 -0
  18. results/Bgym-GPT-4o-V/workarena++-l2.json +16 -0
  19. results/Bgym-GPT-4o-V/workarena++-l3.json +16 -0
  20. results/Bgym-GPT-4o-V/workarena-l1.json +16 -0
  21. results/Bgym-GPT-4o/README.md +1 -0
  22. results/Bgym-GPT-4o/config.json +4 -0
  23. results/Bgym-GPT-4o/miniwob.json +16 -0
  24. results/Bgym-GPT-4o/results.json +52 -0
  25. results/Bgym-GPT-4o/webarena.json +16 -0
  26. results/Bgym-GPT-4o/workarena++-l2.json +16 -0
  27. results/Bgym-GPT-4o/workarena++-l3.json +16 -0
  28. results/Bgym-GPT-4o/workarena-l1.json +16 -0
  29. results/Bgym-Llama-3-70b/README.md +1 -0
  30. results/Bgym-Llama-3-70b/config.json +4 -0
  31. results/Bgym-Llama-3-70b/miniwob.json +16 -0
  32. results/Bgym-Llama-3-70b/results.json +52 -0
  33. results/Bgym-Llama-3-70b/webarena.json +16 -0
  34. results/Bgym-Llama-3-70b/workarena++-l2.json +16 -0
  35. results/Bgym-Llama-3-70b/workarena++-l3.json +16 -0
  36. results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
  37. results/Bgym-Mixtral-8x22b/README.md +1 -0
  38. results/Bgym-Mixtral-8x22b/config.json +4 -0
  39. results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
  40. results/Bgym-Mixtral-8x22b/results.json +52 -0
  41. results/Bgym-Mixtral-8x22b/webarena.json +16 -0
  42. results/Bgym-Mixtral-8x22b/workarena++-l2.json +16 -0
  43. results/Bgym-Mixtral-8x22b/workarena++-l3.json +16 -0
  44. results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+ COPY ./app.py /code/app.py
7
+ COPY ./results.json /code/results.json
8
+ COPY ./results /code/results
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ CMD ["streamlit", "run", "/code/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Webagents Leaderboard
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: Leaderboard to track the progress of agents on web tasks
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: WebAgent Leaderboard
3
+ emoji: 🐠
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ license: mit
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import os
4
+ import streamlit as st
5
+ import requests
6
+ import pandas as pd
7
+ from io import StringIO
8
+ import plotly.graph_objs as go
9
+ from huggingface_hub import HfApi
10
+ from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
+ import streamlit.components.v1 as components
12
+
13
+ # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
14
+ BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
15
+
16
+ def create_html_table_main(df, benchmarks):
17
+ col1, col2 = st.columns([2,6])
18
+ with col1:
19
+ sort_column = st.selectbox("Sort by", df.columns.tolist())
20
+ with col2:
21
+ sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
22
+
23
+ # Sort dataframe
24
+ if sort_order == "Ascending":
25
+ df = df.sort_values(by=sort_column)
26
+ else:
27
+ df = df.sort_values(by=sort_column, ascending=False)
28
+
29
+ # Create HTML table without JavaScript sorting
30
+ html = '''
31
+ <style>
32
+ table {
33
+ width: 100%;
34
+ border-collapse: collapse;
35
+ }
36
+ th, td {
37
+ border: 1px solid #ddd;
38
+ padding: 8px;
39
+ text-align: center;
40
+ }
41
+ th {
42
+ font-weight: bold;
43
+ }
44
+ .table-container {
45
+ padding-bottom: 20px;
46
+ }
47
+ </style>
48
+ '''
49
+ html += '<div class="table-container">'
50
+ html += '<table>'
51
+ html += '<thead><tr>'
52
+ for column in df.columns:
53
+ html += f'<th>{column}</th>'
54
+ html += '</tr></thead>'
55
+ html += '<tbody>'
56
+ for _, row in df.iterrows():
57
+ html += '<tr>'
58
+ for col in df.columns:
59
+ html += f'<td>{row[col]}</td>'
60
+ html += '</tr>'
61
+ html += '</tbody></table>'
62
+ html += '</div>'
63
+ return html
64
+
65
+ def create_html_table_benchmark(df, benchmarks):
66
+ # Create HTML table without JavaScript sorting
67
+ html = '''
68
+ <style>
69
+ table {
70
+ width: 100%;
71
+ border-collapse: collapse;
72
+ }
73
+ th, td {
74
+ border: 1px solid #ddd;
75
+ padding: 8px;
76
+ text-align: center;
77
+ }
78
+ th {
79
+ font-weight: bold;
80
+ }
81
+ .table-container {
82
+ padding-bottom: 20px;
83
+ }
84
+ </style>
85
+ '''
86
+ html += '<div class="table-container">'
87
+ html += '<table>'
88
+ html += '<thead><tr>'
89
+ for column in df.columns:
90
+ if column != "Reproduced_all":
91
+ html += f'<th>{column}</th>'
92
+ html += '</tr></thead>'
93
+ html += '<tbody>'
94
+ for _, row in df.iterrows():
95
+ html += '<tr>'
96
+ for column in df.columns:
97
+ if column == "Reproduced":
98
+ if row[column] == "-":
99
+ html += f'<td>{row[column]}</td>'
100
+ else:
101
+ html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>'
102
+ elif column == "Reproduced_all":
103
+ continue
104
+ else:
105
+ html += f'<td>{row[column]}</td>'
106
+ html += '</tr>'
107
+ html += '</tbody></table>'
108
+ html += '</div>'
109
+ return html
110
+
111
+ def check_sanity(agent):
112
+ for benchmark in BENCHMARKS:
113
+ file_path = f"results/{agent}/{benchmark.lower()}.json"
114
+ if not os.path.exists(file_path):
115
+ continue
116
+ original_count = 0
117
+ with open(file_path) as f:
118
+ results = json.load(f)
119
+ for result in results:
120
+ if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
121
+ return False
122
+ if result["agent_name"] != agent:
123
+ return False
124
+ if result["benchmark"] != benchmark:
125
+ return False
126
+ if result["original_or_reproduced"] == "Original":
127
+ original_count += 1
128
+ if original_count != 1:
129
+ return False
130
+ return True
131
+
132
+ def main():
133
+ st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
134
+
135
+ all_agents = os.listdir("results")
136
+ all_results = {}
137
+ for agent in all_agents:
138
+ if not check_sanity(agent):
139
+ st.error(f"Results for {agent} are not in the correct format.")
140
+ continue
141
+ agent_results = []
142
+ for benchmark in BENCHMARKS:
143
+ with open(f"results/{agent}/{benchmark.lower()}.json") as f:
144
+ agent_results.extend(json.load(f))
145
+ all_results[agent] = agent_results
146
+
147
+ st.title("🏆 WebAgent Leaderboard")
148
+ st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
149
+ # content = create_yall()
150
+ # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
151
+ tabs = st.tabs(["🏆 WebAgent Leaderboard",] + BENCHMARKS + ["📝 About"])
152
+
153
+ with tabs[0]:
154
+ # Leaderboard tab
155
+ def get_leaderboard_dict(results):
156
+ leaderboard_dict = []
157
+ for key, values in results.items():
158
+ result_dict = {"Agent": key}
159
+ for benchmark in BENCHMARKS:
160
+ if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
161
+ result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
162
+ else:
163
+ result_dict[benchmark] = "-"
164
+ leaderboard_dict.append(result_dict)
165
+ return leaderboard_dict
166
+ leaderboard_dict = get_leaderboard_dict(all_results)
167
+ # print (leaderboard_dict)
168
+ full_df = pd.DataFrame.from_dict(leaderboard_dict)
169
+
170
+ df = pd.DataFrame(columns=full_df.columns)
171
+ dfs_to_concat = []
172
+ dfs_to_concat.append(full_df)
173
+
174
+ # Concatenate the DataFrames
175
+ if dfs_to_concat:
176
+ df = pd.concat(dfs_to_concat, ignore_index=True)
177
+
178
+ # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
179
+ # df['Average'] = df['Average'].round(2)
180
+ # Sort values
181
+ df = df.sort_values(by='WebArena', ascending=False)
182
+
183
+ # Add a search bar
184
+ search_query = st.text_input("Search agents", "", key="search_main")
185
+
186
+ # Filter the DataFrame based on the search query
187
+ if search_query:
188
+ df = df[df['Agent'].str.contains(search_query, case=False)]
189
+
190
+ # Display the filtered DataFrame or the entire leaderboard
191
+
192
+ def make_hyperlink(agent_name):
193
+ url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
194
+ return f'<a href="{url}" target="_blank">{agent_name}</a>'
195
+ df['Agent'] = df['Agent'].apply(make_hyperlink)
196
+ # st.dataframe(
197
+ # df[['Agent'] + BENCHMARKS],
198
+ # use_container_width=True,
199
+ # column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
200
+ # hide_index=True,
201
+ # # height=int(len(df) * 36.2),
202
+ # )
203
+ # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
204
+ html_table = create_html_table_main(df, BENCHMARKS)
205
+ # print (html_table)
206
+ st.markdown(html_table, unsafe_allow_html=True)
207
+ # components.html(html_table, height=600, scrolling=True)
208
+
209
+ if st.button("Export to CSV", key="export_main"):
210
+ # Export the DataFrame to CSV
211
+ csv_data = df.to_csv(index=False)
212
+
213
+ # Create a link to download the CSV file
214
+ st.download_button(
215
+ label="Download CSV",
216
+ data=csv_data,
217
+ file_name="leaderboard.csv",
218
+ key="download-csv",
219
+ help="Click to download the CSV file",
220
+ )
221
+
222
+ with tabs[-1]:
223
+ st.markdown('''
224
+ ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
225
+ ''')
226
+ for i, benchmark in enumerate(BENCHMARKS, start=1):
227
+ with tabs[i]:
228
+ def get_benchmark_dict(results, benchmark):
229
+ benchmark_dict = []
230
+ for key, values in results.items():
231
+ result_dict = {"Agent": key}
232
+ flag = 0
233
+ for value in values:
234
+ if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
235
+ result_dict["Score"] = value["score"]
236
+ result_dict["Benchmark Specific"] = value["benchmark_specific"]
237
+ result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
238
+ result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
239
+ result_dict["Reproducible"] = value["reproducible"]
240
+ result_dict["Comments"] = value["comments"]
241
+ result_dict["Study ID"] = value["study_id"]
242
+ result_dict["Date"] = value["date_time"]
243
+ result_dict["Reproduced"] = []
244
+ result_dict["Reproduced_all"] = []
245
+ flag = 1
246
+ if not flag:
247
+ result_dict["Score"] = "-"
248
+ result_dict["Benchmark Specific"] = "-"
249
+ result_dict["Benchmark Tuned"] = "-"
250
+ result_dict["Followed Evaluation Protocol"] = "-"
251
+ result_dict["Reproducible"] = "-"
252
+ result_dict["Comments"] = "-"
253
+ result_dict["Study ID"] = "-"
254
+ result_dict["Date"] = "-"
255
+ result_dict["Reproduced"] = []
256
+ result_dict["Reproduced_all"] = []
257
+ if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
258
+ result_dict["Reproduced"].append(value["score"])
259
+ result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
260
+ if result_dict["Reproduced"]:
261
+ result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
262
+ else:
263
+ result_dict["Reproduced"] = "-"
264
+ benchmark_dict.append(result_dict)
265
+ return benchmark_dict
266
+ benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
267
+ # print (leaderboard_dict)
268
+ full_df = pd.DataFrame.from_dict(benchmark_dict)
269
+ df_ = pd.DataFrame(columns=full_df.columns)
270
+ dfs_to_concat = []
271
+ dfs_to_concat.append(full_df)
272
+
273
+ # Concatenate the DataFrames
274
+ if dfs_to_concat:
275
+ df_ = pd.concat(dfs_to_concat, ignore_index=True)
276
+ # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
277
+ # st.dataframe(
278
+ # df_,
279
+ # use_container_width=True,
280
+ # column_config={benchmark: {'alignment': 'center'}},
281
+ # hide_index=True,
282
+ # )
283
+ html_table = create_html_table_benchmark(df_, BENCHMARKS)
284
+ st.markdown(html_table, unsafe_allow_html=True)
285
+
286
+
287
+ if __name__ == "__main__":
288
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.23
2
+ pandas
3
+ requests
4
+ plotly
5
+ gistyc
6
+ huggingface_hub
results/Bgym-GPT-3.5/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-3.5 model
results/Bgym-GPT-3.5/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-3.5",
3
+ "backend_llm": "GPT-3.5"
4
+ }
results/Bgym-GPT-3.5/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 43.4,
8
+ "std_err": 0.1,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-3.5/results.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "WorkArena-L1",
4
+ "score": 6.1,
5
+ "std_err": 0.3,
6
+ "benchmark_specific": "No",
7
+ "benchmark_tuned": "No",
8
+ "followed_evaluation_protocol": "Yes",
9
+ "reproducible": "Yes",
10
+ "reproduced": [["aug 2025", 0.65, 0.05, "study_id"]],
11
+ "comments": "NA"
12
+ },
13
+ {
14
+ "benchmark": "WorkArena++-L2",
15
+ "score": 0.0,
16
+ "std_err": 0.0,
17
+ "benchmark_specific": "No",
18
+ "benchmark_tuned": "No",
19
+ "followed_evaluation_protocol": "Yes",
20
+ "reproducible": "Yes",
21
+ "comments": "NA"
22
+ },
23
+ {
24
+ "benchmark": "WorkArena++-L3",
25
+ "score": 0.0,
26
+ "std_err": 0.0,
27
+ "benchmark_specific": "No",
28
+ "benchmark_tuned": "No",
29
+ "followed_evaluation_protocol": "Yes",
30
+ "reproducible": "Yes",
31
+ "comments": "NA"
32
+ },
33
+ {
34
+ "benchmark": "MiniWoB",
35
+ "score": 43.4,
36
+ "std_err": 0.1,
37
+ "benchmark_specific": "No",
38
+ "benchmark_tuned": "No",
39
+ "followed_evaluation_protocol": "Yes",
40
+ "reproducible": "Yes",
41
+ "comments": "NA"
42
+ },
43
+ {
44
+ "benchmark": "WebArena",
45
+ "score": 6.7,
46
+ "std_err": 0.2,
47
+ "benchmark_specific": "No",
48
+ "benchmark_tuned": "No",
49
+ "followed_evaluation_protocol": "Yes",
50
+ "reproducible": "Yes",
51
+ "comments": "NA"
52
+ }
53
+ ]
results/Bgym-GPT-3.5/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 6.7,
8
+ "std_err": 0.2,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-3.5/workarena++-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-3.5/workarena++-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-3.5/workarena-l1.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-3.5",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 6.1,
8
+ "std_err": 0.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-GPT-3.5",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 5.7,
21
+ "std_err": 0.3,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "benchmark": "WorkArena-L1",
32
+ "agent_name": "Bgym-GPT-3.5",
33
+ "study_id": "study_id",
34
+ "score": 5.1,
35
+ "std_err": 0.3,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-04 12:06:00"
43
+ }
44
+ ]
results/Bgym-GPT-4o-V/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-4o-V model
results/Bgym-GPT-4o-V/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-4o-V",
3
+ "backend_llm": "GPT-4o-V"
4
+ }
results/Bgym-GPT-4o-V/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 72.5,
8
+ "std_err": 0.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-V/results.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "WorkArena-L1",
4
+ "score": 41.8,
5
+ "std_err": 0.4,
6
+ "benchmark_specific": "No",
7
+ "benchmark_tuned": "No",
8
+ "followed_evaluation_protocol": "Yes",
9
+ "reproducible": "Yes",
10
+ "comments": "NA"
11
+ },
12
+ {
13
+ "benchmark": "WorkArena++-L2",
14
+ "score": 3.8,
15
+ "std_err": 0.6,
16
+ "benchmark_specific": "No",
17
+ "benchmark_tuned": "No",
18
+ "followed_evaluation_protocol": "Yes",
19
+ "reproducible": "Yes",
20
+ "comments": "NA"
21
+ },
22
+ {
23
+ "benchmark": "WorkArena++-L3",
24
+ "score": 0.0,
25
+ "std_err": 0.0,
26
+ "benchmark_specific": "No",
27
+ "benchmark_tuned": "No",
28
+ "followed_evaluation_protocol": "Yes",
29
+ "reproducible": "Yes",
30
+ "comments": "NA"
31
+ },
32
+ {
33
+ "benchmark": "MiniWoB",
34
+ "score": 72.5,
35
+ "std_err": 0.5,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA"
41
+ },
42
+ {
43
+ "benchmark": "WebArena",
44
+ "score": 24.0,
45
+ "std_err": 0.4,
46
+ "benchmark_specific": "No",
47
+ "benchmark_tuned": "No",
48
+ "followed_evaluation_protocol": "Yes",
49
+ "reproducible": "Yes",
50
+ "comments": "NA"
51
+ }
52
+ ]
results/Bgym-GPT-4o-V/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 24.0,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-V/workarena++-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L2",
7
+ "score": 3.8,
8
+ "std_err": 0.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-V/workarena++-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o-V/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o-V",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 41.8,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## GPT-4o model
results/Bgym-GPT-4o/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "GPT-4o",
3
+ "backend_llm": "GPT-4o"
4
+ }
results/Bgym-GPT-4o/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 71.3,
8
+ "std_err": 0.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/results.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "WorkArena-L1",
4
+ "score": 42.7,
5
+ "std_err": 0.4,
6
+ "benchmark_specific": "No",
7
+ "benchmark_tuned": "No",
8
+ "followed_evaluation_protocol": "Yes",
9
+ "reproducible": "Yes",
10
+ "comments": "NA"
11
+ },
12
+ {
13
+ "benchmark": "WorkArena++-L2",
14
+ "score": 3.0,
15
+ "std_err": 0.6,
16
+ "benchmark_specific": "No",
17
+ "benchmark_tuned": "No",
18
+ "followed_evaluation_protocol": "Yes",
19
+ "reproducible": "Yes",
20
+ "comments": "NA"
21
+ },
22
+ {
23
+ "benchmark": "WorkArena++-L3",
24
+ "score": 0.0,
25
+ "std_err": 0.0,
26
+ "benchmark_specific": "No",
27
+ "benchmark_tuned": "No",
28
+ "followed_evaluation_protocol": "Yes",
29
+ "reproducible": "Yes",
30
+ "comments": "NA"
31
+ },
32
+ {
33
+ "benchmark": "MiniWoB",
34
+ "score": 71.3,
35
+ "std_err": 0.5,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA"
41
+ },
42
+ {
43
+ "benchmark": "WebArena",
44
+ "score": 23.5,
45
+ "std_err": 0.4,
46
+ "benchmark_specific": "No",
47
+ "benchmark_tuned": "No",
48
+ "followed_evaluation_protocol": "Yes",
49
+ "reproducible": "Yes",
50
+ "comments": "NA"
51
+ }
52
+ ]
results/Bgym-GPT-4o/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 23.5,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/workarena++-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L2",
7
+ "score": 3.0,
8
+ "std_err": 0.6,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/workarena++-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-GPT-4o/workarena-l1.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-GPT-4o",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena-L1",
7
+ "score": 42.7,
8
+ "std_err": 0.4,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ### Llama-3-70B
results/Bgym-Llama-3-70b/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "Llama-3-70B",
3
+ "backend_llm": "Llama-3-70B"
4
+ }
results/Bgym-Llama-3-70b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 68.2,
8
+ "std_err": 0.7,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/results.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "WorkArena-L1",
4
+ "score": 17.9,
5
+ "std_err": 0.6,
6
+ "benchmark_specific": "No",
7
+ "benchmark_tuned": "No",
8
+ "followed_evaluation_protocol": "Yes",
9
+ "reproducible": "Yes",
10
+ "comments": "NA"
11
+ },
12
+ {
13
+ "benchmark": "WorkArena++-L2",
14
+ "score": 0.0,
15
+ "std_err": 0.0,
16
+ "benchmark_specific": "No",
17
+ "benchmark_tuned": "No",
18
+ "followed_evaluation_protocol": "Yes",
19
+ "reproducible": "Yes",
20
+ "comments": "NA"
21
+ },
22
+ {
23
+ "benchmark": "WorkArena++-L3",
24
+ "score": 0.0,
25
+ "std_err": 0.0,
26
+ "benchmark_specific": "No",
27
+ "benchmark_tuned": "No",
28
+ "followed_evaluation_protocol": "Yes",
29
+ "reproducible": "Yes",
30
+ "comments": "NA"
31
+ },
32
+ {
33
+ "benchmark": "MiniWoB",
34
+ "score": 68.2,
35
+ "std_err": 0.7,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA"
41
+ },
42
+ {
43
+ "benchmark": "WebArena",
44
+ "score": 11.0,
45
+ "std_err": 0.3,
46
+ "benchmark_specific": "No",
47
+ "benchmark_tuned": "No",
48
+ "followed_evaluation_protocol": "Yes",
49
+ "reproducible": "Yes",
50
+ "comments": "NA"
51
+ }
52
+ ]
results/Bgym-Llama-3-70b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 11.0,
8
+ "std_err": 0.3,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/workarena++-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/workarena++-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Llama-3-70b/workarena-l1.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Llama-3-70b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 17.9,
7
+ "std_err": 0.6,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-01 12:00:00"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-Llama-3-70b",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 15.9,
21
+ "std_err": 0.6,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "agent_name": "Bgym-Llama-3-70b",
32
+ "study_id": "study_id",
33
+ "benchmark": "WorkArena-L1",
34
+ "score": 19.9,
35
+ "std_err": 0.6,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-05 2:07:00"
43
+ },
44
+ {
45
+ "agent_name": "Bgym-Llama-3-70b",
46
+ "study_id": "study_id",
47
+ "benchmark": "WorkArena-L1",
48
+ "score": 17.9,
49
+ "std_err": 0.6,
50
+ "benchmark_specific": "No",
51
+ "benchmark_tuned": "No",
52
+ "followed_evaluation_protocol": "Yes",
53
+ "reproducible": "Yes",
54
+ "comments": "NA",
55
+ "original_or_reproduced": "Reproduced",
56
+ "date_time": "2021-01-12 12:00:00"
57
+ }
58
+ ]
results/Bgym-Mixtral-8x22b/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ## Mixtral 8x22B
results/Bgym-Mixtral-8x22b/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "agent_name": "Mixtral-8x22B",
3
+ "backend_llm": "Mixtral-8x22B"
4
+ }
results/Bgym-Mixtral-8x22b/miniwob.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "MiniWoB",
7
+ "score": 62.4,
8
+ "std_err": 0.5,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/results.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "benchmark": "WorkArena-L1",
4
+ "score": 12.4,
5
+ "std_err": 0.7,
6
+ "benchmark_specific": "No",
7
+ "benchmark_tuned": "No",
8
+ "followed_evaluation_protocol": "Yes",
9
+ "reproducible": "Yes",
10
+ "comments": "NA"
11
+ },
12
+ {
13
+ "benchmark": "WorkArena++-L2",
14
+ "score": 0.0,
15
+ "std_err": 0.0,
16
+ "benchmark_specific": "No",
17
+ "benchmark_tuned": "No",
18
+ "followed_evaluation_protocol": "Yes",
19
+ "reproducible": "Yes",
20
+ "comments": "NA"
21
+ },
22
+ {
23
+ "benchmark": "WorkArena++-L3",
24
+ "score": 0.0,
25
+ "std_err": 0.0,
26
+ "benchmark_specific": "No",
27
+ "benchmark_tuned": "No",
28
+ "followed_evaluation_protocol": "Yes",
29
+ "reproducible": "Yes",
30
+ "comments": "NA"
31
+ },
32
+ {
33
+ "benchmark": "MiniWoB",
34
+ "score": 62.4,
35
+ "std_err": 0.5,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA"
41
+ },
42
+ {
43
+ "benchmark": "WebArena",
44
+ "score": 12.6,
45
+ "std_err": 0.9,
46
+ "benchmark_specific": "No",
47
+ "benchmark_tuned": "No",
48
+ "followed_evaluation_protocol": "Yes",
49
+ "reproducible": "Yes",
50
+ "comments": "NA"
51
+ }
52
+ ]
results/Bgym-Mixtral-8x22b/webarena.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WebArena",
7
+ "score": 12.6,
8
+ "std_err": 0.9,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/workarena++-l2.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L2",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/workarena++-l3.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "date_time": "2021-01-01 12:00:00",
6
+ "benchmark": "WorkArena++-L3",
7
+ "score": 0.0,
8
+ "std_err": 0.0,
9
+ "benchmark_specific": "No",
10
+ "benchmark_tuned": "No",
11
+ "followed_evaluation_protocol": "Yes",
12
+ "reproducible": "Yes",
13
+ "comments": "NA",
14
+ "original_or_reproduced": "Original"
15
+ }
16
+ ]
results/Bgym-Mixtral-8x22b/workarena-l1.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "agent_name": "Bgym-Mixtral-8x22b",
4
+ "study_id": "study_id",
5
+ "benchmark": "WorkArena-L1",
6
+ "score": 12.4,
7
+ "std_err": 0.7,
8
+ "benchmark_specific": "No",
9
+ "benchmark_tuned": "No",
10
+ "followed_evaluation_protocol": "Yes",
11
+ "reproducible": "Yes",
12
+ "comments": "NA",
13
+ "original_or_reproduced": "Original",
14
+ "date_time": "2021-01-04 12:06:00"
15
+ },
16
+ {
17
+ "agent_name": "Bgym-Mixtral-8x22b",
18
+ "study_id": "study_id",
19
+ "benchmark": "WorkArena-L1",
20
+ "score": 11.4,
21
+ "std_err": 0.7,
22
+ "benchmark_specific": "No",
23
+ "benchmark_tuned": "No",
24
+ "followed_evaluation_protocol": "Yes",
25
+ "reproducible": "Yes",
26
+ "comments": "NA",
27
+ "original_or_reproduced": "Reproduced",
28
+ "date_time": "2021-01-04 12:06:00"
29
+ },
30
+ {
31
+ "agent_name": "Bgym-Mixtral-8x22b",
32
+ "study_id": "study_id",
33
+ "benchmark": "WorkArena-L1",
34
+ "score": 13.4,
35
+ "std_err": 0.7,
36
+ "benchmark_specific": "No",
37
+ "benchmark_tuned": "No",
38
+ "followed_evaluation_protocol": "Yes",
39
+ "reproducible": "Yes",
40
+ "comments": "NA",
41
+ "original_or_reproduced": "Reproduced",
42
+ "date_time": "2021-01-04 12:06:00"
43
+ }
44
+ ]