Spaces:

ServiceNow
/

browsergym-leaderboard

Running

App Files Files Community

meghsn commited on Nov 13, 2024

Commit

8627a70

1 Parent(s): cc74085

Init leaderboard

Browse files

Files changed (44) hide show

Dockerfile +12 -0
README.md +5 -6
app.py +288 -0
requirements.txt +6 -0
results/Bgym-GPT-3.5/README.md +1 -0
results/Bgym-GPT-3.5/config.json +4 -0
results/Bgym-GPT-3.5/miniwob.json +16 -0
results/Bgym-GPT-3.5/results.json +53 -0
results/Bgym-GPT-3.5/webarena.json +16 -0
results/Bgym-GPT-3.5/workarena++-l2.json +16 -0
results/Bgym-GPT-3.5/workarena++-l3.json +16 -0
results/Bgym-GPT-3.5/workarena-l1.json +44 -0
results/Bgym-GPT-4o-V/README.md +1 -0
results/Bgym-GPT-4o-V/config.json +4 -0
results/Bgym-GPT-4o-V/miniwob.json +16 -0
results/Bgym-GPT-4o-V/results.json +52 -0
results/Bgym-GPT-4o-V/webarena.json +16 -0
results/Bgym-GPT-4o-V/workarena++-l2.json +16 -0
results/Bgym-GPT-4o-V/workarena++-l3.json +16 -0
results/Bgym-GPT-4o-V/workarena-l1.json +16 -0
results/Bgym-GPT-4o/README.md +1 -0
results/Bgym-GPT-4o/config.json +4 -0
results/Bgym-GPT-4o/miniwob.json +16 -0
results/Bgym-GPT-4o/results.json +52 -0
results/Bgym-GPT-4o/webarena.json +16 -0
results/Bgym-GPT-4o/workarena++-l2.json +16 -0
results/Bgym-GPT-4o/workarena++-l3.json +16 -0
results/Bgym-GPT-4o/workarena-l1.json +16 -0
results/Bgym-Llama-3-70b/README.md +1 -0
results/Bgym-Llama-3-70b/config.json +4 -0
results/Bgym-Llama-3-70b/miniwob.json +16 -0
results/Bgym-Llama-3-70b/results.json +52 -0
results/Bgym-Llama-3-70b/webarena.json +16 -0
results/Bgym-Llama-3-70b/workarena++-l2.json +16 -0
results/Bgym-Llama-3-70b/workarena++-l3.json +16 -0
results/Bgym-Llama-3-70b/workarena-l1.json +58 -0
results/Bgym-Mixtral-8x22b/README.md +1 -0
results/Bgym-Mixtral-8x22b/config.json +4 -0
results/Bgym-Mixtral-8x22b/miniwob.json +16 -0
results/Bgym-Mixtral-8x22b/results.json +52 -0
results/Bgym-Mixtral-8x22b/webarena.json +16 -0
results/Bgym-Mixtral-8x22b/workarena++-l2.json +16 -0
results/Bgym-Mixtral-8x22b/workarena++-l3.json +16 -0
results/Bgym-Mixtral-8x22b/workarena-l1.json +44 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+COPY ./app.py /code/app.py
+COPY ./results.json /code/results.json
+COPY ./results /code/results
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+CMD ["streamlit", "run", "/code/app.py", "--server.address", "0.0.0.0", "--server.port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: Webagents Leaderboard
-emoji: 👁
-colorFrom: blue
-colorTo: gray
 sdk: docker
 pinned: false
-license: apache-2.0
-short_description: Leaderboard to track the progress of agents on web tasks
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: WebAgent Leaderboard
+emoji: 🐠
+colorFrom: purple
+colorTo: green
 sdk: docker
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import json
+import re
+import os
+import streamlit as st
+import requests
+import pandas as pd
+from io import StringIO
+import plotly.graph_objs as go
+from huggingface_hub import HfApi
+from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
+import streamlit.components.v1 as components
+# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
+BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
+def create_html_table_main(df, benchmarks):
+    col1, col2 = st.columns([2,6])
+    with col1:
+        sort_column = st.selectbox("Sort by", df.columns.tolist())
+    with col2:
+        sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
+    # Sort dataframe
+    if sort_order == "Ascending":
+        df = df.sort_values(by=sort_column)
+    else:
+        df = df.sort_values(by=sort_column, ascending=False)
+    # Create HTML table without JavaScript sorting
+    html = '''
+    <style>
+        table {
+            width: 100%;
+            border-collapse: collapse;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: center;
+        }
+        th {
+            font-weight: bold;
+        }
+        .table-container {
+            padding-bottom: 20px;
+        }
+    </style>
+    '''
+    html += '<div class="table-container">'
+    html += '<table>'
+    html += '<thead><tr>'
+    for column in df.columns:
+        html += f'<th>{column}</th>'
+    html += '</tr></thead>'
+    html += '<tbody>'
+    for _, row in df.iterrows():
+        html += '<tr>'
+        for col in df.columns:
+            html += f'<td>{row[col]}</td>'
+        html += '</tr>'
+    html += '</tbody></table>'
+    html += '</div>'
+    return html
+def create_html_table_benchmark(df, benchmarks):
+    # Create HTML table without JavaScript sorting
+    html = '''
+    <style>
+        table {
+            width: 100%;
+            border-collapse: collapse;
+        }
+        th, td {
+            border: 1px solid #ddd;
+            padding: 8px;
+            text-align: center;
+        }
+        th {
+            font-weight: bold;
+        }
+        .table-container {
+            padding-bottom: 20px;
+        }
+    </style>
+    '''
+    html += '<div class="table-container">'
+    html += '<table>'
+    html += '<thead><tr>'
+    for column in df.columns:
+        if column != "Reproduced_all":
+            html += f'<th>{column}</th>'
+    html += '</tr></thead>'
+    html += '<tbody>'
+    for _, row in df.iterrows():
+        html += '<tr>'
+        for column in df.columns:
+            if column == "Reproduced":
+                if row[column] == "-":
+                    html += f'<td>{row[column]}</td>'
+                else:
+                    html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>'
+            elif column == "Reproduced_all":
+                continue
+            else:
+                html += f'<td>{row[column]}</td>'
+        html += '</tr>'
+    html += '</tbody></table>'
+    html += '</div>'
+    return html
+def check_sanity(agent):
+    for benchmark in BENCHMARKS:
+        file_path = f"results/{agent}/{benchmark.lower()}.json"
+        if not os.path.exists(file_path):
+            continue
+        original_count = 0
+        with open(file_path) as f:
+            results = json.load(f)
+            for result in results:
+                if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
+                    return False
+                if result["agent_name"] != agent:
+                    return False
+                if result["benchmark"] != benchmark:
+                    return False
+                if result["original_or_reproduced"] == "Original":
+                    original_count += 1
+        if original_count != 1:
+            return False
+    return True
+def main():
+    st.set_page_config(page_title="WebAgent Leaderboard", layout="wide")
+    all_agents = os.listdir("results")
+    all_results = {}
+    for agent in all_agents:
+        if not check_sanity(agent):
+            st.error(f"Results for {agent} are not in the correct format.")
+            continue
+        agent_results = []
+        for benchmark in BENCHMARKS:
+            with open(f"results/{agent}/{benchmark.lower()}.json") as f:
+                agent_results.extend(json.load(f))
+        all_results[agent] = agent_results
+    st.title("🏆 WebAgent Leaderboard")
+    st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
+    # content = create_yall()
+    # tab1, tab2, tab3, tab4 = st.tabs(["🏆 WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "📝 About"])
+    tabs = st.tabs(["🏆 WebAgent Leaderboard",] +  BENCHMARKS + ["📝 About"])
+    with tabs[0]:
+        # Leaderboard tab
+        def get_leaderboard_dict(results):
+            leaderboard_dict = []
+            for key, values in results.items():
+                result_dict = {"Agent": key}
+                for benchmark in BENCHMARKS:
+                    if any(value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original" for value in values):
+                        result_dict[benchmark] = [value["score"] for value in values if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original"][0]
+                    else:
+                        result_dict[benchmark] = "-"
+                leaderboard_dict.append(result_dict)
+            return leaderboard_dict
+        leaderboard_dict = get_leaderboard_dict(all_results)
+        # print (leaderboard_dict)
+        full_df = pd.DataFrame.from_dict(leaderboard_dict)
+        df = pd.DataFrame(columns=full_df.columns)
+        dfs_to_concat = []
+        dfs_to_concat.append(full_df)
+        # Concatenate the DataFrames
+        if dfs_to_concat:
+            df = pd.concat(dfs_to_concat, ignore_index=True)
+        # df['Average'] = sum(df[column] for column in BENCHMARKS)/len(BENCHMARKS)
+        # df['Average'] = df['Average'].round(2)
+        # Sort values
+        df = df.sort_values(by='WebArena', ascending=False)
+        # Add a search bar
+        search_query = st.text_input("Search agents", "", key="search_main")
+        # Filter the DataFrame based on the search query
+        if search_query:
+            df = df[df['Agent'].str.contains(search_query, case=False)]
+        # Display the filtered DataFrame or the entire leaderboard
+        def make_hyperlink(agent_name):
+            url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
+            return f'<a href="{url}" target="_blank">{agent_name}</a>'
+        df['Agent'] = df['Agent'].apply(make_hyperlink)
+        # st.dataframe(
+        #     df[['Agent'] + BENCHMARKS],
+        #     use_container_width=True,
+        #     column_config={benchmark: {'alignment': 'center'} for benchmark in BENCHMARKS},
+        #     hide_index=True,
+        #     # height=int(len(df) * 36.2),
+        # )
+        # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
+        html_table = create_html_table_main(df, BENCHMARKS)
+        # print (html_table)
+        st.markdown(html_table, unsafe_allow_html=True)
+        # components.html(html_table, height=600, scrolling=True)
+        if st.button("Export to CSV", key="export_main"):
+            # Export the DataFrame to CSV
+            csv_data = df.to_csv(index=False)
+            # Create a link to download the CSV file
+            st.download_button(
+                label="Download CSV",
+                data=csv_data,
+                file_name="leaderboard.csv",
+                key="download-csv",
+                help="Click to download the CSV file",
+            )
+    with tabs[-1]:
+            st.markdown('''
+                    ### Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.
+                ''')
+    for i, benchmark in enumerate(BENCHMARKS, start=1):
+        with tabs[i]:
+            def get_benchmark_dict(results, benchmark):
+                benchmark_dict = []
+                for key, values in results.items():
+                    result_dict = {"Agent": key}
+                    flag = 0
+                    for value in values:
+                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Original":
+                            result_dict["Score"] = value["score"]
+                            result_dict["Benchmark Specific"] = value["benchmark_specific"]
+                            result_dict["Benchmark Tuned"] = value["benchmark_tuned"]
+                            result_dict["Followed Evaluation Protocol"] = value["followed_evaluation_protocol"]
+                            result_dict["Reproducible"] = value["reproducible"]
+                            result_dict["Comments"] = value["comments"]
+                            result_dict["Study ID"] = value["study_id"]
+                            result_dict["Date"] = value["date_time"]
+                            result_dict["Reproduced"] = []
+                            result_dict["Reproduced_all"] = []
+                            flag = 1
+                        if not flag:
+                            result_dict["Score"] = "-"
+                            result_dict["Benchmark Specific"] = "-"
+                            result_dict["Benchmark Tuned"] = "-"
+                            result_dict["Followed Evaluation Protocol"] = "-"
+                            result_dict["Reproducible"] = "-"
+                            result_dict["Comments"] = "-"
+                            result_dict["Study ID"] = "-"
+                            result_dict["Date"] = "-"
+                            result_dict["Reproduced"] = []
+                            result_dict["Reproduced_all"] = []
+                        if value["benchmark"] == benchmark and value["original_or_reproduced"] == "Reproduced":
+                            result_dict["Reproduced"].append(value["score"])
+                            result_dict["Reproduced_all"].append(", ".join([str(value["score"]), str(value["date_time"])]))
+                    if result_dict["Reproduced"]:
+                        result_dict["Reproduced"] = str(min(result_dict["Reproduced"])) + " - " + str(max(result_dict["Reproduced"]))
+                    else:
+                        result_dict["Reproduced"] = "-"
+                    benchmark_dict.append(result_dict)
+                return benchmark_dict
+            benchmark_dict = get_benchmark_dict(all_results, benchmark=benchmark)
+            # print (leaderboard_dict)
+            full_df = pd.DataFrame.from_dict(benchmark_dict)
+            df_ = pd.DataFrame(columns=full_df.columns)
+            dfs_to_concat = []
+            dfs_to_concat.append(full_df)
+            # Concatenate the DataFrames
+            if dfs_to_concat:
+                df_ = pd.concat(dfs_to_concat, ignore_index=True)
+            # st.markdown(f"<h2 id='{benchmark.lower()}'>{benchmark}</h2>", unsafe_allow_html=True)
+            # st.dataframe(
+            #     df_,
+            #     use_container_width=True,
+            #     column_config={benchmark: {'alignment': 'center'}},
+            #     hide_index=True,
+            # )
+            html_table = create_html_table_benchmark(df_, BENCHMARKS)
+            st.markdown(html_table, unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit==1.23
+pandas
+requests
+plotly
+gistyc
+huggingface_hub

results/Bgym-GPT-3.5/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-3.5 model

results/Bgym-GPT-3.5/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-3.5",
+    "backend_llm": "GPT-3.5"
+}

results/Bgym-GPT-3.5/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 43.4,
+        "std_err": 0.1,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-3.5/results.json ADDED Viewed

	@@ -0,0 +1,53 @@

+[
+    {
+        "benchmark": "WorkArena-L1",
+        "score": 6.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "reproduced": [["aug 2025", 0.65, 0.05, "study_id"]],
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "MiniWoB",
+        "score": 43.4,
+        "std_err": 0.1,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WebArena",
+        "score": 6.7,
+        "std_err": 0.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    }
+]

results/Bgym-GPT-3.5/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 6.7,
+        "std_err": 0.2,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-3.5/workarena++-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-3.5/workarena++-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-3.5/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,44 @@

+[
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 6.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    },
+    {
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 5.7,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "benchmark": "WorkArena-L1",
+        "agent_name": "Bgym-GPT-3.5",
+        "study_id": "study_id",
+        "score": 5.1,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]

results/Bgym-GPT-4o-V/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-4o-V model

results/Bgym-GPT-4o-V/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-4o-V",
+    "backend_llm": "GPT-4o-V"
+}

results/Bgym-GPT-4o-V/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 72.5,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-V/results.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+    {
+        "benchmark": "WorkArena-L1",
+        "score": 41.8,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L2",
+        "score": 3.8,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "MiniWoB",
+        "score": 72.5,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WebArena",
+        "score": 24.0,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    }
+]

results/Bgym-GPT-4o-V/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 24.0,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-V/workarena++-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L2",
+        "score": 3.8,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-V/workarena++-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o-V/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o-V",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 41.8,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## GPT-4o model

results/Bgym-GPT-4o/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "GPT-4o",
+    "backend_llm": "GPT-4o"
+}

results/Bgym-GPT-4o/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 71.3,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/results.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+    {
+        "benchmark": "WorkArena-L1",
+        "score": 42.7,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L2",
+        "score": 3.0,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "MiniWoB",
+        "score": 71.3,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WebArena",
+        "score": 23.5,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    }
+]

results/Bgym-GPT-4o/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 23.5,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/workarena++-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L2",
+        "score": 3.0,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/workarena++-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-GPT-4o/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-GPT-4o",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena-L1",
+        "score": 42.7,
+        "std_err": 0.4,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ### Llama-3-70B

results/Bgym-Llama-3-70b/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "Llama-3-70B",
+    "backend_llm": "Llama-3-70B"
+}

results/Bgym-Llama-3-70b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 68.2,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/results.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+    {
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "MiniWoB",
+        "score": 68.2,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WebArena",
+        "score": 11.0,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    }
+]

results/Bgym-Llama-3-70b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 11.0,
+        "std_err": 0.3,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/workarena++-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/workarena++-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Llama-3-70b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,58 @@

+[
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-01 12:00:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 15.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 19.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-05 2:07:00"
+    },
+    {
+        "agent_name": "Bgym-Llama-3-70b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 17.9,
+        "std_err": 0.6,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-12 12:00:00"
+    }
+]

results/Bgym-Mixtral-8x22b/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ## Mixtral 8x22B

results/Bgym-Mixtral-8x22b/config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "agent_name": "Mixtral-8x22B",
+    "backend_llm": "Mixtral-8x22B"
+}

results/Bgym-Mixtral-8x22b/miniwob.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "MiniWoB",
+        "score": 62.4,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/results.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+    {
+        "benchmark": "WorkArena-L1",
+        "score": 12.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "MiniWoB",
+        "score": 62.4,
+        "std_err": 0.5,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    },
+    {
+        "benchmark": "WebArena",
+        "score": 12.6,
+        "std_err": 0.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA"
+    }
+]

results/Bgym-Mixtral-8x22b/webarena.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WebArena",
+        "score": 12.6,
+        "std_err": 0.9,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena++-l2.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L2",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena++-l3.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "date_time": "2021-01-01 12:00:00",
+        "benchmark": "WorkArena++-L3",
+        "score": 0.0,
+        "std_err": 0.0,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original"
+    }
+]

results/Bgym-Mixtral-8x22b/workarena-l1.json ADDED Viewed

	@@ -0,0 +1,44 @@

+[
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 12.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Original",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 11.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    },
+    {
+        "agent_name": "Bgym-Mixtral-8x22b",
+        "study_id": "study_id",
+        "benchmark": "WorkArena-L1",
+        "score": 13.4,
+        "std_err": 0.7,
+        "benchmark_specific": "No",
+        "benchmark_tuned": "No",
+        "followed_evaluation_protocol": "Yes",
+        "reproducible": "Yes",
+        "comments": "NA",
+        "original_or_reproduced": "Reproduced",
+        "date_time": "2021-01-04 12:06:00"
+    }
+]