Security checks
Browse files- .gitignore +2 -0
- app.py +95 -48
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
app.py
CHANGED
@@ -10,23 +10,46 @@ from huggingface_hub import HfApi
|
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
|
14 |
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
|
15 |
|
16 |
-
def
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
with col2:
|
21 |
-
sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
|
22 |
-
|
23 |
-
# Sort dataframe
|
24 |
-
if sort_order == "Ascending":
|
25 |
-
df = df.sort_values(by=sort_column)
|
26 |
-
else:
|
27 |
-
df = df.sort_values(by=sort_column, ascending=False)
|
28 |
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
html = '''
|
31 |
<style>
|
32 |
table {
|
@@ -50,20 +73,22 @@ def create_html_table_main(df, benchmarks):
|
|
50 |
html += '<table>'
|
51 |
html += '<thead><tr>'
|
52 |
for column in df.columns:
|
53 |
-
html += f'<th>{column}</th>'
|
54 |
html += '</tr></thead>'
|
55 |
html += '<tbody>'
|
56 |
for _, row in df.iterrows():
|
57 |
html += '<tr>'
|
58 |
for col in df.columns:
|
59 |
-
|
|
|
|
|
|
|
60 |
html += '</tr>'
|
61 |
html += '</tbody></table>'
|
62 |
html += '</div>'
|
63 |
return html
|
64 |
|
65 |
-
def create_html_table_benchmark(df
|
66 |
-
# Create HTML table without JavaScript sorting
|
67 |
html = '''
|
68 |
<style>
|
69 |
table {
|
@@ -88,7 +113,7 @@ def create_html_table_benchmark(df, benchmarks):
|
|
88 |
html += '<thead><tr>'
|
89 |
for column in df.columns:
|
90 |
if column != "Reproduced_all":
|
91 |
-
html += f'<th>{column}</th>'
|
92 |
html += '</tr></thead>'
|
93 |
html += '<tbody>'
|
94 |
for _, row in df.iterrows():
|
@@ -96,41 +121,60 @@ def create_html_table_benchmark(df, benchmarks):
|
|
96 |
for column in df.columns:
|
97 |
if column == "Reproduced":
|
98 |
if row[column] == "-":
|
99 |
-
html += f'<td>{row[column]}</td>'
|
100 |
else:
|
101 |
-
|
|
|
|
|
102 |
elif column == "Reproduced_all":
|
103 |
continue
|
104 |
else:
|
105 |
-
html += f'<td>{row[column]}</td>'
|
106 |
html += '</tr>'
|
107 |
html += '</tbody></table>'
|
108 |
html += '</div>'
|
109 |
return html
|
110 |
|
111 |
def check_sanity(agent):
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
131 |
|
132 |
def main():
|
133 |
-
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
all_agents = os.listdir("results")
|
136 |
all_results = {}
|
@@ -148,7 +192,7 @@ def main():
|
|
148 |
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
|
149 |
# content = create_yall()
|
150 |
# tab1, tab2, tab3, tab4 = st.tabs(["π WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "π About"])
|
151 |
-
tabs = st.tabs(["π
|
152 |
|
153 |
with tabs[0]:
|
154 |
# Leaderboard tab
|
@@ -190,8 +234,13 @@ def main():
|
|
190 |
# Display the filtered DataFrame or the entire leaderboard
|
191 |
|
192 |
def make_hyperlink(agent_name):
|
193 |
-
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
195 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
196 |
# st.dataframe(
|
197 |
# df[['Agent'] + BENCHMARKS],
|
@@ -201,10 +250,8 @@ def main():
|
|
201 |
# # height=int(len(df) * 36.2),
|
202 |
# )
|
203 |
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
204 |
-
html_table = create_html_table_main(df
|
205 |
-
# print (html_table)
|
206 |
st.markdown(html_table, unsafe_allow_html=True)
|
207 |
-
# components.html(html_table, height=600, scrolling=True)
|
208 |
|
209 |
if st.button("Export to CSV", key="export_main"):
|
210 |
# Export the DataFrame to CSV
|
@@ -280,7 +327,7 @@ def main():
|
|
280 |
# column_config={benchmark: {'alignment': 'center'}},
|
281 |
# hide_index=True,
|
282 |
# )
|
283 |
-
html_table = create_html_table_benchmark(df_
|
284 |
st.markdown(html_table, unsafe_allow_html=True)
|
285 |
|
286 |
|
|
|
10 |
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
|
11 |
import streamlit.components.v1 as components
|
12 |
|
13 |
+
from urllib.parse import quote
|
14 |
+
from pathlib import Path
|
15 |
+
import re
|
16 |
+
import html
|
17 |
+
from typing import Dict, Any
|
18 |
+
|
19 |
# BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
|
20 |
BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
|
21 |
|
22 |
+
def sanitize_agent_name(agent_name):
|
23 |
+
# Only allow alphanumeric chars, hyphen, underscore
|
24 |
+
if agent_name.startswith('.'):
|
25 |
+
raise ValueError("Agent name cannot start with a dot")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
|
28 |
+
raise ValueError("Invalid agent name format")
|
29 |
+
return agent_name
|
30 |
+
|
31 |
+
def safe_path_join(*parts):
|
32 |
+
# Ensure we stay within results directory
|
33 |
+
base = Path("results").resolve()
|
34 |
+
try:
|
35 |
+
path = base.joinpath(*parts).resolve()
|
36 |
+
if not str(path).startswith(str(base)):
|
37 |
+
raise ValueError("Path traversal detected")
|
38 |
+
return path
|
39 |
+
except Exception:
|
40 |
+
raise ValueError("Invalid path")
|
41 |
+
|
42 |
+
def sanitize_column_name(col: str) -> str:
|
43 |
+
"""Sanitize column names for HTML display"""
|
44 |
+
return html.escape(str(col))
|
45 |
+
|
46 |
+
def sanitize_cell_value(value: Any) -> str:
|
47 |
+
"""Sanitize cell values for HTML display"""
|
48 |
+
if isinstance(value, (int, float)):
|
49 |
+
return str(value)
|
50 |
+
return html.escape(str(value))
|
51 |
+
|
52 |
+
def create_html_table_main(df):
|
53 |
html = '''
|
54 |
<style>
|
55 |
table {
|
|
|
73 |
html += '<table>'
|
74 |
html += '<thead><tr>'
|
75 |
for column in df.columns:
|
76 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
77 |
html += '</tr></thead>'
|
78 |
html += '<tbody>'
|
79 |
for _, row in df.iterrows():
|
80 |
html += '<tr>'
|
81 |
for col in df.columns:
|
82 |
+
if col == "Agent":
|
83 |
+
html += f'<td>{row[col]}</td>'
|
84 |
+
else:
|
85 |
+
html += f'<td>{sanitize_cell_value(row[col])}</td>'
|
86 |
html += '</tr>'
|
87 |
html += '</tbody></table>'
|
88 |
html += '</div>'
|
89 |
return html
|
90 |
|
91 |
+
def create_html_table_benchmark(df):
|
|
|
92 |
html = '''
|
93 |
<style>
|
94 |
table {
|
|
|
113 |
html += '<thead><tr>'
|
114 |
for column in df.columns:
|
115 |
if column != "Reproduced_all":
|
116 |
+
html += f'<th>{sanitize_column_name(column)}</th>'
|
117 |
html += '</tr></thead>'
|
118 |
html += '<tbody>'
|
119 |
for _, row in df.iterrows():
|
|
|
121 |
for column in df.columns:
|
122 |
if column == "Reproduced":
|
123 |
if row[column] == "-":
|
124 |
+
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
125 |
else:
|
126 |
+
summary = sanitize_cell_value(row[column])
|
127 |
+
details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
|
128 |
+
html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
|
129 |
elif column == "Reproduced_all":
|
130 |
continue
|
131 |
else:
|
132 |
+
html += f'<td>{sanitize_cell_value(row[column])}</td>'
|
133 |
html += '</tr>'
|
134 |
html += '</tbody></table>'
|
135 |
html += '</div>'
|
136 |
return html
|
137 |
|
138 |
def check_sanity(agent):
|
139 |
+
try:
|
140 |
+
safe_agent = sanitize_agent_name(agent)
|
141 |
+
for benchmark in BENCHMARKS:
|
142 |
+
file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
|
143 |
+
if not file_path.is_file():
|
144 |
+
continue
|
145 |
+
original_count = 0
|
146 |
+
with open(file_path) as f:
|
147 |
+
results = json.load(f)
|
148 |
+
for result in results:
|
149 |
+
if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
|
150 |
+
return False
|
151 |
+
if result["agent_name"] != agent:
|
152 |
+
return False
|
153 |
+
if result["benchmark"] != benchmark:
|
154 |
+
return False
|
155 |
+
if result["original_or_reproduced"] == "Original":
|
156 |
+
original_count += 1
|
157 |
+
if original_count != 1:
|
158 |
+
return False
|
159 |
+
return True
|
160 |
+
except ValueError:
|
161 |
+
return False
|
162 |
|
163 |
def main():
|
164 |
+
st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
|
165 |
+
st.markdown("""
|
166 |
+
<head>
|
167 |
+
<meta http-equiv="Content-Security-Policy"
|
168 |
+
content="default-src 'self' https://huggingface.co;
|
169 |
+
script-src 'self' 'unsafe-inline';
|
170 |
+
style-src 'self' 'unsafe-inline';
|
171 |
+
img-src 'self' data: https:;
|
172 |
+
frame-ancestors 'none';">
|
173 |
+
<meta http-equiv="X-Frame-Options" content="DENY">
|
174 |
+
<meta http-equiv="X-Content-Type-Options" content="nosniff">
|
175 |
+
<meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
|
176 |
+
</head>
|
177 |
+
""", unsafe_allow_html=True)
|
178 |
|
179 |
all_agents = os.listdir("results")
|
180 |
all_results = {}
|
|
|
192 |
st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
|
193 |
# content = create_yall()
|
194 |
# tab1, tab2, tab3, tab4 = st.tabs(["π WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "π About"])
|
195 |
+
tabs = st.tabs(["π Main Leaderboard",] + BENCHMARKS + ["π About"])
|
196 |
|
197 |
with tabs[0]:
|
198 |
# Leaderboard tab
|
|
|
234 |
# Display the filtered DataFrame or the entire leaderboard
|
235 |
|
236 |
def make_hyperlink(agent_name):
|
237 |
+
try:
|
238 |
+
safe_name = sanitize_agent_name(agent_name)
|
239 |
+
safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
|
240 |
+
return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
|
241 |
+
except ValueError:
|
242 |
+
return ""
|
243 |
+
|
244 |
df['Agent'] = df['Agent'].apply(make_hyperlink)
|
245 |
# st.dataframe(
|
246 |
# df[['Agent'] + BENCHMARKS],
|
|
|
250 |
# # height=int(len(df) * 36.2),
|
251 |
# )
|
252 |
# st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
|
253 |
+
html_table = create_html_table_main(df)
|
|
|
254 |
st.markdown(html_table, unsafe_allow_html=True)
|
|
|
255 |
|
256 |
if st.button("Export to CSV", key="export_main"):
|
257 |
# Export the DataFrame to CSV
|
|
|
327 |
# column_config={benchmark: {'alignment': 'center'}},
|
328 |
# hide_index=True,
|
329 |
# )
|
330 |
+
html_table = create_html_table_benchmark(df_)
|
331 |
st.markdown(html_table, unsafe_allow_html=True)
|
332 |
|
333 |
|