meghsn commited on
Commit
b667dc2
Β·
1 Parent(s): 2705446

Security checks

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +95 -48
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ *.pyc
app.py CHANGED
@@ -10,23 +10,46 @@ from huggingface_hub import HfApi
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
12
 
 
 
 
 
 
 
13
  # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
14
  BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
15
 
16
- def create_html_table_main(df, benchmarks):
17
- col1, col2 = st.columns([2,6])
18
- with col1:
19
- sort_column = st.selectbox("Sort by", df.columns.tolist())
20
- with col2:
21
- sort_order = st.radio("Order", ["Ascending", "Descending"], horizontal=True)
22
-
23
- # Sort dataframe
24
- if sort_order == "Ascending":
25
- df = df.sort_values(by=sort_column)
26
- else:
27
- df = df.sort_values(by=sort_column, ascending=False)
28
 
29
- # Create HTML table without JavaScript sorting
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  html = '''
31
  <style>
32
  table {
@@ -50,20 +73,22 @@ def create_html_table_main(df, benchmarks):
50
  html += '<table>'
51
  html += '<thead><tr>'
52
  for column in df.columns:
53
- html += f'<th>{column}</th>'
54
  html += '</tr></thead>'
55
  html += '<tbody>'
56
  for _, row in df.iterrows():
57
  html += '<tr>'
58
  for col in df.columns:
59
- html += f'<td>{row[col]}</td>'
 
 
 
60
  html += '</tr>'
61
  html += '</tbody></table>'
62
  html += '</div>'
63
  return html
64
 
65
- def create_html_table_benchmark(df, benchmarks):
66
- # Create HTML table without JavaScript sorting
67
  html = '''
68
  <style>
69
  table {
@@ -88,7 +113,7 @@ def create_html_table_benchmark(df, benchmarks):
88
  html += '<thead><tr>'
89
  for column in df.columns:
90
  if column != "Reproduced_all":
91
- html += f'<th>{column}</th>'
92
  html += '</tr></thead>'
93
  html += '<tbody>'
94
  for _, row in df.iterrows():
@@ -96,41 +121,60 @@ def create_html_table_benchmark(df, benchmarks):
96
  for column in df.columns:
97
  if column == "Reproduced":
98
  if row[column] == "-":
99
- html += f'<td>{row[column]}</td>'
100
  else:
101
- html += f'<td><details><summary>{row[column]}</summary>{"<br>".join(map(str, row["Reproduced_all"]))}</details></td>'
 
 
102
  elif column == "Reproduced_all":
103
  continue
104
  else:
105
- html += f'<td>{row[column]}</td>'
106
  html += '</tr>'
107
  html += '</tbody></table>'
108
  html += '</div>'
109
  return html
110
 
111
  def check_sanity(agent):
112
- for benchmark in BENCHMARKS:
113
- file_path = f"results/{agent}/{benchmark.lower()}.json"
114
- if not os.path.exists(file_path):
115
- continue
116
- original_count = 0
117
- with open(file_path) as f:
118
- results = json.load(f)
119
- for result in results:
120
- if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
121
- return False
122
- if result["agent_name"] != agent:
123
- return False
124
- if result["benchmark"] != benchmark:
125
- return False
126
- if result["original_or_reproduced"] == "Original":
127
- original_count += 1
128
- if original_count != 1:
129
- return False
130
- return True
 
 
 
 
131
 
132
  def main():
133
- st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  all_agents = os.listdir("results")
136
  all_results = {}
@@ -148,7 +192,7 @@ def main():
148
  st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
149
  # content = create_yall()
150
  # tab1, tab2, tab3, tab4 = st.tabs(["πŸ† WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "πŸ“ About"])
151
- tabs = st.tabs(["πŸ† WebAgent Leaderboard",] + BENCHMARKS + ["πŸ“ About"])
152
 
153
  with tabs[0]:
154
  # Leaderboard tab
@@ -190,8 +234,13 @@ def main():
190
  # Display the filtered DataFrame or the entire leaderboard
191
 
192
  def make_hyperlink(agent_name):
193
- url = f"https://huggingface.co/spaces/meghsn/WebAgent-Leaderboard/blob/main/results/{agent_name}/README.md"
194
- return f'<a href="{url}" target="_blank">{agent_name}</a>'
 
 
 
 
 
195
  df['Agent'] = df['Agent'].apply(make_hyperlink)
196
  # st.dataframe(
197
  # df[['Agent'] + BENCHMARKS],
@@ -201,10 +250,8 @@ def main():
201
  # # height=int(len(df) * 36.2),
202
  # )
203
  # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
204
- html_table = create_html_table_main(df, BENCHMARKS)
205
- # print (html_table)
206
  st.markdown(html_table, unsafe_allow_html=True)
207
- # components.html(html_table, height=600, scrolling=True)
208
 
209
  if st.button("Export to CSV", key="export_main"):
210
  # Export the DataFrame to CSV
@@ -280,7 +327,7 @@ def main():
280
  # column_config={benchmark: {'alignment': 'center'}},
281
  # hide_index=True,
282
  # )
283
- html_table = create_html_table_benchmark(df_, BENCHMARKS)
284
  st.markdown(html_table, unsafe_allow_html=True)
285
 
286
 
 
10
  from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
11
  import streamlit.components.v1 as components
12
 
13
+ from urllib.parse import quote
14
+ from pathlib import Path
15
+ import re
16
+ import html
17
+ from typing import Dict, Any
18
+
19
  # BENCHMARKS = ["WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB", "WebArena"]
20
  BENCHMARKS = ["WebArena", "WorkArena-L1", "WorkArena++-L2", "WorkArena++-L3", "MiniWoB",]
21
 
22
+ def sanitize_agent_name(agent_name):
23
+ # Only allow alphanumeric chars, hyphen, underscore
24
+ if agent_name.startswith('.'):
25
+ raise ValueError("Agent name cannot start with a dot")
 
 
 
 
 
 
 
 
26
 
27
+ if not re.match("^[a-zA-Z0-9-_][a-zA-Z0-9-_.]*$", agent_name):
28
+ raise ValueError("Invalid agent name format")
29
+ return agent_name
30
+
31
+ def safe_path_join(*parts):
32
+ # Ensure we stay within results directory
33
+ base = Path("results").resolve()
34
+ try:
35
+ path = base.joinpath(*parts).resolve()
36
+ if not str(path).startswith(str(base)):
37
+ raise ValueError("Path traversal detected")
38
+ return path
39
+ except Exception:
40
+ raise ValueError("Invalid path")
41
+
42
+ def sanitize_column_name(col: str) -> str:
43
+ """Sanitize column names for HTML display"""
44
+ return html.escape(str(col))
45
+
46
+ def sanitize_cell_value(value: Any) -> str:
47
+ """Sanitize cell values for HTML display"""
48
+ if isinstance(value, (int, float)):
49
+ return str(value)
50
+ return html.escape(str(value))
51
+
52
+ def create_html_table_main(df):
53
  html = '''
54
  <style>
55
  table {
 
73
  html += '<table>'
74
  html += '<thead><tr>'
75
  for column in df.columns:
76
+ html += f'<th>{sanitize_column_name(column)}</th>'
77
  html += '</tr></thead>'
78
  html += '<tbody>'
79
  for _, row in df.iterrows():
80
  html += '<tr>'
81
  for col in df.columns:
82
+ if col == "Agent":
83
+ html += f'<td>{row[col]}</td>'
84
+ else:
85
+ html += f'<td>{sanitize_cell_value(row[col])}</td>'
86
  html += '</tr>'
87
  html += '</tbody></table>'
88
  html += '</div>'
89
  return html
90
 
91
+ def create_html_table_benchmark(df):
 
92
  html = '''
93
  <style>
94
  table {
 
113
  html += '<thead><tr>'
114
  for column in df.columns:
115
  if column != "Reproduced_all":
116
+ html += f'<th>{sanitize_column_name(column)}</th>'
117
  html += '</tr></thead>'
118
  html += '<tbody>'
119
  for _, row in df.iterrows():
 
121
  for column in df.columns:
122
  if column == "Reproduced":
123
  if row[column] == "-":
124
+ html += f'<td>{sanitize_cell_value(row[column])}</td>'
125
  else:
126
+ summary = sanitize_cell_value(row[column])
127
+ details = "<br>".join(map(sanitize_cell_value, row["Reproduced_all"]))
128
+ html += f'<td><details><summary>{summary}</summary>{details}</details></td>'
129
  elif column == "Reproduced_all":
130
  continue
131
  else:
132
+ html += f'<td>{sanitize_cell_value(row[column])}</td>'
133
  html += '</tr>'
134
  html += '</tbody></table>'
135
  html += '</div>'
136
  return html
137
 
138
  def check_sanity(agent):
139
+ try:
140
+ safe_agent = sanitize_agent_name(agent)
141
+ for benchmark in BENCHMARKS:
142
+ file_path = safe_path_join(safe_agent, f"{benchmark.lower()}.json")
143
+ if not file_path.is_file():
144
+ continue
145
+ original_count = 0
146
+ with open(file_path) as f:
147
+ results = json.load(f)
148
+ for result in results:
149
+ if not all(key in result for key in ["agent_name", "benchmark", "original_or_reproduced", "score", "std_err", "benchmark_specific", "benchmark_tuned", "followed_evaluation_protocol", "reproducible", "comments", "study_id", "date_time"]):
150
+ return False
151
+ if result["agent_name"] != agent:
152
+ return False
153
+ if result["benchmark"] != benchmark:
154
+ return False
155
+ if result["original_or_reproduced"] == "Original":
156
+ original_count += 1
157
+ if original_count != 1:
158
+ return False
159
+ return True
160
+ except ValueError:
161
+ return False
162
 
163
  def main():
164
+ st.set_page_config(page_title="BrowserGym Leaderboard", layout="wide", initial_sidebar_state="expanded")
165
+ st.markdown("""
166
+ <head>
167
+ <meta http-equiv="Content-Security-Policy"
168
+ content="default-src 'self' https://huggingface.co;
169
+ script-src 'self' 'unsafe-inline';
170
+ style-src 'self' 'unsafe-inline';
171
+ img-src 'self' data: https:;
172
+ frame-ancestors 'none';">
173
+ <meta http-equiv="X-Frame-Options" content="DENY">
174
+ <meta http-equiv="X-Content-Type-Options" content="nosniff">
175
+ <meta http-equiv="Referrer-Policy" content="strict-origin-when-cross-origin">
176
+ </head>
177
+ """, unsafe_allow_html=True)
178
 
179
  all_agents = os.listdir("results")
180
  all_results = {}
 
192
  st.markdown("Leaderboard to evaluate LLMs, VLMs, and agents on web navigation tasks.")
193
  # content = create_yall()
194
  # tab1, tab2, tab3, tab4 = st.tabs(["πŸ† WebAgent Leaderboard", "WorkArena++-L2 Leaderboard", "WorkArena++-L3 Leaderboard", "πŸ“ About"])
195
+ tabs = st.tabs(["πŸ† Main Leaderboard",] + BENCHMARKS + ["πŸ“ About"])
196
 
197
  with tabs[0]:
198
  # Leaderboard tab
 
234
  # Display the filtered DataFrame or the entire leaderboard
235
 
236
  def make_hyperlink(agent_name):
237
+ try:
238
+ safe_name = sanitize_agent_name(agent_name)
239
+ safe_url = f"https://huggingface.co/spaces/ServiceNow/browsergym-leaderboard/blob/main/results/{quote(safe_name)}/README.md"
240
+ return f'<a href="{html.escape(safe_url)}" target="_blank">{html.escape(safe_name)}</a>'
241
+ except ValueError:
242
+ return ""
243
+
244
  df['Agent'] = df['Agent'].apply(make_hyperlink)
245
  # st.dataframe(
246
  # df[['Agent'] + BENCHMARKS],
 
250
  # # height=int(len(df) * 36.2),
251
  # )
252
  # st.markdown(df.to_html(escape=False, index=False), unsafe_allow_html=True)
253
+ html_table = create_html_table_main(df)
 
254
  st.markdown(html_table, unsafe_allow_html=True)
 
255
 
256
  if st.button("Export to CSV", key="export_main"):
257
  # Export the DataFrame to CSV
 
327
  # column_config={benchmark: {'alignment': 'center'}},
328
  # hide_index=True,
329
  # )
330
+ html_table = create_html_table_benchmark(df_)
331
  st.markdown(html_table, unsafe_allow_html=True)
332
 
333