nebulae09 commited on
Commit
5ab1442
·
1 Parent(s): 8e5bfdb

update code with MLVU and TempCompass

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. lb_info.py +42 -8
app.py CHANGED
@@ -8,8 +8,8 @@ with gr.Blocks() as demo:
8
  EVAL_TIME = format_timestamp(timestamp)
9
  results = struct['results']
10
  N_MODEL = len(results)
11
- N_DATA = len(results['Video-LLaVA']) - 1
12
- DATASETS = list(results['Video-LLaVA'])
13
  DATASETS.remove('META')
14
  print(DATASETS)
15
 
 
8
  EVAL_TIME = format_timestamp(timestamp)
9
  results = struct['results']
10
  N_MODEL = len(results)
11
+ N_DATA = len(results['Video-LLaVA-7B']) - 1
12
+ DATASETS = list(results['Video-LLaVA-7B'])
13
  DATASETS.remove('META')
14
  print(DATASETS)
15
 
lb_info.py CHANGED
@@ -36,7 +36,7 @@ This leaderboard was last updated: {}.
36
  """
37
  # CONSTANTS-FIELDS
38
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Frames']
39
- MAIN_FIELDS = ['MVBench', 'Video-MME (w/o subs)', 'MMBench-Video']
40
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
41
  MODEL_TYPE = ['API', 'OpenSource']
42
 
@@ -134,13 +134,41 @@ def BUILD_L1_DF(results, fields):
134
  res[k].append(meta[k])
135
  scores, ranks = [], []
136
  for d in fields:
137
- res[d].append(item[d]['Overall'])
138
- # scores.append(item[d]['Overall'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  if d == 'MMBench-Video':
140
  scores.append(item[d]['Overall'] / 3 * 100)
 
 
 
 
141
  else:
142
  scores.append(item[d]['Overall'])
143
- ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values() if 'Overall' in x[d]]))
 
 
 
 
 
 
 
 
 
144
  res['Avg Score'].append(round(np.mean(scores), 1))
145
  res['Avg Rank'].append(round(np.mean(ranks), 2))
146
 
@@ -160,13 +188,13 @@ def BUILD_L1_DF(results, fields):
160
  def BUILD_L2_DF(results, dataset):
161
  res = defaultdict(list)
162
  fields = list(list(results.values())[0][dataset].keys())
163
- non_overall_fields = [x for x in fields if 'Overall' not in x]
164
- overall_fields = [x for x in fields if 'Overall' in x]
165
 
166
  for m in results:
167
  item = results[m]
168
  meta = item['META']
169
- if item[dataset] == {}:
170
  continue
171
  for k in META_FIELDS:
172
  if k == 'Parameters (B)':
@@ -186,7 +214,12 @@ def BUILD_L2_DF(results, dataset):
186
  res[d].append(item[dataset][d])
187
 
188
  df = pd.DataFrame(res)
189
- df = df.sort_values('Overall')
 
 
 
 
 
190
  df = df.iloc[::-1]
191
 
192
  check_box = {}
@@ -202,4 +235,5 @@ def BUILD_L2_DF(results, dataset):
202
  type_map['Method'] = 'html'
203
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] ='str'
204
  check_box['type_map'] = type_map
 
205
  return df, check_box
 
36
  """
37
  # CONSTANTS-FIELDS
38
  META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Frames']
39
+ MAIN_FIELDS = ['MVBench', 'Video-MME (w/o subs)', 'MMBench-Video', 'TempCompass', 'MLVU']
40
  MODEL_SIZE = ['<10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
41
  MODEL_TYPE = ['API', 'OpenSource']
42
 
 
134
  res[k].append(meta[k])
135
  scores, ranks = [], []
136
  for d in fields:
137
+ # if d == 'MLVU':
138
+ # item[d]['Overall'] = item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16
139
+ # elif d == 'TempCompass':
140
+ # item[d]['Overall'] = item[d]['overall']
141
+ if d == 'MLVU':
142
+ res[d].append(
143
+ f'M-Avg: {item[d]["M-Avg"]}, G-Avg: {item[d]["G-Avg"]}'
144
+ # {
145
+ # 'M-Avg': item[d]['M-Avg'],
146
+ # 'G-Avg': item[d]['G-Avg']
147
+ # }
148
+ )
149
+ elif d == 'TempCompass':
150
+ res[d].append(item[d]['overall'])
151
+ else:
152
+ res[d].append(item[d]['Overall'])
153
+
154
  if d == 'MMBench-Video':
155
  scores.append(item[d]['Overall'] / 3 * 100)
156
+ elif d == 'TempCompass':
157
+ scores.append(item[d]['overall'])
158
+ elif d == 'MLVU':
159
+ scores.append(item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16)
160
  else:
161
  scores.append(item[d]['Overall'])
162
+
163
+ if d == 'MLVU':
164
+ ranks.append(nth_large(
165
+ item[d]['M-Avg'] * 0.84 + item[d]['G-Avg'] * 10 * 0.16,
166
+ [x[d]['M-Avg'] * 0.84 + x[d]['G-Avg'] * 10 * 0.16 for x in results.values() if d in x and 'M-Avg' in x[d] and 'G-Avg' in x[d]]
167
+ ))
168
+ elif d == 'TempCompass':
169
+ ranks.append(nth_large(item[d]['overall'], [x[d]['overall'] for x in results.values() if d in x and 'overall' in x[d]]))
170
+ else:
171
+ ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values() if d in x and 'Overall' in x[d]]))
172
  res['Avg Score'].append(round(np.mean(scores), 1))
173
  res['Avg Rank'].append(round(np.mean(ranks), 2))
174
 
 
188
  def BUILD_L2_DF(results, dataset):
189
  res = defaultdict(list)
190
  fields = list(list(results.values())[0][dataset].keys())
191
+ non_overall_fields = [x for x in fields if 'Overall' not in x and 'Avg' not in x and 'overall' not in x]
192
+ overall_fields = [x for x in fields if 'Overall' in x or 'Avg' in x or 'overall' in x]
193
 
194
  for m in results:
195
  item = results[m]
196
  meta = item['META']
197
+ if dataset not in item or item[dataset] == {}:
198
  continue
199
  for k in META_FIELDS:
200
  if k == 'Parameters (B)':
 
214
  res[d].append(item[dataset][d])
215
 
216
  df = pd.DataFrame(res)
217
+ if dataset == 'MLVU':
218
+ df = df.sort_values('M-Avg')
219
+ elif dataset == 'TempCompass':
220
+ df = df.sort_values('overall')
221
+ else:
222
+ df = df.sort_values('Overall')
223
  df = df.iloc[::-1]
224
 
225
  check_box = {}
 
235
  type_map['Method'] = 'html'
236
  type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = type_map['Frames'] ='str'
237
  check_box['type_map'] = type_map
238
+ # print(check_box, dataset, df.columns)
239
  return df, check_box