cccjc commited on
Commit
d16a60b
1 Parent(s): 6a59158

add internvl-2.5-8B

Browse files
constants.py CHANGED
@@ -115,6 +115,7 @@ MODEL_NAME_MAP = {
115
  "POINTS_15_7B": "POINTS-1.5-8B",
116
  "InternVL2_5_78B": "InternVL2.5-78B",
117
  "InternVL2_5_2B": "InternVL2.5-2B",
 
118
  }
119
 
120
  DIMENSION_NAME_MAP = {
@@ -201,15 +202,16 @@ MODEL_URLS = {
201
  "Mammoth_VL": "https://huggingface.co/MAmmoTH-VL/MAmmoTH-VL-8B",
202
  "InternVL2_5_78B": "https://huggingface.co/OpenGVLab/InternVL2_5-78B",
203
  "InternVL2_5_2B": "https://huggingface.co/OpenGVLab/InternVL2_5-2B",
 
204
  }
205
 
206
  # Define the base MODEL_GROUPS structure
207
  BASE_MODEL_GROUPS = {
208
  "All": list(MODEL_NAME_MAP.keys()),
209
  "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B'],
210
- "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B"],
211
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
212
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
213
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
214
- "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B"]
215
- }
 
115
  "POINTS_15_7B": "POINTS-1.5-8B",
116
  "InternVL2_5_78B": "InternVL2.5-78B",
117
  "InternVL2_5_2B": "InternVL2.5-2B",
118
+ "InternVL2_5_8B": "InternVL2.5-8B",
119
  }
120
 
121
  DIMENSION_NAME_MAP = {
 
202
  "Mammoth_VL": "https://huggingface.co/MAmmoTH-VL/MAmmoTH-VL-8B",
203
  "InternVL2_5_78B": "https://huggingface.co/OpenGVLab/InternVL2_5-78B",
204
  "InternVL2_5_2B": "https://huggingface.co/OpenGVLab/InternVL2_5-2B",
205
+ "InternVL2_5_8B": "https://huggingface.co/OpenGVLab/InternVL2_5-8B",
206
  }
207
 
208
  # Define the base MODEL_GROUPS structure
209
  BASE_MODEL_GROUPS = {
210
  "All": list(MODEL_NAME_MAP.keys()),
211
  "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B'],
212
+ "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"],
213
  "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'],
214
  "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'],
215
  "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"],
216
+ "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"]
217
+ }
static/eval_results/Default/InternVL2_5_8B/summary_results.json ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_summary": {
3
+ "core": {
4
+ "num_eval_tasks": 440,
5
+ "num_eval_samples": 6539,
6
+ "macro_mean_score": 0.28341178736010597
7
+ },
8
+ "open": {
9
+ "num_eval_tasks": 65,
10
+ "num_eval_samples": 1163,
11
+ "macro_mean_score": 0.4427412542642981
12
+ },
13
+ "overall_score": 0.3039195405259921
14
+ },
15
+ "keyword_stats": {
16
+ "skills": {
17
+ "Object Recognition and Classification": {
18
+ "count": 303,
19
+ "num_samples": 4755,
20
+ "tasks": [],
21
+ "average_score": 0.30632267145420167
22
+ },
23
+ "Text Recognition (OCR)": {
24
+ "count": 137,
25
+ "num_samples": 2239,
26
+ "tasks": [],
27
+ "average_score": 0.32780231152819705
28
+ },
29
+ "Language Understanding and Generation": {
30
+ "count": 154,
31
+ "num_samples": 2509,
32
+ "tasks": [],
33
+ "average_score": 0.3474357934495928
34
+ },
35
+ "Scene and Event Understanding": {
36
+ "count": 154,
37
+ "num_samples": 2467,
38
+ "tasks": [],
39
+ "average_score": 0.3581812555811988
40
+ },
41
+ "Mathematical and Logical Reasoning": {
42
+ "count": 109,
43
+ "num_samples": 1910,
44
+ "tasks": [],
45
+ "average_score": 0.24957753120558315
46
+ },
47
+ "Commonsense and Social Reasoning": {
48
+ "count": 51,
49
+ "num_samples": 855,
50
+ "tasks": [],
51
+ "average_score": 0.4216233765938271
52
+ },
53
+ "Ethical and Safety Reasoning": {
54
+ "count": 15,
55
+ "num_samples": 245,
56
+ "tasks": [],
57
+ "average_score": 0.5161152882205514
58
+ },
59
+ "Domain-Specific Knowledge and Skills": {
60
+ "count": 77,
61
+ "num_samples": 1386,
62
+ "tasks": [],
63
+ "average_score": 0.2832458606328222
64
+ },
65
+ "Spatial and Temporal Reasoning": {
66
+ "count": 152,
67
+ "num_samples": 2437,
68
+ "tasks": [],
69
+ "average_score": 0.23652758853070416
70
+ },
71
+ "Planning and Decision Making": {
72
+ "count": 37,
73
+ "num_samples": 577,
74
+ "tasks": [],
75
+ "average_score": 0.1316894515451977
76
+ }
77
+ },
78
+ "input_format": {
79
+ "User Interface Screenshots": {
80
+ "count": 93,
81
+ "num_samples": 1517,
82
+ "tasks": [],
83
+ "average_score": 0.3170058923831629
84
+ },
85
+ "Text-Based Images and Documents": {
86
+ "count": 82,
87
+ "num_samples": 1294,
88
+ "tasks": [],
89
+ "average_score": 0.22476699307920894
90
+ },
91
+ "Diagrams and Data Visualizations": {
92
+ "count": 101,
93
+ "num_samples": 1718,
94
+ "tasks": [],
95
+ "average_score": 0.27470636143635613
96
+ },
97
+ "Videos": {
98
+ "count": 43,
99
+ "num_samples": 698,
100
+ "tasks": [],
101
+ "average_score": 0.358656962172874
102
+ },
103
+ "Artistic and Creative Content": {
104
+ "count": 32,
105
+ "num_samples": 541,
106
+ "tasks": [],
107
+ "average_score": 0.3109129114251568
108
+ },
109
+ "Photographs": {
110
+ "count": 143,
111
+ "num_samples": 2248,
112
+ "tasks": [],
113
+ "average_score": 0.3559644169537599
114
+ },
115
+ "3D Models and Aerial Imagery": {
116
+ "count": 11,
117
+ "num_samples": 169,
118
+ "tasks": [],
119
+ "average_score": 0.14065544299986515
120
+ }
121
+ },
122
+ "output_format": {
123
+ "contextual_formatted_text": {
124
+ "count": 98,
125
+ "num_samples": 1514,
126
+ "tasks": [],
127
+ "average_score": 0.2593999929737164
128
+ },
129
+ "structured_output": {
130
+ "count": 110,
131
+ "num_samples": 1714,
132
+ "tasks": [],
133
+ "average_score": 0.24115694869183088
134
+ },
135
+ "exact_text": {
136
+ "count": 83,
137
+ "num_samples": 1278,
138
+ "tasks": [],
139
+ "average_score": 0.32751507985720435
140
+ },
141
+ "numerical_data": {
142
+ "count": 49,
143
+ "num_samples": 862,
144
+ "tasks": [],
145
+ "average_score": 0.28350382022275183
146
+ },
147
+ "open_ended_output": {
148
+ "count": 80,
149
+ "num_samples": 1454,
150
+ "tasks": [],
151
+ "average_score": 0.39498520559767875
152
+ },
153
+ "multiple_choice": {
154
+ "count": 85,
155
+ "num_samples": 1363,
156
+ "tasks": [],
157
+ "average_score": 0.3394899886026274
158
+ }
159
+ },
160
+ "input_num": {
161
+ "6-8 images": {
162
+ "count": 21,
163
+ "num_samples": 314,
164
+ "tasks": [],
165
+ "average_score": 0.12507018680488066
166
+ },
167
+ "9-image or more": {
168
+ "count": 41,
169
+ "num_samples": 623,
170
+ "tasks": [],
171
+ "average_score": 0.2869727946384576
172
+ },
173
+ "1-image": {
174
+ "count": 315,
175
+ "num_samples": 5228,
176
+ "tasks": [],
177
+ "average_score": 0.3332167660932209
178
+ },
179
+ "video": {
180
+ "count": 43,
181
+ "num_samples": 698,
182
+ "tasks": [],
183
+ "average_score": 0.358656962172874
184
+ },
185
+ "4-5 images": {
186
+ "count": 34,
187
+ "num_samples": 520,
188
+ "tasks": [],
189
+ "average_score": 0.17503971457329898
190
+ },
191
+ "2-3 images": {
192
+ "count": 51,
193
+ "num_samples": 802,
194
+ "tasks": [],
195
+ "average_score": 0.25000251910306803
196
+ }
197
+ },
198
+ "app": {
199
+ "Information_Extraction": {
200
+ "count": 72,
201
+ "num_samples": 1124,
202
+ "tasks": [],
203
+ "average_score": 0.3510200305843745
204
+ },
205
+ "Planning": {
206
+ "count": 78,
207
+ "num_samples": 1239,
208
+ "tasks": [],
209
+ "average_score": 0.1596951437265508
210
+ },
211
+ "Coding": {
212
+ "count": 31,
213
+ "num_samples": 474,
214
+ "tasks": [],
215
+ "average_score": 0.25454014939309055
216
+ },
217
+ "Perception": {
218
+ "count": 145,
219
+ "num_samples": 2313,
220
+ "tasks": [],
221
+ "average_score": 0.332705158221202
222
+ },
223
+ "Metrics": {
224
+ "count": 20,
225
+ "num_samples": 309,
226
+ "tasks": [],
227
+ "average_score": 0.4496016958712894
228
+ },
229
+ "Science": {
230
+ "count": 29,
231
+ "num_samples": 574,
232
+ "tasks": [],
233
+ "average_score": 0.28828525298916796
234
+ },
235
+ "Knowledge": {
236
+ "count": 97,
237
+ "num_samples": 1605,
238
+ "tasks": [],
239
+ "average_score": 0.3477512139656071
240
+ },
241
+ "Mathematics": {
242
+ "count": 33,
243
+ "num_samples": 547,
244
+ "tasks": [],
245
+ "average_score": 0.25856175669225717
246
+ }
247
+ }
248
+ }
249
+ }
static/eval_results/Default/InternVL2_5_8B/task_results.json ADDED
The diff for this file is too large to render. See raw diff