pharaouk commited on
Commit
0af3fe6
1 Parent(s): 167671f
adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "alpindale/CodeLlama-34B-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "up_proj",
19
- "gate_proj",
20
- "down_proj",
21
  "q_proj",
 
22
  "o_proj",
23
- "k_proj"
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "k_proj",
 
 
 
18
  "q_proj",
19
+ "v_proj",
20
  "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f63e1f9f5ce72e3d0fcb7fa87c92aeb6019137e90d5971496e900c70d14626
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af5d01bf9badfccc5c82770b56c6a6cdd6418705b7ac5979ca2a69aa9395cf4
3
  size 871609293
checkpoint-200/adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "alpindale/CodeLlama-34B-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "up_proj",
19
- "gate_proj",
20
- "down_proj",
21
  "q_proj",
 
22
  "o_proj",
23
- "k_proj"
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "k_proj",
 
 
 
18
  "q_proj",
19
+ "v_proj",
20
  "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
checkpoint-200/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:160a60dd1cd3974d6547f7ec7bccbdd179da028231e05e345a2477823ba13476
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5ccfb1a0ea0f9a78946fca6cabf55b2e581ad5c230518e05796212b2142ddbb
3
  size 871609293
checkpoint-200/adapter_model/adapter_model/adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "alpindale/CodeLlama-34B-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "up_proj",
19
- "gate_proj",
20
- "down_proj",
21
  "q_proj",
 
22
  "o_proj",
23
- "k_proj"
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "k_proj",
 
 
 
18
  "q_proj",
19
+ "v_proj",
20
  "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
checkpoint-200/adapter_model/adapter_model/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:160a60dd1cd3974d6547f7ec7bccbdd179da028231e05e345a2477823ba13476
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5ccfb1a0ea0f9a78946fca6cabf55b2e581ad5c230518e05796212b2142ddbb
3
  size 871609293
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:28664f723821987f28653721f3c3e5831cf63de5de35a89366da708f0f7196fc
3
  size 3485880477
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76db89aa865c6c0a8ec3928a2b2b6daae0a3918e3f6e1a1d76ef3ca8c7dc7fd5
3
  size 3485880477
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41a5175571e28bc9e49814ab269d17e197cb3c5f958226d58cf87c3326589d83
3
  size 14511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae24c3e879b23efe7f563fecb9a8536d5b1ba37dfc40dc27479609345e020130
3
  size 14511
checkpoint-200/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7aa2c8b84e17817e6a4dcba5955fca913e266fdcd47f5594a29933ebd4972a01
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81248501833af563175f43c1d681185643b8411cee1fb1e631b8687c465eb2e3
3
  size 627
checkpoint-200/tokenizer_config.json CHANGED
@@ -23,7 +23,6 @@
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
26
- "spaces_between_special_tokens": false,
27
  "tokenizer_class": "LlamaTokenizer",
28
  "unk_token": {
29
  "__type": "AddedToken",
@@ -32,6 +31,5 @@
32
  "normalized": true,
33
  "rstrip": false,
34
  "single_word": false
35
- },
36
- "use_default_system_prompt": true
37
  }
 
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
 
26
  "tokenizer_class": "LlamaTokenizer",
27
  "unk_token": {
28
  "__type": "AddedToken",
 
31
  "normalized": true,
32
  "rstrip": false,
33
  "single_word": false
34
+ }
 
35
  }
checkpoint-200/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.3453182876110077,
3
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34B-hf_unnatural-instructions_standardized/checkpoint-200",
4
  "epoch": 0.012222137346268428,
5
  "global_step": 200,
6
  "is_hyper_param_search": false,
@@ -9,1279 +9,1249 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 0.0002,
13
- "loss": 1.9735,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 0.0002,
19
- "loss": 2.7155,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
- "learning_rate": 0.0002,
25
- "loss": 3.1137,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
- "learning_rate": 0.0002,
31
- "loss": 1.6054,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
- "learning_rate": 0.0002,
37
- "loss": 1.0381,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
- "learning_rate": 0.0002,
43
- "loss": 0.9959,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
- "learning_rate": 0.0002,
49
- "loss": 0.7395,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
- "learning_rate": 0.0002,
55
- "loss": 0.3255,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
- "learning_rate": 0.0002,
61
- "loss": 0.8252,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
- "learning_rate": 0.0002,
67
- "loss": 0.5362,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
- "learning_rate": 0.0002,
73
- "loss": 1.404,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
- "learning_rate": 0.0002,
79
- "loss": 0.6234,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
- "learning_rate": 0.0002,
85
- "loss": 1.0263,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
- "learning_rate": 0.0002,
91
- "loss": 0.2622,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
- "learning_rate": 0.0002,
97
- "loss": 0.2692,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
- "learning_rate": 0.0002,
103
- "loss": 0.2624,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
- "learning_rate": 0.0002,
109
- "loss": 0.4385,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
- "learning_rate": 0.0002,
115
- "loss": 0.3265,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
- "learning_rate": 0.0002,
121
- "loss": 0.2191,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
- "learning_rate": 0.0002,
127
- "loss": 1.0049,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
- "learning_rate": 0.0002,
133
- "loss": 0.6586,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
- "learning_rate": 0.0002,
139
- "loss": 0.3471,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
- "learning_rate": 0.0002,
145
- "loss": 0.7134,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
- "learning_rate": 0.0002,
151
- "loss": 1.01,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
- "learning_rate": 0.0002,
157
- "loss": 0.2802,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
- "learning_rate": 0.0002,
163
- "loss": 0.4205,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
- "learning_rate": 0.0002,
169
- "loss": 0.7682,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
- "learning_rate": 0.0002,
175
- "loss": 0.2002,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
- "learning_rate": 0.0002,
181
- "loss": 0.2132,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
- "learning_rate": 0.0002,
187
- "loss": 1.0622,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
- "learning_rate": 0.0002,
193
- "loss": 0.212,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
- "learning_rate": 0.0002,
199
- "loss": 0.3738,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.0,
204
- "learning_rate": 0.0002,
205
- "loss": 0.3594,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.0,
210
- "learning_rate": 0.0002,
211
- "loss": 0.8766,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.0,
216
- "learning_rate": 0.0002,
217
- "loss": 0.3108,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.0,
222
- "learning_rate": 0.0002,
223
- "loss": 0.2127,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.0,
228
- "learning_rate": 0.0002,
229
- "loss": 0.5968,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.0,
234
- "learning_rate": 0.0002,
235
- "loss": 0.8806,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.0,
240
- "learning_rate": 0.0002,
241
- "loss": 0.0633,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.0,
246
- "learning_rate": 0.0002,
247
- "loss": 0.5851,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.0,
252
- "learning_rate": 0.0002,
253
- "loss": 0.2376,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.0,
258
- "learning_rate": 0.0002,
259
- "loss": 0.2293,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.0,
264
- "learning_rate": 0.0002,
265
- "loss": 0.428,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.0,
270
- "learning_rate": 0.0002,
271
- "loss": 0.131,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.0,
276
- "learning_rate": 0.0002,
277
- "loss": 0.3724,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.0,
282
- "learning_rate": 0.0002,
283
- "loss": 0.5031,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.0,
288
- "learning_rate": 0.0002,
289
- "loss": 0.4934,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.0,
294
- "learning_rate": 0.0002,
295
- "loss": 0.8127,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.0,
300
- "learning_rate": 0.0002,
301
- "loss": 0.4573,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.0,
306
- "learning_rate": 0.0002,
307
- "loss": 0.5568,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.0,
312
- "learning_rate": 0.0002,
313
- "loss": 0.5411,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.0,
318
- "learning_rate": 0.0002,
319
- "loss": 0.4448,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.0,
324
- "learning_rate": 0.0002,
325
- "loss": 0.3774,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.0,
330
- "learning_rate": 0.0002,
331
- "loss": 0.1825,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.0,
336
- "learning_rate": 0.0002,
337
- "loss": 0.2356,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.0,
342
- "learning_rate": 0.0002,
343
- "loss": 0.0236,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.0,
348
- "learning_rate": 0.0002,
349
- "loss": 0.4344,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.0,
354
- "learning_rate": 0.0002,
355
- "loss": 0.4589,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.0,
360
- "learning_rate": 0.0002,
361
- "loss": 0.3766,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.0,
366
- "learning_rate": 0.0002,
367
- "loss": 0.6034,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.0,
372
- "learning_rate": 0.0002,
373
- "loss": 0.7632,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.0,
378
- "learning_rate": 0.0002,
379
- "loss": 0.0612,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.0,
384
- "learning_rate": 0.0002,
385
- "loss": 0.6783,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.0,
390
- "learning_rate": 0.0002,
391
- "loss": 0.2845,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.0,
396
- "learning_rate": 0.0002,
397
- "loss": 0.395,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.0,
402
- "learning_rate": 0.0002,
403
- "loss": 0.8106,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.0,
408
- "learning_rate": 0.0002,
409
- "loss": 0.1468,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.0,
414
- "learning_rate": 0.0002,
415
- "loss": 0.0537,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.0,
420
- "learning_rate": 0.0002,
421
- "loss": 0.4816,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.0,
426
- "learning_rate": 0.0002,
427
- "loss": 0.6052,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.0,
432
- "learning_rate": 0.0002,
433
- "loss": 0.2805,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.0,
438
- "learning_rate": 0.0002,
439
- "loss": 0.8279,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.0,
444
- "learning_rate": 0.0002,
445
- "loss": 0.6954,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.0,
450
- "learning_rate": 0.0002,
451
- "loss": 0.0635,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.0,
456
- "learning_rate": 0.0002,
457
- "loss": 0.2866,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.0,
462
- "learning_rate": 0.0002,
463
- "loss": 0.9656,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.0,
468
- "learning_rate": 0.0002,
469
- "loss": 0.1113,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.0,
474
- "learning_rate": 0.0002,
475
- "loss": 0.4063,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.0,
480
- "learning_rate": 0.0002,
481
- "loss": 0.3245,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.0,
486
- "learning_rate": 0.0002,
487
- "loss": 0.3966,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.0,
492
- "learning_rate": 0.0002,
493
- "loss": 0.4809,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
- "learning_rate": 0.0002,
499
- "loss": 0.3844,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
- "learning_rate": 0.0002,
505
- "loss": 0.1501,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
- "learning_rate": 0.0002,
511
- "loss": 0.5504,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
- "learning_rate": 0.0002,
517
- "loss": 0.2332,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
- "learning_rate": 0.0002,
523
- "loss": 0.0049,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
- "learning_rate": 0.0002,
529
- "loss": 0.2585,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
- "learning_rate": 0.0002,
535
- "loss": 0.2012,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
- "learning_rate": 0.0002,
541
- "loss": 0.0386,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
- "learning_rate": 0.0002,
547
- "loss": 0.5818,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
- "learning_rate": 0.0002,
553
- "loss": 0.2827,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
- "learning_rate": 0.0002,
559
- "loss": 0.3877,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
- "learning_rate": 0.0002,
565
- "loss": 0.3117,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
- "learning_rate": 0.0002,
571
- "loss": 0.9205,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
- "learning_rate": 0.0002,
577
- "loss": 0.4653,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
- "learning_rate": 0.0002,
583
- "loss": 0.3412,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
- "learning_rate": 0.0002,
589
- "loss": 0.3367,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.01,
594
- "learning_rate": 0.0002,
595
- "loss": 0.1755,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.01,
600
- "learning_rate": 0.0002,
601
- "loss": 0.3586,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.01,
606
- "learning_rate": 0.0002,
607
- "loss": 0.5682,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.01,
612
- "learning_rate": 0.0002,
613
- "loss": 0.4869,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.01,
618
- "learning_rate": 0.0002,
619
- "loss": 0.7614,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.01,
624
- "learning_rate": 0.0002,
625
- "loss": 0.4366,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.01,
630
- "learning_rate": 0.0002,
631
- "loss": 0.5307,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.01,
636
- "learning_rate": 0.0002,
637
- "loss": 0.3974,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.01,
642
- "learning_rate": 0.0002,
643
- "loss": 0.5255,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.01,
648
- "learning_rate": 0.0002,
649
- "loss": 0.2682,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.01,
654
- "learning_rate": 0.0002,
655
- "loss": 0.4884,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.01,
660
- "learning_rate": 0.0002,
661
- "loss": 0.0414,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.01,
666
- "learning_rate": 0.0002,
667
- "loss": 0.5094,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.01,
672
- "learning_rate": 0.0002,
673
- "loss": 0.298,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.01,
678
- "learning_rate": 0.0002,
679
- "loss": 0.2147,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.01,
684
- "learning_rate": 0.0002,
685
- "loss": 0.2712,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.01,
690
- "learning_rate": 0.0002,
691
- "loss": 0.5713,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.01,
696
- "learning_rate": 0.0002,
697
- "loss": 0.2979,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.01,
702
- "learning_rate": 0.0002,
703
- "loss": 0.2424,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.01,
708
- "learning_rate": 0.0002,
709
- "loss": 0.1412,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.01,
714
- "learning_rate": 0.0002,
715
- "loss": 0.3252,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.01,
720
- "learning_rate": 0.0002,
721
- "loss": 0.4267,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.01,
726
- "learning_rate": 0.0002,
727
- "loss": 0.2139,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.01,
732
- "learning_rate": 0.0002,
733
- "loss": 0.4214,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.01,
738
- "learning_rate": 0.0002,
739
- "loss": 0.2338,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.01,
744
- "learning_rate": 0.0002,
745
- "loss": 0.5877,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.01,
750
- "learning_rate": 0.0002,
751
- "loss": 0.2574,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.01,
756
- "learning_rate": 0.0002,
757
- "loss": 0.0011,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.01,
762
- "learning_rate": 0.0002,
763
- "loss": 0.6156,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.01,
768
- "learning_rate": 0.0002,
769
- "loss": 0.0888,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.01,
774
- "learning_rate": 0.0002,
775
- "loss": 0.3159,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.01,
780
- "learning_rate": 0.0002,
781
- "loss": 0.2122,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.01,
786
- "learning_rate": 0.0002,
787
- "loss": 0.1131,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.01,
792
- "learning_rate": 0.0002,
793
- "loss": 0.1634,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.01,
798
- "learning_rate": 0.0002,
799
- "loss": 0.3788,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.01,
804
- "learning_rate": 0.0002,
805
- "loss": 0.3187,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.01,
810
- "learning_rate": 0.0002,
811
- "loss": 0.1685,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.01,
816
- "learning_rate": 0.0002,
817
- "loss": 1.5573,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.01,
822
- "learning_rate": 0.0002,
823
- "loss": 0.3409,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.01,
828
- "learning_rate": 0.0002,
829
- "loss": 1.1279,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.01,
834
- "learning_rate": 0.0002,
835
- "loss": 0.1385,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.01,
840
- "learning_rate": 0.0002,
841
- "loss": 0.5391,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.01,
846
- "learning_rate": 0.0002,
847
- "loss": 0.9212,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.01,
852
- "learning_rate": 0.0002,
853
- "loss": 0.3178,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.01,
858
- "learning_rate": 0.0002,
859
- "loss": 0.1896,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.01,
864
- "learning_rate": 0.0002,
865
- "loss": 0.2479,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.01,
870
- "learning_rate": 0.0002,
871
- "loss": 0.0806,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.01,
876
- "learning_rate": 0.0002,
877
- "loss": 0.4446,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.01,
882
- "learning_rate": 0.0002,
883
- "loss": 0.1199,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.01,
888
- "learning_rate": 0.0002,
889
- "loss": 0.0728,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.01,
894
- "learning_rate": 0.0002,
895
- "loss": 0.2178,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.01,
900
- "learning_rate": 0.0002,
901
- "loss": 0.6712,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.01,
906
- "learning_rate": 0.0002,
907
- "loss": 0.0917,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.01,
912
- "learning_rate": 0.0002,
913
- "loss": 0.0679,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.01,
918
- "learning_rate": 0.0002,
919
- "loss": 0.2296,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.01,
924
- "learning_rate": 0.0002,
925
- "loss": 0.4093,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.01,
930
- "learning_rate": 0.0002,
931
- "loss": 0.7889,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.01,
936
- "learning_rate": 0.0002,
937
- "loss": 0.142,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.01,
942
- "learning_rate": 0.0002,
943
- "loss": 0.16,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.01,
948
- "learning_rate": 0.0002,
949
- "loss": 0.2812,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.01,
954
- "learning_rate": 0.0002,
955
- "loss": 0.3536,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.01,
960
- "learning_rate": 0.0002,
961
- "loss": 0.2734,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.01,
966
- "learning_rate": 0.0002,
967
- "loss": 1.0048,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.01,
972
- "learning_rate": 0.0002,
973
- "loss": 0.2911,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.01,
978
- "learning_rate": 0.0002,
979
- "loss": 0.2417,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.01,
984
- "learning_rate": 0.0002,
985
- "loss": 0.8293,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.01,
990
- "learning_rate": 0.0002,
991
- "loss": 0.4375,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.01,
996
- "learning_rate": 0.0002,
997
- "loss": 0.7972,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.01,
1002
- "learning_rate": 0.0002,
1003
- "loss": 0.1297,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.01,
1008
- "learning_rate": 0.0002,
1009
- "loss": 0.5533,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.01,
1014
- "learning_rate": 0.0002,
1015
- "loss": 0.8447,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.01,
1020
- "learning_rate": 0.0002,
1021
- "loss": 0.0787,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.01,
1026
- "learning_rate": 0.0002,
1027
- "loss": 0.2196,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.01,
1032
- "learning_rate": 0.0002,
1033
- "loss": 0.1463,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.01,
1038
- "learning_rate": 0.0002,
1039
- "loss": 0.2969,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.01,
1044
- "learning_rate": 0.0002,
1045
- "loss": 0.204,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.01,
1050
- "learning_rate": 0.0002,
1051
- "loss": 0.5595,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.01,
1056
- "learning_rate": 0.0002,
1057
- "loss": 0.1947,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.01,
1062
- "learning_rate": 0.0002,
1063
- "loss": 0.239,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.01,
1068
- "learning_rate": 0.0002,
1069
- "loss": 0.0937,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.01,
1074
- "learning_rate": 0.0002,
1075
- "loss": 0.0284,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.01,
1080
- "learning_rate": 0.0002,
1081
- "loss": 0.4115,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.01,
1086
- "learning_rate": 0.0002,
1087
- "loss": 0.1322,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.01,
1092
- "learning_rate": 0.0002,
1093
- "loss": 0.3562,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.01,
1098
- "learning_rate": 0.0002,
1099
- "loss": 0.5618,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.01,
1104
- "learning_rate": 0.0002,
1105
- "loss": 0.5469,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.01,
1110
- "learning_rate": 0.0002,
1111
- "loss": 0.2538,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.01,
1116
- "learning_rate": 0.0002,
1117
- "loss": 0.3875,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.01,
1122
- "learning_rate": 0.0002,
1123
- "loss": 0.1755,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.01,
1128
- "learning_rate": 0.0002,
1129
- "loss": 0.5634,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.01,
1134
- "learning_rate": 0.0002,
1135
- "loss": 0.5176,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.01,
1140
- "learning_rate": 0.0002,
1141
- "loss": 0.3164,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.01,
1146
- "learning_rate": 0.0002,
1147
- "loss": 0.1107,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.01,
1152
- "learning_rate": 0.0002,
1153
- "loss": 0.7371,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.01,
1158
- "learning_rate": 0.0002,
1159
- "loss": 0.3597,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.01,
1164
- "learning_rate": 0.0002,
1165
- "loss": 0.6858,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.01,
1170
- "learning_rate": 0.0002,
1171
- "loss": 0.2797,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.01,
1176
- "learning_rate": 0.0002,
1177
- "loss": 0.5096,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.01,
1182
- "learning_rate": 0.0002,
1183
- "loss": 0.4265,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.01,
1188
- "learning_rate": 0.0002,
1189
- "loss": 0.4173,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.01,
1194
- "learning_rate": 0.0002,
1195
- "loss": 0.1054,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.01,
1200
- "learning_rate": 0.0002,
1201
- "loss": 0.112,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.01,
1206
- "learning_rate": 0.0002,
1207
- "loss": 0.316,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.01,
1212
- "eval_loss": 0.3453182876110077,
1213
- "eval_runtime": 435.8836,
1214
- "eval_samples_per_second": 2.294,
1215
- "eval_steps_per_second": 1.147,
1216
  "step": 200
1217
  },
1218
  {
1219
  "epoch": 0.01,
1220
- "mmlu_eval_accuracy": 0.4811559812252676,
1221
- "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453,
1222
- "mmlu_eval_accuracy_anatomy": 0.5,
1223
- "mmlu_eval_accuracy_astronomy": 0.25,
1224
- "mmlu_eval_accuracy_business_ethics": 0.6363636363636364,
1225
  "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586,
1226
- "mmlu_eval_accuracy_college_biology": 0.5,
1227
- "mmlu_eval_accuracy_college_chemistry": 0.125,
1228
- "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453,
1229
- "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
1230
- "mmlu_eval_accuracy_college_medicine": 0.5454545454545454,
1231
- "mmlu_eval_accuracy_college_physics": 0.36363636363636365,
1232
- "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
1233
- "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077,
1234
- "mmlu_eval_accuracy_econometrics": 0.3333333333333333,
1235
  "mmlu_eval_accuracy_electrical_engineering": 0.25,
1236
- "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683,
1237
  "mmlu_eval_accuracy_formal_logic": 0.21428571428571427,
1238
  "mmlu_eval_accuracy_global_facts": 0.0,
1239
- "mmlu_eval_accuracy_high_school_biology": 0.375,
1240
- "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182,
1241
  "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556,
1242
- "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222,
1243
- "mmlu_eval_accuracy_high_school_geography": 0.8181818181818182,
1244
- "mmlu_eval_accuracy_high_school_government_and_politics": 0.6666666666666666,
1245
- "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907,
1246
- "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724,
1247
- "mmlu_eval_accuracy_high_school_microeconomics": 0.5769230769230769,
1248
- "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529,
1249
- "mmlu_eval_accuracy_high_school_psychology": 0.7166666666666667,
1250
- "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173,
1251
- "mmlu_eval_accuracy_high_school_us_history": 0.6818181818181818,
1252
- "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384,
1253
- "mmlu_eval_accuracy_human_aging": 0.6086956521739131,
1254
- "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333,
1255
- "mmlu_eval_accuracy_international_law": 0.7692307692307693,
1256
- "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453,
1257
- "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
1258
- "mmlu_eval_accuracy_machine_learning": 0.45454545454545453,
1259
- "mmlu_eval_accuracy_management": 0.7272727272727273,
1260
- "mmlu_eval_accuracy_marketing": 0.88,
1261
- "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273,
1262
- "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093,
1263
- "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735,
1264
- "mmlu_eval_accuracy_moral_scenarios": 0.35,
1265
- "mmlu_eval_accuracy_nutrition": 0.6363636363636364,
1266
- "mmlu_eval_accuracy_philosophy": 0.47058823529411764,
1267
- "mmlu_eval_accuracy_prehistory": 0.34285714285714286,
1268
- "mmlu_eval_accuracy_professional_accounting": 0.22580645161290322,
1269
- "mmlu_eval_accuracy_professional_law": 0.3176470588235294,
1270
- "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613,
1271
- "mmlu_eval_accuracy_professional_psychology": 0.4492753623188406,
1272
- "mmlu_eval_accuracy_public_relations": 0.4166666666666667,
1273
- "mmlu_eval_accuracy_security_studies": 0.5185185185185185,
1274
- "mmlu_eval_accuracy_sociology": 0.5909090909090909,
1275
- "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364,
1276
- "mmlu_eval_accuracy_virology": 0.5,
1277
- "mmlu_eval_accuracy_world_religions": 0.7894736842105263,
1278
- "mmlu_loss": 0.9429792200577477,
1279
  "step": 200
1280
  }
1281
  ],
1282
  "max_steps": 5000,
1283
  "num_train_epochs": 1,
1284
- "total_flos": 3.4693272745672704e+16,
1285
  "trial_name": null,
1286
  "trial_params": null
1287
  }
 
1
  {
2
+ "best_metric": 0.3938411474227905,
3
+ "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-200",
4
  "epoch": 0.012222137346268428,
5
  "global_step": 200,
6
  "is_hyper_param_search": false,
 
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
+ "learning_rate": 0.0004,
13
+ "loss": 0.1006,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
+ "learning_rate": 0.0004,
19
+ "loss": 1.2792,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
+ "learning_rate": 0.0004,
25
+ "loss": 0.3465,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
+ "learning_rate": 0.0004,
31
+ "loss": 0.4367,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
+ "learning_rate": 0.0004,
37
+ "loss": 0.1706,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
+ "learning_rate": 0.0004,
43
+ "loss": 0.632,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
+ "learning_rate": 0.0004,
49
+ "loss": 0.1233,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
+ "learning_rate": 0.0004,
55
+ "loss": 0.4661,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
+ "learning_rate": 0.0004,
61
+ "loss": 0.1672,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
+ "learning_rate": 0.0004,
67
+ "loss": 0.0641,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
+ "learning_rate": 0.0004,
73
+ "loss": 0.3908,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
+ "learning_rate": 0.0004,
79
+ "loss": 0.505,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
+ "learning_rate": 0.0004,
85
+ "loss": 0.7672,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
+ "learning_rate": 0.0004,
91
+ "loss": 0.2875,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
+ "learning_rate": 0.0004,
97
+ "loss": 0.4619,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
+ "learning_rate": 0.0004,
103
+ "loss": 0.4586,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
+ "learning_rate": 0.0004,
109
+ "loss": 0.7159,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
+ "learning_rate": 0.0004,
115
+ "loss": 0.207,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
+ "learning_rate": 0.0004,
121
+ "loss": 0.4808,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
+ "learning_rate": 0.0004,
127
+ "loss": 0.104,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
+ "learning_rate": 0.0004,
133
+ "loss": 0.5561,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
+ "learning_rate": 0.0004,
139
+ "loss": 0.4114,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
+ "learning_rate": 0.0004,
145
+ "loss": 0.3656,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
+ "learning_rate": 0.0004,
151
+ "loss": 0.6222,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
+ "learning_rate": 0.0004,
157
+ "loss": 0.502,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
+ "learning_rate": 0.0004,
163
+ "loss": 0.3339,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
+ "learning_rate": 0.0004,
169
+ "loss": 0.6282,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
+ "learning_rate": 0.0004,
175
+ "loss": 0.104,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
+ "learning_rate": 0.0004,
181
+ "loss": 0.7241,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
+ "learning_rate": 0.0004,
187
+ "loss": 0.7388,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
+ "learning_rate": 0.0004,
193
+ "loss": 0.1877,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
+ "learning_rate": 0.0004,
199
+ "loss": 0.5733,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.0,
204
+ "learning_rate": 0.0004,
205
+ "loss": 0.6204,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.0,
210
+ "learning_rate": 0.0004,
211
+ "loss": 0.9779,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.0,
216
+ "learning_rate": 0.0004,
217
+ "loss": 1.0215,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.0,
222
+ "learning_rate": 0.0004,
223
+ "loss": 0.2895,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.0,
228
+ "learning_rate": 0.0004,
229
+ "loss": 0.3081,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.0,
234
+ "learning_rate": 0.0004,
235
+ "loss": 0.6799,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.0,
240
+ "learning_rate": 0.0004,
241
+ "loss": 0.6704,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.0,
246
+ "learning_rate": 0.0004,
247
+ "loss": 0.6087,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.0,
252
+ "learning_rate": 0.0004,
253
+ "loss": 0.8191,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.0,
258
+ "learning_rate": 0.0004,
259
+ "loss": 0.307,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.0,
264
+ "learning_rate": 0.0004,
265
+ "loss": 0.431,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.0,
270
+ "learning_rate": 0.0004,
271
+ "loss": 0.2427,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.0,
276
+ "learning_rate": 0.0004,
277
+ "loss": 0.8054,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.0,
282
+ "learning_rate": 0.0004,
283
+ "loss": 1.0238,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.0,
288
+ "learning_rate": 0.0004,
289
+ "loss": 0.4241,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.0,
294
+ "learning_rate": 0.0004,
295
+ "loss": 0.1145,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.0,
300
+ "learning_rate": 0.0004,
301
+ "loss": 1.069,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.0,
306
+ "learning_rate": 0.0004,
307
+ "loss": 1.0728,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.0,
312
+ "learning_rate": 0.0004,
313
+ "loss": 0.108,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.0,
318
+ "learning_rate": 0.0004,
319
+ "loss": 0.2927,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.0,
324
+ "learning_rate": 0.0004,
325
+ "loss": 0.2443,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.0,
330
+ "learning_rate": 0.0004,
331
+ "loss": 0.0006,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.0,
336
+ "learning_rate": 0.0004,
337
+ "loss": 0.2178,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.0,
342
+ "learning_rate": 0.0004,
343
+ "loss": 0.2221,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.0,
348
+ "learning_rate": 0.0004,
349
+ "loss": 0.0375,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.0,
354
+ "learning_rate": 0.0004,
355
+ "loss": 0.1756,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.0,
360
+ "learning_rate": 0.0004,
361
+ "loss": 0.4141,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.0,
366
+ "learning_rate": 0.0004,
367
+ "loss": 0.154,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.0,
372
+ "learning_rate": 0.0004,
373
+ "loss": 0.1159,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.0,
378
+ "learning_rate": 0.0004,
379
+ "loss": 0.2163,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.0,
384
+ "learning_rate": 0.0004,
385
+ "loss": 0.3193,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.0,
390
+ "learning_rate": 0.0004,
391
+ "loss": 0.3983,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.0,
396
+ "learning_rate": 0.0004,
397
+ "loss": 0.7675,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.0,
402
+ "learning_rate": 0.0004,
403
+ "loss": 0.395,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.0,
408
+ "learning_rate": 0.0004,
409
+ "loss": 0.4137,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.0,
414
+ "learning_rate": 0.0004,
415
+ "loss": 0.1585,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.0,
420
+ "learning_rate": 0.0004,
421
+ "loss": 0.0744,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.0,
426
+ "learning_rate": 0.0004,
427
+ "loss": 0.2868,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.0,
432
+ "learning_rate": 0.0004,
433
+ "loss": 0.6288,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.0,
438
+ "learning_rate": 0.0004,
439
+ "loss": 0.2539,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.0,
444
+ "learning_rate": 0.0004,
445
+ "loss": 0.9,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.0,
450
+ "learning_rate": 0.0004,
451
+ "loss": 0.5689,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.0,
456
+ "learning_rate": 0.0004,
457
+ "loss": 0.1503,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.0,
462
+ "learning_rate": 0.0004,
463
+ "loss": 0.6418,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.0,
468
+ "learning_rate": 0.0004,
469
+ "loss": 0.2353,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.0,
474
+ "learning_rate": 0.0004,
475
+ "loss": 0.8223,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.0,
480
+ "learning_rate": 0.0004,
481
+ "loss": 0.1297,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.0,
486
+ "learning_rate": 0.0004,
487
+ "loss": 0.6385,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.0,
492
+ "learning_rate": 0.0004,
493
+ "loss": 0.1623,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
+ "learning_rate": 0.0004,
499
+ "loss": 0.3846,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
+ "learning_rate": 0.0004,
505
+ "loss": 0.3152,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
+ "learning_rate": 0.0004,
511
+ "loss": 0.1425,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
+ "learning_rate": 0.0004,
517
+ "loss": 0.6978,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
+ "learning_rate": 0.0004,
523
+ "loss": 1.0012,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
+ "learning_rate": 0.0004,
529
+ "loss": 0.1544,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
+ "learning_rate": 0.0004,
535
+ "loss": 0.7167,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
+ "learning_rate": 0.0004,
541
+ "loss": 0.5173,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
+ "learning_rate": 0.0004,
547
+ "loss": 0.4471,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
+ "learning_rate": 0.0004,
553
+ "loss": 0.4159,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
+ "learning_rate": 0.0004,
559
+ "loss": 0.697,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
+ "learning_rate": 0.0004,
565
+ "loss": 0.2301,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
+ "learning_rate": 0.0004,
571
+ "loss": 0.9655,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
+ "learning_rate": 0.0004,
577
+ "loss": 0.2113,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
+ "learning_rate": 0.0004,
583
+ "loss": 1.5099,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
+ "learning_rate": 0.0004,
589
+ "loss": 0.6587,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.01,
594
+ "learning_rate": 0.0004,
595
+ "loss": 0.677,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.01,
600
+ "learning_rate": 0.0004,
601
+ "loss": 0.8563,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.01,
606
+ "learning_rate": 0.0004,
607
+ "loss": 1.6579,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.01,
612
+ "learning_rate": 0.0004,
613
+ "loss": 0.2976,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.01,
618
+ "learning_rate": 0.0004,
619
+ "loss": 0.4181,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.01,
624
+ "learning_rate": 0.0004,
625
+ "loss": 0.3141,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.01,
630
+ "learning_rate": 0.0004,
631
+ "loss": 0.1189,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.01,
636
+ "learning_rate": 0.0004,
637
+ "loss": 0.0589,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.01,
642
+ "learning_rate": 0.0004,
643
+ "loss": 0.533,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.01,
648
+ "learning_rate": 0.0004,
649
+ "loss": 0.4562,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.01,
654
+ "learning_rate": 0.0004,
655
+ "loss": 0.2835,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.01,
660
+ "learning_rate": 0.0004,
661
+ "loss": 0.5246,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.01,
666
+ "learning_rate": 0.0004,
667
+ "loss": 0.2345,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.01,
672
+ "learning_rate": 0.0004,
673
+ "loss": 0.1858,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.01,
678
+ "learning_rate": 0.0004,
679
+ "loss": 0.5243,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.01,
684
+ "learning_rate": 0.0004,
685
+ "loss": 0.3014,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.01,
690
+ "learning_rate": 0.0004,
691
+ "loss": 0.0783,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.01,
696
+ "learning_rate": 0.0004,
697
+ "loss": 0.1369,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.01,
702
+ "learning_rate": 0.0004,
703
+ "loss": 0.1517,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.01,
708
+ "learning_rate": 0.0004,
709
+ "loss": 0.4089,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.01,
714
+ "learning_rate": 0.0004,
715
+ "loss": 0.184,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.01,
720
+ "learning_rate": 0.0004,
721
+ "loss": 0.218,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.01,
726
+ "learning_rate": 0.0004,
727
+ "loss": 0.2696,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.01,
732
+ "learning_rate": 0.0004,
733
+ "loss": 0.0955,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.01,
738
+ "learning_rate": 0.0004,
739
+ "loss": 0.3469,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.01,
744
+ "learning_rate": 0.0004,
745
+ "loss": 0.2769,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.01,
750
+ "learning_rate": 0.0004,
751
+ "loss": 0.2437,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.01,
756
+ "learning_rate": 0.0004,
757
+ "loss": 0.2283,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.01,
762
+ "learning_rate": 0.0004,
763
+ "loss": 0.5484,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.01,
768
+ "learning_rate": 0.0004,
769
+ "loss": 0.3495,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.01,
774
+ "learning_rate": 0.0004,
775
+ "loss": 0.7042,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.01,
780
+ "learning_rate": 0.0004,
781
+ "loss": 0.3839,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.01,
786
+ "learning_rate": 0.0004,
787
+ "loss": 0.3892,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.01,
792
+ "learning_rate": 0.0004,
793
+ "loss": 0.2422,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.01,
798
+ "learning_rate": 0.0004,
799
+ "loss": 0.3934,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.01,
804
+ "learning_rate": 0.0004,
805
+ "loss": 0.4136,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.01,
810
+ "learning_rate": 0.0004,
811
+ "loss": 0.0939,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.01,
816
+ "learning_rate": 0.0004,
817
+ "loss": 0.508,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.01,
822
+ "learning_rate": 0.0004,
823
+ "loss": 0.3331,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.01,
828
+ "learning_rate": 0.0004,
829
+ "loss": 0.377,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.01,
834
+ "learning_rate": 0.0004,
835
+ "loss": 0.8366,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.01,
840
+ "learning_rate": 0.0004,
841
+ "loss": 0.2068,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.01,
846
+ "learning_rate": 0.0004,
847
+ "loss": 0.484,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.01,
852
+ "learning_rate": 0.0004,
853
+ "loss": 0.8796,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.01,
858
+ "learning_rate": 0.0004,
859
+ "loss": 0.4984,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.01,
864
+ "learning_rate": 0.0004,
865
+ "loss": 0.5241,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.01,
870
+ "learning_rate": 0.0004,
871
+ "loss": 0.4839,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.01,
876
+ "learning_rate": 0.0004,
877
+ "loss": 0.2773,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.01,
882
+ "learning_rate": 0.0004,
883
+ "loss": 0.5004,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.01,
888
+ "learning_rate": 0.0004,
889
+ "loss": 0.3029,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.01,
894
+ "learning_rate": 0.0004,
895
+ "loss": 0.9682,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.01,
900
+ "learning_rate": 0.0004,
901
+ "loss": 0.3496,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.01,
906
+ "learning_rate": 0.0004,
907
+ "loss": 0.462,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.01,
912
+ "learning_rate": 0.0004,
913
+ "loss": 0.1464,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.01,
918
+ "learning_rate": 0.0004,
919
+ "loss": 0.1177,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.01,
924
+ "learning_rate": 0.0004,
925
+ "loss": 0.3903,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.01,
930
+ "learning_rate": 0.0004,
931
+ "loss": 0.2373,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.01,
936
+ "learning_rate": 0.0004,
937
+ "loss": 0.1732,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.01,
942
+ "learning_rate": 0.0004,
943
+ "loss": 0.5158,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.01,
948
+ "learning_rate": 0.0004,
949
+ "loss": 0.3224,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.01,
954
+ "learning_rate": 0.0004,
955
+ "loss": 0.2082,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.01,
960
+ "learning_rate": 0.0004,
961
+ "loss": 0.2307,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.01,
966
+ "learning_rate": 0.0004,
967
+ "loss": 0.1758,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.01,
972
+ "learning_rate": 0.0004,
973
+ "loss": 0.2339,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.01,
978
+ "learning_rate": 0.0004,
979
+ "loss": 0.0613,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.01,
984
+ "learning_rate": 0.0004,
985
+ "loss": 0.1142,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.01,
990
+ "learning_rate": 0.0004,
991
+ "loss": 0.3177,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.01,
996
+ "learning_rate": 0.0004,
997
+ "loss": 0.4358,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.01,
1002
+ "learning_rate": 0.0004,
1003
+ "loss": 1.3582,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.01,
1008
+ "learning_rate": 0.0004,
1009
+ "loss": 0.5703,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.01,
1014
+ "learning_rate": 0.0004,
1015
+ "loss": 0.3477,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.01,
1020
+ "learning_rate": 0.0004,
1021
+ "loss": 0.4394,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.01,
1026
+ "learning_rate": 0.0004,
1027
+ "loss": 0.3481,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.01,
1032
+ "learning_rate": 0.0004,
1033
+ "loss": 0.1735,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.01,
1038
+ "learning_rate": 0.0004,
1039
+ "loss": 0.0878,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.01,
1044
+ "learning_rate": 0.0004,
1045
+ "loss": 0.0659,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.01,
1050
+ "learning_rate": 0.0004,
1051
+ "loss": 0.3527,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.01,
1056
+ "learning_rate": 0.0004,
1057
+ "loss": 0.1819,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.01,
1062
+ "learning_rate": 0.0004,
1063
+ "loss": 0.379,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.01,
1068
+ "learning_rate": 0.0004,
1069
+ "loss": 0.2146,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.01,
1074
+ "learning_rate": 0.0004,
1075
+ "loss": 0.133,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.01,
1080
+ "learning_rate": 0.0004,
1081
+ "loss": 0.5217,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.01,
1086
+ "learning_rate": 0.0004,
1087
+ "loss": 0.3077,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.01,
1092
+ "learning_rate": 0.0004,
1093
+ "loss": 0.0022,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.01,
1098
+ "learning_rate": 0.0004,
1099
+ "loss": 0.1031,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.01,
1104
+ "learning_rate": 0.0004,
1105
+ "loss": 0.681,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.01,
1110
+ "learning_rate": 0.0004,
1111
+ "loss": 0.7839,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.01,
1116
+ "learning_rate": 0.0004,
1117
+ "loss": 0.6465,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.01,
1122
+ "learning_rate": 0.0004,
1123
+ "loss": 0.2607,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.01,
1128
+ "learning_rate": 0.0004,
1129
+ "loss": 0.7913,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.01,
1134
+ "learning_rate": 0.0004,
1135
+ "loss": 0.4266,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.01,
1140
+ "learning_rate": 0.0004,
1141
+ "loss": 0.2851,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.01,
1146
+ "learning_rate": 0.0004,
1147
+ "loss": 0.6628,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.01,
1152
+ "learning_rate": 0.0004,
1153
+ "loss": 0.8151,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.01,
1158
+ "learning_rate": 0.0004,
1159
+ "loss": 0.3577,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.01,
1164
+ "learning_rate": 0.0004,
1165
+ "loss": 0.4329,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.01,
1170
+ "learning_rate": 0.0004,
1171
+ "loss": 0.1639,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.01,
1176
+ "learning_rate": 0.0004,
1177
+ "loss": 0.1394,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.01,
1182
+ "learning_rate": 0.0004,
1183
+ "loss": 0.3146,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.01,
1188
+ "learning_rate": 0.0004,
1189
+ "loss": 0.2623,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.01,
1194
+ "learning_rate": 0.0004,
1195
+ "loss": 1.3405,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.01,
1200
+ "learning_rate": 0.0004,
1201
+ "loss": 0.6208,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.01,
1206
+ "learning_rate": 0.0004,
1207
+ "loss": 0.7118,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.01,
1212
+ "eval_loss": 0.3938411474227905,
1213
+ "eval_runtime": 219.0899,
1214
+ "eval_samples_per_second": 2.282,
1215
+ "eval_steps_per_second": 1.141,
1216
  "step": 200
1217
  },
1218
  {
1219
  "epoch": 0.01,
1220
+ "mmlu_eval_accuracy": 0.3485764968358423,
1221
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
1222
+ "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
1223
+ "mmlu_eval_accuracy_astronomy": 0.5,
1224
+ "mmlu_eval_accuracy_business_ethics": 0.2727272727272727,
1225
  "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586,
1226
+ "mmlu_eval_accuracy_college_biology": 0.4375,
1227
+ "mmlu_eval_accuracy_college_chemistry": 0.25,
1228
+ "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365,
1229
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
1230
+ "mmlu_eval_accuracy_college_medicine": 0.4090909090909091,
1231
+ "mmlu_eval_accuracy_college_physics": 0.2727272727272727,
1232
+ "mmlu_eval_accuracy_computer_security": 0.45454545454545453,
1233
+ "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615,
1234
+ "mmlu_eval_accuracy_econometrics": 0.25,
1235
  "mmlu_eval_accuracy_electrical_engineering": 0.25,
1236
+ "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637,
1237
  "mmlu_eval_accuracy_formal_logic": 0.21428571428571427,
1238
  "mmlu_eval_accuracy_global_facts": 0.0,
1239
+ "mmlu_eval_accuracy_high_school_biology": 0.3125,
1240
+ "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182,
1241
  "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556,
1242
+ "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556,
1243
+ "mmlu_eval_accuracy_high_school_geography": 0.7272727272727273,
1244
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857,
1245
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.5116279069767442,
1246
+ "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793,
1247
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.21428571428571427,
1248
+ "mmlu_loss": 0.783768397025764,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  "step": 200
1250
  }
1251
  ],
1252
  "max_steps": 5000,
1253
  "num_train_epochs": 1,
1254
+ "total_flos": 2.978705686187213e+16,
1255
  "trial_name": null,
1256
  "trial_params": null
1257
  }
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd28a065deb906dd6787d5be775d7f7fef1c3352a93f2dc2266d20467a05b48d
3
  size 6011
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6221336348c810e346236bf80a362d1c36330d016829c5789d6e4b72e63969b6
3
  size 6011
checkpoint-400/adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "alpindale/CodeLlama-34B-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
@@ -14,13 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "v_proj",
18
- "up_proj",
19
- "gate_proj",
20
- "down_proj",
21
  "q_proj",
 
22
  "o_proj",
23
- "k_proj"
 
 
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "k_proj",
 
 
 
18
  "q_proj",
19
+ "v_proj",
20
  "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
checkpoint-400/adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f63e1f9f5ce72e3d0fcb7fa87c92aeb6019137e90d5971496e900c70d14626
3
  size 871609293
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af5d01bf9badfccc5c82770b56c6a6cdd6418705b7ac5979ca2a69aa9395cf4
3
  size 871609293
checkpoint-400/adapter_model/adapter_model/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+
7
+ The following `bitsandbytes` quantization config was used during training:
8
+ - load_in_8bit: False
9
+ - load_in_4bit: True
10
+ - llm_int8_threshold: 6.0
11
+ - llm_int8_skip_modules: None
12
+ - llm_int8_enable_fp32_cpu_offload: False
13
+ - llm_int8_has_fp16_weight: False
14
+ - bnb_4bit_quant_type: nf4
15
+ - bnb_4bit_use_double_quant: True
16
+ - bnb_4bit_compute_dtype: bfloat16
17
+ ### Framework versions
18
+
19
+
20
+ - PEFT 0.4.0
checkpoint-400/adapter_model/adapter_model/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "codellama/CodeLlama-34b-Python-hf",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16.0,
11
+ "lora_dropout": 0.1,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "k_proj",
18
+ "q_proj",
19
+ "v_proj",
20
+ "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
checkpoint-400/adapter_model/adapter_model/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af5d01bf9badfccc5c82770b56c6a6cdd6418705b7ac5979ca2a69aa9395cf4
3
+ size 871609293
checkpoint-400/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bbe1bdd2b11078dc20e8ba8d86297f3269b045d20db70804401d8c7b3fe590c
3
  size 3485881117
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97c9e244ad4da2d730ae74830aef4c3a6ad5097bc9fd5e28b2ad10a070f522d
3
  size 3485881117
checkpoint-400/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd8504965e12c1c177b0358cf8e356b8368468b64276e35c622c130f00b781b6
3
  size 14511
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ce8ae09e310b824c5926b786c493b2261b25e11f1a648afad871272fb89a96f
3
  size 14511
checkpoint-400/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13276f15dd2b6acc19b970176aa2db4ac9b58241843e72c89b50e3094e903b19
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef1ca3e6fc07b43239ed034e2d8e5ae6ded24ae869473b3f8f48afde040dedc
3
  size 627
checkpoint-400/tokenizer_config.json CHANGED
@@ -23,7 +23,6 @@
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
26
- "spaces_between_special_tokens": false,
27
  "tokenizer_class": "LlamaTokenizer",
28
  "unk_token": {
29
  "__type": "AddedToken",
@@ -32,6 +31,5 @@
32
  "normalized": true,
33
  "rstrip": false,
34
  "single_word": false
35
- },
36
- "use_default_system_prompt": true
37
  }
 
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
 
26
  "tokenizer_class": "LlamaTokenizer",
27
  "unk_token": {
28
  "__type": "AddedToken",
 
31
  "normalized": true,
32
  "rstrip": false,
33
  "single_word": false
34
+ }
 
35
  }
checkpoint-400/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.3298754394054413,
3
- "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34B-hf_unnatural-instructions_standardized/checkpoint-400",
4
  "epoch": 0.024444274692536856,
5
  "global_step": 400,
6
  "is_hyper_param_search": false,
@@ -9,2550 +9,2490 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
- "learning_rate": 0.0002,
13
- "loss": 1.9735,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
- "learning_rate": 0.0002,
19
- "loss": 2.7155,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
- "learning_rate": 0.0002,
25
- "loss": 3.1137,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
- "learning_rate": 0.0002,
31
- "loss": 1.6054,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
- "learning_rate": 0.0002,
37
- "loss": 1.0381,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
- "learning_rate": 0.0002,
43
- "loss": 0.9959,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
- "learning_rate": 0.0002,
49
- "loss": 0.7395,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
- "learning_rate": 0.0002,
55
- "loss": 0.3255,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
- "learning_rate": 0.0002,
61
- "loss": 0.8252,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
- "learning_rate": 0.0002,
67
- "loss": 0.5362,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
- "learning_rate": 0.0002,
73
- "loss": 1.404,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
- "learning_rate": 0.0002,
79
- "loss": 0.6234,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
- "learning_rate": 0.0002,
85
- "loss": 1.0263,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
- "learning_rate": 0.0002,
91
- "loss": 0.2622,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
- "learning_rate": 0.0002,
97
- "loss": 0.2692,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
- "learning_rate": 0.0002,
103
- "loss": 0.2624,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
- "learning_rate": 0.0002,
109
- "loss": 0.4385,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
- "learning_rate": 0.0002,
115
- "loss": 0.3265,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
- "learning_rate": 0.0002,
121
- "loss": 0.2191,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
- "learning_rate": 0.0002,
127
- "loss": 1.0049,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
- "learning_rate": 0.0002,
133
- "loss": 0.6586,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
- "learning_rate": 0.0002,
139
- "loss": 0.3471,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
- "learning_rate": 0.0002,
145
- "loss": 0.7134,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
- "learning_rate": 0.0002,
151
- "loss": 1.01,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
- "learning_rate": 0.0002,
157
- "loss": 0.2802,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
- "learning_rate": 0.0002,
163
- "loss": 0.4205,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
- "learning_rate": 0.0002,
169
- "loss": 0.7682,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
- "learning_rate": 0.0002,
175
- "loss": 0.2002,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
- "learning_rate": 0.0002,
181
- "loss": 0.2132,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
- "learning_rate": 0.0002,
187
- "loss": 1.0622,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
- "learning_rate": 0.0002,
193
- "loss": 0.212,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
- "learning_rate": 0.0002,
199
- "loss": 0.3738,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.0,
204
- "learning_rate": 0.0002,
205
- "loss": 0.3594,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.0,
210
- "learning_rate": 0.0002,
211
- "loss": 0.8766,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.0,
216
- "learning_rate": 0.0002,
217
- "loss": 0.3108,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.0,
222
- "learning_rate": 0.0002,
223
- "loss": 0.2127,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.0,
228
- "learning_rate": 0.0002,
229
- "loss": 0.5968,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.0,
234
- "learning_rate": 0.0002,
235
- "loss": 0.8806,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.0,
240
- "learning_rate": 0.0002,
241
- "loss": 0.0633,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.0,
246
- "learning_rate": 0.0002,
247
- "loss": 0.5851,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.0,
252
- "learning_rate": 0.0002,
253
- "loss": 0.2376,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.0,
258
- "learning_rate": 0.0002,
259
- "loss": 0.2293,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.0,
264
- "learning_rate": 0.0002,
265
- "loss": 0.428,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.0,
270
- "learning_rate": 0.0002,
271
- "loss": 0.131,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.0,
276
- "learning_rate": 0.0002,
277
- "loss": 0.3724,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.0,
282
- "learning_rate": 0.0002,
283
- "loss": 0.5031,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.0,
288
- "learning_rate": 0.0002,
289
- "loss": 0.4934,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.0,
294
- "learning_rate": 0.0002,
295
- "loss": 0.8127,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.0,
300
- "learning_rate": 0.0002,
301
- "loss": 0.4573,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.0,
306
- "learning_rate": 0.0002,
307
- "loss": 0.5568,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.0,
312
- "learning_rate": 0.0002,
313
- "loss": 0.5411,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.0,
318
- "learning_rate": 0.0002,
319
- "loss": 0.4448,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.0,
324
- "learning_rate": 0.0002,
325
- "loss": 0.3774,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.0,
330
- "learning_rate": 0.0002,
331
- "loss": 0.1825,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.0,
336
- "learning_rate": 0.0002,
337
- "loss": 0.2356,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.0,
342
- "learning_rate": 0.0002,
343
- "loss": 0.0236,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.0,
348
- "learning_rate": 0.0002,
349
- "loss": 0.4344,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.0,
354
- "learning_rate": 0.0002,
355
- "loss": 0.4589,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.0,
360
- "learning_rate": 0.0002,
361
- "loss": 0.3766,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.0,
366
- "learning_rate": 0.0002,
367
- "loss": 0.6034,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.0,
372
- "learning_rate": 0.0002,
373
- "loss": 0.7632,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.0,
378
- "learning_rate": 0.0002,
379
- "loss": 0.0612,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.0,
384
- "learning_rate": 0.0002,
385
- "loss": 0.6783,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.0,
390
- "learning_rate": 0.0002,
391
- "loss": 0.2845,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.0,
396
- "learning_rate": 0.0002,
397
- "loss": 0.395,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.0,
402
- "learning_rate": 0.0002,
403
- "loss": 0.8106,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.0,
408
- "learning_rate": 0.0002,
409
- "loss": 0.1468,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.0,
414
- "learning_rate": 0.0002,
415
- "loss": 0.0537,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.0,
420
- "learning_rate": 0.0002,
421
- "loss": 0.4816,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.0,
426
- "learning_rate": 0.0002,
427
- "loss": 0.6052,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.0,
432
- "learning_rate": 0.0002,
433
- "loss": 0.2805,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.0,
438
- "learning_rate": 0.0002,
439
- "loss": 0.8279,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.0,
444
- "learning_rate": 0.0002,
445
- "loss": 0.6954,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.0,
450
- "learning_rate": 0.0002,
451
- "loss": 0.0635,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.0,
456
- "learning_rate": 0.0002,
457
- "loss": 0.2866,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.0,
462
- "learning_rate": 0.0002,
463
- "loss": 0.9656,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.0,
468
- "learning_rate": 0.0002,
469
- "loss": 0.1113,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.0,
474
- "learning_rate": 0.0002,
475
- "loss": 0.4063,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.0,
480
- "learning_rate": 0.0002,
481
- "loss": 0.3245,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.0,
486
- "learning_rate": 0.0002,
487
- "loss": 0.3966,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.0,
492
- "learning_rate": 0.0002,
493
- "loss": 0.4809,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
- "learning_rate": 0.0002,
499
- "loss": 0.3844,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
- "learning_rate": 0.0002,
505
- "loss": 0.1501,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
- "learning_rate": 0.0002,
511
- "loss": 0.5504,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
- "learning_rate": 0.0002,
517
- "loss": 0.2332,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
- "learning_rate": 0.0002,
523
- "loss": 0.0049,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
- "learning_rate": 0.0002,
529
- "loss": 0.2585,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
- "learning_rate": 0.0002,
535
- "loss": 0.2012,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
- "learning_rate": 0.0002,
541
- "loss": 0.0386,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
- "learning_rate": 0.0002,
547
- "loss": 0.5818,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
- "learning_rate": 0.0002,
553
- "loss": 0.2827,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
- "learning_rate": 0.0002,
559
- "loss": 0.3877,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
- "learning_rate": 0.0002,
565
- "loss": 0.3117,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
- "learning_rate": 0.0002,
571
- "loss": 0.9205,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
- "learning_rate": 0.0002,
577
- "loss": 0.4653,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
- "learning_rate": 0.0002,
583
- "loss": 0.3412,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
- "learning_rate": 0.0002,
589
- "loss": 0.3367,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.01,
594
- "learning_rate": 0.0002,
595
- "loss": 0.1755,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.01,
600
- "learning_rate": 0.0002,
601
- "loss": 0.3586,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.01,
606
- "learning_rate": 0.0002,
607
- "loss": 0.5682,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.01,
612
- "learning_rate": 0.0002,
613
- "loss": 0.4869,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.01,
618
- "learning_rate": 0.0002,
619
- "loss": 0.7614,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.01,
624
- "learning_rate": 0.0002,
625
- "loss": 0.4366,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.01,
630
- "learning_rate": 0.0002,
631
- "loss": 0.5307,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.01,
636
- "learning_rate": 0.0002,
637
- "loss": 0.3974,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.01,
642
- "learning_rate": 0.0002,
643
- "loss": 0.5255,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.01,
648
- "learning_rate": 0.0002,
649
- "loss": 0.2682,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.01,
654
- "learning_rate": 0.0002,
655
- "loss": 0.4884,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.01,
660
- "learning_rate": 0.0002,
661
- "loss": 0.0414,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.01,
666
- "learning_rate": 0.0002,
667
- "loss": 0.5094,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.01,
672
- "learning_rate": 0.0002,
673
- "loss": 0.298,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.01,
678
- "learning_rate": 0.0002,
679
- "loss": 0.2147,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.01,
684
- "learning_rate": 0.0002,
685
- "loss": 0.2712,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.01,
690
- "learning_rate": 0.0002,
691
- "loss": 0.5713,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.01,
696
- "learning_rate": 0.0002,
697
- "loss": 0.2979,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.01,
702
- "learning_rate": 0.0002,
703
- "loss": 0.2424,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.01,
708
- "learning_rate": 0.0002,
709
- "loss": 0.1412,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.01,
714
- "learning_rate": 0.0002,
715
- "loss": 0.3252,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.01,
720
- "learning_rate": 0.0002,
721
- "loss": 0.4267,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.01,
726
- "learning_rate": 0.0002,
727
- "loss": 0.2139,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.01,
732
- "learning_rate": 0.0002,
733
- "loss": 0.4214,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.01,
738
- "learning_rate": 0.0002,
739
- "loss": 0.2338,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.01,
744
- "learning_rate": 0.0002,
745
- "loss": 0.5877,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.01,
750
- "learning_rate": 0.0002,
751
- "loss": 0.2574,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.01,
756
- "learning_rate": 0.0002,
757
- "loss": 0.0011,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.01,
762
- "learning_rate": 0.0002,
763
- "loss": 0.6156,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.01,
768
- "learning_rate": 0.0002,
769
- "loss": 0.0888,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.01,
774
- "learning_rate": 0.0002,
775
- "loss": 0.3159,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.01,
780
- "learning_rate": 0.0002,
781
- "loss": 0.2122,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.01,
786
- "learning_rate": 0.0002,
787
- "loss": 0.1131,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.01,
792
- "learning_rate": 0.0002,
793
- "loss": 0.1634,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.01,
798
- "learning_rate": 0.0002,
799
- "loss": 0.3788,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.01,
804
- "learning_rate": 0.0002,
805
- "loss": 0.3187,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.01,
810
- "learning_rate": 0.0002,
811
- "loss": 0.1685,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.01,
816
- "learning_rate": 0.0002,
817
- "loss": 1.5573,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.01,
822
- "learning_rate": 0.0002,
823
- "loss": 0.3409,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.01,
828
- "learning_rate": 0.0002,
829
- "loss": 1.1279,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.01,
834
- "learning_rate": 0.0002,
835
- "loss": 0.1385,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.01,
840
- "learning_rate": 0.0002,
841
- "loss": 0.5391,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.01,
846
- "learning_rate": 0.0002,
847
- "loss": 0.9212,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.01,
852
- "learning_rate": 0.0002,
853
- "loss": 0.3178,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.01,
858
- "learning_rate": 0.0002,
859
- "loss": 0.1896,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.01,
864
- "learning_rate": 0.0002,
865
- "loss": 0.2479,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.01,
870
- "learning_rate": 0.0002,
871
- "loss": 0.0806,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.01,
876
- "learning_rate": 0.0002,
877
- "loss": 0.4446,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.01,
882
- "learning_rate": 0.0002,
883
- "loss": 0.1199,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.01,
888
- "learning_rate": 0.0002,
889
- "loss": 0.0728,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.01,
894
- "learning_rate": 0.0002,
895
- "loss": 0.2178,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.01,
900
- "learning_rate": 0.0002,
901
- "loss": 0.6712,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.01,
906
- "learning_rate": 0.0002,
907
- "loss": 0.0917,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.01,
912
- "learning_rate": 0.0002,
913
- "loss": 0.0679,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.01,
918
- "learning_rate": 0.0002,
919
- "loss": 0.2296,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.01,
924
- "learning_rate": 0.0002,
925
- "loss": 0.4093,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.01,
930
- "learning_rate": 0.0002,
931
- "loss": 0.7889,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.01,
936
- "learning_rate": 0.0002,
937
- "loss": 0.142,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.01,
942
- "learning_rate": 0.0002,
943
- "loss": 0.16,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.01,
948
- "learning_rate": 0.0002,
949
- "loss": 0.2812,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.01,
954
- "learning_rate": 0.0002,
955
- "loss": 0.3536,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.01,
960
- "learning_rate": 0.0002,
961
- "loss": 0.2734,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.01,
966
- "learning_rate": 0.0002,
967
- "loss": 1.0048,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.01,
972
- "learning_rate": 0.0002,
973
- "loss": 0.2911,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.01,
978
- "learning_rate": 0.0002,
979
- "loss": 0.2417,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.01,
984
- "learning_rate": 0.0002,
985
- "loss": 0.8293,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.01,
990
- "learning_rate": 0.0002,
991
- "loss": 0.4375,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.01,
996
- "learning_rate": 0.0002,
997
- "loss": 0.7972,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.01,
1002
- "learning_rate": 0.0002,
1003
- "loss": 0.1297,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.01,
1008
- "learning_rate": 0.0002,
1009
- "loss": 0.5533,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.01,
1014
- "learning_rate": 0.0002,
1015
- "loss": 0.8447,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.01,
1020
- "learning_rate": 0.0002,
1021
- "loss": 0.0787,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.01,
1026
- "learning_rate": 0.0002,
1027
- "loss": 0.2196,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.01,
1032
- "learning_rate": 0.0002,
1033
- "loss": 0.1463,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.01,
1038
- "learning_rate": 0.0002,
1039
- "loss": 0.2969,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.01,
1044
- "learning_rate": 0.0002,
1045
- "loss": 0.204,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.01,
1050
- "learning_rate": 0.0002,
1051
- "loss": 0.5595,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.01,
1056
- "learning_rate": 0.0002,
1057
- "loss": 0.1947,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.01,
1062
- "learning_rate": 0.0002,
1063
- "loss": 0.239,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.01,
1068
- "learning_rate": 0.0002,
1069
- "loss": 0.0937,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.01,
1074
- "learning_rate": 0.0002,
1075
- "loss": 0.0284,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.01,
1080
- "learning_rate": 0.0002,
1081
- "loss": 0.4115,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.01,
1086
- "learning_rate": 0.0002,
1087
- "loss": 0.1322,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.01,
1092
- "learning_rate": 0.0002,
1093
- "loss": 0.3562,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.01,
1098
- "learning_rate": 0.0002,
1099
- "loss": 0.5618,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.01,
1104
- "learning_rate": 0.0002,
1105
- "loss": 0.5469,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.01,
1110
- "learning_rate": 0.0002,
1111
- "loss": 0.2538,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.01,
1116
- "learning_rate": 0.0002,
1117
- "loss": 0.3875,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.01,
1122
- "learning_rate": 0.0002,
1123
- "loss": 0.1755,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.01,
1128
- "learning_rate": 0.0002,
1129
- "loss": 0.5634,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.01,
1134
- "learning_rate": 0.0002,
1135
- "loss": 0.5176,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.01,
1140
- "learning_rate": 0.0002,
1141
- "loss": 0.3164,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.01,
1146
- "learning_rate": 0.0002,
1147
- "loss": 0.1107,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.01,
1152
- "learning_rate": 0.0002,
1153
- "loss": 0.7371,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.01,
1158
- "learning_rate": 0.0002,
1159
- "loss": 0.3597,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.01,
1164
- "learning_rate": 0.0002,
1165
- "loss": 0.6858,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.01,
1170
- "learning_rate": 0.0002,
1171
- "loss": 0.2797,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.01,
1176
- "learning_rate": 0.0002,
1177
- "loss": 0.5096,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.01,
1182
- "learning_rate": 0.0002,
1183
- "loss": 0.4265,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.01,
1188
- "learning_rate": 0.0002,
1189
- "loss": 0.4173,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.01,
1194
- "learning_rate": 0.0002,
1195
- "loss": 0.1054,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.01,
1200
- "learning_rate": 0.0002,
1201
- "loss": 0.112,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.01,
1206
- "learning_rate": 0.0002,
1207
- "loss": 0.316,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.01,
1212
- "eval_loss": 0.3453182876110077,
1213
- "eval_runtime": 435.8836,
1214
- "eval_samples_per_second": 2.294,
1215
- "eval_steps_per_second": 1.147,
1216
  "step": 200
1217
  },
1218
  {
1219
  "epoch": 0.01,
1220
- "mmlu_eval_accuracy": 0.4811559812252676,
1221
- "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453,
1222
- "mmlu_eval_accuracy_anatomy": 0.5,
1223
- "mmlu_eval_accuracy_astronomy": 0.25,
1224
- "mmlu_eval_accuracy_business_ethics": 0.6363636363636364,
1225
  "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586,
1226
- "mmlu_eval_accuracy_college_biology": 0.5,
1227
- "mmlu_eval_accuracy_college_chemistry": 0.125,
1228
- "mmlu_eval_accuracy_college_computer_science": 0.45454545454545453,
1229
- "mmlu_eval_accuracy_college_mathematics": 0.45454545454545453,
1230
- "mmlu_eval_accuracy_college_medicine": 0.5454545454545454,
1231
- "mmlu_eval_accuracy_college_physics": 0.36363636363636365,
1232
- "mmlu_eval_accuracy_computer_security": 0.6363636363636364,
1233
- "mmlu_eval_accuracy_conceptual_physics": 0.3076923076923077,
1234
- "mmlu_eval_accuracy_econometrics": 0.3333333333333333,
1235
  "mmlu_eval_accuracy_electrical_engineering": 0.25,
1236
- "mmlu_eval_accuracy_elementary_mathematics": 0.2926829268292683,
1237
  "mmlu_eval_accuracy_formal_logic": 0.21428571428571427,
1238
  "mmlu_eval_accuracy_global_facts": 0.0,
1239
- "mmlu_eval_accuracy_high_school_biology": 0.375,
1240
- "mmlu_eval_accuracy_high_school_chemistry": 0.3181818181818182,
1241
  "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556,
1242
- "mmlu_eval_accuracy_high_school_european_history": 0.7222222222222222,
1243
- "mmlu_eval_accuracy_high_school_geography": 0.8181818181818182,
1244
- "mmlu_eval_accuracy_high_school_government_and_politics": 0.6666666666666666,
1245
- "mmlu_eval_accuracy_high_school_macroeconomics": 0.4186046511627907,
1246
- "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724,
1247
- "mmlu_eval_accuracy_high_school_microeconomics": 0.5769230769230769,
1248
- "mmlu_eval_accuracy_high_school_physics": 0.4117647058823529,
1249
- "mmlu_eval_accuracy_high_school_psychology": 0.7166666666666667,
1250
- "mmlu_eval_accuracy_high_school_statistics": 0.34782608695652173,
1251
- "mmlu_eval_accuracy_high_school_us_history": 0.6818181818181818,
1252
- "mmlu_eval_accuracy_high_school_world_history": 0.5384615384615384,
1253
- "mmlu_eval_accuracy_human_aging": 0.6086956521739131,
1254
- "mmlu_eval_accuracy_human_sexuality": 0.3333333333333333,
1255
- "mmlu_eval_accuracy_international_law": 0.7692307692307693,
1256
- "mmlu_eval_accuracy_jurisprudence": 0.45454545454545453,
1257
- "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
1258
- "mmlu_eval_accuracy_machine_learning": 0.45454545454545453,
1259
- "mmlu_eval_accuracy_management": 0.7272727272727273,
1260
- "mmlu_eval_accuracy_marketing": 0.88,
1261
- "mmlu_eval_accuracy_medical_genetics": 0.7272727272727273,
1262
- "mmlu_eval_accuracy_miscellaneous": 0.5813953488372093,
1263
- "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735,
1264
- "mmlu_eval_accuracy_moral_scenarios": 0.35,
1265
- "mmlu_eval_accuracy_nutrition": 0.6363636363636364,
1266
- "mmlu_eval_accuracy_philosophy": 0.47058823529411764,
1267
- "mmlu_eval_accuracy_prehistory": 0.34285714285714286,
1268
- "mmlu_eval_accuracy_professional_accounting": 0.22580645161290322,
1269
- "mmlu_eval_accuracy_professional_law": 0.3176470588235294,
1270
- "mmlu_eval_accuracy_professional_medicine": 0.2903225806451613,
1271
- "mmlu_eval_accuracy_professional_psychology": 0.4492753623188406,
1272
- "mmlu_eval_accuracy_public_relations": 0.4166666666666667,
1273
- "mmlu_eval_accuracy_security_studies": 0.5185185185185185,
1274
- "mmlu_eval_accuracy_sociology": 0.5909090909090909,
1275
- "mmlu_eval_accuracy_us_foreign_policy": 0.6363636363636364,
1276
- "mmlu_eval_accuracy_virology": 0.5,
1277
- "mmlu_eval_accuracy_world_religions": 0.7894736842105263,
1278
- "mmlu_loss": 0.9429792200577477,
1279
  "step": 200
1280
  },
1281
  {
1282
  "epoch": 0.01,
1283
- "learning_rate": 0.0002,
1284
- "loss": 0.2838,
1285
  "step": 201
1286
  },
1287
  {
1288
  "epoch": 0.01,
1289
- "learning_rate": 0.0002,
1290
- "loss": 0.2909,
1291
  "step": 202
1292
  },
1293
  {
1294
  "epoch": 0.01,
1295
- "learning_rate": 0.0002,
1296
- "loss": 0.5662,
1297
  "step": 203
1298
  },
1299
  {
1300
  "epoch": 0.01,
1301
- "learning_rate": 0.0002,
1302
- "loss": 0.1471,
1303
  "step": 204
1304
  },
1305
  {
1306
  "epoch": 0.01,
1307
- "learning_rate": 0.0002,
1308
- "loss": 0.3506,
1309
  "step": 205
1310
  },
1311
  {
1312
  "epoch": 0.01,
1313
- "learning_rate": 0.0002,
1314
- "loss": 0.3255,
1315
  "step": 206
1316
  },
1317
  {
1318
  "epoch": 0.01,
1319
- "learning_rate": 0.0002,
1320
- "loss": 0.2363,
1321
  "step": 207
1322
  },
1323
  {
1324
  "epoch": 0.01,
1325
- "learning_rate": 0.0002,
1326
- "loss": 0.0581,
1327
  "step": 208
1328
  },
1329
  {
1330
  "epoch": 0.01,
1331
- "learning_rate": 0.0002,
1332
- "loss": 0.0182,
1333
  "step": 209
1334
  },
1335
  {
1336
  "epoch": 0.01,
1337
- "learning_rate": 0.0002,
1338
- "loss": 0.4469,
1339
  "step": 210
1340
  },
1341
  {
1342
  "epoch": 0.01,
1343
- "learning_rate": 0.0002,
1344
- "loss": 0.6449,
1345
  "step": 211
1346
  },
1347
  {
1348
  "epoch": 0.01,
1349
- "learning_rate": 0.0002,
1350
- "loss": 1.0061,
1351
  "step": 212
1352
  },
1353
  {
1354
  "epoch": 0.01,
1355
- "learning_rate": 0.0002,
1356
- "loss": 0.2537,
1357
  "step": 213
1358
  },
1359
  {
1360
  "epoch": 0.01,
1361
- "learning_rate": 0.0002,
1362
- "loss": 1.3736,
1363
  "step": 214
1364
  },
1365
  {
1366
  "epoch": 0.01,
1367
- "learning_rate": 0.0002,
1368
- "loss": 0.3474,
1369
  "step": 215
1370
  },
1371
  {
1372
  "epoch": 0.01,
1373
- "learning_rate": 0.0002,
1374
- "loss": 0.6253,
1375
  "step": 216
1376
  },
1377
  {
1378
  "epoch": 0.01,
1379
- "learning_rate": 0.0002,
1380
- "loss": 0.5241,
1381
  "step": 217
1382
  },
1383
  {
1384
  "epoch": 0.01,
1385
- "learning_rate": 0.0002,
1386
- "loss": 0.0377,
1387
  "step": 218
1388
  },
1389
  {
1390
  "epoch": 0.01,
1391
- "learning_rate": 0.0002,
1392
- "loss": 0.2494,
1393
  "step": 219
1394
  },
1395
  {
1396
  "epoch": 0.01,
1397
- "learning_rate": 0.0002,
1398
- "loss": 0.152,
1399
  "step": 220
1400
  },
1401
  {
1402
  "epoch": 0.01,
1403
- "learning_rate": 0.0002,
1404
- "loss": 0.253,
1405
  "step": 221
1406
  },
1407
  {
1408
  "epoch": 0.01,
1409
- "learning_rate": 0.0002,
1410
- "loss": 0.075,
1411
  "step": 222
1412
  },
1413
  {
1414
  "epoch": 0.01,
1415
- "learning_rate": 0.0002,
1416
- "loss": 0.1513,
1417
  "step": 223
1418
  },
1419
  {
1420
  "epoch": 0.01,
1421
- "learning_rate": 0.0002,
1422
- "loss": 0.5925,
1423
  "step": 224
1424
  },
1425
  {
1426
  "epoch": 0.01,
1427
- "learning_rate": 0.0002,
1428
- "loss": 0.2344,
1429
  "step": 225
1430
  },
1431
  {
1432
  "epoch": 0.01,
1433
- "learning_rate": 0.0002,
1434
- "loss": 0.3119,
1435
  "step": 226
1436
  },
1437
  {
1438
  "epoch": 0.01,
1439
- "learning_rate": 0.0002,
1440
- "loss": 0.4142,
1441
  "step": 227
1442
  },
1443
  {
1444
  "epoch": 0.01,
1445
- "learning_rate": 0.0002,
1446
- "loss": 0.5266,
1447
  "step": 228
1448
  },
1449
  {
1450
  "epoch": 0.01,
1451
- "learning_rate": 0.0002,
1452
- "loss": 0.3029,
1453
  "step": 229
1454
  },
1455
  {
1456
  "epoch": 0.01,
1457
- "learning_rate": 0.0002,
1458
- "loss": 0.5502,
1459
  "step": 230
1460
  },
1461
  {
1462
  "epoch": 0.01,
1463
- "learning_rate": 0.0002,
1464
- "loss": 0.1675,
1465
  "step": 231
1466
  },
1467
  {
1468
  "epoch": 0.01,
1469
- "learning_rate": 0.0002,
1470
- "loss": 0.4987,
1471
  "step": 232
1472
  },
1473
  {
1474
  "epoch": 0.01,
1475
- "learning_rate": 0.0002,
1476
- "loss": 0.8938,
1477
  "step": 233
1478
  },
1479
  {
1480
  "epoch": 0.01,
1481
- "learning_rate": 0.0002,
1482
- "loss": 0.2379,
1483
  "step": 234
1484
  },
1485
  {
1486
  "epoch": 0.01,
1487
- "learning_rate": 0.0002,
1488
- "loss": 0.0423,
1489
  "step": 235
1490
  },
1491
  {
1492
  "epoch": 0.01,
1493
- "learning_rate": 0.0002,
1494
- "loss": 0.1419,
1495
  "step": 236
1496
  },
1497
  {
1498
  "epoch": 0.01,
1499
- "learning_rate": 0.0002,
1500
- "loss": 0.2125,
1501
  "step": 237
1502
  },
1503
  {
1504
  "epoch": 0.01,
1505
- "learning_rate": 0.0002,
1506
- "loss": 0.3397,
1507
  "step": 238
1508
  },
1509
  {
1510
  "epoch": 0.01,
1511
- "learning_rate": 0.0002,
1512
- "loss": 0.232,
1513
  "step": 239
1514
  },
1515
  {
1516
  "epoch": 0.01,
1517
- "learning_rate": 0.0002,
1518
- "loss": 0.7102,
1519
  "step": 240
1520
  },
1521
  {
1522
  "epoch": 0.01,
1523
- "learning_rate": 0.0002,
1524
- "loss": 0.3081,
1525
  "step": 241
1526
  },
1527
  {
1528
  "epoch": 0.01,
1529
- "learning_rate": 0.0002,
1530
- "loss": 0.2406,
1531
  "step": 242
1532
  },
1533
  {
1534
  "epoch": 0.01,
1535
- "learning_rate": 0.0002,
1536
- "loss": 0.1953,
1537
  "step": 243
1538
  },
1539
  {
1540
  "epoch": 0.01,
1541
- "learning_rate": 0.0002,
1542
- "loss": 0.197,
1543
  "step": 244
1544
  },
1545
  {
1546
  "epoch": 0.01,
1547
- "learning_rate": 0.0002,
1548
- "loss": 0.5018,
1549
  "step": 245
1550
  },
1551
  {
1552
  "epoch": 0.02,
1553
- "learning_rate": 0.0002,
1554
- "loss": 0.0011,
1555
  "step": 246
1556
  },
1557
  {
1558
  "epoch": 0.02,
1559
- "learning_rate": 0.0002,
1560
- "loss": 0.1112,
1561
  "step": 247
1562
  },
1563
  {
1564
  "epoch": 0.02,
1565
- "learning_rate": 0.0002,
1566
- "loss": 0.2433,
1567
  "step": 248
1568
  },
1569
  {
1570
  "epoch": 0.02,
1571
- "learning_rate": 0.0002,
1572
- "loss": 0.2467,
1573
  "step": 249
1574
  },
1575
  {
1576
  "epoch": 0.02,
1577
- "learning_rate": 0.0002,
1578
- "loss": 0.2748,
1579
  "step": 250
1580
  },
1581
  {
1582
  "epoch": 0.02,
1583
- "learning_rate": 0.0002,
1584
- "loss": 0.427,
1585
  "step": 251
1586
  },
1587
  {
1588
  "epoch": 0.02,
1589
- "learning_rate": 0.0002,
1590
- "loss": 0.5246,
1591
  "step": 252
1592
  },
1593
  {
1594
  "epoch": 0.02,
1595
- "learning_rate": 0.0002,
1596
- "loss": 0.8089,
1597
  "step": 253
1598
  },
1599
  {
1600
  "epoch": 0.02,
1601
- "learning_rate": 0.0002,
1602
- "loss": 0.5974,
1603
  "step": 254
1604
  },
1605
  {
1606
  "epoch": 0.02,
1607
- "learning_rate": 0.0002,
1608
- "loss": 0.4483,
1609
  "step": 255
1610
  },
1611
  {
1612
  "epoch": 0.02,
1613
- "learning_rate": 0.0002,
1614
- "loss": 0.5411,
1615
  "step": 256
1616
  },
1617
  {
1618
  "epoch": 0.02,
1619
- "learning_rate": 0.0002,
1620
- "loss": 1.0383,
1621
  "step": 257
1622
  },
1623
  {
1624
  "epoch": 0.02,
1625
- "learning_rate": 0.0002,
1626
- "loss": 0.3503,
1627
  "step": 258
1628
  },
1629
  {
1630
  "epoch": 0.02,
1631
- "learning_rate": 0.0002,
1632
- "loss": 0.4224,
1633
  "step": 259
1634
  },
1635
  {
1636
  "epoch": 0.02,
1637
- "learning_rate": 0.0002,
1638
- "loss": 0.3989,
1639
  "step": 260
1640
  },
1641
  {
1642
  "epoch": 0.02,
1643
- "learning_rate": 0.0002,
1644
- "loss": 0.0156,
1645
  "step": 261
1646
  },
1647
  {
1648
  "epoch": 0.02,
1649
- "learning_rate": 0.0002,
1650
- "loss": 0.059,
1651
  "step": 262
1652
  },
1653
  {
1654
  "epoch": 0.02,
1655
- "learning_rate": 0.0002,
1656
- "loss": 0.4875,
1657
  "step": 263
1658
  },
1659
  {
1660
  "epoch": 0.02,
1661
- "learning_rate": 0.0002,
1662
- "loss": 0.5285,
1663
  "step": 264
1664
  },
1665
  {
1666
  "epoch": 0.02,
1667
- "learning_rate": 0.0002,
1668
- "loss": 0.3905,
1669
  "step": 265
1670
  },
1671
  {
1672
  "epoch": 0.02,
1673
- "learning_rate": 0.0002,
1674
- "loss": 0.2485,
1675
  "step": 266
1676
  },
1677
  {
1678
  "epoch": 0.02,
1679
- "learning_rate": 0.0002,
1680
- "loss": 0.0871,
1681
  "step": 267
1682
  },
1683
  {
1684
  "epoch": 0.02,
1685
- "learning_rate": 0.0002,
1686
- "loss": 0.375,
1687
  "step": 268
1688
  },
1689
  {
1690
  "epoch": 0.02,
1691
- "learning_rate": 0.0002,
1692
- "loss": 0.6823,
1693
  "step": 269
1694
  },
1695
  {
1696
  "epoch": 0.02,
1697
- "learning_rate": 0.0002,
1698
- "loss": 0.1278,
1699
  "step": 270
1700
  },
1701
  {
1702
  "epoch": 0.02,
1703
- "learning_rate": 0.0002,
1704
- "loss": 0.4192,
1705
  "step": 271
1706
  },
1707
  {
1708
  "epoch": 0.02,
1709
- "learning_rate": 0.0002,
1710
- "loss": 0.369,
1711
  "step": 272
1712
  },
1713
  {
1714
  "epoch": 0.02,
1715
- "learning_rate": 0.0002,
1716
- "loss": 0.1667,
1717
  "step": 273
1718
  },
1719
  {
1720
  "epoch": 0.02,
1721
- "learning_rate": 0.0002,
1722
- "loss": 0.3167,
1723
  "step": 274
1724
  },
1725
  {
1726
  "epoch": 0.02,
1727
- "learning_rate": 0.0002,
1728
- "loss": 0.4002,
1729
  "step": 275
1730
  },
1731
  {
1732
  "epoch": 0.02,
1733
- "learning_rate": 0.0002,
1734
- "loss": 0.7081,
1735
  "step": 276
1736
  },
1737
  {
1738
  "epoch": 0.02,
1739
- "learning_rate": 0.0002,
1740
- "loss": 0.6772,
1741
  "step": 277
1742
  },
1743
  {
1744
  "epoch": 0.02,
1745
- "learning_rate": 0.0002,
1746
- "loss": 0.9273,
1747
  "step": 278
1748
  },
1749
  {
1750
  "epoch": 0.02,
1751
- "learning_rate": 0.0002,
1752
- "loss": 0.3571,
1753
  "step": 279
1754
  },
1755
  {
1756
  "epoch": 0.02,
1757
- "learning_rate": 0.0002,
1758
- "loss": 0.3216,
1759
  "step": 280
1760
  },
1761
  {
1762
  "epoch": 0.02,
1763
- "learning_rate": 0.0002,
1764
- "loss": 0.3049,
1765
  "step": 281
1766
  },
1767
  {
1768
  "epoch": 0.02,
1769
- "learning_rate": 0.0002,
1770
- "loss": 0.3644,
1771
  "step": 282
1772
  },
1773
  {
1774
  "epoch": 0.02,
1775
- "learning_rate": 0.0002,
1776
- "loss": 0.1136,
1777
  "step": 283
1778
  },
1779
  {
1780
  "epoch": 0.02,
1781
- "learning_rate": 0.0002,
1782
- "loss": 0.8281,
1783
  "step": 284
1784
  },
1785
  {
1786
  "epoch": 0.02,
1787
- "learning_rate": 0.0002,
1788
- "loss": 0.3866,
1789
  "step": 285
1790
  },
1791
  {
1792
  "epoch": 0.02,
1793
- "learning_rate": 0.0002,
1794
- "loss": 0.6077,
1795
  "step": 286
1796
  },
1797
  {
1798
  "epoch": 0.02,
1799
- "learning_rate": 0.0002,
1800
- "loss": 0.2542,
1801
  "step": 287
1802
  },
1803
  {
1804
  "epoch": 0.02,
1805
- "learning_rate": 0.0002,
1806
- "loss": 0.0754,
1807
  "step": 288
1808
  },
1809
  {
1810
  "epoch": 0.02,
1811
- "learning_rate": 0.0002,
1812
- "loss": 0.4549,
1813
  "step": 289
1814
  },
1815
  {
1816
  "epoch": 0.02,
1817
- "learning_rate": 0.0002,
1818
- "loss": 0.2535,
1819
  "step": 290
1820
  },
1821
  {
1822
  "epoch": 0.02,
1823
- "learning_rate": 0.0002,
1824
- "loss": 0.0623,
1825
  "step": 291
1826
  },
1827
  {
1828
  "epoch": 0.02,
1829
- "learning_rate": 0.0002,
1830
- "loss": 0.506,
1831
  "step": 292
1832
  },
1833
  {
1834
  "epoch": 0.02,
1835
- "learning_rate": 0.0002,
1836
- "loss": 0.4297,
1837
  "step": 293
1838
  },
1839
  {
1840
  "epoch": 0.02,
1841
- "learning_rate": 0.0002,
1842
- "loss": 0.2837,
1843
  "step": 294
1844
  },
1845
  {
1846
  "epoch": 0.02,
1847
- "learning_rate": 0.0002,
1848
- "loss": 0.123,
1849
  "step": 295
1850
  },
1851
  {
1852
  "epoch": 0.02,
1853
- "learning_rate": 0.0002,
1854
- "loss": 0.3171,
1855
  "step": 296
1856
  },
1857
  {
1858
  "epoch": 0.02,
1859
- "learning_rate": 0.0002,
1860
- "loss": 0.1956,
1861
  "step": 297
1862
  },
1863
  {
1864
  "epoch": 0.02,
1865
- "learning_rate": 0.0002,
1866
- "loss": 0.3334,
1867
  "step": 298
1868
  },
1869
  {
1870
  "epoch": 0.02,
1871
- "learning_rate": 0.0002,
1872
- "loss": 0.1935,
1873
  "step": 299
1874
  },
1875
  {
1876
  "epoch": 0.02,
1877
- "learning_rate": 0.0002,
1878
- "loss": 0.4596,
1879
  "step": 300
1880
  },
1881
  {
1882
  "epoch": 0.02,
1883
- "learning_rate": 0.0002,
1884
- "loss": 0.3046,
1885
  "step": 301
1886
  },
1887
  {
1888
  "epoch": 0.02,
1889
- "learning_rate": 0.0002,
1890
- "loss": 0.3804,
1891
  "step": 302
1892
  },
1893
  {
1894
  "epoch": 0.02,
1895
- "learning_rate": 0.0002,
1896
- "loss": 0.3248,
1897
  "step": 303
1898
  },
1899
  {
1900
  "epoch": 0.02,
1901
- "learning_rate": 0.0002,
1902
- "loss": 0.2898,
1903
  "step": 304
1904
  },
1905
  {
1906
  "epoch": 0.02,
1907
- "learning_rate": 0.0002,
1908
- "loss": 0.4411,
1909
  "step": 305
1910
  },
1911
  {
1912
  "epoch": 0.02,
1913
- "learning_rate": 0.0002,
1914
- "loss": 0.5006,
1915
  "step": 306
1916
  },
1917
  {
1918
  "epoch": 0.02,
1919
- "learning_rate": 0.0002,
1920
- "loss": 0.0616,
1921
  "step": 307
1922
  },
1923
  {
1924
  "epoch": 0.02,
1925
- "learning_rate": 0.0002,
1926
- "loss": 0.1974,
1927
  "step": 308
1928
  },
1929
  {
1930
  "epoch": 0.02,
1931
- "learning_rate": 0.0002,
1932
- "loss": 0.2575,
1933
  "step": 309
1934
  },
1935
  {
1936
  "epoch": 0.02,
1937
- "learning_rate": 0.0002,
1938
- "loss": 0.1852,
1939
  "step": 310
1940
  },
1941
  {
1942
  "epoch": 0.02,
1943
- "learning_rate": 0.0002,
1944
- "loss": 0.191,
1945
  "step": 311
1946
  },
1947
  {
1948
  "epoch": 0.02,
1949
- "learning_rate": 0.0002,
1950
- "loss": 0.205,
1951
  "step": 312
1952
  },
1953
  {
1954
  "epoch": 0.02,
1955
- "learning_rate": 0.0002,
1956
- "loss": 0.3353,
1957
  "step": 313
1958
  },
1959
  {
1960
  "epoch": 0.02,
1961
- "learning_rate": 0.0002,
1962
- "loss": 0.4007,
1963
  "step": 314
1964
  },
1965
  {
1966
  "epoch": 0.02,
1967
- "learning_rate": 0.0002,
1968
- "loss": 0.0527,
1969
  "step": 315
1970
  },
1971
  {
1972
  "epoch": 0.02,
1973
- "learning_rate": 0.0002,
1974
- "loss": 0.3113,
1975
  "step": 316
1976
  },
1977
  {
1978
  "epoch": 0.02,
1979
- "learning_rate": 0.0002,
1980
- "loss": 0.2557,
1981
  "step": 317
1982
  },
1983
  {
1984
  "epoch": 0.02,
1985
- "learning_rate": 0.0002,
1986
- "loss": 0.2492,
1987
  "step": 318
1988
  },
1989
  {
1990
  "epoch": 0.02,
1991
- "learning_rate": 0.0002,
1992
- "loss": 0.0202,
1993
  "step": 319
1994
  },
1995
  {
1996
  "epoch": 0.02,
1997
- "learning_rate": 0.0002,
1998
- "loss": 0.0006,
1999
  "step": 320
2000
  },
2001
  {
2002
  "epoch": 0.02,
2003
- "learning_rate": 0.0002,
2004
- "loss": 0.5835,
2005
  "step": 321
2006
  },
2007
  {
2008
  "epoch": 0.02,
2009
- "learning_rate": 0.0002,
2010
- "loss": 0.3835,
2011
  "step": 322
2012
  },
2013
  {
2014
  "epoch": 0.02,
2015
- "learning_rate": 0.0002,
2016
- "loss": 0.3948,
2017
  "step": 323
2018
  },
2019
  {
2020
  "epoch": 0.02,
2021
- "learning_rate": 0.0002,
2022
- "loss": 0.2367,
2023
  "step": 324
2024
  },
2025
  {
2026
  "epoch": 0.02,
2027
- "learning_rate": 0.0002,
2028
- "loss": 0.0857,
2029
  "step": 325
2030
  },
2031
  {
2032
  "epoch": 0.02,
2033
- "learning_rate": 0.0002,
2034
- "loss": 0.2721,
2035
  "step": 326
2036
  },
2037
  {
2038
  "epoch": 0.02,
2039
- "learning_rate": 0.0002,
2040
- "loss": 0.0837,
2041
  "step": 327
2042
  },
2043
  {
2044
  "epoch": 0.02,
2045
- "learning_rate": 0.0002,
2046
- "loss": 0.4804,
2047
  "step": 328
2048
  },
2049
  {
2050
  "epoch": 0.02,
2051
- "learning_rate": 0.0002,
2052
- "loss": 0.0626,
2053
  "step": 329
2054
  },
2055
  {
2056
  "epoch": 0.02,
2057
- "learning_rate": 0.0002,
2058
- "loss": 0.1362,
2059
  "step": 330
2060
  },
2061
  {
2062
  "epoch": 0.02,
2063
- "learning_rate": 0.0002,
2064
- "loss": 0.2887,
2065
  "step": 331
2066
  },
2067
  {
2068
  "epoch": 0.02,
2069
- "learning_rate": 0.0002,
2070
- "loss": 0.1514,
2071
  "step": 332
2072
  },
2073
  {
2074
  "epoch": 0.02,
2075
- "learning_rate": 0.0002,
2076
- "loss": 0.5356,
2077
  "step": 333
2078
  },
2079
  {
2080
  "epoch": 0.02,
2081
- "learning_rate": 0.0002,
2082
- "loss": 0.2528,
2083
  "step": 334
2084
  },
2085
  {
2086
  "epoch": 0.02,
2087
- "learning_rate": 0.0002,
2088
- "loss": 0.0625,
2089
  "step": 335
2090
  },
2091
  {
2092
  "epoch": 0.02,
2093
- "learning_rate": 0.0002,
2094
- "loss": 0.0133,
2095
  "step": 336
2096
  },
2097
  {
2098
  "epoch": 0.02,
2099
- "learning_rate": 0.0002,
2100
- "loss": 0.2942,
2101
  "step": 337
2102
  },
2103
  {
2104
  "epoch": 0.02,
2105
- "learning_rate": 0.0002,
2106
- "loss": 0.7313,
2107
  "step": 338
2108
  },
2109
  {
2110
  "epoch": 0.02,
2111
- "learning_rate": 0.0002,
2112
- "loss": 0.3594,
2113
  "step": 339
2114
  },
2115
  {
2116
  "epoch": 0.02,
2117
- "learning_rate": 0.0002,
2118
- "loss": 0.1863,
2119
  "step": 340
2120
  },
2121
  {
2122
  "epoch": 0.02,
2123
- "learning_rate": 0.0002,
2124
- "loss": 0.154,
2125
  "step": 341
2126
  },
2127
  {
2128
  "epoch": 0.02,
2129
- "learning_rate": 0.0002,
2130
- "loss": 0.4095,
2131
  "step": 342
2132
  },
2133
  {
2134
  "epoch": 0.02,
2135
- "learning_rate": 0.0002,
2136
- "loss": 0.0629,
2137
  "step": 343
2138
  },
2139
  {
2140
  "epoch": 0.02,
2141
- "learning_rate": 0.0002,
2142
- "loss": 0.7485,
2143
  "step": 344
2144
  },
2145
  {
2146
  "epoch": 0.02,
2147
- "learning_rate": 0.0002,
2148
- "loss": 0.0399,
2149
  "step": 345
2150
  },
2151
  {
2152
  "epoch": 0.02,
2153
- "learning_rate": 0.0002,
2154
- "loss": 0.8687,
2155
  "step": 346
2156
  },
2157
  {
2158
  "epoch": 0.02,
2159
- "learning_rate": 0.0002,
2160
- "loss": 0.6033,
2161
  "step": 347
2162
  },
2163
  {
2164
  "epoch": 0.02,
2165
- "learning_rate": 0.0002,
2166
- "loss": 0.1485,
2167
  "step": 348
2168
  },
2169
  {
2170
  "epoch": 0.02,
2171
- "learning_rate": 0.0002,
2172
- "loss": 0.3709,
2173
  "step": 349
2174
  },
2175
  {
2176
  "epoch": 0.02,
2177
- "learning_rate": 0.0002,
2178
- "loss": 0.0525,
2179
  "step": 350
2180
  },
2181
  {
2182
  "epoch": 0.02,
2183
- "learning_rate": 0.0002,
2184
- "loss": 0.1674,
2185
  "step": 351
2186
  },
2187
  {
2188
  "epoch": 0.02,
2189
- "learning_rate": 0.0002,
2190
- "loss": 0.0609,
2191
  "step": 352
2192
  },
2193
  {
2194
  "epoch": 0.02,
2195
- "learning_rate": 0.0002,
2196
- "loss": 0.3799,
2197
  "step": 353
2198
  },
2199
  {
2200
  "epoch": 0.02,
2201
- "learning_rate": 0.0002,
2202
- "loss": 0.3422,
2203
  "step": 354
2204
  },
2205
  {
2206
  "epoch": 0.02,
2207
- "learning_rate": 0.0002,
2208
- "loss": 0.4144,
2209
  "step": 355
2210
  },
2211
  {
2212
  "epoch": 0.02,
2213
- "learning_rate": 0.0002,
2214
- "loss": 0.2626,
2215
  "step": 356
2216
  },
2217
  {
2218
  "epoch": 0.02,
2219
- "learning_rate": 0.0002,
2220
- "loss": 0.687,
2221
  "step": 357
2222
  },
2223
  {
2224
  "epoch": 0.02,
2225
- "learning_rate": 0.0002,
2226
- "loss": 0.4491,
2227
  "step": 358
2228
  },
2229
  {
2230
  "epoch": 0.02,
2231
- "learning_rate": 0.0002,
2232
- "loss": 0.723,
2233
  "step": 359
2234
  },
2235
  {
2236
  "epoch": 0.02,
2237
- "learning_rate": 0.0002,
2238
- "loss": 0.3955,
2239
  "step": 360
2240
  },
2241
  {
2242
  "epoch": 0.02,
2243
- "learning_rate": 0.0002,
2244
- "loss": 0.2879,
2245
  "step": 361
2246
  },
2247
  {
2248
  "epoch": 0.02,
2249
- "learning_rate": 0.0002,
2250
- "loss": 0.7725,
2251
  "step": 362
2252
  },
2253
  {
2254
  "epoch": 0.02,
2255
- "learning_rate": 0.0002,
2256
- "loss": 0.2334,
2257
  "step": 363
2258
  },
2259
  {
2260
  "epoch": 0.02,
2261
- "learning_rate": 0.0002,
2262
- "loss": 0.7444,
2263
  "step": 364
2264
  },
2265
  {
2266
  "epoch": 0.02,
2267
- "learning_rate": 0.0002,
2268
- "loss": 0.837,
2269
  "step": 365
2270
  },
2271
  {
2272
  "epoch": 0.02,
2273
- "learning_rate": 0.0002,
2274
- "loss": 0.3277,
2275
  "step": 366
2276
  },
2277
  {
2278
  "epoch": 0.02,
2279
- "learning_rate": 0.0002,
2280
- "loss": 0.4366,
2281
  "step": 367
2282
  },
2283
  {
2284
  "epoch": 0.02,
2285
- "learning_rate": 0.0002,
2286
- "loss": 0.2013,
2287
  "step": 368
2288
  },
2289
  {
2290
  "epoch": 0.02,
2291
- "learning_rate": 0.0002,
2292
- "loss": 0.1617,
2293
  "step": 369
2294
  },
2295
  {
2296
  "epoch": 0.02,
2297
- "learning_rate": 0.0002,
2298
- "loss": 0.2804,
2299
  "step": 370
2300
  },
2301
  {
2302
  "epoch": 0.02,
2303
- "learning_rate": 0.0002,
2304
- "loss": 1.1655,
2305
  "step": 371
2306
  },
2307
  {
2308
  "epoch": 0.02,
2309
- "learning_rate": 0.0002,
2310
- "loss": 0.6786,
2311
  "step": 372
2312
  },
2313
  {
2314
  "epoch": 0.02,
2315
- "learning_rate": 0.0002,
2316
- "loss": 0.3358,
2317
  "step": 373
2318
  },
2319
  {
2320
  "epoch": 0.02,
2321
- "learning_rate": 0.0002,
2322
- "loss": 0.5243,
2323
  "step": 374
2324
  },
2325
  {
2326
  "epoch": 0.02,
2327
- "learning_rate": 0.0002,
2328
- "loss": 0.3309,
2329
  "step": 375
2330
  },
2331
  {
2332
  "epoch": 0.02,
2333
- "learning_rate": 0.0002,
2334
- "loss": 0.2393,
2335
  "step": 376
2336
  },
2337
  {
2338
  "epoch": 0.02,
2339
- "learning_rate": 0.0002,
2340
- "loss": 0.5474,
2341
  "step": 377
2342
  },
2343
  {
2344
  "epoch": 0.02,
2345
- "learning_rate": 0.0002,
2346
- "loss": 0.1695,
2347
  "step": 378
2348
  },
2349
  {
2350
  "epoch": 0.02,
2351
- "learning_rate": 0.0002,
2352
- "loss": 0.0975,
2353
  "step": 379
2354
  },
2355
  {
2356
  "epoch": 0.02,
2357
- "learning_rate": 0.0002,
2358
- "loss": 0.2748,
2359
  "step": 380
2360
  },
2361
  {
2362
  "epoch": 0.02,
2363
- "learning_rate": 0.0002,
2364
- "loss": 0.6588,
2365
  "step": 381
2366
  },
2367
  {
2368
  "epoch": 0.02,
2369
- "learning_rate": 0.0002,
2370
- "loss": 0.3226,
2371
  "step": 382
2372
  },
2373
  {
2374
  "epoch": 0.02,
2375
- "learning_rate": 0.0002,
2376
- "loss": 0.1443,
2377
  "step": 383
2378
  },
2379
  {
2380
  "epoch": 0.02,
2381
- "learning_rate": 0.0002,
2382
- "loss": 0.2284,
2383
  "step": 384
2384
  },
2385
  {
2386
  "epoch": 0.02,
2387
- "learning_rate": 0.0002,
2388
- "loss": 0.2532,
2389
  "step": 385
2390
  },
2391
  {
2392
  "epoch": 0.02,
2393
- "learning_rate": 0.0002,
2394
- "loss": 0.4315,
2395
  "step": 386
2396
  },
2397
  {
2398
  "epoch": 0.02,
2399
- "learning_rate": 0.0002,
2400
- "loss": 0.3583,
2401
  "step": 387
2402
  },
2403
  {
2404
  "epoch": 0.02,
2405
- "learning_rate": 0.0002,
2406
- "loss": 0.1068,
2407
  "step": 388
2408
  },
2409
  {
2410
  "epoch": 0.02,
2411
- "learning_rate": 0.0002,
2412
- "loss": 0.3752,
2413
  "step": 389
2414
  },
2415
  {
2416
  "epoch": 0.02,
2417
- "learning_rate": 0.0002,
2418
- "loss": 0.3231,
2419
  "step": 390
2420
  },
2421
  {
2422
  "epoch": 0.02,
2423
- "learning_rate": 0.0002,
2424
- "loss": 0.12,
2425
  "step": 391
2426
  },
2427
  {
2428
  "epoch": 0.02,
2429
- "learning_rate": 0.0002,
2430
- "loss": 0.2209,
2431
  "step": 392
2432
  },
2433
  {
2434
  "epoch": 0.02,
2435
- "learning_rate": 0.0002,
2436
- "loss": 0.3115,
2437
  "step": 393
2438
  },
2439
  {
2440
  "epoch": 0.02,
2441
- "learning_rate": 0.0002,
2442
- "loss": 0.4848,
2443
  "step": 394
2444
  },
2445
  {
2446
  "epoch": 0.02,
2447
- "learning_rate": 0.0002,
2448
- "loss": 0.2688,
2449
  "step": 395
2450
  },
2451
  {
2452
  "epoch": 0.02,
2453
- "learning_rate": 0.0002,
2454
- "loss": 0.2797,
2455
  "step": 396
2456
  },
2457
  {
2458
  "epoch": 0.02,
2459
- "learning_rate": 0.0002,
2460
- "loss": 0.4188,
2461
  "step": 397
2462
  },
2463
  {
2464
  "epoch": 0.02,
2465
- "learning_rate": 0.0002,
2466
- "loss": 0.1552,
2467
  "step": 398
2468
  },
2469
  {
2470
  "epoch": 0.02,
2471
- "learning_rate": 0.0002,
2472
- "loss": 0.1362,
2473
  "step": 399
2474
  },
2475
  {
2476
  "epoch": 0.02,
2477
- "learning_rate": 0.0002,
2478
- "loss": 0.287,
2479
  "step": 400
2480
  },
2481
  {
2482
  "epoch": 0.02,
2483
- "eval_loss": 0.3298754394054413,
2484
- "eval_runtime": 436.0672,
2485
- "eval_samples_per_second": 2.293,
2486
- "eval_steps_per_second": 1.147,
2487
  "step": 400
2488
  },
2489
  {
2490
  "epoch": 0.02,
2491
- "mmlu_eval_accuracy": 0.4953105974030981,
2492
- "mmlu_eval_accuracy_abstract_algebra": 0.45454545454545453,
2493
  "mmlu_eval_accuracy_anatomy": 0.5,
2494
  "mmlu_eval_accuracy_astronomy": 0.375,
2495
- "mmlu_eval_accuracy_business_ethics": 0.5454545454545454,
2496
  "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552,
2497
- "mmlu_eval_accuracy_college_biology": 0.4375,
2498
  "mmlu_eval_accuracy_college_chemistry": 0.125,
2499
  "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727,
2500
- "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
2501
- "mmlu_eval_accuracy_college_medicine": 0.5,
2502
- "mmlu_eval_accuracy_college_physics": 0.36363636363636365,
2503
- "mmlu_eval_accuracy_computer_security": 0.8181818181818182,
2504
- "mmlu_eval_accuracy_conceptual_physics": 0.2692307692307692,
2505
- "mmlu_eval_accuracy_econometrics": 0.3333333333333333,
2506
- "mmlu_eval_accuracy_electrical_engineering": 0.3125,
2507
- "mmlu_eval_accuracy_elementary_mathematics": 0.43902439024390244,
2508
- "mmlu_eval_accuracy_formal_logic": 0.2857142857142857,
2509
- "mmlu_eval_accuracy_global_facts": 0.1,
2510
- "mmlu_eval_accuracy_high_school_biology": 0.3125,
2511
  "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727,
2512
- "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556,
2513
- "mmlu_eval_accuracy_high_school_european_history": 0.6666666666666666,
2514
- "mmlu_eval_accuracy_high_school_geography": 0.8181818181818182,
2515
- "mmlu_eval_accuracy_high_school_government_and_politics": 0.7142857142857143,
2516
- "mmlu_eval_accuracy_high_school_macroeconomics": 0.46511627906976744,
2517
- "mmlu_eval_accuracy_high_school_mathematics": 0.27586206896551724,
2518
- "mmlu_eval_accuracy_high_school_microeconomics": 0.5,
2519
- "mmlu_eval_accuracy_high_school_physics": 0.29411764705882354,
2520
- "mmlu_eval_accuracy_high_school_psychology": 0.7333333333333333,
2521
- "mmlu_eval_accuracy_high_school_statistics": 0.2608695652173913,
2522
- "mmlu_eval_accuracy_high_school_us_history": 0.5909090909090909,
2523
- "mmlu_eval_accuracy_high_school_world_history": 0.46153846153846156,
2524
- "mmlu_eval_accuracy_human_aging": 0.5217391304347826,
2525
- "mmlu_eval_accuracy_human_sexuality": 0.4166666666666667,
2526
- "mmlu_eval_accuracy_international_law": 0.6923076923076923,
2527
- "mmlu_eval_accuracy_jurisprudence": 0.5454545454545454,
2528
- "mmlu_eval_accuracy_logical_fallacies": 0.7777777777777778,
2529
- "mmlu_eval_accuracy_machine_learning": 0.36363636363636365,
2530
- "mmlu_eval_accuracy_management": 0.8181818181818182,
2531
- "mmlu_eval_accuracy_marketing": 0.8,
2532
- "mmlu_eval_accuracy_medical_genetics": 0.8181818181818182,
2533
- "mmlu_eval_accuracy_miscellaneous": 0.6046511627906976,
2534
- "mmlu_eval_accuracy_moral_disputes": 0.42105263157894735,
2535
- "mmlu_eval_accuracy_moral_scenarios": 0.37,
2536
- "mmlu_eval_accuracy_nutrition": 0.696969696969697,
2537
- "mmlu_eval_accuracy_philosophy": 0.5882352941176471,
2538
- "mmlu_eval_accuracy_prehistory": 0.4,
2539
- "mmlu_eval_accuracy_professional_accounting": 0.3548387096774194,
2540
- "mmlu_eval_accuracy_professional_law": 0.3411764705882353,
2541
- "mmlu_eval_accuracy_professional_medicine": 0.4838709677419355,
2542
- "mmlu_eval_accuracy_professional_psychology": 0.463768115942029,
2543
- "mmlu_eval_accuracy_public_relations": 0.4166666666666667,
2544
- "mmlu_eval_accuracy_security_studies": 0.5925925925925926,
2545
- "mmlu_eval_accuracy_sociology": 0.8181818181818182,
2546
- "mmlu_eval_accuracy_us_foreign_policy": 0.7272727272727273,
2547
- "mmlu_eval_accuracy_virology": 0.5555555555555556,
2548
- "mmlu_eval_accuracy_world_religions": 0.7894736842105263,
2549
- "mmlu_loss": 0.7769390310382096,
2550
  "step": 400
2551
  }
2552
  ],
2553
  "max_steps": 5000,
2554
  "num_train_epochs": 1,
2555
- "total_flos": 7.124647112166605e+16,
2556
  "trial_name": null,
2557
  "trial_params": null
2558
  }
 
1
  {
2
+ "best_metric": 0.36855557560920715,
3
+ "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-400",
4
  "epoch": 0.024444274692536856,
5
  "global_step": 400,
6
  "is_hyper_param_search": false,
 
9
  "log_history": [
10
  {
11
  "epoch": 0.0,
12
+ "learning_rate": 0.0004,
13
+ "loss": 0.1006,
14
  "step": 1
15
  },
16
  {
17
  "epoch": 0.0,
18
+ "learning_rate": 0.0004,
19
+ "loss": 1.2792,
20
  "step": 2
21
  },
22
  {
23
  "epoch": 0.0,
24
+ "learning_rate": 0.0004,
25
+ "loss": 0.3465,
26
  "step": 3
27
  },
28
  {
29
  "epoch": 0.0,
30
+ "learning_rate": 0.0004,
31
+ "loss": 0.4367,
32
  "step": 4
33
  },
34
  {
35
  "epoch": 0.0,
36
+ "learning_rate": 0.0004,
37
+ "loss": 0.1706,
38
  "step": 5
39
  },
40
  {
41
  "epoch": 0.0,
42
+ "learning_rate": 0.0004,
43
+ "loss": 0.632,
44
  "step": 6
45
  },
46
  {
47
  "epoch": 0.0,
48
+ "learning_rate": 0.0004,
49
+ "loss": 0.1233,
50
  "step": 7
51
  },
52
  {
53
  "epoch": 0.0,
54
+ "learning_rate": 0.0004,
55
+ "loss": 0.4661,
56
  "step": 8
57
  },
58
  {
59
  "epoch": 0.0,
60
+ "learning_rate": 0.0004,
61
+ "loss": 0.1672,
62
  "step": 9
63
  },
64
  {
65
  "epoch": 0.0,
66
+ "learning_rate": 0.0004,
67
+ "loss": 0.0641,
68
  "step": 10
69
  },
70
  {
71
  "epoch": 0.0,
72
+ "learning_rate": 0.0004,
73
+ "loss": 0.3908,
74
  "step": 11
75
  },
76
  {
77
  "epoch": 0.0,
78
+ "learning_rate": 0.0004,
79
+ "loss": 0.505,
80
  "step": 12
81
  },
82
  {
83
  "epoch": 0.0,
84
+ "learning_rate": 0.0004,
85
+ "loss": 0.7672,
86
  "step": 13
87
  },
88
  {
89
  "epoch": 0.0,
90
+ "learning_rate": 0.0004,
91
+ "loss": 0.2875,
92
  "step": 14
93
  },
94
  {
95
  "epoch": 0.0,
96
+ "learning_rate": 0.0004,
97
+ "loss": 0.4619,
98
  "step": 15
99
  },
100
  {
101
  "epoch": 0.0,
102
+ "learning_rate": 0.0004,
103
+ "loss": 0.4586,
104
  "step": 16
105
  },
106
  {
107
  "epoch": 0.0,
108
+ "learning_rate": 0.0004,
109
+ "loss": 0.7159,
110
  "step": 17
111
  },
112
  {
113
  "epoch": 0.0,
114
+ "learning_rate": 0.0004,
115
+ "loss": 0.207,
116
  "step": 18
117
  },
118
  {
119
  "epoch": 0.0,
120
+ "learning_rate": 0.0004,
121
+ "loss": 0.4808,
122
  "step": 19
123
  },
124
  {
125
  "epoch": 0.0,
126
+ "learning_rate": 0.0004,
127
+ "loss": 0.104,
128
  "step": 20
129
  },
130
  {
131
  "epoch": 0.0,
132
+ "learning_rate": 0.0004,
133
+ "loss": 0.5561,
134
  "step": 21
135
  },
136
  {
137
  "epoch": 0.0,
138
+ "learning_rate": 0.0004,
139
+ "loss": 0.4114,
140
  "step": 22
141
  },
142
  {
143
  "epoch": 0.0,
144
+ "learning_rate": 0.0004,
145
+ "loss": 0.3656,
146
  "step": 23
147
  },
148
  {
149
  "epoch": 0.0,
150
+ "learning_rate": 0.0004,
151
+ "loss": 0.6222,
152
  "step": 24
153
  },
154
  {
155
  "epoch": 0.0,
156
+ "learning_rate": 0.0004,
157
+ "loss": 0.502,
158
  "step": 25
159
  },
160
  {
161
  "epoch": 0.0,
162
+ "learning_rate": 0.0004,
163
+ "loss": 0.3339,
164
  "step": 26
165
  },
166
  {
167
  "epoch": 0.0,
168
+ "learning_rate": 0.0004,
169
+ "loss": 0.6282,
170
  "step": 27
171
  },
172
  {
173
  "epoch": 0.0,
174
+ "learning_rate": 0.0004,
175
+ "loss": 0.104,
176
  "step": 28
177
  },
178
  {
179
  "epoch": 0.0,
180
+ "learning_rate": 0.0004,
181
+ "loss": 0.7241,
182
  "step": 29
183
  },
184
  {
185
  "epoch": 0.0,
186
+ "learning_rate": 0.0004,
187
+ "loss": 0.7388,
188
  "step": 30
189
  },
190
  {
191
  "epoch": 0.0,
192
+ "learning_rate": 0.0004,
193
+ "loss": 0.1877,
194
  "step": 31
195
  },
196
  {
197
  "epoch": 0.0,
198
+ "learning_rate": 0.0004,
199
+ "loss": 0.5733,
200
  "step": 32
201
  },
202
  {
203
  "epoch": 0.0,
204
+ "learning_rate": 0.0004,
205
+ "loss": 0.6204,
206
  "step": 33
207
  },
208
  {
209
  "epoch": 0.0,
210
+ "learning_rate": 0.0004,
211
+ "loss": 0.9779,
212
  "step": 34
213
  },
214
  {
215
  "epoch": 0.0,
216
+ "learning_rate": 0.0004,
217
+ "loss": 1.0215,
218
  "step": 35
219
  },
220
  {
221
  "epoch": 0.0,
222
+ "learning_rate": 0.0004,
223
+ "loss": 0.2895,
224
  "step": 36
225
  },
226
  {
227
  "epoch": 0.0,
228
+ "learning_rate": 0.0004,
229
+ "loss": 0.3081,
230
  "step": 37
231
  },
232
  {
233
  "epoch": 0.0,
234
+ "learning_rate": 0.0004,
235
+ "loss": 0.6799,
236
  "step": 38
237
  },
238
  {
239
  "epoch": 0.0,
240
+ "learning_rate": 0.0004,
241
+ "loss": 0.6704,
242
  "step": 39
243
  },
244
  {
245
  "epoch": 0.0,
246
+ "learning_rate": 0.0004,
247
+ "loss": 0.6087,
248
  "step": 40
249
  },
250
  {
251
  "epoch": 0.0,
252
+ "learning_rate": 0.0004,
253
+ "loss": 0.8191,
254
  "step": 41
255
  },
256
  {
257
  "epoch": 0.0,
258
+ "learning_rate": 0.0004,
259
+ "loss": 0.307,
260
  "step": 42
261
  },
262
  {
263
  "epoch": 0.0,
264
+ "learning_rate": 0.0004,
265
+ "loss": 0.431,
266
  "step": 43
267
  },
268
  {
269
  "epoch": 0.0,
270
+ "learning_rate": 0.0004,
271
+ "loss": 0.2427,
272
  "step": 44
273
  },
274
  {
275
  "epoch": 0.0,
276
+ "learning_rate": 0.0004,
277
+ "loss": 0.8054,
278
  "step": 45
279
  },
280
  {
281
  "epoch": 0.0,
282
+ "learning_rate": 0.0004,
283
+ "loss": 1.0238,
284
  "step": 46
285
  },
286
  {
287
  "epoch": 0.0,
288
+ "learning_rate": 0.0004,
289
+ "loss": 0.4241,
290
  "step": 47
291
  },
292
  {
293
  "epoch": 0.0,
294
+ "learning_rate": 0.0004,
295
+ "loss": 0.1145,
296
  "step": 48
297
  },
298
  {
299
  "epoch": 0.0,
300
+ "learning_rate": 0.0004,
301
+ "loss": 1.069,
302
  "step": 49
303
  },
304
  {
305
  "epoch": 0.0,
306
+ "learning_rate": 0.0004,
307
+ "loss": 1.0728,
308
  "step": 50
309
  },
310
  {
311
  "epoch": 0.0,
312
+ "learning_rate": 0.0004,
313
+ "loss": 0.108,
314
  "step": 51
315
  },
316
  {
317
  "epoch": 0.0,
318
+ "learning_rate": 0.0004,
319
+ "loss": 0.2927,
320
  "step": 52
321
  },
322
  {
323
  "epoch": 0.0,
324
+ "learning_rate": 0.0004,
325
+ "loss": 0.2443,
326
  "step": 53
327
  },
328
  {
329
  "epoch": 0.0,
330
+ "learning_rate": 0.0004,
331
+ "loss": 0.0006,
332
  "step": 54
333
  },
334
  {
335
  "epoch": 0.0,
336
+ "learning_rate": 0.0004,
337
+ "loss": 0.2178,
338
  "step": 55
339
  },
340
  {
341
  "epoch": 0.0,
342
+ "learning_rate": 0.0004,
343
+ "loss": 0.2221,
344
  "step": 56
345
  },
346
  {
347
  "epoch": 0.0,
348
+ "learning_rate": 0.0004,
349
+ "loss": 0.0375,
350
  "step": 57
351
  },
352
  {
353
  "epoch": 0.0,
354
+ "learning_rate": 0.0004,
355
+ "loss": 0.1756,
356
  "step": 58
357
  },
358
  {
359
  "epoch": 0.0,
360
+ "learning_rate": 0.0004,
361
+ "loss": 0.4141,
362
  "step": 59
363
  },
364
  {
365
  "epoch": 0.0,
366
+ "learning_rate": 0.0004,
367
+ "loss": 0.154,
368
  "step": 60
369
  },
370
  {
371
  "epoch": 0.0,
372
+ "learning_rate": 0.0004,
373
+ "loss": 0.1159,
374
  "step": 61
375
  },
376
  {
377
  "epoch": 0.0,
378
+ "learning_rate": 0.0004,
379
+ "loss": 0.2163,
380
  "step": 62
381
  },
382
  {
383
  "epoch": 0.0,
384
+ "learning_rate": 0.0004,
385
+ "loss": 0.3193,
386
  "step": 63
387
  },
388
  {
389
  "epoch": 0.0,
390
+ "learning_rate": 0.0004,
391
+ "loss": 0.3983,
392
  "step": 64
393
  },
394
  {
395
  "epoch": 0.0,
396
+ "learning_rate": 0.0004,
397
+ "loss": 0.7675,
398
  "step": 65
399
  },
400
  {
401
  "epoch": 0.0,
402
+ "learning_rate": 0.0004,
403
+ "loss": 0.395,
404
  "step": 66
405
  },
406
  {
407
  "epoch": 0.0,
408
+ "learning_rate": 0.0004,
409
+ "loss": 0.4137,
410
  "step": 67
411
  },
412
  {
413
  "epoch": 0.0,
414
+ "learning_rate": 0.0004,
415
+ "loss": 0.1585,
416
  "step": 68
417
  },
418
  {
419
  "epoch": 0.0,
420
+ "learning_rate": 0.0004,
421
+ "loss": 0.0744,
422
  "step": 69
423
  },
424
  {
425
  "epoch": 0.0,
426
+ "learning_rate": 0.0004,
427
+ "loss": 0.2868,
428
  "step": 70
429
  },
430
  {
431
  "epoch": 0.0,
432
+ "learning_rate": 0.0004,
433
+ "loss": 0.6288,
434
  "step": 71
435
  },
436
  {
437
  "epoch": 0.0,
438
+ "learning_rate": 0.0004,
439
+ "loss": 0.2539,
440
  "step": 72
441
  },
442
  {
443
  "epoch": 0.0,
444
+ "learning_rate": 0.0004,
445
+ "loss": 0.9,
446
  "step": 73
447
  },
448
  {
449
  "epoch": 0.0,
450
+ "learning_rate": 0.0004,
451
+ "loss": 0.5689,
452
  "step": 74
453
  },
454
  {
455
  "epoch": 0.0,
456
+ "learning_rate": 0.0004,
457
+ "loss": 0.1503,
458
  "step": 75
459
  },
460
  {
461
  "epoch": 0.0,
462
+ "learning_rate": 0.0004,
463
+ "loss": 0.6418,
464
  "step": 76
465
  },
466
  {
467
  "epoch": 0.0,
468
+ "learning_rate": 0.0004,
469
+ "loss": 0.2353,
470
  "step": 77
471
  },
472
  {
473
  "epoch": 0.0,
474
+ "learning_rate": 0.0004,
475
+ "loss": 0.8223,
476
  "step": 78
477
  },
478
  {
479
  "epoch": 0.0,
480
+ "learning_rate": 0.0004,
481
+ "loss": 0.1297,
482
  "step": 79
483
  },
484
  {
485
  "epoch": 0.0,
486
+ "learning_rate": 0.0004,
487
+ "loss": 0.6385,
488
  "step": 80
489
  },
490
  {
491
  "epoch": 0.0,
492
+ "learning_rate": 0.0004,
493
+ "loss": 0.1623,
494
  "step": 81
495
  },
496
  {
497
  "epoch": 0.01,
498
+ "learning_rate": 0.0004,
499
+ "loss": 0.3846,
500
  "step": 82
501
  },
502
  {
503
  "epoch": 0.01,
504
+ "learning_rate": 0.0004,
505
+ "loss": 0.3152,
506
  "step": 83
507
  },
508
  {
509
  "epoch": 0.01,
510
+ "learning_rate": 0.0004,
511
+ "loss": 0.1425,
512
  "step": 84
513
  },
514
  {
515
  "epoch": 0.01,
516
+ "learning_rate": 0.0004,
517
+ "loss": 0.6978,
518
  "step": 85
519
  },
520
  {
521
  "epoch": 0.01,
522
+ "learning_rate": 0.0004,
523
+ "loss": 1.0012,
524
  "step": 86
525
  },
526
  {
527
  "epoch": 0.01,
528
+ "learning_rate": 0.0004,
529
+ "loss": 0.1544,
530
  "step": 87
531
  },
532
  {
533
  "epoch": 0.01,
534
+ "learning_rate": 0.0004,
535
+ "loss": 0.7167,
536
  "step": 88
537
  },
538
  {
539
  "epoch": 0.01,
540
+ "learning_rate": 0.0004,
541
+ "loss": 0.5173,
542
  "step": 89
543
  },
544
  {
545
  "epoch": 0.01,
546
+ "learning_rate": 0.0004,
547
+ "loss": 0.4471,
548
  "step": 90
549
  },
550
  {
551
  "epoch": 0.01,
552
+ "learning_rate": 0.0004,
553
+ "loss": 0.4159,
554
  "step": 91
555
  },
556
  {
557
  "epoch": 0.01,
558
+ "learning_rate": 0.0004,
559
+ "loss": 0.697,
560
  "step": 92
561
  },
562
  {
563
  "epoch": 0.01,
564
+ "learning_rate": 0.0004,
565
+ "loss": 0.2301,
566
  "step": 93
567
  },
568
  {
569
  "epoch": 0.01,
570
+ "learning_rate": 0.0004,
571
+ "loss": 0.9655,
572
  "step": 94
573
  },
574
  {
575
  "epoch": 0.01,
576
+ "learning_rate": 0.0004,
577
+ "loss": 0.2113,
578
  "step": 95
579
  },
580
  {
581
  "epoch": 0.01,
582
+ "learning_rate": 0.0004,
583
+ "loss": 1.5099,
584
  "step": 96
585
  },
586
  {
587
  "epoch": 0.01,
588
+ "learning_rate": 0.0004,
589
+ "loss": 0.6587,
590
  "step": 97
591
  },
592
  {
593
  "epoch": 0.01,
594
+ "learning_rate": 0.0004,
595
+ "loss": 0.677,
596
  "step": 98
597
  },
598
  {
599
  "epoch": 0.01,
600
+ "learning_rate": 0.0004,
601
+ "loss": 0.8563,
602
  "step": 99
603
  },
604
  {
605
  "epoch": 0.01,
606
+ "learning_rate": 0.0004,
607
+ "loss": 1.6579,
608
  "step": 100
609
  },
610
  {
611
  "epoch": 0.01,
612
+ "learning_rate": 0.0004,
613
+ "loss": 0.2976,
614
  "step": 101
615
  },
616
  {
617
  "epoch": 0.01,
618
+ "learning_rate": 0.0004,
619
+ "loss": 0.4181,
620
  "step": 102
621
  },
622
  {
623
  "epoch": 0.01,
624
+ "learning_rate": 0.0004,
625
+ "loss": 0.3141,
626
  "step": 103
627
  },
628
  {
629
  "epoch": 0.01,
630
+ "learning_rate": 0.0004,
631
+ "loss": 0.1189,
632
  "step": 104
633
  },
634
  {
635
  "epoch": 0.01,
636
+ "learning_rate": 0.0004,
637
+ "loss": 0.0589,
638
  "step": 105
639
  },
640
  {
641
  "epoch": 0.01,
642
+ "learning_rate": 0.0004,
643
+ "loss": 0.533,
644
  "step": 106
645
  },
646
  {
647
  "epoch": 0.01,
648
+ "learning_rate": 0.0004,
649
+ "loss": 0.4562,
650
  "step": 107
651
  },
652
  {
653
  "epoch": 0.01,
654
+ "learning_rate": 0.0004,
655
+ "loss": 0.2835,
656
  "step": 108
657
  },
658
  {
659
  "epoch": 0.01,
660
+ "learning_rate": 0.0004,
661
+ "loss": 0.5246,
662
  "step": 109
663
  },
664
  {
665
  "epoch": 0.01,
666
+ "learning_rate": 0.0004,
667
+ "loss": 0.2345,
668
  "step": 110
669
  },
670
  {
671
  "epoch": 0.01,
672
+ "learning_rate": 0.0004,
673
+ "loss": 0.1858,
674
  "step": 111
675
  },
676
  {
677
  "epoch": 0.01,
678
+ "learning_rate": 0.0004,
679
+ "loss": 0.5243,
680
  "step": 112
681
  },
682
  {
683
  "epoch": 0.01,
684
+ "learning_rate": 0.0004,
685
+ "loss": 0.3014,
686
  "step": 113
687
  },
688
  {
689
  "epoch": 0.01,
690
+ "learning_rate": 0.0004,
691
+ "loss": 0.0783,
692
  "step": 114
693
  },
694
  {
695
  "epoch": 0.01,
696
+ "learning_rate": 0.0004,
697
+ "loss": 0.1369,
698
  "step": 115
699
  },
700
  {
701
  "epoch": 0.01,
702
+ "learning_rate": 0.0004,
703
+ "loss": 0.1517,
704
  "step": 116
705
  },
706
  {
707
  "epoch": 0.01,
708
+ "learning_rate": 0.0004,
709
+ "loss": 0.4089,
710
  "step": 117
711
  },
712
  {
713
  "epoch": 0.01,
714
+ "learning_rate": 0.0004,
715
+ "loss": 0.184,
716
  "step": 118
717
  },
718
  {
719
  "epoch": 0.01,
720
+ "learning_rate": 0.0004,
721
+ "loss": 0.218,
722
  "step": 119
723
  },
724
  {
725
  "epoch": 0.01,
726
+ "learning_rate": 0.0004,
727
+ "loss": 0.2696,
728
  "step": 120
729
  },
730
  {
731
  "epoch": 0.01,
732
+ "learning_rate": 0.0004,
733
+ "loss": 0.0955,
734
  "step": 121
735
  },
736
  {
737
  "epoch": 0.01,
738
+ "learning_rate": 0.0004,
739
+ "loss": 0.3469,
740
  "step": 122
741
  },
742
  {
743
  "epoch": 0.01,
744
+ "learning_rate": 0.0004,
745
+ "loss": 0.2769,
746
  "step": 123
747
  },
748
  {
749
  "epoch": 0.01,
750
+ "learning_rate": 0.0004,
751
+ "loss": 0.2437,
752
  "step": 124
753
  },
754
  {
755
  "epoch": 0.01,
756
+ "learning_rate": 0.0004,
757
+ "loss": 0.2283,
758
  "step": 125
759
  },
760
  {
761
  "epoch": 0.01,
762
+ "learning_rate": 0.0004,
763
+ "loss": 0.5484,
764
  "step": 126
765
  },
766
  {
767
  "epoch": 0.01,
768
+ "learning_rate": 0.0004,
769
+ "loss": 0.3495,
770
  "step": 127
771
  },
772
  {
773
  "epoch": 0.01,
774
+ "learning_rate": 0.0004,
775
+ "loss": 0.7042,
776
  "step": 128
777
  },
778
  {
779
  "epoch": 0.01,
780
+ "learning_rate": 0.0004,
781
+ "loss": 0.3839,
782
  "step": 129
783
  },
784
  {
785
  "epoch": 0.01,
786
+ "learning_rate": 0.0004,
787
+ "loss": 0.3892,
788
  "step": 130
789
  },
790
  {
791
  "epoch": 0.01,
792
+ "learning_rate": 0.0004,
793
+ "loss": 0.2422,
794
  "step": 131
795
  },
796
  {
797
  "epoch": 0.01,
798
+ "learning_rate": 0.0004,
799
+ "loss": 0.3934,
800
  "step": 132
801
  },
802
  {
803
  "epoch": 0.01,
804
+ "learning_rate": 0.0004,
805
+ "loss": 0.4136,
806
  "step": 133
807
  },
808
  {
809
  "epoch": 0.01,
810
+ "learning_rate": 0.0004,
811
+ "loss": 0.0939,
812
  "step": 134
813
  },
814
  {
815
  "epoch": 0.01,
816
+ "learning_rate": 0.0004,
817
+ "loss": 0.508,
818
  "step": 135
819
  },
820
  {
821
  "epoch": 0.01,
822
+ "learning_rate": 0.0004,
823
+ "loss": 0.3331,
824
  "step": 136
825
  },
826
  {
827
  "epoch": 0.01,
828
+ "learning_rate": 0.0004,
829
+ "loss": 0.377,
830
  "step": 137
831
  },
832
  {
833
  "epoch": 0.01,
834
+ "learning_rate": 0.0004,
835
+ "loss": 0.8366,
836
  "step": 138
837
  },
838
  {
839
  "epoch": 0.01,
840
+ "learning_rate": 0.0004,
841
+ "loss": 0.2068,
842
  "step": 139
843
  },
844
  {
845
  "epoch": 0.01,
846
+ "learning_rate": 0.0004,
847
+ "loss": 0.484,
848
  "step": 140
849
  },
850
  {
851
  "epoch": 0.01,
852
+ "learning_rate": 0.0004,
853
+ "loss": 0.8796,
854
  "step": 141
855
  },
856
  {
857
  "epoch": 0.01,
858
+ "learning_rate": 0.0004,
859
+ "loss": 0.4984,
860
  "step": 142
861
  },
862
  {
863
  "epoch": 0.01,
864
+ "learning_rate": 0.0004,
865
+ "loss": 0.5241,
866
  "step": 143
867
  },
868
  {
869
  "epoch": 0.01,
870
+ "learning_rate": 0.0004,
871
+ "loss": 0.4839,
872
  "step": 144
873
  },
874
  {
875
  "epoch": 0.01,
876
+ "learning_rate": 0.0004,
877
+ "loss": 0.2773,
878
  "step": 145
879
  },
880
  {
881
  "epoch": 0.01,
882
+ "learning_rate": 0.0004,
883
+ "loss": 0.5004,
884
  "step": 146
885
  },
886
  {
887
  "epoch": 0.01,
888
+ "learning_rate": 0.0004,
889
+ "loss": 0.3029,
890
  "step": 147
891
  },
892
  {
893
  "epoch": 0.01,
894
+ "learning_rate": 0.0004,
895
+ "loss": 0.9682,
896
  "step": 148
897
  },
898
  {
899
  "epoch": 0.01,
900
+ "learning_rate": 0.0004,
901
+ "loss": 0.3496,
902
  "step": 149
903
  },
904
  {
905
  "epoch": 0.01,
906
+ "learning_rate": 0.0004,
907
+ "loss": 0.462,
908
  "step": 150
909
  },
910
  {
911
  "epoch": 0.01,
912
+ "learning_rate": 0.0004,
913
+ "loss": 0.1464,
914
  "step": 151
915
  },
916
  {
917
  "epoch": 0.01,
918
+ "learning_rate": 0.0004,
919
+ "loss": 0.1177,
920
  "step": 152
921
  },
922
  {
923
  "epoch": 0.01,
924
+ "learning_rate": 0.0004,
925
+ "loss": 0.3903,
926
  "step": 153
927
  },
928
  {
929
  "epoch": 0.01,
930
+ "learning_rate": 0.0004,
931
+ "loss": 0.2373,
932
  "step": 154
933
  },
934
  {
935
  "epoch": 0.01,
936
+ "learning_rate": 0.0004,
937
+ "loss": 0.1732,
938
  "step": 155
939
  },
940
  {
941
  "epoch": 0.01,
942
+ "learning_rate": 0.0004,
943
+ "loss": 0.5158,
944
  "step": 156
945
  },
946
  {
947
  "epoch": 0.01,
948
+ "learning_rate": 0.0004,
949
+ "loss": 0.3224,
950
  "step": 157
951
  },
952
  {
953
  "epoch": 0.01,
954
+ "learning_rate": 0.0004,
955
+ "loss": 0.2082,
956
  "step": 158
957
  },
958
  {
959
  "epoch": 0.01,
960
+ "learning_rate": 0.0004,
961
+ "loss": 0.2307,
962
  "step": 159
963
  },
964
  {
965
  "epoch": 0.01,
966
+ "learning_rate": 0.0004,
967
+ "loss": 0.1758,
968
  "step": 160
969
  },
970
  {
971
  "epoch": 0.01,
972
+ "learning_rate": 0.0004,
973
+ "loss": 0.2339,
974
  "step": 161
975
  },
976
  {
977
  "epoch": 0.01,
978
+ "learning_rate": 0.0004,
979
+ "loss": 0.0613,
980
  "step": 162
981
  },
982
  {
983
  "epoch": 0.01,
984
+ "learning_rate": 0.0004,
985
+ "loss": 0.1142,
986
  "step": 163
987
  },
988
  {
989
  "epoch": 0.01,
990
+ "learning_rate": 0.0004,
991
+ "loss": 0.3177,
992
  "step": 164
993
  },
994
  {
995
  "epoch": 0.01,
996
+ "learning_rate": 0.0004,
997
+ "loss": 0.4358,
998
  "step": 165
999
  },
1000
  {
1001
  "epoch": 0.01,
1002
+ "learning_rate": 0.0004,
1003
+ "loss": 1.3582,
1004
  "step": 166
1005
  },
1006
  {
1007
  "epoch": 0.01,
1008
+ "learning_rate": 0.0004,
1009
+ "loss": 0.5703,
1010
  "step": 167
1011
  },
1012
  {
1013
  "epoch": 0.01,
1014
+ "learning_rate": 0.0004,
1015
+ "loss": 0.3477,
1016
  "step": 168
1017
  },
1018
  {
1019
  "epoch": 0.01,
1020
+ "learning_rate": 0.0004,
1021
+ "loss": 0.4394,
1022
  "step": 169
1023
  },
1024
  {
1025
  "epoch": 0.01,
1026
+ "learning_rate": 0.0004,
1027
+ "loss": 0.3481,
1028
  "step": 170
1029
  },
1030
  {
1031
  "epoch": 0.01,
1032
+ "learning_rate": 0.0004,
1033
+ "loss": 0.1735,
1034
  "step": 171
1035
  },
1036
  {
1037
  "epoch": 0.01,
1038
+ "learning_rate": 0.0004,
1039
+ "loss": 0.0878,
1040
  "step": 172
1041
  },
1042
  {
1043
  "epoch": 0.01,
1044
+ "learning_rate": 0.0004,
1045
+ "loss": 0.0659,
1046
  "step": 173
1047
  },
1048
  {
1049
  "epoch": 0.01,
1050
+ "learning_rate": 0.0004,
1051
+ "loss": 0.3527,
1052
  "step": 174
1053
  },
1054
  {
1055
  "epoch": 0.01,
1056
+ "learning_rate": 0.0004,
1057
+ "loss": 0.1819,
1058
  "step": 175
1059
  },
1060
  {
1061
  "epoch": 0.01,
1062
+ "learning_rate": 0.0004,
1063
+ "loss": 0.379,
1064
  "step": 176
1065
  },
1066
  {
1067
  "epoch": 0.01,
1068
+ "learning_rate": 0.0004,
1069
+ "loss": 0.2146,
1070
  "step": 177
1071
  },
1072
  {
1073
  "epoch": 0.01,
1074
+ "learning_rate": 0.0004,
1075
+ "loss": 0.133,
1076
  "step": 178
1077
  },
1078
  {
1079
  "epoch": 0.01,
1080
+ "learning_rate": 0.0004,
1081
+ "loss": 0.5217,
1082
  "step": 179
1083
  },
1084
  {
1085
  "epoch": 0.01,
1086
+ "learning_rate": 0.0004,
1087
+ "loss": 0.3077,
1088
  "step": 180
1089
  },
1090
  {
1091
  "epoch": 0.01,
1092
+ "learning_rate": 0.0004,
1093
+ "loss": 0.0022,
1094
  "step": 181
1095
  },
1096
  {
1097
  "epoch": 0.01,
1098
+ "learning_rate": 0.0004,
1099
+ "loss": 0.1031,
1100
  "step": 182
1101
  },
1102
  {
1103
  "epoch": 0.01,
1104
+ "learning_rate": 0.0004,
1105
+ "loss": 0.681,
1106
  "step": 183
1107
  },
1108
  {
1109
  "epoch": 0.01,
1110
+ "learning_rate": 0.0004,
1111
+ "loss": 0.7839,
1112
  "step": 184
1113
  },
1114
  {
1115
  "epoch": 0.01,
1116
+ "learning_rate": 0.0004,
1117
+ "loss": 0.6465,
1118
  "step": 185
1119
  },
1120
  {
1121
  "epoch": 0.01,
1122
+ "learning_rate": 0.0004,
1123
+ "loss": 0.2607,
1124
  "step": 186
1125
  },
1126
  {
1127
  "epoch": 0.01,
1128
+ "learning_rate": 0.0004,
1129
+ "loss": 0.7913,
1130
  "step": 187
1131
  },
1132
  {
1133
  "epoch": 0.01,
1134
+ "learning_rate": 0.0004,
1135
+ "loss": 0.4266,
1136
  "step": 188
1137
  },
1138
  {
1139
  "epoch": 0.01,
1140
+ "learning_rate": 0.0004,
1141
+ "loss": 0.2851,
1142
  "step": 189
1143
  },
1144
  {
1145
  "epoch": 0.01,
1146
+ "learning_rate": 0.0004,
1147
+ "loss": 0.6628,
1148
  "step": 190
1149
  },
1150
  {
1151
  "epoch": 0.01,
1152
+ "learning_rate": 0.0004,
1153
+ "loss": 0.8151,
1154
  "step": 191
1155
  },
1156
  {
1157
  "epoch": 0.01,
1158
+ "learning_rate": 0.0004,
1159
+ "loss": 0.3577,
1160
  "step": 192
1161
  },
1162
  {
1163
  "epoch": 0.01,
1164
+ "learning_rate": 0.0004,
1165
+ "loss": 0.4329,
1166
  "step": 193
1167
  },
1168
  {
1169
  "epoch": 0.01,
1170
+ "learning_rate": 0.0004,
1171
+ "loss": 0.1639,
1172
  "step": 194
1173
  },
1174
  {
1175
  "epoch": 0.01,
1176
+ "learning_rate": 0.0004,
1177
+ "loss": 0.1394,
1178
  "step": 195
1179
  },
1180
  {
1181
  "epoch": 0.01,
1182
+ "learning_rate": 0.0004,
1183
+ "loss": 0.3146,
1184
  "step": 196
1185
  },
1186
  {
1187
  "epoch": 0.01,
1188
+ "learning_rate": 0.0004,
1189
+ "loss": 0.2623,
1190
  "step": 197
1191
  },
1192
  {
1193
  "epoch": 0.01,
1194
+ "learning_rate": 0.0004,
1195
+ "loss": 1.3405,
1196
  "step": 198
1197
  },
1198
  {
1199
  "epoch": 0.01,
1200
+ "learning_rate": 0.0004,
1201
+ "loss": 0.6208,
1202
  "step": 199
1203
  },
1204
  {
1205
  "epoch": 0.01,
1206
+ "learning_rate": 0.0004,
1207
+ "loss": 0.7118,
1208
  "step": 200
1209
  },
1210
  {
1211
  "epoch": 0.01,
1212
+ "eval_loss": 0.3938411474227905,
1213
+ "eval_runtime": 219.0899,
1214
+ "eval_samples_per_second": 2.282,
1215
+ "eval_steps_per_second": 1.141,
1216
  "step": 200
1217
  },
1218
  {
1219
  "epoch": 0.01,
1220
+ "mmlu_eval_accuracy": 0.3485764968358423,
1221
+ "mmlu_eval_accuracy_abstract_algebra": 0.36363636363636365,
1222
+ "mmlu_eval_accuracy_anatomy": 0.35714285714285715,
1223
+ "mmlu_eval_accuracy_astronomy": 0.5,
1224
+ "mmlu_eval_accuracy_business_ethics": 0.2727272727272727,
1225
  "mmlu_eval_accuracy_clinical_knowledge": 0.41379310344827586,
1226
+ "mmlu_eval_accuracy_college_biology": 0.4375,
1227
+ "mmlu_eval_accuracy_college_chemistry": 0.25,
1228
+ "mmlu_eval_accuracy_college_computer_science": 0.36363636363636365,
1229
+ "mmlu_eval_accuracy_college_mathematics": 0.36363636363636365,
1230
+ "mmlu_eval_accuracy_college_medicine": 0.4090909090909091,
1231
+ "mmlu_eval_accuracy_college_physics": 0.2727272727272727,
1232
+ "mmlu_eval_accuracy_computer_security": 0.45454545454545453,
1233
+ "mmlu_eval_accuracy_conceptual_physics": 0.34615384615384615,
1234
+ "mmlu_eval_accuracy_econometrics": 0.25,
1235
  "mmlu_eval_accuracy_electrical_engineering": 0.25,
1236
+ "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637,
1237
  "mmlu_eval_accuracy_formal_logic": 0.21428571428571427,
1238
  "mmlu_eval_accuracy_global_facts": 0.0,
1239
+ "mmlu_eval_accuracy_high_school_biology": 0.3125,
1240
+ "mmlu_eval_accuracy_high_school_chemistry": 0.18181818181818182,
1241
  "mmlu_eval_accuracy_high_school_computer_science": 0.5555555555555556,
1242
+ "mmlu_eval_accuracy_high_school_european_history": 0.5555555555555556,
1243
+ "mmlu_eval_accuracy_high_school_geography": 0.7272727272727273,
1244
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.2857142857142857,
1245
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.5116279069767442,
1246
+ "mmlu_eval_accuracy_high_school_mathematics": 0.20689655172413793,
1247
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.21428571428571427,
1248
+ "mmlu_loss": 0.783768397025764,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1249
  "step": 200
1250
  },
1251
  {
1252
  "epoch": 0.01,
1253
+ "learning_rate": 0.0004,
1254
+ "loss": 0.9429,
1255
  "step": 201
1256
  },
1257
  {
1258
  "epoch": 0.01,
1259
+ "learning_rate": 0.0004,
1260
+ "loss": 0.1329,
1261
  "step": 202
1262
  },
1263
  {
1264
  "epoch": 0.01,
1265
+ "learning_rate": 0.0004,
1266
+ "loss": 0.1037,
1267
  "step": 203
1268
  },
1269
  {
1270
  "epoch": 0.01,
1271
+ "learning_rate": 0.0004,
1272
+ "loss": 0.1569,
1273
  "step": 204
1274
  },
1275
  {
1276
  "epoch": 0.01,
1277
+ "learning_rate": 0.0004,
1278
+ "loss": 0.1435,
1279
  "step": 205
1280
  },
1281
  {
1282
  "epoch": 0.01,
1283
+ "learning_rate": 0.0004,
1284
+ "loss": 0.1809,
1285
  "step": 206
1286
  },
1287
  {
1288
  "epoch": 0.01,
1289
+ "learning_rate": 0.0004,
1290
+ "loss": 0.3113,
1291
  "step": 207
1292
  },
1293
  {
1294
  "epoch": 0.01,
1295
+ "learning_rate": 0.0004,
1296
+ "loss": 0.5093,
1297
  "step": 208
1298
  },
1299
  {
1300
  "epoch": 0.01,
1301
+ "learning_rate": 0.0004,
1302
+ "loss": 0.3999,
1303
  "step": 209
1304
  },
1305
  {
1306
  "epoch": 0.01,
1307
+ "learning_rate": 0.0004,
1308
+ "loss": 0.2678,
1309
  "step": 210
1310
  },
1311
  {
1312
  "epoch": 0.01,
1313
+ "learning_rate": 0.0004,
1314
+ "loss": 0.3196,
1315
  "step": 211
1316
  },
1317
  {
1318
  "epoch": 0.01,
1319
+ "learning_rate": 0.0004,
1320
+ "loss": 0.6502,
1321
  "step": 212
1322
  },
1323
  {
1324
  "epoch": 0.01,
1325
+ "learning_rate": 0.0004,
1326
+ "loss": 0.2296,
1327
  "step": 213
1328
  },
1329
  {
1330
  "epoch": 0.01,
1331
+ "learning_rate": 0.0004,
1332
+ "loss": 0.5807,
1333
  "step": 214
1334
  },
1335
  {
1336
  "epoch": 0.01,
1337
+ "learning_rate": 0.0004,
1338
+ "loss": 0.1532,
1339
  "step": 215
1340
  },
1341
  {
1342
  "epoch": 0.01,
1343
+ "learning_rate": 0.0004,
1344
+ "loss": 0.4757,
1345
  "step": 216
1346
  },
1347
  {
1348
  "epoch": 0.01,
1349
+ "learning_rate": 0.0004,
1350
+ "loss": 0.228,
1351
  "step": 217
1352
  },
1353
  {
1354
  "epoch": 0.01,
1355
+ "learning_rate": 0.0004,
1356
+ "loss": 0.9191,
1357
  "step": 218
1358
  },
1359
  {
1360
  "epoch": 0.01,
1361
+ "learning_rate": 0.0004,
1362
+ "loss": 0.1239,
1363
  "step": 219
1364
  },
1365
  {
1366
  "epoch": 0.01,
1367
+ "learning_rate": 0.0004,
1368
+ "loss": 0.6583,
1369
  "step": 220
1370
  },
1371
  {
1372
  "epoch": 0.01,
1373
+ "learning_rate": 0.0004,
1374
+ "loss": 0.1169,
1375
  "step": 221
1376
  },
1377
  {
1378
  "epoch": 0.01,
1379
+ "learning_rate": 0.0004,
1380
+ "loss": 0.4391,
1381
  "step": 222
1382
  },
1383
  {
1384
  "epoch": 0.01,
1385
+ "learning_rate": 0.0004,
1386
+ "loss": 0.2499,
1387
  "step": 223
1388
  },
1389
  {
1390
  "epoch": 0.01,
1391
+ "learning_rate": 0.0004,
1392
+ "loss": 0.3765,
1393
  "step": 224
1394
  },
1395
  {
1396
  "epoch": 0.01,
1397
+ "learning_rate": 0.0004,
1398
+ "loss": 0.3311,
1399
  "step": 225
1400
  },
1401
  {
1402
  "epoch": 0.01,
1403
+ "learning_rate": 0.0004,
1404
+ "loss": 0.4432,
1405
  "step": 226
1406
  },
1407
  {
1408
  "epoch": 0.01,
1409
+ "learning_rate": 0.0004,
1410
+ "loss": 0.273,
1411
  "step": 227
1412
  },
1413
  {
1414
  "epoch": 0.01,
1415
+ "learning_rate": 0.0004,
1416
+ "loss": 0.0578,
1417
  "step": 228
1418
  },
1419
  {
1420
  "epoch": 0.01,
1421
+ "learning_rate": 0.0004,
1422
+ "loss": 0.7053,
1423
  "step": 229
1424
  },
1425
  {
1426
  "epoch": 0.01,
1427
+ "learning_rate": 0.0004,
1428
+ "loss": 0.724,
1429
  "step": 230
1430
  },
1431
  {
1432
  "epoch": 0.01,
1433
+ "learning_rate": 0.0004,
1434
+ "loss": 0.0928,
1435
  "step": 231
1436
  },
1437
  {
1438
  "epoch": 0.01,
1439
+ "learning_rate": 0.0004,
1440
+ "loss": 0.903,
1441
  "step": 232
1442
  },
1443
  {
1444
  "epoch": 0.01,
1445
+ "learning_rate": 0.0004,
1446
+ "loss": 0.7901,
1447
  "step": 233
1448
  },
1449
  {
1450
  "epoch": 0.01,
1451
+ "learning_rate": 0.0004,
1452
+ "loss": 0.7745,
1453
  "step": 234
1454
  },
1455
  {
1456
  "epoch": 0.01,
1457
+ "learning_rate": 0.0004,
1458
+ "loss": 0.3911,
1459
  "step": 235
1460
  },
1461
  {
1462
  "epoch": 0.01,
1463
+ "learning_rate": 0.0004,
1464
+ "loss": 0.1124,
1465
  "step": 236
1466
  },
1467
  {
1468
  "epoch": 0.01,
1469
+ "learning_rate": 0.0004,
1470
+ "loss": 0.6872,
1471
  "step": 237
1472
  },
1473
  {
1474
  "epoch": 0.01,
1475
+ "learning_rate": 0.0004,
1476
+ "loss": 0.3603,
1477
  "step": 238
1478
  },
1479
  {
1480
  "epoch": 0.01,
1481
+ "learning_rate": 0.0004,
1482
+ "loss": 0.7547,
1483
  "step": 239
1484
  },
1485
  {
1486
  "epoch": 0.01,
1487
+ "learning_rate": 0.0004,
1488
+ "loss": 0.5801,
1489
  "step": 240
1490
  },
1491
  {
1492
  "epoch": 0.01,
1493
+ "learning_rate": 0.0004,
1494
+ "loss": 0.565,
1495
  "step": 241
1496
  },
1497
  {
1498
  "epoch": 0.01,
1499
+ "learning_rate": 0.0004,
1500
+ "loss": 0.5144,
1501
  "step": 242
1502
  },
1503
  {
1504
  "epoch": 0.01,
1505
+ "learning_rate": 0.0004,
1506
+ "loss": 0.7707,
1507
  "step": 243
1508
  },
1509
  {
1510
  "epoch": 0.01,
1511
+ "learning_rate": 0.0004,
1512
+ "loss": 0.3473,
1513
  "step": 244
1514
  },
1515
  {
1516
  "epoch": 0.01,
1517
+ "learning_rate": 0.0004,
1518
+ "loss": 0.7117,
1519
  "step": 245
1520
  },
1521
  {
1522
  "epoch": 0.02,
1523
+ "learning_rate": 0.0004,
1524
+ "loss": 1.2869,
1525
  "step": 246
1526
  },
1527
  {
1528
  "epoch": 0.02,
1529
+ "learning_rate": 0.0004,
1530
+ "loss": 0.835,
1531
  "step": 247
1532
  },
1533
  {
1534
  "epoch": 0.02,
1535
+ "learning_rate": 0.0004,
1536
+ "loss": 1.023,
1537
  "step": 248
1538
  },
1539
  {
1540
  "epoch": 0.02,
1541
+ "learning_rate": 0.0004,
1542
+ "loss": 0.3017,
1543
  "step": 249
1544
  },
1545
  {
1546
  "epoch": 0.02,
1547
+ "learning_rate": 0.0004,
1548
+ "loss": 1.5144,
1549
  "step": 250
1550
  },
1551
  {
1552
  "epoch": 0.02,
1553
+ "learning_rate": 0.0004,
1554
+ "loss": 0.2024,
1555
  "step": 251
1556
  },
1557
  {
1558
  "epoch": 0.02,
1559
+ "learning_rate": 0.0004,
1560
+ "loss": 0.1079,
1561
  "step": 252
1562
  },
1563
  {
1564
  "epoch": 0.02,
1565
+ "learning_rate": 0.0004,
1566
+ "loss": 0.2874,
1567
  "step": 253
1568
  },
1569
  {
1570
  "epoch": 0.02,
1571
+ "learning_rate": 0.0004,
1572
+ "loss": 0.0762,
1573
  "step": 254
1574
  },
1575
  {
1576
  "epoch": 0.02,
1577
+ "learning_rate": 0.0004,
1578
+ "loss": 0.0513,
1579
  "step": 255
1580
  },
1581
  {
1582
  "epoch": 0.02,
1583
+ "learning_rate": 0.0004,
1584
+ "loss": 0.6924,
1585
  "step": 256
1586
  },
1587
  {
1588
  "epoch": 0.02,
1589
+ "learning_rate": 0.0004,
1590
+ "loss": 0.3838,
1591
  "step": 257
1592
  },
1593
  {
1594
  "epoch": 0.02,
1595
+ "learning_rate": 0.0004,
1596
+ "loss": 0.2467,
1597
  "step": 258
1598
  },
1599
  {
1600
  "epoch": 0.02,
1601
+ "learning_rate": 0.0004,
1602
+ "loss": 0.3778,
1603
  "step": 259
1604
  },
1605
  {
1606
  "epoch": 0.02,
1607
+ "learning_rate": 0.0004,
1608
+ "loss": 0.1238,
1609
  "step": 260
1610
  },
1611
  {
1612
  "epoch": 0.02,
1613
+ "learning_rate": 0.0004,
1614
+ "loss": 0.336,
1615
  "step": 261
1616
  },
1617
  {
1618
  "epoch": 0.02,
1619
+ "learning_rate": 0.0004,
1620
+ "loss": 1.1311,
1621
  "step": 262
1622
  },
1623
  {
1624
  "epoch": 0.02,
1625
+ "learning_rate": 0.0004,
1626
+ "loss": 0.3756,
1627
  "step": 263
1628
  },
1629
  {
1630
  "epoch": 0.02,
1631
+ "learning_rate": 0.0004,
1632
+ "loss": 0.2104,
1633
  "step": 264
1634
  },
1635
  {
1636
  "epoch": 0.02,
1637
+ "learning_rate": 0.0004,
1638
+ "loss": 0.2373,
1639
  "step": 265
1640
  },
1641
  {
1642
  "epoch": 0.02,
1643
+ "learning_rate": 0.0004,
1644
+ "loss": 0.3603,
1645
  "step": 266
1646
  },
1647
  {
1648
  "epoch": 0.02,
1649
+ "learning_rate": 0.0004,
1650
+ "loss": 0.1986,
1651
  "step": 267
1652
  },
1653
  {
1654
  "epoch": 0.02,
1655
+ "learning_rate": 0.0004,
1656
+ "loss": 0.1257,
1657
  "step": 268
1658
  },
1659
  {
1660
  "epoch": 0.02,
1661
+ "learning_rate": 0.0004,
1662
+ "loss": 0.7065,
1663
  "step": 269
1664
  },
1665
  {
1666
  "epoch": 0.02,
1667
+ "learning_rate": 0.0004,
1668
+ "loss": 0.148,
1669
  "step": 270
1670
  },
1671
  {
1672
  "epoch": 0.02,
1673
+ "learning_rate": 0.0004,
1674
+ "loss": 0.0937,
1675
  "step": 271
1676
  },
1677
  {
1678
  "epoch": 0.02,
1679
+ "learning_rate": 0.0004,
1680
+ "loss": 0.4194,
1681
  "step": 272
1682
  },
1683
  {
1684
  "epoch": 0.02,
1685
+ "learning_rate": 0.0004,
1686
+ "loss": 0.5164,
1687
  "step": 273
1688
  },
1689
  {
1690
  "epoch": 0.02,
1691
+ "learning_rate": 0.0004,
1692
+ "loss": 0.1585,
1693
  "step": 274
1694
  },
1695
  {
1696
  "epoch": 0.02,
1697
+ "learning_rate": 0.0004,
1698
+ "loss": 0.4005,
1699
  "step": 275
1700
  },
1701
  {
1702
  "epoch": 0.02,
1703
+ "learning_rate": 0.0004,
1704
+ "loss": 0.2115,
1705
  "step": 276
1706
  },
1707
  {
1708
  "epoch": 0.02,
1709
+ "learning_rate": 0.0004,
1710
+ "loss": 0.0974,
1711
  "step": 277
1712
  },
1713
  {
1714
  "epoch": 0.02,
1715
+ "learning_rate": 0.0004,
1716
+ "loss": 0.2902,
1717
  "step": 278
1718
  },
1719
  {
1720
  "epoch": 0.02,
1721
+ "learning_rate": 0.0004,
1722
+ "loss": 0.5665,
1723
  "step": 279
1724
  },
1725
  {
1726
  "epoch": 0.02,
1727
+ "learning_rate": 0.0004,
1728
+ "loss": 0.4776,
1729
  "step": 280
1730
  },
1731
  {
1732
  "epoch": 0.02,
1733
+ "learning_rate": 0.0004,
1734
+ "loss": 0.4144,
1735
  "step": 281
1736
  },
1737
  {
1738
  "epoch": 0.02,
1739
+ "learning_rate": 0.0004,
1740
+ "loss": 0.2455,
1741
  "step": 282
1742
  },
1743
  {
1744
  "epoch": 0.02,
1745
+ "learning_rate": 0.0004,
1746
+ "loss": 0.1699,
1747
  "step": 283
1748
  },
1749
  {
1750
  "epoch": 0.02,
1751
+ "learning_rate": 0.0004,
1752
+ "loss": 0.1421,
1753
  "step": 284
1754
  },
1755
  {
1756
  "epoch": 0.02,
1757
+ "learning_rate": 0.0004,
1758
+ "loss": 0.1625,
1759
  "step": 285
1760
  },
1761
  {
1762
  "epoch": 0.02,
1763
+ "learning_rate": 0.0004,
1764
+ "loss": 0.1451,
1765
  "step": 286
1766
  },
1767
  {
1768
  "epoch": 0.02,
1769
+ "learning_rate": 0.0004,
1770
+ "loss": 0.2915,
1771
  "step": 287
1772
  },
1773
  {
1774
  "epoch": 0.02,
1775
+ "learning_rate": 0.0004,
1776
+ "loss": 0.5279,
1777
  "step": 288
1778
  },
1779
  {
1780
  "epoch": 0.02,
1781
+ "learning_rate": 0.0004,
1782
+ "loss": 0.3336,
1783
  "step": 289
1784
  },
1785
  {
1786
  "epoch": 0.02,
1787
+ "learning_rate": 0.0004,
1788
+ "loss": 0.2865,
1789
  "step": 290
1790
  },
1791
  {
1792
  "epoch": 0.02,
1793
+ "learning_rate": 0.0004,
1794
+ "loss": 0.3335,
1795
  "step": 291
1796
  },
1797
  {
1798
  "epoch": 0.02,
1799
+ "learning_rate": 0.0004,
1800
+ "loss": 0.5623,
1801
  "step": 292
1802
  },
1803
  {
1804
  "epoch": 0.02,
1805
+ "learning_rate": 0.0004,
1806
+ "loss": 0.4999,
1807
  "step": 293
1808
  },
1809
  {
1810
  "epoch": 0.02,
1811
+ "learning_rate": 0.0004,
1812
+ "loss": 0.3863,
1813
  "step": 294
1814
  },
1815
  {
1816
  "epoch": 0.02,
1817
+ "learning_rate": 0.0004,
1818
+ "loss": 0.8553,
1819
  "step": 295
1820
  },
1821
  {
1822
  "epoch": 0.02,
1823
+ "learning_rate": 0.0004,
1824
+ "loss": 0.5907,
1825
  "step": 296
1826
  },
1827
  {
1828
  "epoch": 0.02,
1829
+ "learning_rate": 0.0004,
1830
+ "loss": 0.411,
1831
  "step": 297
1832
  },
1833
  {
1834
  "epoch": 0.02,
1835
+ "learning_rate": 0.0004,
1836
+ "loss": 0.8062,
1837
  "step": 298
1838
  },
1839
  {
1840
  "epoch": 0.02,
1841
+ "learning_rate": 0.0004,
1842
+ "loss": 0.6118,
1843
  "step": 299
1844
  },
1845
  {
1846
  "epoch": 0.02,
1847
+ "learning_rate": 0.0004,
1848
+ "loss": 0.9868,
1849
  "step": 300
1850
  },
1851
  {
1852
  "epoch": 0.02,
1853
+ "learning_rate": 0.0004,
1854
+ "loss": 1.4436,
1855
  "step": 301
1856
  },
1857
  {
1858
  "epoch": 0.02,
1859
+ "learning_rate": 0.0004,
1860
+ "loss": 0.0785,
1861
  "step": 302
1862
  },
1863
  {
1864
  "epoch": 0.02,
1865
+ "learning_rate": 0.0004,
1866
+ "loss": 0.0027,
1867
  "step": 303
1868
  },
1869
  {
1870
  "epoch": 0.02,
1871
+ "learning_rate": 0.0004,
1872
+ "loss": 0.0584,
1873
  "step": 304
1874
  },
1875
  {
1876
  "epoch": 0.02,
1877
+ "learning_rate": 0.0004,
1878
+ "loss": 0.4422,
1879
  "step": 305
1880
  },
1881
  {
1882
  "epoch": 0.02,
1883
+ "learning_rate": 0.0004,
1884
+ "loss": 0.4163,
1885
  "step": 306
1886
  },
1887
  {
1888
  "epoch": 0.02,
1889
+ "learning_rate": 0.0004,
1890
+ "loss": 0.0861,
1891
  "step": 307
1892
  },
1893
  {
1894
  "epoch": 0.02,
1895
+ "learning_rate": 0.0004,
1896
+ "loss": 0.1849,
1897
  "step": 308
1898
  },
1899
  {
1900
  "epoch": 0.02,
1901
+ "learning_rate": 0.0004,
1902
+ "loss": 0.2862,
1903
  "step": 309
1904
  },
1905
  {
1906
  "epoch": 0.02,
1907
+ "learning_rate": 0.0004,
1908
+ "loss": 0.1654,
1909
  "step": 310
1910
  },
1911
  {
1912
  "epoch": 0.02,
1913
+ "learning_rate": 0.0004,
1914
+ "loss": 0.0498,
1915
  "step": 311
1916
  },
1917
  {
1918
  "epoch": 0.02,
1919
+ "learning_rate": 0.0004,
1920
+ "loss": 0.2044,
1921
  "step": 312
1922
  },
1923
  {
1924
  "epoch": 0.02,
1925
+ "learning_rate": 0.0004,
1926
+ "loss": 0.1615,
1927
  "step": 313
1928
  },
1929
  {
1930
  "epoch": 0.02,
1931
+ "learning_rate": 0.0004,
1932
+ "loss": 0.3443,
1933
  "step": 314
1934
  },
1935
  {
1936
  "epoch": 0.02,
1937
+ "learning_rate": 0.0004,
1938
+ "loss": 0.1133,
1939
  "step": 315
1940
  },
1941
  {
1942
  "epoch": 0.02,
1943
+ "learning_rate": 0.0004,
1944
+ "loss": 0.9935,
1945
  "step": 316
1946
  },
1947
  {
1948
  "epoch": 0.02,
1949
+ "learning_rate": 0.0004,
1950
+ "loss": 0.0738,
1951
  "step": 317
1952
  },
1953
  {
1954
  "epoch": 0.02,
1955
+ "learning_rate": 0.0004,
1956
+ "loss": 0.7676,
1957
  "step": 318
1958
  },
1959
  {
1960
  "epoch": 0.02,
1961
+ "learning_rate": 0.0004,
1962
+ "loss": 0.5102,
1963
  "step": 319
1964
  },
1965
  {
1966
  "epoch": 0.02,
1967
+ "learning_rate": 0.0004,
1968
+ "loss": 0.3776,
1969
  "step": 320
1970
  },
1971
  {
1972
  "epoch": 0.02,
1973
+ "learning_rate": 0.0004,
1974
+ "loss": 0.6596,
1975
  "step": 321
1976
  },
1977
  {
1978
  "epoch": 0.02,
1979
+ "learning_rate": 0.0004,
1980
+ "loss": 0.3674,
1981
  "step": 322
1982
  },
1983
  {
1984
  "epoch": 0.02,
1985
+ "learning_rate": 0.0004,
1986
+ "loss": 0.2252,
1987
  "step": 323
1988
  },
1989
  {
1990
  "epoch": 0.02,
1991
+ "learning_rate": 0.0004,
1992
+ "loss": 0.2687,
1993
  "step": 324
1994
  },
1995
  {
1996
  "epoch": 0.02,
1997
+ "learning_rate": 0.0004,
1998
+ "loss": 0.4756,
1999
  "step": 325
2000
  },
2001
  {
2002
  "epoch": 0.02,
2003
+ "learning_rate": 0.0004,
2004
+ "loss": 0.1923,
2005
  "step": 326
2006
  },
2007
  {
2008
  "epoch": 0.02,
2009
+ "learning_rate": 0.0004,
2010
+ "loss": 0.4282,
2011
  "step": 327
2012
  },
2013
  {
2014
  "epoch": 0.02,
2015
+ "learning_rate": 0.0004,
2016
+ "loss": 0.0104,
2017
  "step": 328
2018
  },
2019
  {
2020
  "epoch": 0.02,
2021
+ "learning_rate": 0.0004,
2022
+ "loss": 0.1326,
2023
  "step": 329
2024
  },
2025
  {
2026
  "epoch": 0.02,
2027
+ "learning_rate": 0.0004,
2028
+ "loss": 0.9252,
2029
  "step": 330
2030
  },
2031
  {
2032
  "epoch": 0.02,
2033
+ "learning_rate": 0.0004,
2034
+ "loss": 0.5622,
2035
  "step": 331
2036
  },
2037
  {
2038
  "epoch": 0.02,
2039
+ "learning_rate": 0.0004,
2040
+ "loss": 0.6532,
2041
  "step": 332
2042
  },
2043
  {
2044
  "epoch": 0.02,
2045
+ "learning_rate": 0.0004,
2046
+ "loss": 0.1987,
2047
  "step": 333
2048
  },
2049
  {
2050
  "epoch": 0.02,
2051
+ "learning_rate": 0.0004,
2052
+ "loss": 0.3073,
2053
  "step": 334
2054
  },
2055
  {
2056
  "epoch": 0.02,
2057
+ "learning_rate": 0.0004,
2058
+ "loss": 0.7015,
2059
  "step": 335
2060
  },
2061
  {
2062
  "epoch": 0.02,
2063
+ "learning_rate": 0.0004,
2064
+ "loss": 0.4761,
2065
  "step": 336
2066
  },
2067
  {
2068
  "epoch": 0.02,
2069
+ "learning_rate": 0.0004,
2070
+ "loss": 0.4269,
2071
  "step": 337
2072
  },
2073
  {
2074
  "epoch": 0.02,
2075
+ "learning_rate": 0.0004,
2076
+ "loss": 0.988,
2077
  "step": 338
2078
  },
2079
  {
2080
  "epoch": 0.02,
2081
+ "learning_rate": 0.0004,
2082
+ "loss": 0.3558,
2083
  "step": 339
2084
  },
2085
  {
2086
  "epoch": 0.02,
2087
+ "learning_rate": 0.0004,
2088
+ "loss": 0.1494,
2089
  "step": 340
2090
  },
2091
  {
2092
  "epoch": 0.02,
2093
+ "learning_rate": 0.0004,
2094
+ "loss": 1.1602,
2095
  "step": 341
2096
  },
2097
  {
2098
  "epoch": 0.02,
2099
+ "learning_rate": 0.0004,
2100
+ "loss": 0.8131,
2101
  "step": 342
2102
  },
2103
  {
2104
  "epoch": 0.02,
2105
+ "learning_rate": 0.0004,
2106
+ "loss": 0.1298,
2107
  "step": 343
2108
  },
2109
  {
2110
  "epoch": 0.02,
2111
+ "learning_rate": 0.0004,
2112
+ "loss": 0.3267,
2113
  "step": 344
2114
  },
2115
  {
2116
  "epoch": 0.02,
2117
+ "learning_rate": 0.0004,
2118
+ "loss": 0.9548,
2119
  "step": 345
2120
  },
2121
  {
2122
  "epoch": 0.02,
2123
+ "learning_rate": 0.0004,
2124
+ "loss": 0.62,
2125
  "step": 346
2126
  },
2127
  {
2128
  "epoch": 0.02,
2129
+ "learning_rate": 0.0004,
2130
+ "loss": 0.9897,
2131
  "step": 347
2132
  },
2133
  {
2134
  "epoch": 0.02,
2135
+ "learning_rate": 0.0004,
2136
+ "loss": 0.546,
2137
  "step": 348
2138
  },
2139
  {
2140
  "epoch": 0.02,
2141
+ "learning_rate": 0.0004,
2142
+ "loss": 0.8286,
2143
  "step": 349
2144
  },
2145
  {
2146
  "epoch": 0.02,
2147
+ "learning_rate": 0.0004,
2148
+ "loss": 1.0614,
2149
  "step": 350
2150
  },
2151
  {
2152
  "epoch": 0.02,
2153
+ "learning_rate": 0.0004,
2154
+ "loss": 0.0516,
2155
  "step": 351
2156
  },
2157
  {
2158
  "epoch": 0.02,
2159
+ "learning_rate": 0.0004,
2160
+ "loss": 0.4044,
2161
  "step": 352
2162
  },
2163
  {
2164
  "epoch": 0.02,
2165
+ "learning_rate": 0.0004,
2166
+ "loss": 0.1558,
2167
  "step": 353
2168
  },
2169
  {
2170
  "epoch": 0.02,
2171
+ "learning_rate": 0.0004,
2172
+ "loss": 0.149,
2173
  "step": 354
2174
  },
2175
  {
2176
  "epoch": 0.02,
2177
+ "learning_rate": 0.0004,
2178
+ "loss": 0.2533,
2179
  "step": 355
2180
  },
2181
  {
2182
  "epoch": 0.02,
2183
+ "learning_rate": 0.0004,
2184
+ "loss": 0.4241,
2185
  "step": 356
2186
  },
2187
  {
2188
  "epoch": 0.02,
2189
+ "learning_rate": 0.0004,
2190
+ "loss": 0.0167,
2191
  "step": 357
2192
  },
2193
  {
2194
  "epoch": 0.02,
2195
+ "learning_rate": 0.0004,
2196
+ "loss": 0.2235,
2197
  "step": 358
2198
  },
2199
  {
2200
  "epoch": 0.02,
2201
+ "learning_rate": 0.0004,
2202
+ "loss": 0.0712,
2203
  "step": 359
2204
  },
2205
  {
2206
  "epoch": 0.02,
2207
+ "learning_rate": 0.0004,
2208
+ "loss": 0.4779,
2209
  "step": 360
2210
  },
2211
  {
2212
  "epoch": 0.02,
2213
+ "learning_rate": 0.0004,
2214
+ "loss": 0.0887,
2215
  "step": 361
2216
  },
2217
  {
2218
  "epoch": 0.02,
2219
+ "learning_rate": 0.0004,
2220
+ "loss": 0.0534,
2221
  "step": 362
2222
  },
2223
  {
2224
  "epoch": 0.02,
2225
+ "learning_rate": 0.0004,
2226
+ "loss": 0.1561,
2227
  "step": 363
2228
  },
2229
  {
2230
  "epoch": 0.02,
2231
+ "learning_rate": 0.0004,
2232
+ "loss": 0.2777,
2233
  "step": 364
2234
  },
2235
  {
2236
  "epoch": 0.02,
2237
+ "learning_rate": 0.0004,
2238
+ "loss": 0.202,
2239
  "step": 365
2240
  },
2241
  {
2242
  "epoch": 0.02,
2243
+ "learning_rate": 0.0004,
2244
+ "loss": 0.002,
2245
  "step": 366
2246
  },
2247
  {
2248
  "epoch": 0.02,
2249
+ "learning_rate": 0.0004,
2250
+ "loss": 0.4045,
2251
  "step": 367
2252
  },
2253
  {
2254
  "epoch": 0.02,
2255
+ "learning_rate": 0.0004,
2256
+ "loss": 0.6756,
2257
  "step": 368
2258
  },
2259
  {
2260
  "epoch": 0.02,
2261
+ "learning_rate": 0.0004,
2262
+ "loss": 0.2138,
2263
  "step": 369
2264
  },
2265
  {
2266
  "epoch": 0.02,
2267
+ "learning_rate": 0.0004,
2268
+ "loss": 0.6403,
2269
  "step": 370
2270
  },
2271
  {
2272
  "epoch": 0.02,
2273
+ "learning_rate": 0.0004,
2274
+ "loss": 0.0688,
2275
  "step": 371
2276
  },
2277
  {
2278
  "epoch": 0.02,
2279
+ "learning_rate": 0.0004,
2280
+ "loss": 0.2767,
2281
  "step": 372
2282
  },
2283
  {
2284
  "epoch": 0.02,
2285
+ "learning_rate": 0.0004,
2286
+ "loss": 1.0905,
2287
  "step": 373
2288
  },
2289
  {
2290
  "epoch": 0.02,
2291
+ "learning_rate": 0.0004,
2292
+ "loss": 0.1871,
2293
  "step": 374
2294
  },
2295
  {
2296
  "epoch": 0.02,
2297
+ "learning_rate": 0.0004,
2298
+ "loss": 0.2342,
2299
  "step": 375
2300
  },
2301
  {
2302
  "epoch": 0.02,
2303
+ "learning_rate": 0.0004,
2304
+ "loss": 0.559,
2305
  "step": 376
2306
  },
2307
  {
2308
  "epoch": 0.02,
2309
+ "learning_rate": 0.0004,
2310
+ "loss": 0.1791,
2311
  "step": 377
2312
  },
2313
  {
2314
  "epoch": 0.02,
2315
+ "learning_rate": 0.0004,
2316
+ "loss": 0.311,
2317
  "step": 378
2318
  },
2319
  {
2320
  "epoch": 0.02,
2321
+ "learning_rate": 0.0004,
2322
+ "loss": 0.39,
2323
  "step": 379
2324
  },
2325
  {
2326
  "epoch": 0.02,
2327
+ "learning_rate": 0.0004,
2328
+ "loss": 0.7051,
2329
  "step": 380
2330
  },
2331
  {
2332
  "epoch": 0.02,
2333
+ "learning_rate": 0.0004,
2334
+ "loss": 0.2912,
2335
  "step": 381
2336
  },
2337
  {
2338
  "epoch": 0.02,
2339
+ "learning_rate": 0.0004,
2340
+ "loss": 0.1762,
2341
  "step": 382
2342
  },
2343
  {
2344
  "epoch": 0.02,
2345
+ "learning_rate": 0.0004,
2346
+ "loss": 0.5241,
2347
  "step": 383
2348
  },
2349
  {
2350
  "epoch": 0.02,
2351
+ "learning_rate": 0.0004,
2352
+ "loss": 0.5655,
2353
  "step": 384
2354
  },
2355
  {
2356
  "epoch": 0.02,
2357
+ "learning_rate": 0.0004,
2358
+ "loss": 0.1796,
2359
  "step": 385
2360
  },
2361
  {
2362
  "epoch": 0.02,
2363
+ "learning_rate": 0.0004,
2364
+ "loss": 0.9497,
2365
  "step": 386
2366
  },
2367
  {
2368
  "epoch": 0.02,
2369
+ "learning_rate": 0.0004,
2370
+ "loss": 0.5626,
2371
  "step": 387
2372
  },
2373
  {
2374
  "epoch": 0.02,
2375
+ "learning_rate": 0.0004,
2376
+ "loss": 0.3157,
2377
  "step": 388
2378
  },
2379
  {
2380
  "epoch": 0.02,
2381
+ "learning_rate": 0.0004,
2382
+ "loss": 0.5506,
2383
  "step": 389
2384
  },
2385
  {
2386
  "epoch": 0.02,
2387
+ "learning_rate": 0.0004,
2388
+ "loss": 0.7158,
2389
  "step": 390
2390
  },
2391
  {
2392
  "epoch": 0.02,
2393
+ "learning_rate": 0.0004,
2394
+ "loss": 0.7723,
2395
  "step": 391
2396
  },
2397
  {
2398
  "epoch": 0.02,
2399
+ "learning_rate": 0.0004,
2400
+ "loss": 0.3411,
2401
  "step": 392
2402
  },
2403
  {
2404
  "epoch": 0.02,
2405
+ "learning_rate": 0.0004,
2406
+ "loss": 0.3612,
2407
  "step": 393
2408
  },
2409
  {
2410
  "epoch": 0.02,
2411
+ "learning_rate": 0.0004,
2412
+ "loss": 0.9484,
2413
  "step": 394
2414
  },
2415
  {
2416
  "epoch": 0.02,
2417
+ "learning_rate": 0.0004,
2418
+ "loss": 0.7988,
2419
  "step": 395
2420
  },
2421
  {
2422
  "epoch": 0.02,
2423
+ "learning_rate": 0.0004,
2424
+ "loss": 0.6859,
2425
  "step": 396
2426
  },
2427
  {
2428
  "epoch": 0.02,
2429
+ "learning_rate": 0.0004,
2430
+ "loss": 0.1707,
2431
  "step": 397
2432
  },
2433
  {
2434
  "epoch": 0.02,
2435
+ "learning_rate": 0.0004,
2436
+ "loss": 0.5727,
2437
  "step": 398
2438
  },
2439
  {
2440
  "epoch": 0.02,
2441
+ "learning_rate": 0.0004,
2442
+ "loss": 0.7488,
2443
  "step": 399
2444
  },
2445
  {
2446
  "epoch": 0.02,
2447
+ "learning_rate": 0.0004,
2448
+ "loss": 1.3428,
2449
  "step": 400
2450
  },
2451
  {
2452
  "epoch": 0.02,
2453
+ "eval_loss": 0.36855557560920715,
2454
+ "eval_runtime": 218.1575,
2455
+ "eval_samples_per_second": 2.292,
2456
+ "eval_steps_per_second": 1.146,
2457
  "step": 400
2458
  },
2459
  {
2460
  "epoch": 0.02,
2461
+ "mmlu_eval_accuracy": 0.37785769008166176,
2462
+ "mmlu_eval_accuracy_abstract_algebra": 0.2727272727272727,
2463
  "mmlu_eval_accuracy_anatomy": 0.5,
2464
  "mmlu_eval_accuracy_astronomy": 0.375,
2465
+ "mmlu_eval_accuracy_business_ethics": 0.45454545454545453,
2466
  "mmlu_eval_accuracy_clinical_knowledge": 0.4827586206896552,
2467
+ "mmlu_eval_accuracy_college_biology": 0.25,
2468
  "mmlu_eval_accuracy_college_chemistry": 0.125,
2469
  "mmlu_eval_accuracy_college_computer_science": 0.2727272727272727,
2470
+ "mmlu_eval_accuracy_college_mathematics": 0.18181818181818182,
2471
+ "mmlu_eval_accuracy_college_medicine": 0.5909090909090909,
2472
+ "mmlu_eval_accuracy_college_physics": 0.5454545454545454,
2473
+ "mmlu_eval_accuracy_computer_security": 0.45454545454545453,
2474
+ "mmlu_eval_accuracy_conceptual_physics": 0.4230769230769231,
2475
+ "mmlu_eval_accuracy_econometrics": 0.25,
2476
+ "mmlu_eval_accuracy_electrical_engineering": 0.1875,
2477
+ "mmlu_eval_accuracy_elementary_mathematics": 0.34146341463414637,
2478
+ "mmlu_eval_accuracy_formal_logic": 0.14285714285714285,
2479
+ "mmlu_eval_accuracy_global_facts": 0.3,
2480
+ "mmlu_eval_accuracy_high_school_biology": 0.4375,
2481
  "mmlu_eval_accuracy_high_school_chemistry": 0.22727272727272727,
2482
+ "mmlu_eval_accuracy_high_school_computer_science": 0.6666666666666666,
2483
+ "mmlu_eval_accuracy_high_school_european_history": 0.5,
2484
+ "mmlu_eval_accuracy_high_school_geography": 0.7727272727272727,
2485
+ "mmlu_eval_accuracy_high_school_government_and_politics": 0.47619047619047616,
2486
+ "mmlu_eval_accuracy_high_school_macroeconomics": 0.4418604651162791,
2487
+ "mmlu_eval_accuracy_high_school_mathematics": 0.1724137931034483,
2488
+ "mmlu_eval_accuracy_high_school_microeconomics": 0.35714285714285715,
2489
+ "mmlu_loss": 0.7700759556740523,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2490
  "step": 400
2491
  }
2492
  ],
2493
  "max_steps": 5000,
2494
  "num_train_epochs": 1,
2495
+ "total_flos": 5.810756599893197e+16,
2496
  "trial_name": null,
2497
  "trial_params": null
2498
  }
checkpoint-400/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd28a065deb906dd6787d5be775d7f7fef1c3352a93f2dc2266d20467a05b48d
3
  size 6011
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6221336348c810e346236bf80a362d1c36330d016829c5789d6e4b72e63969b6
3
  size 6011
tokenizer_config.json CHANGED
@@ -23,7 +23,6 @@
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
26
- "spaces_between_special_tokens": false,
27
  "tokenizer_class": "LlamaTokenizer",
28
  "unk_token": {
29
  "__type": "AddedToken",
@@ -32,6 +31,5 @@
32
  "normalized": true,
33
  "rstrip": false,
34
  "single_word": false
35
- },
36
- "use_default_system_prompt": true
37
  }
 
23
  "pad_token": null,
24
  "padding_side": "right",
25
  "sp_model_kwargs": {},
 
26
  "tokenizer_class": "LlamaTokenizer",
27
  "unk_token": {
28
  "__type": "AddedToken",
 
31
  "normalized": true,
32
  "rstrip": false,
33
  "single_word": false
34
+ }
 
35
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd28a065deb906dd6787d5be775d7f7fef1c3352a93f2dc2266d20467a05b48d
3
  size 6011
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6221336348c810e346236bf80a362d1c36330d016829c5789d6e4b72e63969b6
3
  size 6011