|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 2230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004484304932735426, |
|
"grad_norm": 4.696539476451585, |
|
"learning_rate": 1.3452914798206278e-08, |
|
"loss": 0.9912, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004484304932735426, |
|
"grad_norm": 5.089904667658368, |
|
"learning_rate": 1.345291479820628e-07, |
|
"loss": 1.0341, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008968609865470852, |
|
"grad_norm": 5.546828630388097, |
|
"learning_rate": 2.690582959641256e-07, |
|
"loss": 1.0502, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013452914798206279, |
|
"grad_norm": 4.113849381101499, |
|
"learning_rate": 4.0358744394618834e-07, |
|
"loss": 1.0386, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017937219730941704, |
|
"grad_norm": 3.6548963622814887, |
|
"learning_rate": 5.381165919282512e-07, |
|
"loss": 1.0282, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02242152466367713, |
|
"grad_norm": 2.157564670206396, |
|
"learning_rate": 6.72645739910314e-07, |
|
"loss": 0.9574, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026905829596412557, |
|
"grad_norm": 2.0184475272019555, |
|
"learning_rate": 8.071748878923767e-07, |
|
"loss": 0.9263, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03139013452914798, |
|
"grad_norm": 1.7894937443172652, |
|
"learning_rate": 9.417040358744395e-07, |
|
"loss": 0.9253, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03587443946188341, |
|
"grad_norm": 1.6533764414432808, |
|
"learning_rate": 1.0762331838565023e-06, |
|
"loss": 0.9106, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04035874439461883, |
|
"grad_norm": 1.9561381307359194, |
|
"learning_rate": 1.2107623318385651e-06, |
|
"loss": 0.8713, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04484304932735426, |
|
"grad_norm": 1.5478472557018526, |
|
"learning_rate": 1.345291479820628e-06, |
|
"loss": 0.8741, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04484304932735426, |
|
"eval_loss": 0.8599640727043152, |
|
"eval_runtime": 430.7233, |
|
"eval_samples_per_second": 116.263, |
|
"eval_steps_per_second": 1.818, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04932735426008968, |
|
"grad_norm": 1.5759592930264636, |
|
"learning_rate": 1.4798206278026905e-06, |
|
"loss": 0.8381, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.053811659192825115, |
|
"grad_norm": 1.5446577353242628, |
|
"learning_rate": 1.6143497757847533e-06, |
|
"loss": 0.8151, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05829596412556054, |
|
"grad_norm": 1.6899841974229757, |
|
"learning_rate": 1.7488789237668162e-06, |
|
"loss": 0.8309, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06278026905829596, |
|
"grad_norm": 1.6274283098945213, |
|
"learning_rate": 1.883408071748879e-06, |
|
"loss": 0.8509, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06726457399103139, |
|
"grad_norm": 1.7690619100525546, |
|
"learning_rate": 2.0179372197309418e-06, |
|
"loss": 0.8057, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07174887892376682, |
|
"grad_norm": 1.866473004768342, |
|
"learning_rate": 2.1524663677130046e-06, |
|
"loss": 0.8236, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07623318385650224, |
|
"grad_norm": 1.5528009019380091, |
|
"learning_rate": 2.2869955156950674e-06, |
|
"loss": 0.7936, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08071748878923767, |
|
"grad_norm": 1.8924349879943885, |
|
"learning_rate": 2.4215246636771302e-06, |
|
"loss": 0.8054, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08520179372197309, |
|
"grad_norm": 1.5998254884542162, |
|
"learning_rate": 2.556053811659193e-06, |
|
"loss": 0.7971, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08968609865470852, |
|
"grad_norm": 1.553085624058612, |
|
"learning_rate": 2.690582959641256e-06, |
|
"loss": 0.8038, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08968609865470852, |
|
"eval_loss": 0.8094644546508789, |
|
"eval_runtime": 412.1717, |
|
"eval_samples_per_second": 121.495, |
|
"eval_steps_per_second": 1.9, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09417040358744394, |
|
"grad_norm": 1.6621378080442881, |
|
"learning_rate": 2.8251121076233187e-06, |
|
"loss": 0.7815, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09865470852017937, |
|
"grad_norm": 1.5875832641605891, |
|
"learning_rate": 2.959641255605381e-06, |
|
"loss": 0.8088, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1031390134529148, |
|
"grad_norm": 1.6006597094640902, |
|
"learning_rate": 2.99990995533251e-06, |
|
"loss": 0.8141, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10762331838565023, |
|
"grad_norm": 1.7932554350094232, |
|
"learning_rate": 2.9994689462512194e-06, |
|
"loss": 0.7834, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11210762331838565, |
|
"grad_norm": 1.6444723214299724, |
|
"learning_rate": 2.998660541859271e-06, |
|
"loss": 0.7797, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11659192825112108, |
|
"grad_norm": 1.790145213655978, |
|
"learning_rate": 2.9974849402294452e-06, |
|
"loss": 0.8046, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.1210762331838565, |
|
"grad_norm": 1.8694283184605, |
|
"learning_rate": 2.9959424294040703e-06, |
|
"loss": 0.7802, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12556053811659193, |
|
"grad_norm": 1.6030839509233756, |
|
"learning_rate": 2.9940333873244464e-06, |
|
"loss": 0.8032, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.13004484304932734, |
|
"grad_norm": 1.664910362160235, |
|
"learning_rate": 2.991758281738245e-06, |
|
"loss": 0.7802, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13452914798206278, |
|
"grad_norm": 1.6726792291262853, |
|
"learning_rate": 2.989117670084902e-06, |
|
"loss": 0.7937, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13452914798206278, |
|
"eval_loss": 0.7789004445075989, |
|
"eval_runtime": 410.6605, |
|
"eval_samples_per_second": 121.943, |
|
"eval_steps_per_second": 1.907, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13901345291479822, |
|
"grad_norm": 1.4685211047526556, |
|
"learning_rate": 2.986112199359036e-06, |
|
"loss": 0.7486, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.14349775784753363, |
|
"grad_norm": 2.0076694355781575, |
|
"learning_rate": 2.9827426059519237e-06, |
|
"loss": 0.808, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14798206278026907, |
|
"grad_norm": 1.557780179088859, |
|
"learning_rate": 2.9790097154710697e-06, |
|
"loss": 0.7849, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15246636771300448, |
|
"grad_norm": 1.3610248283116362, |
|
"learning_rate": 2.9749144425379216e-06, |
|
"loss": 0.7696, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15695067264573992, |
|
"grad_norm": 1.5050628258310632, |
|
"learning_rate": 2.9704577905637718e-06, |
|
"loss": 0.7497, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16143497757847533, |
|
"grad_norm": 1.4313536098763806, |
|
"learning_rate": 2.9656408515039017e-06, |
|
"loss": 0.7544, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16591928251121077, |
|
"grad_norm": 1.6003065628553548, |
|
"learning_rate": 2.9604648055900368e-06, |
|
"loss": 0.7648, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.17040358744394618, |
|
"grad_norm": 1.633334409956319, |
|
"learning_rate": 2.9549309210411697e-06, |
|
"loss": 0.7471, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17488789237668162, |
|
"grad_norm": 1.5700271693529286, |
|
"learning_rate": 2.949040553752826e-06, |
|
"loss": 0.8009, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17937219730941703, |
|
"grad_norm": 1.4854276734758955, |
|
"learning_rate": 2.9427951469648425e-06, |
|
"loss": 0.7712, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17937219730941703, |
|
"eval_loss": 0.7643527388572693, |
|
"eval_runtime": 413.4678, |
|
"eval_samples_per_second": 121.115, |
|
"eval_steps_per_second": 1.894, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18385650224215247, |
|
"grad_norm": 1.4160940764229815, |
|
"learning_rate": 2.936196230907755e-06, |
|
"loss": 0.7532, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18834080717488788, |
|
"grad_norm": 1.4265290618310995, |
|
"learning_rate": 2.929245422427861e-06, |
|
"loss": 0.7703, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19282511210762332, |
|
"grad_norm": 1.6899882763333507, |
|
"learning_rate": 2.9219444245910674e-06, |
|
"loss": 0.7919, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19730941704035873, |
|
"grad_norm": 1.4186337044303068, |
|
"learning_rate": 2.9142950262656098e-06, |
|
"loss": 0.7477, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20179372197309417, |
|
"grad_norm": 1.4178331376670448, |
|
"learning_rate": 2.9062991016837496e-06, |
|
"loss": 0.7734, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2062780269058296, |
|
"grad_norm": 1.4503162574851487, |
|
"learning_rate": 2.897958609982556e-06, |
|
"loss": 0.7447, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21076233183856502, |
|
"grad_norm": 1.558520612711291, |
|
"learning_rate": 2.8892755947238818e-06, |
|
"loss": 0.741, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21524663677130046, |
|
"grad_norm": 1.4382572158325275, |
|
"learning_rate": 2.8802521833936595e-06, |
|
"loss": 0.7563, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21973094170403587, |
|
"grad_norm": 1.5964216489171685, |
|
"learning_rate": 2.870890586880629e-06, |
|
"loss": 0.7554, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2242152466367713, |
|
"grad_norm": 1.496069010720812, |
|
"learning_rate": 2.8611930989346322e-06, |
|
"loss": 0.7393, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2242152466367713, |
|
"eval_loss": 0.7564548254013062, |
|
"eval_runtime": 408.8965, |
|
"eval_samples_per_second": 122.469, |
|
"eval_steps_per_second": 1.915, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22869955156950672, |
|
"grad_norm": 1.4866290735466012, |
|
"learning_rate": 2.851162095604607e-06, |
|
"loss": 0.7499, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23318385650224216, |
|
"grad_norm": 1.3341919240907245, |
|
"learning_rate": 2.8408000346564136e-06, |
|
"loss": 0.7524, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23766816143497757, |
|
"grad_norm": 1.6374942242171213, |
|
"learning_rate": 2.8301094549706405e-06, |
|
"loss": 0.7386, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.242152466367713, |
|
"grad_norm": 1.6225803035616944, |
|
"learning_rate": 2.8190929759205366e-06, |
|
"loss": 0.7616, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.24663677130044842, |
|
"grad_norm": 1.4683777464043755, |
|
"learning_rate": 2.807753296730219e-06, |
|
"loss": 0.7564, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.25112107623318386, |
|
"grad_norm": 1.350460716883926, |
|
"learning_rate": 2.7960931958133183e-06, |
|
"loss": 0.7424, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2556053811659193, |
|
"grad_norm": 1.522474854464212, |
|
"learning_rate": 2.7841155300922202e-06, |
|
"loss": 0.7331, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2600896860986547, |
|
"grad_norm": 1.448720887976205, |
|
"learning_rate": 2.7718232342980693e-06, |
|
"loss": 0.7657, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2645739910313901, |
|
"grad_norm": 1.6744619426337854, |
|
"learning_rate": 2.759219320251714e-06, |
|
"loss": 0.7363, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.26905829596412556, |
|
"grad_norm": 1.3585539591402243, |
|
"learning_rate": 2.7463068761257554e-06, |
|
"loss": 0.7458, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.26905829596412556, |
|
"eval_loss": 0.7505608797073364, |
|
"eval_runtime": 408.9234, |
|
"eval_samples_per_second": 122.461, |
|
"eval_steps_per_second": 1.915, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.273542600896861, |
|
"grad_norm": 1.580932873164111, |
|
"learning_rate": 2.7330890656878943e-06, |
|
"loss": 0.7565, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.27802690582959644, |
|
"grad_norm": 1.5329888412189265, |
|
"learning_rate": 2.7195691275257547e-06, |
|
"loss": 0.7457, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2825112107623318, |
|
"grad_norm": 1.6754413400622026, |
|
"learning_rate": 2.7057503742533753e-06, |
|
"loss": 0.7392, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.28699551569506726, |
|
"grad_norm": 1.6247897070260917, |
|
"learning_rate": 2.691636191699562e-06, |
|
"loss": 0.758, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2914798206278027, |
|
"grad_norm": 1.42356323236888, |
|
"learning_rate": 2.6772300380783013e-06, |
|
"loss": 0.7626, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.29596412556053814, |
|
"grad_norm": 1.4955853270730488, |
|
"learning_rate": 2.662535443141443e-06, |
|
"loss": 0.7355, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3004484304932735, |
|
"grad_norm": 1.4879073313151545, |
|
"learning_rate": 2.647556007313847e-06, |
|
"loss": 0.7545, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.30493273542600896, |
|
"grad_norm": 1.4153755477305148, |
|
"learning_rate": 2.6322954008112213e-06, |
|
"loss": 0.7378, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3094170403587444, |
|
"grad_norm": 1.4019993036978922, |
|
"learning_rate": 2.616757362740855e-06, |
|
"loss": 0.7387, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.31390134529147984, |
|
"grad_norm": 1.5335241758091316, |
|
"learning_rate": 2.600945700185474e-06, |
|
"loss": 0.7694, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.31390134529147984, |
|
"eval_loss": 0.7457958459854126, |
|
"eval_runtime": 408.7761, |
|
"eval_samples_per_second": 122.505, |
|
"eval_steps_per_second": 1.915, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3183856502242152, |
|
"grad_norm": 1.47263429505246, |
|
"learning_rate": 2.5848642872704417e-06, |
|
"loss": 0.7246, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.32286995515695066, |
|
"grad_norm": 1.5062835613914285, |
|
"learning_rate": 2.5685170642145337e-06, |
|
"loss": 0.7338, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3273542600896861, |
|
"grad_norm": 1.6182138547104117, |
|
"learning_rate": 2.5519080363645134e-06, |
|
"loss": 0.73, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.33183856502242154, |
|
"grad_norm": 1.3515300425343295, |
|
"learning_rate": 2.53504127321376e-06, |
|
"loss": 0.7299, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.336322869955157, |
|
"grad_norm": 1.5798782493243635, |
|
"learning_rate": 2.517920907405168e-06, |
|
"loss": 0.7293, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.34080717488789236, |
|
"grad_norm": 1.4549259580353344, |
|
"learning_rate": 2.5005511337185824e-06, |
|
"loss": 0.7621, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3452914798206278, |
|
"grad_norm": 1.456599605633329, |
|
"learning_rate": 2.4829362080430077e-06, |
|
"loss": 0.7438, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.34977578475336324, |
|
"grad_norm": 1.4128813340833153, |
|
"learning_rate": 2.4650804463338406e-06, |
|
"loss": 0.7413, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3542600896860987, |
|
"grad_norm": 1.5613737124434628, |
|
"learning_rate": 2.4469882235553887e-06, |
|
"loss": 0.7477, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.35874439461883406, |
|
"grad_norm": 1.6383373422678345, |
|
"learning_rate": 2.4286639726089293e-06, |
|
"loss": 0.713, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.35874439461883406, |
|
"eval_loss": 0.7421520352363586, |
|
"eval_runtime": 408.0589, |
|
"eval_samples_per_second": 122.72, |
|
"eval_steps_per_second": 1.919, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3632286995515695, |
|
"grad_norm": 1.3492102003393152, |
|
"learning_rate": 2.4101121832465754e-06, |
|
"loss": 0.7185, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.36771300448430494, |
|
"grad_norm": 1.4117655797526263, |
|
"learning_rate": 2.3913374009712084e-06, |
|
"loss": 0.7379, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3721973094170404, |
|
"grad_norm": 1.5281693242796246, |
|
"learning_rate": 2.3723442259227547e-06, |
|
"loss": 0.7406, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.37668161434977576, |
|
"grad_norm": 1.6990323130848894, |
|
"learning_rate": 2.3531373117510695e-06, |
|
"loss": 0.7388, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.3811659192825112, |
|
"grad_norm": 1.476162200960684, |
|
"learning_rate": 2.33372136447572e-06, |
|
"loss": 0.7434, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.38565022421524664, |
|
"grad_norm": 1.3930484173784414, |
|
"learning_rate": 2.3141011413329244e-06, |
|
"loss": 0.7372, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3901345291479821, |
|
"grad_norm": 1.4071716332679987, |
|
"learning_rate": 2.2942814496099532e-06, |
|
"loss": 0.7531, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.39461883408071746, |
|
"grad_norm": 1.5479232446038012, |
|
"learning_rate": 2.274267145467259e-06, |
|
"loss": 0.7216, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3991031390134529, |
|
"grad_norm": 1.4255077423798548, |
|
"learning_rate": 2.254063132748637e-06, |
|
"loss": 0.7343, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.40358744394618834, |
|
"grad_norm": 1.57276996130409, |
|
"learning_rate": 2.2336743617797006e-06, |
|
"loss": 0.7347, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.40358744394618834, |
|
"eval_loss": 0.7386789321899414, |
|
"eval_runtime": 408.1839, |
|
"eval_samples_per_second": 122.682, |
|
"eval_steps_per_second": 1.918, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4080717488789238, |
|
"grad_norm": 1.4568107529063017, |
|
"learning_rate": 2.213105828154964e-06, |
|
"loss": 0.7266, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4125560538116592, |
|
"grad_norm": 1.374198091231606, |
|
"learning_rate": 2.192362571513841e-06, |
|
"loss": 0.7465, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4170403587443946, |
|
"grad_norm": 1.3925457206301284, |
|
"learning_rate": 2.171449674305846e-06, |
|
"loss": 0.7427, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.42152466367713004, |
|
"grad_norm": 1.4443502855856463, |
|
"learning_rate": 2.1503722605453083e-06, |
|
"loss": 0.7428, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4260089686098655, |
|
"grad_norm": 1.5268146365443709, |
|
"learning_rate": 2.1291354945559004e-06, |
|
"loss": 0.7163, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4304932735426009, |
|
"grad_norm": 1.5000325455240473, |
|
"learning_rate": 2.1077445797052945e-06, |
|
"loss": 0.7472, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4349775784753363, |
|
"grad_norm": 1.4869091852092478, |
|
"learning_rate": 2.086204757130243e-06, |
|
"loss": 0.7427, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.43946188340807174, |
|
"grad_norm": 1.4430282256544564, |
|
"learning_rate": 2.0645213044524194e-06, |
|
"loss": 0.7174, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4439461883408072, |
|
"grad_norm": 1.4822025498870304, |
|
"learning_rate": 2.0426995344853043e-06, |
|
"loss": 0.7538, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4484304932735426, |
|
"grad_norm": 1.5186234240452396, |
|
"learning_rate": 2.0207447939324598e-06, |
|
"loss": 0.7243, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4484304932735426, |
|
"eval_loss": 0.7356163859367371, |
|
"eval_runtime": 407.0139, |
|
"eval_samples_per_second": 123.035, |
|
"eval_steps_per_second": 1.924, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.452914798206278, |
|
"grad_norm": 1.5742685454152958, |
|
"learning_rate": 1.998662462077496e-06, |
|
"loss": 0.7475, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.45739910313901344, |
|
"grad_norm": 1.3834168469611057, |
|
"learning_rate": 1.976457949466054e-06, |
|
"loss": 0.7568, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4618834080717489, |
|
"grad_norm": 1.4947961999330186, |
|
"learning_rate": 1.954136696580132e-06, |
|
"loss": 0.7464, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4663677130044843, |
|
"grad_norm": 1.4284253764088304, |
|
"learning_rate": 1.9317041725050747e-06, |
|
"loss": 0.7456, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.47085201793721976, |
|
"grad_norm": 1.4247354157320633, |
|
"learning_rate": 1.909165873589554e-06, |
|
"loss": 0.7008, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.47533632286995514, |
|
"grad_norm": 1.4525308368306575, |
|
"learning_rate": 1.886527322098871e-06, |
|
"loss": 0.7121, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4798206278026906, |
|
"grad_norm": 1.43738036112722, |
|
"learning_rate": 1.8637940648619065e-06, |
|
"loss": 0.7308, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.484304932735426, |
|
"grad_norm": 1.402086349899742, |
|
"learning_rate": 1.8409716719120561e-06, |
|
"loss": 0.7164, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.48878923766816146, |
|
"grad_norm": 1.5227358428935063, |
|
"learning_rate": 1.8180657351224739e-06, |
|
"loss": 0.732, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.49327354260089684, |
|
"grad_norm": 1.5813743714389112, |
|
"learning_rate": 1.7950818668359733e-06, |
|
"loss": 0.7161, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.49327354260089684, |
|
"eval_loss": 0.7330535054206848, |
|
"eval_runtime": 408.4081, |
|
"eval_samples_per_second": 122.615, |
|
"eval_steps_per_second": 1.917, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4977578475336323, |
|
"grad_norm": 1.4881819590713468, |
|
"learning_rate": 1.772025698489903e-06, |
|
"loss": 0.7144, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5022421524663677, |
|
"grad_norm": 1.4750319990458514, |
|
"learning_rate": 1.7489028792363549e-06, |
|
"loss": 0.7365, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5067264573991032, |
|
"grad_norm": 1.4443590686278198, |
|
"learning_rate": 1.7257190745580209e-06, |
|
"loss": 0.7487, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5112107623318386, |
|
"grad_norm": 1.4695293763109774, |
|
"learning_rate": 1.7024799648800555e-06, |
|
"loss": 0.7233, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.515695067264574, |
|
"grad_norm": 1.4328944860273993, |
|
"learning_rate": 1.679191244178278e-06, |
|
"loss": 0.7322, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5201793721973094, |
|
"grad_norm": 1.4157130638413895, |
|
"learning_rate": 1.6558586185840473e-06, |
|
"loss": 0.728, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5246636771300448, |
|
"grad_norm": 1.4117533616122613, |
|
"learning_rate": 1.6324878049861656e-06, |
|
"loss": 0.7331, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5291479820627802, |
|
"grad_norm": 1.4255877674393056, |
|
"learning_rate": 1.609084529630145e-06, |
|
"loss": 0.7491, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5336322869955157, |
|
"grad_norm": 1.4486300200418207, |
|
"learning_rate": 1.5856545267151759e-06, |
|
"loss": 0.7261, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5381165919282511, |
|
"grad_norm": 1.4628618883782867, |
|
"learning_rate": 1.5622035369891561e-06, |
|
"loss": 0.7247, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5381165919282511, |
|
"eval_loss": 0.7308038473129272, |
|
"eval_runtime": 406.6873, |
|
"eval_samples_per_second": 123.134, |
|
"eval_steps_per_second": 1.925, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5426008968609866, |
|
"grad_norm": 1.4112256357672157, |
|
"learning_rate": 1.5387373063421062e-06, |
|
"loss": 0.7307, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.547085201793722, |
|
"grad_norm": 1.3994109954542429, |
|
"learning_rate": 1.515261584398333e-06, |
|
"loss": 0.7062, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5515695067264574, |
|
"grad_norm": 1.5279436893984248, |
|
"learning_rate": 1.491782123107669e-06, |
|
"loss": 0.7314, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5560538116591929, |
|
"grad_norm": 1.4092281762272858, |
|
"learning_rate": 1.4683046753361521e-06, |
|
"loss": 0.7044, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5605381165919282, |
|
"grad_norm": 1.4363381867810665, |
|
"learning_rate": 1.4448349934564736e-06, |
|
"loss": 0.7287, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5650224215246636, |
|
"grad_norm": 1.4913351223697051, |
|
"learning_rate": 1.421378827938549e-06, |
|
"loss": 0.7254, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5695067264573991, |
|
"grad_norm": 1.5096384680619075, |
|
"learning_rate": 1.3979419259405563e-06, |
|
"loss": 0.7389, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5739910313901345, |
|
"grad_norm": 1.3495144573299676, |
|
"learning_rate": 1.3745300299007856e-06, |
|
"loss": 0.7247, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.57847533632287, |
|
"grad_norm": 1.3641879848291365, |
|
"learning_rate": 1.3511488761306412e-06, |
|
"loss": 0.7312, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5829596412556054, |
|
"grad_norm": 1.3879105033157129, |
|
"learning_rate": 1.3278041934091524e-06, |
|
"loss": 0.7477, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5829596412556054, |
|
"eval_loss": 0.7287724018096924, |
|
"eval_runtime": 406.882, |
|
"eval_samples_per_second": 123.075, |
|
"eval_steps_per_second": 1.924, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5874439461883408, |
|
"grad_norm": 1.3916697284582622, |
|
"learning_rate": 1.3045017015793217e-06, |
|
"loss": 0.7246, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5919282511210763, |
|
"grad_norm": 1.4328511876779917, |
|
"learning_rate": 1.2812471101466687e-06, |
|
"loss": 0.7303, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5964125560538116, |
|
"grad_norm": 1.4411092846252307, |
|
"learning_rate": 1.2580461168803038e-06, |
|
"loss": 0.7318, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.600896860986547, |
|
"grad_norm": 1.4703965551927338, |
|
"learning_rate": 1.2349044064168782e-06, |
|
"loss": 0.7375, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6053811659192825, |
|
"grad_norm": 1.4319057117061509, |
|
"learning_rate": 1.21182764886775e-06, |
|
"loss": 0.7302, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6098654708520179, |
|
"grad_norm": 1.5017976848926429, |
|
"learning_rate": 1.188821498429714e-06, |
|
"loss": 0.7262, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6143497757847534, |
|
"grad_norm": 1.4553869576056546, |
|
"learning_rate": 1.165891591999626e-06, |
|
"loss": 0.7447, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6188340807174888, |
|
"grad_norm": 1.4128744043127173, |
|
"learning_rate": 1.1430435477932646e-06, |
|
"loss": 0.7423, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6233183856502242, |
|
"grad_norm": 1.3797159286061107, |
|
"learning_rate": 1.1202829639687785e-06, |
|
"loss": 0.744, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6278026905829597, |
|
"grad_norm": 1.487304571595245, |
|
"learning_rate": 1.0976154172550408e-06, |
|
"loss": 0.7429, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6278026905829597, |
|
"eval_loss": 0.7272571921348572, |
|
"eval_runtime": 406.7541, |
|
"eval_samples_per_second": 123.114, |
|
"eval_steps_per_second": 1.925, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6322869955156951, |
|
"grad_norm": 1.544512062570189, |
|
"learning_rate": 1.0750464615852523e-06, |
|
"loss": 0.7251, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6367713004484304, |
|
"grad_norm": 1.422563130817404, |
|
"learning_rate": 1.0525816267361398e-06, |
|
"loss": 0.712, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6412556053811659, |
|
"grad_norm": 1.4937681764382644, |
|
"learning_rate": 1.0302264169730613e-06, |
|
"loss": 0.7203, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6457399103139013, |
|
"grad_norm": 1.50738757049434, |
|
"learning_rate": 1.0079863097013722e-06, |
|
"loss": 0.7121, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6502242152466368, |
|
"grad_norm": 1.286396172710849, |
|
"learning_rate": 9.85866754124367e-07, |
|
"loss": 0.7193, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6547085201793722, |
|
"grad_norm": 1.4997539342741677, |
|
"learning_rate": 9.638731699081281e-07, |
|
"loss": 0.7288, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6591928251121076, |
|
"grad_norm": 1.37434247409356, |
|
"learning_rate": 9.42010945853623e-07, |
|
"loss": 0.7597, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6636771300448431, |
|
"grad_norm": 1.3869436283100607, |
|
"learning_rate": 9.202854385763502e-07, |
|
"loss": 0.7184, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6681614349775785, |
|
"grad_norm": 1.3970067087387381, |
|
"learning_rate": 8.987019711938812e-07, |
|
"loss": 0.7326, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.672645739910314, |
|
"grad_norm": 1.553183464191494, |
|
"learning_rate": 8.772658320216047e-07, |
|
"loss": 0.7317, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.672645739910314, |
|
"eval_loss": 0.7256098389625549, |
|
"eval_runtime": 406.6132, |
|
"eval_samples_per_second": 123.156, |
|
"eval_steps_per_second": 1.926, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6771300448430493, |
|
"grad_norm": 1.3357768297094936, |
|
"learning_rate": 8.55982273277002e-07, |
|
"loss": 0.7347, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6816143497757847, |
|
"grad_norm": 1.3249788097985131, |
|
"learning_rate": 8.348565097927605e-07, |
|
"loss": 0.7496, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6860986547085202, |
|
"grad_norm": 1.4578138220875878, |
|
"learning_rate": 8.13893717739056e-07, |
|
"loss": 0.7308, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6905829596412556, |
|
"grad_norm": 1.3268077719441809, |
|
"learning_rate": 7.930990333553013e-07, |
|
"loss": 0.7094, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.695067264573991, |
|
"grad_norm": 1.47562182506043, |
|
"learning_rate": 7.72477551691678e-07, |
|
"loss": 0.697, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6995515695067265, |
|
"grad_norm": 1.4850843190566259, |
|
"learning_rate": 7.520343253607677e-07, |
|
"loss": 0.7301, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7040358744394619, |
|
"grad_norm": 1.5097763618083517, |
|
"learning_rate": 7.317743632995731e-07, |
|
"loss": 0.7217, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7085201793721974, |
|
"grad_norm": 1.3914348509226637, |
|
"learning_rate": 7.117026295422425e-07, |
|
"loss": 0.6957, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7130044843049327, |
|
"grad_norm": 1.5175208261545492, |
|
"learning_rate": 6.918240420038007e-07, |
|
"loss": 0.7317, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7174887892376681, |
|
"grad_norm": 1.4947559578839034, |
|
"learning_rate": 6.721434712751745e-07, |
|
"loss": 0.7226, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7174887892376681, |
|
"eval_loss": 0.7243176102638245, |
|
"eval_runtime": 406.7899, |
|
"eval_samples_per_second": 123.103, |
|
"eval_steps_per_second": 1.925, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7219730941704036, |
|
"grad_norm": 1.5192098207309965, |
|
"learning_rate": 6.526657394298154e-07, |
|
"loss": 0.705, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.726457399103139, |
|
"grad_norm": 1.3665027387136646, |
|
"learning_rate": 6.333956188422088e-07, |
|
"loss": 0.706, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7309417040358744, |
|
"grad_norm": 1.4974912840899435, |
|
"learning_rate": 6.143378310185643e-07, |
|
"loss": 0.6983, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7354260089686099, |
|
"grad_norm": 1.5477574584643699, |
|
"learning_rate": 5.954970454399638e-07, |
|
"loss": 0.7252, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7399103139013453, |
|
"grad_norm": 1.525090065151942, |
|
"learning_rate": 5.768778784182616e-07, |
|
"loss": 0.7087, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7443946188340808, |
|
"grad_norm": 1.4837554579437873, |
|
"learning_rate": 5.584848919650069e-07, |
|
"loss": 0.7075, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7488789237668162, |
|
"grad_norm": 1.3538329119260115, |
|
"learning_rate": 5.403225926736772e-07, |
|
"loss": 0.7057, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7533632286995515, |
|
"grad_norm": 1.359895087573495, |
|
"learning_rate": 5.223954306154843e-07, |
|
"loss": 0.7306, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.757847533632287, |
|
"grad_norm": 1.4168148218595764, |
|
"learning_rate": 5.047077982490311e-07, |
|
"loss": 0.7424, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7623318385650224, |
|
"grad_norm": 1.4815842671642683, |
|
"learning_rate": 4.872640293440861e-07, |
|
"loss": 0.695, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7623318385650224, |
|
"eval_loss": 0.7233718633651733, |
|
"eval_runtime": 406.8015, |
|
"eval_samples_per_second": 123.099, |
|
"eval_steps_per_second": 1.925, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7668161434977578, |
|
"grad_norm": 1.5501655544071418, |
|
"learning_rate": 4.7006839791973673e-07, |
|
"loss": 0.7327, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7713004484304933, |
|
"grad_norm": 1.3834984705411, |
|
"learning_rate": 4.53125117197179e-07, |
|
"loss": 0.7245, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7757847533632287, |
|
"grad_norm": 1.4041748328697374, |
|
"learning_rate": 4.364383385674112e-07, |
|
"loss": 0.7054, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7802690582959642, |
|
"grad_norm": 1.443104622604103, |
|
"learning_rate": 4.2001215057407026e-07, |
|
"loss": 0.7037, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7847533632286996, |
|
"grad_norm": 1.5632699202433824, |
|
"learning_rate": 4.038505779116687e-07, |
|
"loss": 0.705, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7892376681614349, |
|
"grad_norm": 1.349615732583278, |
|
"learning_rate": 3.879575804394782e-07, |
|
"loss": 0.7071, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7937219730941704, |
|
"grad_norm": 1.3657530768128234, |
|
"learning_rate": 3.7233705221129646e-07, |
|
"loss": 0.7273, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7982062780269058, |
|
"grad_norm": 1.5107387856649341, |
|
"learning_rate": 3.569928205213354e-07, |
|
"loss": 0.6975, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8026905829596412, |
|
"grad_norm": 1.4525568524987686, |
|
"learning_rate": 3.419286449664741e-07, |
|
"loss": 0.7095, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8071748878923767, |
|
"grad_norm": 1.4847854049722584, |
|
"learning_rate": 3.2714821652508854e-07, |
|
"loss": 0.7167, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8071748878923767, |
|
"eval_loss": 0.7225807309150696, |
|
"eval_runtime": 406.5326, |
|
"eval_samples_per_second": 123.181, |
|
"eval_steps_per_second": 1.926, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8116591928251121, |
|
"grad_norm": 1.2447161837361285, |
|
"learning_rate": 3.126551566527036e-07, |
|
"loss": 0.7156, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8161434977578476, |
|
"grad_norm": 1.4139333132454484, |
|
"learning_rate": 2.9845301639467284e-07, |
|
"loss": 0.7537, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.820627802690583, |
|
"grad_norm": 1.3663031642715642, |
|
"learning_rate": 2.8454527551611205e-07, |
|
"loss": 0.7238, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8251121076233184, |
|
"grad_norm": 1.389263976301968, |
|
"learning_rate": 2.7093534164929904e-07, |
|
"loss": 0.738, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8295964125560538, |
|
"grad_norm": 1.5068808968575202, |
|
"learning_rate": 2.576265494587458e-07, |
|
"loss": 0.7067, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8340807174887892, |
|
"grad_norm": 1.4226178531466935, |
|
"learning_rate": 2.446221598241472e-07, |
|
"loss": 0.7143, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8385650224215246, |
|
"grad_norm": 1.6881847148932905, |
|
"learning_rate": 2.319253590414132e-07, |
|
"loss": 0.7376, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8430493273542601, |
|
"grad_norm": 1.4353283330892004, |
|
"learning_rate": 2.1953925804197056e-07, |
|
"loss": 0.7095, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8475336322869955, |
|
"grad_norm": 1.4639605071750654, |
|
"learning_rate": 2.0746689163053113e-07, |
|
"loss": 0.7102, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.852017937219731, |
|
"grad_norm": 1.458703799588621, |
|
"learning_rate": 1.9571121774151545e-07, |
|
"loss": 0.686, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.852017937219731, |
|
"eval_loss": 0.7220604419708252, |
|
"eval_runtime": 406.5609, |
|
"eval_samples_per_second": 123.172, |
|
"eval_steps_per_second": 1.926, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8565022421524664, |
|
"grad_norm": 1.470148783910905, |
|
"learning_rate": 1.8427511671430757e-07, |
|
"loss": 0.72, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8609865470852018, |
|
"grad_norm": 1.3891242748262451, |
|
"learning_rate": 1.7316139058752194e-07, |
|
"loss": 0.7318, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8654708520179372, |
|
"grad_norm": 1.2245069775705093, |
|
"learning_rate": 1.6237276241245867e-07, |
|
"loss": 0.7155, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8699551569506726, |
|
"grad_norm": 1.360510189488915, |
|
"learning_rate": 1.519118755859084e-07, |
|
"loss": 0.7255, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.874439461883408, |
|
"grad_norm": 1.495119615923585, |
|
"learning_rate": 1.4178129320247486e-07, |
|
"loss": 0.7484, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8789237668161435, |
|
"grad_norm": 1.3674856635367474, |
|
"learning_rate": 1.31983497426575e-07, |
|
"loss": 0.7366, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8834080717488789, |
|
"grad_norm": 1.4494730150421093, |
|
"learning_rate": 1.2252088888426431e-07, |
|
"loss": 0.742, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8878923766816144, |
|
"grad_norm": 1.4368197978682802, |
|
"learning_rate": 1.1339578607504536e-07, |
|
"loss": 0.7269, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8923766816143498, |
|
"grad_norm": 1.4017197990051706, |
|
"learning_rate": 1.0461042480379402e-07, |
|
"loss": 0.7234, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8968609865470852, |
|
"grad_norm": 1.426560347266084, |
|
"learning_rate": 9.616695763295007e-08, |
|
"loss": 0.7214, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8968609865470852, |
|
"eval_loss": 0.721759557723999, |
|
"eval_runtime": 406.5838, |
|
"eval_samples_per_second": 123.165, |
|
"eval_steps_per_second": 1.926, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9013452914798207, |
|
"grad_norm": 1.489947255967281, |
|
"learning_rate": 8.806745335510297e-08, |
|
"loss": 0.7341, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.905829596412556, |
|
"grad_norm": 1.4312716003053576, |
|
"learning_rate": 8.031389648610266e-08, |
|
"loss": 0.7264, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9103139013452914, |
|
"grad_norm": 1.4764400641380824, |
|
"learning_rate": 7.290818677881966e-08, |
|
"loss": 0.7301, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9147982062780269, |
|
"grad_norm": 1.4381108917682341, |
|
"learning_rate": 6.585213875767305e-08, |
|
"loss": 0.6997, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9192825112107623, |
|
"grad_norm": 1.459723127188453, |
|
"learning_rate": 5.914748127404102e-08, |
|
"loss": 0.7168, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9237668161434978, |
|
"grad_norm": 1.5776619173541433, |
|
"learning_rate": 5.2795857082663655e-08, |
|
"loss": 0.72, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9282511210762332, |
|
"grad_norm": 1.438610611700907, |
|
"learning_rate": 4.6798822439140185e-08, |
|
"loss": 0.7035, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9327354260089686, |
|
"grad_norm": 1.4350411032390504, |
|
"learning_rate": 4.115784671861916e-08, |
|
"loss": 0.735, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.9372197309417041, |
|
"grad_norm": 1.4822578142933729, |
|
"learning_rate": 3.587431205577713e-08, |
|
"loss": 0.7178, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9417040358744395, |
|
"grad_norm": 1.5001233187138816, |
|
"learning_rate": 3.0949513006172325e-08, |
|
"loss": 0.7358, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9417040358744395, |
|
"eval_loss": 0.7216091752052307, |
|
"eval_runtime": 406.6258, |
|
"eval_samples_per_second": 123.153, |
|
"eval_steps_per_second": 1.926, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9461883408071748, |
|
"grad_norm": 1.4457564058059627, |
|
"learning_rate": 2.6384656229056946e-08, |
|
"loss": 0.7285, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.9506726457399103, |
|
"grad_norm": 1.6789172768348999, |
|
"learning_rate": 2.218086019172394e-08, |
|
"loss": 0.7027, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9551569506726457, |
|
"grad_norm": 1.4039832008414181, |
|
"learning_rate": 1.8339154895464894e-08, |
|
"loss": 0.7285, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9596412556053812, |
|
"grad_norm": 1.7674026844330886, |
|
"learning_rate": 1.4860481623201417e-08, |
|
"loss": 0.713, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9641255605381166, |
|
"grad_norm": 1.531580121339593, |
|
"learning_rate": 1.1745692708855282e-08, |
|
"loss": 0.7328, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.968609865470852, |
|
"grad_norm": 1.455884868550825, |
|
"learning_rate": 8.99555132851232e-09, |
|
"loss": 0.7196, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9730941704035875, |
|
"grad_norm": 1.3157536936429735, |
|
"learning_rate": 6.610731313430318e-09, |
|
"loss": 0.7277, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9775784753363229, |
|
"grad_norm": 1.5586404477319191, |
|
"learning_rate": 4.5918169849406e-09, |
|
"loss": 0.7265, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9820627802690582, |
|
"grad_norm": 1.3596393082767964, |
|
"learning_rate": 2.939303011277872e-09, |
|
"loss": 0.719, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9865470852017937, |
|
"grad_norm": 1.3866642718972106, |
|
"learning_rate": 1.6535942863788456e-09, |
|
"loss": 0.7259, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9865470852017937, |
|
"eval_loss": 0.7215752005577087, |
|
"eval_runtime": 408.9437, |
|
"eval_samples_per_second": 122.455, |
|
"eval_steps_per_second": 1.915, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9910313901345291, |
|
"grad_norm": 1.6643780128489514, |
|
"learning_rate": 7.350058306764273e-10, |
|
"loss": 0.7044, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9955156950672646, |
|
"grad_norm": 1.428221428067804, |
|
"learning_rate": 1.8376271391412624e-10, |
|
"loss": 0.7109, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.3882910125414851, |
|
"learning_rate": 0.0, |
|
"loss": 0.7123, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2230, |
|
"total_flos": 250303561007104.0, |
|
"train_loss": 0.7492096503219262, |
|
"train_runtime": 18007.2993, |
|
"train_samples_per_second": 15.851, |
|
"train_steps_per_second": 0.124 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2230, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 250303561007104.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|