|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00125, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.999980723453676e-05, |
|
"loss": 0.6803, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 3.0, |
|
"learning_rate": 4.9999228941119745e-05, |
|
"loss": 0.6176, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00375, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.999826512866693e-05, |
|
"loss": 0.5313, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 4.999691581204152e-05, |
|
"loss": 0.5659, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00625, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.9995181012051625e-05, |
|
"loss": 0.5822, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.9993060755450015e-05, |
|
"loss": 0.5849, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00875, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.999055507493368e-05, |
|
"loss": 0.55, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.998766400914329e-05, |
|
"loss": 0.5323, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01125, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.9984387602662675e-05, |
|
"loss": 0.5775, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.9980725906018074e-05, |
|
"loss": 0.5523, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01375, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.9976678975677376e-05, |
|
"loss": 0.6089, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.9972246874049254e-05, |
|
"loss": 0.5445, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01625, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.996742966948219e-05, |
|
"loss": 0.5256, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.9962227436263453e-05, |
|
"loss": 0.5118, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01875, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.9956640254617906e-05, |
|
"loss": 0.5458, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.995066821070679e-05, |
|
"loss": 0.5946, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02125, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.994431139662643e-05, |
|
"loss": 0.515, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.9937569910406756e-05, |
|
"loss": 0.5501, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02375, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.9930443856009826e-05, |
|
"loss": 0.5475, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.99229333433282e-05, |
|
"loss": 0.5625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02625, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.9915038488183295e-05, |
|
"loss": 0.5627, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.990675941232353e-05, |
|
"loss": 0.5561, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02875, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.989809624342251e-05, |
|
"loss": 0.5254, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.9889049115077005e-05, |
|
"loss": 0.5184, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.987961816680492e-05, |
|
"loss": 0.5563, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.9869803544043166e-05, |
|
"loss": 0.5536, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03375, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.985960539814535e-05, |
|
"loss": 0.5544, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.98490238863795e-05, |
|
"loss": 0.5117, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03625, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 4.983805917192561e-05, |
|
"loss": 0.5125, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.982671142387316e-05, |
|
"loss": 0.5563, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03875, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.9814980817218447e-05, |
|
"loss": 0.5408, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.980286753286195e-05, |
|
"loss": 0.5249, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04125, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.979037175760548e-05, |
|
"loss": 0.546, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.9777493684149375e-05, |
|
"loss": 0.5019, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04375, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.976423351108943e-05, |
|
"loss": 0.5364, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.975059144291394e-05, |
|
"loss": 0.5504, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04625, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.973656769000046e-05, |
|
"loss": 0.4682, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4.972216246861262e-05, |
|
"loss": 0.5262, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04875, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.9707376000896736e-05, |
|
"loss": 0.5343, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.5171, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05125, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.967666024445914e-05, |
|
"loss": 0.5454, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.966073142941239e-05, |
|
"loss": 0.5378, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.05375, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.9644422315380225e-05, |
|
"loss": 0.4792, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 4.962773315386935e-05, |
|
"loss": 0.5336, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05625, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.9610664202247294e-05, |
|
"loss": 0.5293, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9593215723738404e-05, |
|
"loss": 0.4678, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.05875, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 4.957538798741979e-05, |
|
"loss": 0.549, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.9557181268217227e-05, |
|
"loss": 0.5642, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06125, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.953859584690082e-05, |
|
"loss": 0.5544, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.951963201008076e-05, |
|
"loss": 0.5604, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06375, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.9500290050202894e-05, |
|
"loss": 0.5349, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.9480570265544144e-05, |
|
"loss": 0.5393, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06625, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.9460472960208e-05, |
|
"loss": 0.527, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.943999844411977e-05, |
|
"loss": 0.4982, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06875, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.9419147033021814e-05, |
|
"loss": 0.4377, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.939791904846869e-05, |
|
"loss": 0.4919, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07125, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.937631481782218e-05, |
|
"loss": 0.5107, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.935433467424624e-05, |
|
"loss": 0.5611, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07375, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.9331978956701875e-05, |
|
"loss": 0.534, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.9309248009941914e-05, |
|
"loss": 0.5319, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07625, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.928614218450568e-05, |
|
"loss": 0.4805, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.9262661836713564e-05, |
|
"loss": 0.4656, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07875, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.923880732866159e-05, |
|
"loss": 0.5328, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.9214579028215776e-05, |
|
"loss": 0.4949, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.08125, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.9189977309006495e-05, |
|
"loss": 0.5222, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.916500255042268e-05, |
|
"loss": 0.5029, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08375, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.9139655137606015e-05, |
|
"loss": 0.5188, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.9113935461444955e-05, |
|
"loss": 0.5651, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08625, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.908784391856872e-05, |
|
"loss": 0.4586, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.906138091134118e-05, |
|
"loss": 0.539, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08875, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.9034546847854656e-05, |
|
"loss": 0.5331, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.900734214192358e-05, |
|
"loss": 0.4227, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09125, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.897976721307819e-05, |
|
"loss": 0.5005, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.8951822486557986e-05, |
|
"loss": 0.5294, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.892350839330522e-05, |
|
"loss": 0.5729, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.8894825369958255e-05, |
|
"loss": 0.4837, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09625, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.8865773858844776e-05, |
|
"loss": 0.5266, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.8836354307975026e-05, |
|
"loss": 0.5329, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09875, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 4.880656717103489e-05, |
|
"loss": 0.5096, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.4919, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10125, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.874589198202294e-05, |
|
"loss": 0.4633, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.8715004865637614e-05, |
|
"loss": 0.4981, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10375, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.868375203454041e-05, |
|
"loss": 0.4699, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.8652133970688636e-05, |
|
"loss": 0.5086, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10625, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.862015116167196e-05, |
|
"loss": 0.5406, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 4.8587804100704845e-05, |
|
"loss": 0.5456, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.10875, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.8555093286618995e-05, |
|
"loss": 0.5107, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.852201922385564e-05, |
|
"loss": 0.4078, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11125, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.848858242245773e-05, |
|
"loss": 0.4958, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 4.8454783398062106e-05, |
|
"loss": 0.4822, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11375, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.8420622671891533e-05, |
|
"loss": 0.5489, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.838610077074669e-05, |
|
"loss": 0.4884, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11625, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.835121822699796e-05, |
|
"loss": 0.529, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.8315975578577355e-05, |
|
"loss": 0.5414, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.11875, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.828037336897009e-05, |
|
"loss": 0.4749, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.8244412147206284e-05, |
|
"loss": 0.5205, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12125, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.820809246785247e-05, |
|
"loss": 0.5343, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.817141489100302e-05, |
|
"loss": 0.5324, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.12375, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.8134379982271556e-05, |
|
"loss": 0.5451, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 0.5428, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12625, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.805924045916067e-05, |
|
"loss": 0.5002, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.8021137003525664e-05, |
|
"loss": 0.5277, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.12875, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.7982678533479555e-05, |
|
"loss": 0.5185, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.794386564209953e-05, |
|
"loss": 0.501, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13125, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 4.7904698927928406e-05, |
|
"loss": 0.4903, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.7865178994965344e-05, |
|
"loss": 0.4764, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.13375, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.782530645265661e-05, |
|
"loss": 0.5046, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 1.25, |
|
"learning_rate": 4.7785081915886134e-05, |
|
"loss": 0.4849, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.13625, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.7744506004966025e-05, |
|
"loss": 0.4874, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.7703579345627035e-05, |
|
"loss": 0.5632, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.13875, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.766230256900887e-05, |
|
"loss": 0.4894, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.762067631165049e-05, |
|
"loss": 0.4819, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14125, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.7578701215480284e-05, |
|
"loss": 0.4872, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.753637792780614e-05, |
|
"loss": 0.5274, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.14375, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.749370710130554e-05, |
|
"loss": 0.5052, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 4.745068939401539e-05, |
|
"loss": 0.4819, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.14625, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.740732546932197e-05, |
|
"loss": 0.5159, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.7363615995950626e-05, |
|
"loss": 0.5338, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.14875, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.7319561647955526e-05, |
|
"loss": 0.4797, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 0.5453, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.15125, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.7230421050892116e-05, |
|
"loss": 0.5009, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 4.718533617648209e-05, |
|
"loss": 0.4602, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.15375, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.713990917674365e-05, |
|
"loss": 0.5399, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.709414075221734e-05, |
|
"loss": 0.5006, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.7048031608708876e-05, |
|
"loss": 0.4784, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.7001582457278304e-05, |
|
"loss": 0.4764, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.15875, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.695479401422898e-05, |
|
"loss": 0.5003, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.690766700109659e-05, |
|
"loss": 0.4586, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.16125, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.686020214463798e-05, |
|
"loss": 0.5272, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 4.681240017681993e-05, |
|
"loss": 0.5626, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.16375, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.676426183480794e-05, |
|
"loss": 0.5696, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.671578786095478e-05, |
|
"loss": 0.5391, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.16625, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 4.6666979002789105e-05, |
|
"loss": 0.5195, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.661783601300388e-05, |
|
"loss": 0.4973, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.16875, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.65683596494448e-05, |
|
"loss": 0.4741, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.65185506750986e-05, |
|
"loss": 0.4684, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.17125, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.646840985808126e-05, |
|
"loss": 0.5307, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 4.6417937971626245e-05, |
|
"loss": 0.5154, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.17375, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.636713579407245e-05, |
|
"loss": 0.5348, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.6316004108852305e-05, |
|
"loss": 0.477, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.17625, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.6264543704479655e-05, |
|
"loss": 0.4989, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.6212755374537596e-05, |
|
"loss": 0.5109, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.17875, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.616063991766623e-05, |
|
"loss": 0.48, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.610819813755038e-05, |
|
"loss": 0.5151, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.18125, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 4.6055430842907167e-05, |
|
"loss": 0.5235, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.600233884747355e-05, |
|
"loss": 0.5006, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.18375, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.594892296999378e-05, |
|
"loss": 0.479, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.5895184034206765e-05, |
|
"loss": 0.4807, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.18625, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 4.5841122868833364e-05, |
|
"loss": 0.5189, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.5786740307563636e-05, |
|
"loss": 0.4768, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.18875, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 4.573203718904394e-05, |
|
"loss": 0.4747, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.567701435686404e-05, |
|
"loss": 0.4756, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.19125, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.562167265954409e-05, |
|
"loss": 0.5102, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 4.55660129505215e-05, |
|
"loss": 0.5229, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.19375, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.551003608813784e-05, |
|
"loss": 0.5103, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.545374293562559e-05, |
|
"loss": 0.5216, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.19625, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.5397134361094786e-05, |
|
"loss": 0.5039, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.534021123751968e-05, |
|
"loss": 0.4834, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.19875, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 4.528297444272525e-05, |
|
"loss": 0.4386, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.5097, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20125, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.516756337495075e-05, |
|
"loss": 0.5574, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.5109390881752114e-05, |
|
"loss": 0.5492, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.20375, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.5050908276869586e-05, |
|
"loss": 0.5281, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 4.499211646217727e-05, |
|
"loss": 0.5042, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.20625, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.493301634431768e-05, |
|
"loss": 0.4746, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.487360883468775e-05, |
|
"loss": 0.5611, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.20875, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 4.481389484942478e-05, |
|
"loss": 0.5058, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.4753875309392266e-05, |
|
"loss": 0.4352, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.21125, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.469355114016577e-05, |
|
"loss": 0.4849, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.463292327201862e-05, |
|
"loss": 0.5195, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.21375, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 4.4571992639907545e-05, |
|
"loss": 0.3864, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.451076018345825e-05, |
|
"loss": 0.4903, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.21625, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 4.444922684695097e-05, |
|
"loss": 0.5126, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.4387393579305865e-05, |
|
"loss": 0.4958, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 4.4325261334068426e-05, |
|
"loss": 0.5307, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.5, |
|
"learning_rate": 4.426283106939474e-05, |
|
"loss": 0.5048, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.22125, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.4200103748036695e-05, |
|
"loss": 0.4757, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.4137080337327205e-05, |
|
"loss": 0.5525, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.22375, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 4.407376180916522e-05, |
|
"loss": 0.4781, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.225, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.401014914000078e-05, |
|
"loss": 0.4797, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.22625, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.3946243310819926e-05, |
|
"loss": 0.5529, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.2275, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.3882045307129594e-05, |
|
"loss": 0.4669, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.22875, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.3817556118942425e-05, |
|
"loss": 0.5328, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.375277674076149e-05, |
|
"loss": 0.5347, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.23125, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.3687708171564925e-05, |
|
"loss": 0.4615, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2325, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.3622351414790554e-05, |
|
"loss": 0.5132, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.23375, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.355670747832042e-05, |
|
"loss": 0.4063, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.235, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.349077737446525e-05, |
|
"loss": 0.493, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.23625, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.3424562119948776e-05, |
|
"loss": 0.4826, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.2375, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.335806273589214e-05, |
|
"loss": 0.4726, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.23875, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.329128024779812e-05, |
|
"loss": 0.4672, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 4.3224215685535294e-05, |
|
"loss": 0.4467, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.24125, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.315687008332217e-05, |
|
"loss": 0.5019, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2425, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.3089244479711236e-05, |
|
"loss": 0.526, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.24375, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.302133991757297e-05, |
|
"loss": 0.509, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.245, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.295315744407972e-05, |
|
"loss": 0.447, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.24625, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.2884698110689575e-05, |
|
"loss": 0.4927, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.2475, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 4.281596297313013e-05, |
|
"loss": 0.4891, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.24875, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 4.274695309138226e-05, |
|
"loss": 0.5046, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.4642, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25125, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.260811335641266e-05, |
|
"loss": 0.4396, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.2525, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.25382856442714e-05, |
|
"loss": 0.4386, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.25375, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 4.2468187470069607e-05, |
|
"loss": 0.5335, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.255, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.2397819914807856e-05, |
|
"loss": 0.4703, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.25625, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 4.23271840636409e-05, |
|
"loss": 0.5011, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2575, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 4.225628100586093e-05, |
|
"loss": 0.5234, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.25875, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.218511183488082e-05, |
|
"loss": 0.4749, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.211367764821722e-05, |
|
"loss": 0.5361, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.26125, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.2041979547473665e-05, |
|
"loss": 0.4458, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.2625, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.197001863832355e-05, |
|
"loss": 0.4517, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.26375, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.189779603049312e-05, |
|
"loss": 0.4571, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.265, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.182531283774434e-05, |
|
"loss": 0.487, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.26625, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.17525701778577e-05, |
|
"loss": 0.5186, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.2675, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.1679569172614996e-05, |
|
"loss": 0.4815, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.26875, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.1606310947782044e-05, |
|
"loss": 0.4563, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0, |
|
"learning_rate": 4.1532796633091296e-05, |
|
"loss": 0.4585, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.27125, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.1459027362224436e-05, |
|
"loss": 0.4846, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.2725, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 4.138500427279485e-05, |
|
"loss": 0.505, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.27375, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.1310728506330174e-05, |
|
"loss": 0.4765, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.275, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.123620120825459e-05, |
|
"loss": 0.5105, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.27625, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.116142352787125e-05, |
|
"loss": 0.4193, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.2775, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.1086396618344476e-05, |
|
"loss": 0.4953, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.27875, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.101112163668203e-05, |
|
"loss": 0.4572, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 4.093559974371725e-05, |
|
"loss": 0.4247, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.085983210409114e-05, |
|
"loss": 0.483, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.2825, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.0783819886234445e-05, |
|
"loss": 0.4787, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.28375, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.0707564262349595e-05, |
|
"loss": 0.4891, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.285, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.063106640839264e-05, |
|
"loss": 0.503, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.28625, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 4.05543275040551e-05, |
|
"loss": 0.5003, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.2875, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 4.047734873274586e-05, |
|
"loss": 0.5444, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.28875, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 4.040013128157275e-05, |
|
"loss": 0.4193, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.0322676341324415e-05, |
|
"loss": 0.497, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.29125, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.024498510645185e-05, |
|
"loss": 0.377, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.2925, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.0167058775049996e-05, |
|
"loss": 0.5118, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.29375, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 4.008889854883929e-05, |
|
"loss": 0.4941, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.295, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.0010505633147106e-05, |
|
"loss": 0.5302, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.29625, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.993188123688918e-05, |
|
"loss": 0.5273, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.2975, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.985302657255097e-05, |
|
"loss": 0.463, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.29875, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.977394285616893e-05, |
|
"loss": 0.5116, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.5089, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.30125, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.961509314906184e-05, |
|
"loss": 0.5043, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.3025, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.953532960799577e-05, |
|
"loss": 0.4877, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.30375, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 3.9455341914166075e-05, |
|
"loss": 0.5368, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.305, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.937513130108197e-05, |
|
"loss": 0.4303, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.30625, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 3.9294699005690305e-05, |
|
"loss": 0.4978, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3075, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.92140462683566e-05, |
|
"loss": 0.4898, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.30875, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 3.913317433284582e-05, |
|
"loss": 0.4307, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.905208444630327e-05, |
|
"loss": 0.4599, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.31125, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.897077785923529e-05, |
|
"loss": 0.4449, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.888925582549006e-05, |
|
"loss": 0.4508, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.31375, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.880751960223817e-05, |
|
"loss": 0.4523, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.315, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.87255704499533e-05, |
|
"loss": 0.4782, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.31625, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.864340963239275e-05, |
|
"loss": 0.4821, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.3175, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.856103841657797e-05, |
|
"loss": 0.393, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.31875, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.847845807277502e-05, |
|
"loss": 0.4731, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 3.8395669874474915e-05, |
|
"loss": 0.4644, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.32125, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.831267509837414e-05, |
|
"loss": 0.5069, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.3225, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.822947502435477e-05, |
|
"loss": 0.4767, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.32375, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 3.814607093546489e-05, |
|
"loss": 0.472, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.325, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.8062464117898724e-05, |
|
"loss": 0.4598, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.32625, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 3.7978655860976824e-05, |
|
"loss": 0.4794, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3275, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.789464745712619e-05, |
|
"loss": 0.4728, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.32875, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.7810440201860334e-05, |
|
"loss": 0.4535, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.7726035393759285e-05, |
|
"loss": 0.4646, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.33125, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.764143433444962e-05, |
|
"loss": 0.4597, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3325, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 3.755663832858432e-05, |
|
"loss": 0.516, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.33375, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.747164868382269e-05, |
|
"loss": 0.4492, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.335, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.7386466710810194e-05, |
|
"loss": 0.4644, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.33625, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.730109372315822e-05, |
|
"loss": 0.5028, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3375, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 3.721553103742388e-05, |
|
"loss": 0.424, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.33875, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.71297799730896e-05, |
|
"loss": 0.4592, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.704384185254288e-05, |
|
"loss": 0.4678, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.34125, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.695771800105586e-05, |
|
"loss": 0.4809, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3425, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.6871409746764865e-05, |
|
"loss": 0.5093, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.678491842064995e-05, |
|
"loss": 0.4937, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.345, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.6698245356514335e-05, |
|
"loss": 0.4107, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.34625, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.661139189096391e-05, |
|
"loss": 0.4578, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.3475, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.652435936338656e-05, |
|
"loss": 0.3964, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.34875, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.6437149115931514e-05, |
|
"loss": 0.5011, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 0.494, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.35125, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.626220084366786e-05, |
|
"loss": 0.4773, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.3525, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.6174465516778035e-05, |
|
"loss": 0.4338, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.35375, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.608655786580647e-05, |
|
"loss": 0.4538, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.355, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.599847924639788e-05, |
|
"loss": 0.4537, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.35625, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.591023101683355e-05, |
|
"loss": 0.448, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.3575, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.582181453801036e-05, |
|
"loss": 0.4645, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.35875, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 3.5733231173419754e-05, |
|
"loss": 0.4578, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.564448228912682e-05, |
|
"loss": 0.4704, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.36125, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 3.555556925374914e-05, |
|
"loss": 0.4383, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.3625, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.54664934384357e-05, |
|
"loss": 0.4192, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.36375, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.5377256216845785e-05, |
|
"loss": 0.5063, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.365, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.528785896512772e-05, |
|
"loss": 0.4711, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.36625, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.519830306189773e-05, |
|
"loss": 0.4494, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.3675, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.510858988821863e-05, |
|
"loss": 0.4972, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.36875, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 3.5018720827578524e-05, |
|
"loss": 0.4312, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.4928697265869515e-05, |
|
"loss": 0.4267, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.37125, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.483852059136629e-05, |
|
"loss": 0.4563, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.3725, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.474819219470471e-05, |
|
"loss": 0.4642, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.37375, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.4657713468860405e-05, |
|
"loss": 0.414, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 0.4919, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.37625, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.447631061309587e-05, |
|
"loss": 0.5023, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.3775, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.438538928063208e-05, |
|
"loss": 0.469, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.37875, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.4294323213855305e-05, |
|
"loss": 0.4322, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.4203113817116957e-05, |
|
"loss": 0.4393, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.38125, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.411176249697875e-05, |
|
"loss": 0.4005, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.3825, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 3.402027066219105e-05, |
|
"loss": 0.4094, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.38375, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.392863972367114e-05, |
|
"loss": 0.4474, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.385, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.383687109448143e-05, |
|
"loss": 0.399, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.38625, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.374496618980772e-05, |
|
"loss": 0.4342, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.3875, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.365292642693732e-05, |
|
"loss": 0.4847, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.38875, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.356075322523725e-05, |
|
"loss": 0.4343, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 3.346844800613229e-05, |
|
"loss": 0.498, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.39125, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.33760121930831e-05, |
|
"loss": 0.4737, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.3925, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.3283447211564276e-05, |
|
"loss": 0.4965, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.39375, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.319075448904234e-05, |
|
"loss": 0.4626, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.395, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.309793545495374e-05, |
|
"loss": 0.5161, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.39625, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.3004991540682796e-05, |
|
"loss": 0.4371, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.3975, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.2911924179539656e-05, |
|
"loss": 0.4427, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.39875, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.281873480673815e-05, |
|
"loss": 0.4318, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.4756, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.40125, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.2631995776401094e-05, |
|
"loss": 0.4507, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4025, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.253844899861239e-05, |
|
"loss": 0.4444, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.40375, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.244478596861464e-05, |
|
"loss": 0.4291, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.405, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.23510081308076e-05, |
|
"loss": 0.4615, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.225711693136156e-05, |
|
"loss": 0.4347, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.4075, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.2163113818194964e-05, |
|
"loss": 0.4349, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.40875, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.206900024095208e-05, |
|
"loss": 0.4814, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 3.1974777650980735e-05, |
|
"loss": 0.502, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.41125, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.188044750130979e-05, |
|
"loss": 0.4457, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.4125, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.178601124662686e-05, |
|
"loss": 0.505, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.41375, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.169147034325582e-05, |
|
"loss": 0.4941, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.415, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.1596826249134324e-05, |
|
"loss": 0.4524, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.41625, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.150208042379142e-05, |
|
"loss": 0.4826, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.4175, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.140723432832492e-05, |
|
"loss": 0.4101, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.41875, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 3.131228942537895e-05, |
|
"loss": 0.4068, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.121724717912138e-05, |
|
"loss": 0.4341, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.42125, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.112210905522119e-05, |
|
"loss": 0.4197, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.4225, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.102687652082597e-05, |
|
"loss": 0.4257, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.42375, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.0931551044539194e-05, |
|
"loss": 0.4513, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.425, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.083613409639764e-05, |
|
"loss": 0.4757, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.42625, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.0740627147848675e-05, |
|
"loss": 0.441, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.4275, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.06450316717276e-05, |
|
"loss": 0.4249, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.42875, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 3.05493491422349e-05, |
|
"loss": 0.4189, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.045358103491357e-05, |
|
"loss": 0.4315, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.43125, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.035772882662627e-05, |
|
"loss": 0.4641, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.4325, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.026179399553264e-05, |
|
"loss": 0.4701, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.43375, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.0165778021066453e-05, |
|
"loss": 0.4827, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.435, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.0069682383912813e-05, |
|
"loss": 0.4439, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.43625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.9973508565985313e-05, |
|
"loss": 0.4916, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.9877258050403212e-05, |
|
"loss": 0.464, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.43875, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.9780932321468515e-05, |
|
"loss": 0.4105, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.9684532864643122e-05, |
|
"loss": 0.4312, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.44125, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.9588061166525914e-05, |
|
"loss": 0.4465, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.4425, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.949151871482982e-05, |
|
"loss": 0.4136, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.44375, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.9394906998358868e-05, |
|
"loss": 0.4107, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.445, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.929822750698524e-05, |
|
"loss": 0.4327, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.44625, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.92014817316263e-05, |
|
"loss": 0.4597, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.4475, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.9104671164221576e-05, |
|
"loss": 0.4685, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.44875, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.9007797297709782e-05, |
|
"loss": 0.451, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 0.4101, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.45125, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.8813865643977526e-05, |
|
"loss": 0.4775, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.4525, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.871681084742308e-05, |
|
"loss": 0.4588, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.45375, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.8619698733047447e-05, |
|
"loss": 0.4476, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.455, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.8522530798439567e-05, |
|
"loss": 0.4375, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.45625, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.8425308542049206e-05, |
|
"loss": 0.422, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.4575, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.832803346316381e-05, |
|
"loss": 0.4887, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.45875, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.8230707061885443e-05, |
|
"loss": 0.4136, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.8133330839107608e-05, |
|
"loss": 0.4236, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.46125, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.803590629649212e-05, |
|
"loss": 0.4983, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.4625, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 2.7938434936445945e-05, |
|
"loss": 0.4988, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.46375, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 2.784091826209803e-05, |
|
"loss": 0.4337, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.465, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.774335777727613e-05, |
|
"loss": 0.4574, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.46625, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.764575498648362e-05, |
|
"loss": 0.4606, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.4675, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.754811139487625e-05, |
|
"loss": 0.4489, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.7450428508239024e-05, |
|
"loss": 0.4016, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.7352707832962865e-05, |
|
"loss": 0.4191, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.47125, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.725495087602148e-05, |
|
"loss": 0.5397, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.4725, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.7157159144948092e-05, |
|
"loss": 0.4646, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.47375, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.7059334147812142e-05, |
|
"loss": 0.4443, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.475, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.6961477393196126e-05, |
|
"loss": 0.4943, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.47625, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.6863590390172243e-05, |
|
"loss": 0.4654, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.4775, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.6765674648279172e-05, |
|
"loss": 0.4517, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.47875, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.666773167749878e-05, |
|
"loss": 0.4525, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.656976298823284e-05, |
|
"loss": 0.4676, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.48125, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 2.6471770091279724e-05, |
|
"loss": 0.495, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.4825, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.637375449781115e-05, |
|
"loss": 0.4322, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.48375, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.627571771934879e-05, |
|
"loss": 0.4147, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.485, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.6177661267741065e-05, |
|
"loss": 0.4204, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.48625, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.607958665513976e-05, |
|
"loss": 0.4245, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.4875, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.598149539397672e-05, |
|
"loss": 0.4582, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.48875, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.5883388996940534e-05, |
|
"loss": 0.4445, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.578526897695321e-05, |
|
"loss": 0.4533, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.49125, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.5687136847146838e-05, |
|
"loss": 0.4334, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.4925, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.558899412084026e-05, |
|
"loss": 0.434, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.49375, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.5490842311515707e-05, |
|
"loss": 0.4257, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.495, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 2.539268293279552e-05, |
|
"loss": 0.4503, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.49625, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.529451749841873e-05, |
|
"loss": 0.5045, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.4975, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.5196347522217784e-05, |
|
"loss": 0.4307, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.49875, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.509817451809515e-05, |
|
"loss": 0.4701, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4573, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.50125, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.4901825481904855e-05, |
|
"loss": 0.4304, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.5025, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 2.480365247778223e-05, |
|
"loss": 0.4334, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.50375, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.4705482501581266e-05, |
|
"loss": 0.4507, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.505, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.460731706720449e-05, |
|
"loss": 0.4555, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.50625, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.4509157688484295e-05, |
|
"loss": 0.4791, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5075, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.4411005879159753e-05, |
|
"loss": 0.4324, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.50875, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.4312863152853165e-05, |
|
"loss": 0.4534, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 2.4214731023046793e-05, |
|
"loss": 0.4411, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.51125, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.4116611003059472e-05, |
|
"loss": 0.4333, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.5125, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 2.4018504606023293e-05, |
|
"loss": 0.4231, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.51375, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 2.392041334486024e-05, |
|
"loss": 0.3752, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.515, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.3822338732258937e-05, |
|
"loss": 0.4876, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.51625, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 2.3724282280651214e-05, |
|
"loss": 0.3989, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.5175, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.3626245502188864e-05, |
|
"loss": 0.4102, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.51875, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.3528229908720272e-05, |
|
"loss": 0.3997, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.3430237011767167e-05, |
|
"loss": 0.4009, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.52125, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.3332268322501228e-05, |
|
"loss": 0.4769, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.5225, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 2.323432535172084e-05, |
|
"loss": 0.4405, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.52375, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 2.313640960982776e-05, |
|
"loss": 0.4436, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.525, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.303852260680388e-05, |
|
"loss": 0.4027, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.52625, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.294066585218786e-05, |
|
"loss": 0.4086, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.5275, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.284284085505192e-05, |
|
"loss": 0.4262, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.52875, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 2.274504912397852e-05, |
|
"loss": 0.4605, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 2.2647292167037144e-05, |
|
"loss": 0.4534, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.2549571491760986e-05, |
|
"loss": 0.3628, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.5325, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.2451888605123754e-05, |
|
"loss": 0.4879, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.53375, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.2354245013516393e-05, |
|
"loss": 0.4517, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.535, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.225664222272387e-05, |
|
"loss": 0.4303, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.53625, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 2.2159081737901975e-05, |
|
"loss": 0.4172, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.5375, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.2061565063554064e-05, |
|
"loss": 0.4169, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.53875, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.1964093703507893e-05, |
|
"loss": 0.4839, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.186666916089239e-05, |
|
"loss": 0.3919, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.54125, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.1769292938114563e-05, |
|
"loss": 0.4435, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.5425, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.1671966536836196e-05, |
|
"loss": 0.4902, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.54375, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.1574691457950803e-05, |
|
"loss": 0.4667, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.545, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.1477469201560435e-05, |
|
"loss": 0.3795, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.54625, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 2.1380301266952556e-05, |
|
"loss": 0.4658, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.5475, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 2.1283189152576925e-05, |
|
"loss": 0.4589, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.54875, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.118613435602248e-05, |
|
"loss": 0.4394, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 0.4321, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.55125, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.0992202702290227e-05, |
|
"loss": 0.4084, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.5525, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.089532883577843e-05, |
|
"loss": 0.4489, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.55375, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.0798518268373706e-05, |
|
"loss": 0.4403, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.555, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.070177249301476e-05, |
|
"loss": 0.4286, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.55625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.0605093001641138e-05, |
|
"loss": 0.4557, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.5575, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.0508481285170186e-05, |
|
"loss": 0.4686, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.55875, |
|
"grad_norm": 1.375, |
|
"learning_rate": 2.04119388334741e-05, |
|
"loss": 0.4402, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.031546713535688e-05, |
|
"loss": 0.3973, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.56125, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.0219067678531494e-05, |
|
"loss": 0.4349, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.0122741949596797e-05, |
|
"loss": 0.4329, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.56375, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.002649143401469e-05, |
|
"loss": 0.4402, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.565, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9930317616087196e-05, |
|
"loss": 0.4342, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.56625, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 1.9834221978933543e-05, |
|
"loss": 0.4537, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.5675, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.9738206004467363e-05, |
|
"loss": 0.4597, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.56875, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.9642271173373737e-05, |
|
"loss": 0.4372, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.9546418965086442e-05, |
|
"loss": 0.4062, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.57125, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 1.9450650857765102e-05, |
|
"loss": 0.4698, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.5725, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.935496832827241e-05, |
|
"loss": 0.4312, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.57375, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.925937285215133e-05, |
|
"loss": 0.4643, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.575, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.9163865903602374e-05, |
|
"loss": 0.4256, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.57625, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.9068448955460805e-05, |
|
"loss": 0.3879, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.5775, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 1.897312347917404e-05, |
|
"loss": 0.4048, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.57875, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.8877890944778815e-05, |
|
"loss": 0.4572, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.8782752820878634e-05, |
|
"loss": 0.4634, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.58125, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.868771057462105e-05, |
|
"loss": 0.3985, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.5825, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.8592765671675084e-05, |
|
"loss": 0.4463, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.58375, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 1.8497919576208585e-05, |
|
"loss": 0.4083, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.585, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 1.8403173750865685e-05, |
|
"loss": 0.3929, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.58625, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.830852965674419e-05, |
|
"loss": 0.4659, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.5875, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.8213988753373146e-05, |
|
"loss": 0.3986, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.58875, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.8119552498690215e-05, |
|
"loss": 0.4043, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.802522234901927e-05, |
|
"loss": 0.449, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.59125, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.793099975904791e-05, |
|
"loss": 0.4178, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.5925, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.783688618180504e-05, |
|
"loss": 0.4422, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.7742883068638447e-05, |
|
"loss": 0.4666, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.595, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.7648991869192405e-05, |
|
"loss": 0.4226, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.59625, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.7555214031385375e-05, |
|
"loss": 0.408, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.5975, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 1.746155100138761e-05, |
|
"loss": 0.4778, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.59875, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.7368004223598912e-05, |
|
"loss": 0.4059, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.4398, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.60125, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.7181265193261865e-05, |
|
"loss": 0.482, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.6025, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 1.7088075820460346e-05, |
|
"loss": 0.4192, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.60375, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 1.6995008459317206e-05, |
|
"loss": 0.4748, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.605, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 1.690206454504627e-05, |
|
"loss": 0.4276, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.60625, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.6809245510957665e-05, |
|
"loss": 0.3548, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.6075, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.6716552788435724e-05, |
|
"loss": 0.4122, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.60875, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.66239878069169e-05, |
|
"loss": 0.432, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.6531551993867717e-05, |
|
"loss": 0.4467, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.61125, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.643924677476276e-05, |
|
"loss": 0.4652, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.6125, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.6347073573062672e-05, |
|
"loss": 0.4024, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.61375, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.6255033810192282e-05, |
|
"loss": 0.3974, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.615, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.6163128905518578e-05, |
|
"loss": 0.3891, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.61625, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.6071360276328874e-05, |
|
"loss": 0.3499, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.6175, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.5979729337808955e-05, |
|
"loss": 0.4386, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.61875, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.588823750302126e-05, |
|
"loss": 0.4494, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.5796886182883053e-05, |
|
"loss": 0.4025, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.62125, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.57056767861447e-05, |
|
"loss": 0.4192, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.6225, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.561461071936792e-05, |
|
"loss": 0.4509, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.62375, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.552368938690414e-05, |
|
"loss": 0.3897, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 0.473, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.62625, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.5342286531139605e-05, |
|
"loss": 0.4333, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.6275, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.5251807805295302e-05, |
|
"loss": 0.4245, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.62875, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.5161479408633713e-05, |
|
"loss": 0.4342, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.5071302734130489e-05, |
|
"loss": 0.3951, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.63125, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.498127917242148e-05, |
|
"loss": 0.4367, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.6325, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.4891410111781378e-05, |
|
"loss": 0.4766, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.63375, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.4801696938102272e-05, |
|
"loss": 0.373, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.635, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.4712141034872282e-05, |
|
"loss": 0.3804, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.63625, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.4622743783154223e-05, |
|
"loss": 0.4206, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.6375, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.4533506561564306e-05, |
|
"loss": 0.4585, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.63875, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 1.4444430746250867e-05, |
|
"loss": 0.3796, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4355517710873184e-05, |
|
"loss": 0.4296, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.64125, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 1.4266768826580257e-05, |
|
"loss": 0.5008, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.6425, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4178185461989662e-05, |
|
"loss": 0.3952, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.64375, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.4089768983166444e-05, |
|
"loss": 0.4494, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.645, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.4001520753602121e-05, |
|
"loss": 0.3944, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.64625, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.3913442134193544e-05, |
|
"loss": 0.4276, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.6475, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.3825534483221974e-05, |
|
"loss": 0.4433, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.64875, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.3737799156332143e-05, |
|
"loss": 0.3992, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 0.4488, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.65125, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 1.3562850884068487e-05, |
|
"loss": 0.4243, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.6525, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.3475640636613446e-05, |
|
"loss": 0.3477, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.65375, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.3388608109036086e-05, |
|
"loss": 0.4413, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.655, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.330175464348567e-05, |
|
"loss": 0.4487, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.3215081579350058e-05, |
|
"loss": 0.4122, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.6575, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.312859025323514e-05, |
|
"loss": 0.424, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.65875, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.3042281998944151e-05, |
|
"loss": 0.4013, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 1.2956158147457115e-05, |
|
"loss": 0.5066, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.66125, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.2870220026910407e-05, |
|
"loss": 0.3935, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.6625, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.2784468962576136e-05, |
|
"loss": 0.4039, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.66375, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.2698906276841776e-05, |
|
"loss": 0.4817, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.665, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.261353328918981e-05, |
|
"loss": 0.3917, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.66625, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.2528351316177319e-05, |
|
"loss": 0.425, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.6675, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.2443361671415687e-05, |
|
"loss": 0.4234, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.66875, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.235856566555039e-05, |
|
"loss": 0.4414, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.2273964606240718e-05, |
|
"loss": 0.4563, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.67125, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.2189559798139682e-05, |
|
"loss": 0.4132, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.6725, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 1.2105352542873815e-05, |
|
"loss": 0.4317, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.67375, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 1.2021344139023186e-05, |
|
"loss": 0.4073, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.675, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.1937535882101281e-05, |
|
"loss": 0.4147, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.67625, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 1.1853929064535111e-05, |
|
"loss": 0.4394, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.6775, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 1.1770524975645238e-05, |
|
"loss": 0.461, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.67875, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.1687324901625879e-05, |
|
"loss": 0.4279, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.1604330125525079e-05, |
|
"loss": 0.4201, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.68125, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.1521541927224994e-05, |
|
"loss": 0.4392, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.6825, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.1438961583422037e-05, |
|
"loss": 0.4064, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.68375, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.1356590367607252e-05, |
|
"loss": 0.4081, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.685, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.1274429550046704e-05, |
|
"loss": 0.4629, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.68625, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 1.1192480397761837e-05, |
|
"loss": 0.3942, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.1110744174509952e-05, |
|
"loss": 0.4581, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.68875, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.1029222140764712e-05, |
|
"loss": 0.4079, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.0947915553696742e-05, |
|
"loss": 0.3924, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.69125, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 1.0866825667154182e-05, |
|
"loss": 0.3715, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.6925, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.07859537316434e-05, |
|
"loss": 0.4238, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.69375, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.0705300994309697e-05, |
|
"loss": 0.4465, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.695, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.0624868698918045e-05, |
|
"loss": 0.4295, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.69625, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.0544658085833919e-05, |
|
"loss": 0.4527, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.6975, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.0464670392004235e-05, |
|
"loss": 0.4721, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.69875, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.0384906850938166e-05, |
|
"loss": 0.4632, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.4382, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.70125, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.0226057143831064e-05, |
|
"loss": 0.4699, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.7025, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.0146973427449038e-05, |
|
"loss": 0.4368, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.70375, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.0068118763110824e-05, |
|
"loss": 0.4513, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.705, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.989494366852904e-06, |
|
"loss": 0.3863, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.70625, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.911101451160715e-06, |
|
"loss": 0.3907, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.7075, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.832941224950012e-06, |
|
"loss": 0.4537, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.70875, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.755014893548157e-06, |
|
"loss": 0.4082, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.677323658675594e-06, |
|
"loss": 0.3992, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.71125, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.599868718427257e-06, |
|
"loss": 0.4512, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.7125, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.522651267254149e-06, |
|
"loss": 0.419, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.71375, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 9.445672495944899e-06, |
|
"loss": 0.4542, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.715, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.368933591607378e-06, |
|
"loss": 0.4554, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.71625, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 9.292435737650407e-06, |
|
"loss": 0.4158, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.7175, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 9.216180113765558e-06, |
|
"loss": 0.4145, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.140167895908867e-06, |
|
"loss": 0.4276, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.064400256282757e-06, |
|
"loss": 0.4477, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.72125, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 8.988878363317979e-06, |
|
"loss": 0.4563, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.7225, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 8.913603381655528e-06, |
|
"loss": 0.4396, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.72375, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 8.838576472128756e-06, |
|
"loss": 0.4831, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.725, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 8.763798791745411e-06, |
|
"loss": 0.4437, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.72625, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 8.689271493669837e-06, |
|
"loss": 0.4639, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.7275, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 8.614995727205156e-06, |
|
"loss": 0.4215, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.72875, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 8.540972637775572e-06, |
|
"loss": 0.4615, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 8.467203366908707e-06, |
|
"loss": 0.4043, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.73125, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 8.393689052217966e-06, |
|
"loss": 0.4634, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.7325, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 8.320430827385003e-06, |
|
"loss": 0.4411, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.73375, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 8.24742982214231e-06, |
|
"loss": 0.3556, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.735, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 8.174687162255672e-06, |
|
"loss": 0.4456, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.73625, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.102203969506886e-06, |
|
"loss": 0.4254, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.7375, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 8.029981361676456e-06, |
|
"loss": 0.427, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.73875, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.958020452526346e-06, |
|
"loss": 0.4323, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.125, |
|
"learning_rate": 7.886322351782783e-06, |
|
"loss": 0.3968, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.74125, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.814888165119186e-06, |
|
"loss": 0.4628, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.7425, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.743718994139071e-06, |
|
"loss": 0.4388, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.74375, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 7.672815936359107e-06, |
|
"loss": 0.4029, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.745, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.602180085192143e-06, |
|
"loss": 0.4214, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.74625, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 7.531812529930398e-06, |
|
"loss": 0.4165, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.7475, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.461714355728608e-06, |
|
"loss": 0.4016, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.74875, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 7.391886643587342e-06, |
|
"loss": 0.4527, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.4143, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.75125, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 7.253046908617747e-06, |
|
"loss": 0.4667, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.7525, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 7.184037026869867e-06, |
|
"loss": 0.4032, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.75375, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.115301889310427e-06, |
|
"loss": 0.433, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.755, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 7.046842555920283e-06, |
|
"loss": 0.4017, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.75625, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 6.9786600824270296e-06, |
|
"loss": 0.4006, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.7575, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 6.91075552028877e-06, |
|
"loss": 0.4536, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.75875, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 6.84312991667784e-06, |
|
"loss": 0.4295, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 6.775784314464717e-06, |
|
"loss": 0.4216, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.76125, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.708719752201884e-06, |
|
"loss": 0.4071, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.7625, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 6.641937264107867e-06, |
|
"loss": 0.4518, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.76375, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 6.575437880051233e-06, |
|
"loss": 0.4776, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.765, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 6.509222625534755e-06, |
|
"loss": 0.4084, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.76625, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 6.443292521679578e-06, |
|
"loss": 0.4825, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.7675, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 6.377648585209456e-06, |
|
"loss": 0.4788, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.76875, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 6.312291828435077e-06, |
|
"loss": 0.4077, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 6.247223259238511e-06, |
|
"loss": 0.4103, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.77125, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 6.182443881057576e-06, |
|
"loss": 0.4401, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.7725, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 6.117954692870412e-06, |
|
"loss": 0.4628, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.77375, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 6.053756689180082e-06, |
|
"loss": 0.3789, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.775, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 5.989850859999227e-06, |
|
"loss": 0.4261, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.77625, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.926238190834779e-06, |
|
"loss": 0.4548, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.7775, |
|
"grad_norm": 1.125, |
|
"learning_rate": 5.8629196626728e-06, |
|
"loss": 0.4496, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.77875, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 5.7998962519633045e-06, |
|
"loss": 0.3764, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 5.737168930605272e-06, |
|
"loss": 0.3888, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.674738665931575e-06, |
|
"loss": 0.4209, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.7825, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 5.612606420694141e-06, |
|
"loss": 0.4727, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.78375, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 5.550773153049046e-06, |
|
"loss": 0.4365, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.785, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 5.489239816541755e-06, |
|
"loss": 0.4403, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.78625, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 5.428007360092463e-06, |
|
"loss": 0.4521, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.7875, |
|
"grad_norm": 1.25, |
|
"learning_rate": 5.367076727981382e-06, |
|
"loss": 0.4657, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.78875, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.306448859834228e-06, |
|
"loss": 0.4367, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 5.24612469060774e-06, |
|
"loss": 0.4053, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.79125, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.186105150575232e-06, |
|
"loss": 0.3926, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.7925, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 5.12639116531225e-06, |
|
"loss": 0.4534, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.79375, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 5.066983655682325e-06, |
|
"loss": 0.4551, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.795, |
|
"grad_norm": 1.375, |
|
"learning_rate": 5.007883537822736e-06, |
|
"loss": 0.4066, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.79625, |
|
"grad_norm": 0.75, |
|
"learning_rate": 4.949091723130425e-06, |
|
"loss": 0.4247, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.7975, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 4.890609118247888e-06, |
|
"loss": 0.4215, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.79875, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 4.832436625049256e-06, |
|
"loss": 0.4385, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.4393, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.80125, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.717025557274749e-06, |
|
"loss": 0.42, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.8025, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.659788762480327e-06, |
|
"loss": 0.3758, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.80375, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.602865638905224e-06, |
|
"loss": 0.4448, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.805, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.54625706437441e-06, |
|
"loss": 0.4453, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.80625, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.48996391186216e-06, |
|
"loss": 0.4359, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.8075, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.433987049478508e-06, |
|
"loss": 0.3974, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.80875, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.378327340455915e-06, |
|
"loss": 0.4194, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 4.322985643135952e-06, |
|
"loss": 0.4214, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.81125, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 4.267962810956061e-06, |
|
"loss": 0.3592, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.213259692436367e-06, |
|
"loss": 0.3997, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.81375, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 4.158877131166641e-06, |
|
"loss": 0.4471, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.815, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 4.104815965793249e-06, |
|
"loss": 0.4293, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.81625, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 4.051077030006228e-06, |
|
"loss": 0.4562, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.8175, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.9976611525264525e-06, |
|
"loss": 0.434, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.81875, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.944569157092839e-06, |
|
"loss": 0.4524, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 3.891801862449629e-06, |
|
"loss": 0.4498, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.82125, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.839360082333771e-06, |
|
"loss": 0.4329, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.8225, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.7872446254624104e-06, |
|
"loss": 0.3884, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.82375, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.735456295520348e-06, |
|
"loss": 0.4114, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.825, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.6839958911476957e-06, |
|
"loss": 0.4404, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.82625, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.6328642059275524e-06, |
|
"loss": 0.4548, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.8275, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.5820620283737616e-06, |
|
"loss": 0.4648, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.82875, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.5315901419187363e-06, |
|
"loss": 0.4233, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 3.4814493249014116e-06, |
|
"loss": 0.4021, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.83125, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.431640350555204e-06, |
|
"loss": 0.4732, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.8325, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.382163986996126e-06, |
|
"loss": 0.4174, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.83375, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.3330209972108976e-06, |
|
"loss": 0.4284, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.835, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 3.284212139045223e-06, |
|
"loss": 0.4183, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.83625, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.2357381651920648e-06, |
|
"loss": 0.3996, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.8375, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.187599823180071e-06, |
|
"loss": 0.4317, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.83875, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.139797855362031e-06, |
|
"loss": 0.4341, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 3.092332998903416e-06, |
|
"loss": 0.4516, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.84125, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.0452059857710186e-06, |
|
"loss": 0.4371, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.8425, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.9984175427217016e-06, |
|
"loss": 0.4346, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 2.9519683912911266e-06, |
|
"loss": 0.3893, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.845, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.9058592477826636e-06, |
|
"loss": 0.4086, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.84625, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 2.860090823256359e-06, |
|
"loss": 0.4211, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.8475, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.8146638235179213e-06, |
|
"loss": 0.422, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.84875, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.769578949107893e-06, |
|
"loss": 0.4117, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 0.3836, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.85125, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.6804383520444815e-06, |
|
"loss": 0.3996, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.8525, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.6363840040493747e-06, |
|
"loss": 0.4007, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.85375, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 2.5926745306780324e-06, |
|
"loss": 0.4431, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.855, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.5493106059846116e-06, |
|
"loss": 0.4013, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.85625, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.506292898694468e-06, |
|
"loss": 0.4748, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.8575, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.4636220721938554e-06, |
|
"loss": 0.4454, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.85875, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.421298784519724e-06, |
|
"loss": 0.3844, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.379323688349516e-06, |
|
"loss": 0.4772, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.86125, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 2.3376974309911343e-06, |
|
"loss": 0.4668, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.8625, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.296420654372966e-06, |
|
"loss": 0.4191, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.86375, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 2.2554939950339747e-06, |
|
"loss": 0.3971, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.865, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 2.2149180841138676e-06, |
|
"loss": 0.4282, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.86625, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.1746935473433928e-06, |
|
"loss": 0.4406, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.8675, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 2.1348210050346595e-06, |
|
"loss": 0.3914, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.86875, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.0953010720716037e-06, |
|
"loss": 0.3676, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 2.0561343579004715e-06, |
|
"loss": 0.3973, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.87125, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.0173214665204555e-06, |
|
"loss": 0.4067, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.8725, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.9788629964743455e-06, |
|
"loss": 0.4279, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.87375, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.940759540839329e-06, |
|
"loss": 0.4449, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.9030116872178316e-06, |
|
"loss": 0.4364, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.87625, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.8656200177284505e-06, |
|
"loss": 0.3991, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.8775, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.8285851089969802e-06, |
|
"loss": 0.3922, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.87875, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.7919075321475325e-06, |
|
"loss": 0.4559, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.7555878527937164e-06, |
|
"loss": 0.3947, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.88125, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.7196266310299108e-06, |
|
"loss": 0.3853, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.8825, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.6840244214226502e-06, |
|
"loss": 0.4429, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.88375, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.6487817730020365e-06, |
|
"loss": 0.4092, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.885, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.6138992292533183e-06, |
|
"loss": 0.4348, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.88625, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.579377328108464e-06, |
|
"loss": 0.4362, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.8875, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.5452166019378989e-06, |
|
"loss": 0.431, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.88875, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.5114175775422762e-06, |
|
"loss": 0.4164, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.4779807761443636e-06, |
|
"loss": 0.4154, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.89125, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 1.4449067133810056e-06, |
|
"loss": 0.4108, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.8925, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.4121958992951629e-06, |
|
"loss": 0.4024, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.89375, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.379848838328049e-06, |
|
"loss": 0.4041, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.895, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.3478660293113676e-06, |
|
"loss": 0.4291, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.89625, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.3162479654595938e-06, |
|
"loss": 0.4561, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.8975, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.284995134362385e-06, |
|
"loss": 0.4599, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.89875, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 1.2541080179770571e-06, |
|
"loss": 0.371, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 0.428, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.90125, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.193432828965113e-06, |
|
"loss": 0.4093, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.9025, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 1.16364569202497e-06, |
|
"loss": 0.4123, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.90375, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.134226141155223e-06, |
|
"loss": 0.4212, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.905, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.105174630041747e-06, |
|
"loss": 0.4379, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 1.0764916066947794e-06, |
|
"loss": 0.4788, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.9075, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.0481775134420225e-06, |
|
"loss": 0.4523, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.90875, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.020232786921821e-06, |
|
"loss": 0.4141, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.926578580764234e-07, |
|
"loss": 0.3628, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.91125, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.654531521453513e-07, |
|
"loss": 0.4142, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.9125, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 9.386190886588208e-07, |
|
"loss": 0.4348, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.91375, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 9.121560814312813e-07, |
|
"loss": 0.4138, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.915, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 8.860645385550481e-07, |
|
"loss": 0.4332, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.91625, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 8.603448623939858e-07, |
|
"loss": 0.4577, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.9175, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 8.349974495773183e-07, |
|
"loss": 0.4456, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.91875, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 8.10022690993506e-07, |
|
"loss": 0.3946, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 7.854209717842231e-07, |
|
"loss": 0.478, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.92125, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.611926713384121e-07, |
|
"loss": 0.3592, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.9225, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 7.373381632864384e-07, |
|
"loss": 0.4425, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.92375, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 7.138578154943288e-07, |
|
"loss": 0.4219, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.925, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 6.907519900580861e-07, |
|
"loss": 0.4419, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.92625, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 6.680210432981254e-07, |
|
"loss": 0.3983, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.9275, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 6.456653257537665e-07, |
|
"loss": 0.4417, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.92875, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 6.2368518217783e-07, |
|
"loss": 0.4469, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 6.020809515313142e-07, |
|
"loss": 0.435, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.93125, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 5.808529669781904e-07, |
|
"loss": 0.3856, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.9325, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 5.600015558802352e-07, |
|
"loss": 0.4587, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.93375, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 5.39527039792001e-07, |
|
"loss": 0.4325, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.935, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 5.194297344558536e-07, |
|
"loss": 0.4166, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.93625, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.997099497971114e-07, |
|
"loss": 0.4347, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.803679899192392e-07, |
|
"loss": 0.4252, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.93875, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.614041530991903e-07, |
|
"loss": 0.4036, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.375, |
|
"learning_rate": 4.4281873178278475e-07, |
|
"loss": 0.4359, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.94125, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 4.246120125802111e-07, |
|
"loss": 0.4566, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.9425, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.067842762616014e-07, |
|
"loss": 0.4226, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.94375, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 3.8933579775271013e-07, |
|
"loss": 0.3903, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.945, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.7226684613065333e-07, |
|
"loss": 0.4119, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.94625, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.555776846197817e-07, |
|
"loss": 0.4268, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.9475, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 3.3926857058761417e-07, |
|
"loss": 0.405, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.94875, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.233397555408607e-07, |
|
"loss": 0.4161, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 0.4293, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.95125, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 2.92623999103267e-07, |
|
"loss": 0.4066, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.9525, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.778375313873871e-07, |
|
"loss": 0.4309, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.95375, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.634323099995395e-07, |
|
"loss": 0.4623, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.955, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 2.494085570860616e-07, |
|
"loss": 0.3977, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.95625, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.3576648891056875e-07, |
|
"loss": 0.4135, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.9575, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.2250631585063186e-07, |
|
"loss": 0.3874, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.95875, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.0962824239451894e-07, |
|
"loss": 0.4494, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.9713246713805588e-07, |
|
"loss": 0.3946, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.96125, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 1.8501918278155393e-07, |
|
"loss": 0.4312, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.9625, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.732885761268427e-07, |
|
"loss": 0.4453, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.96375, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 1.619408280743917e-07, |
|
"loss": 0.3943, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.965, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 1.509761136205101e-07, |
|
"loss": 0.4144, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.96625, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.4039460185465703e-07, |
|
"loss": 0.4026, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.9675, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.3019645595683806e-07, |
|
"loss": 0.432, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.2038183319507955e-07, |
|
"loss": 0.4384, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.109508849230001e-07, |
|
"loss": 0.4622, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.97125, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.0190375657749274e-07, |
|
"loss": 0.4136, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.9725, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 9.324058767646859e-08, |
|
"loss": 0.4417, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.97375, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.496151181670852e-08, |
|
"loss": 0.4316, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.975, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.706665667180091e-08, |
|
"loss": 0.4246, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.97625, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 6.955614399018206e-08, |
|
"loss": 0.4481, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.9775, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 6.243008959324892e-08, |
|
"loss": 0.4544, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.97875, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 5.568860337357151e-08, |
|
"loss": 0.4005, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 4.9331789293211026e-08, |
|
"loss": 0.4955, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.98125, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 4.335974538210441e-08, |
|
"loss": 0.4102, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.9825, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.7772563736551694e-08, |
|
"loss": 0.4542, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.98375, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.2570330517811555e-08, |
|
"loss": 0.4691, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.985, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 2.7753125950752413e-08, |
|
"loss": 0.4155, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.98625, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 2.3321024322625617e-08, |
|
"loss": 0.4268, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.9875, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9274093981927478e-08, |
|
"loss": 0.4098, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.98875, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.5612397337325113e-08, |
|
"loss": 0.3965, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.233599085671e-08, |
|
"loss": 0.4047, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.99125, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 9.444925066329213e-09, |
|
"loss": 0.4358, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.9925, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 6.939244549986068e-09, |
|
"loss": 0.465, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.99375, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.818987948379539e-09, |
|
"loss": 0.4483, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.995, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.0841879584853073e-09, |
|
"loss": 0.4986, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.99625, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 1.7348713330672671e-09, |
|
"loss": 0.4121, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.9975, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 7.710588802584129e-10, |
|
"loss": 0.3764, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.99875, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.9276546323609978e-10, |
|
"loss": 0.4443, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0, |
|
"loss": 0.4357, |
|
"step": 4000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.184022656018907e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|