{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 530, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037735849056603774, "grad_norm": 15.33187898922165, "learning_rate": 3.773584905660378e-07, "loss": 1.3947, "step": 1 }, { "epoch": 0.018867924528301886, "grad_norm": 14.587979298665214, "learning_rate": 1.8867924528301889e-06, "loss": 1.2879, "step": 5 }, { "epoch": 0.03773584905660377, "grad_norm": 5.156574643577073, "learning_rate": 3.7735849056603777e-06, "loss": 1.1356, "step": 10 }, { "epoch": 0.05660377358490566, "grad_norm": 2.4413584926483494, "learning_rate": 5.660377358490566e-06, "loss": 1.056, "step": 15 }, { "epoch": 0.07547169811320754, "grad_norm": 1.939557281184319, "learning_rate": 7.5471698113207555e-06, "loss": 1.0035, "step": 20 }, { "epoch": 0.09433962264150944, "grad_norm": 1.445408046384172, "learning_rate": 9.433962264150944e-06, "loss": 0.9666, "step": 25 }, { "epoch": 0.11320754716981132, "grad_norm": 1.5429900501633669, "learning_rate": 1.1320754716981132e-05, "loss": 0.9673, "step": 30 }, { "epoch": 0.1320754716981132, "grad_norm": 1.2999242817660164, "learning_rate": 1.320754716981132e-05, "loss": 0.9161, "step": 35 }, { "epoch": 0.1509433962264151, "grad_norm": 1.1690976259696864, "learning_rate": 1.5094339622641511e-05, "loss": 0.9029, "step": 40 }, { "epoch": 0.16981132075471697, "grad_norm": 1.2404008101715356, "learning_rate": 1.69811320754717e-05, "loss": 0.8976, "step": 45 }, { "epoch": 0.18867924528301888, "grad_norm": 1.2490209366453322, "learning_rate": 1.8867924528301888e-05, "loss": 0.9202, "step": 50 }, { "epoch": 0.20754716981132076, "grad_norm": 1.2687062081137546, "learning_rate": 1.9999132465602526e-05, "loss": 0.9125, "step": 55 }, { "epoch": 0.22641509433962265, "grad_norm": 1.2693770693981177, "learning_rate": 1.998937443221316e-05, "loss": 0.918, "step": 60 }, { "epoch": 0.24528301886792453, "grad_norm": 1.17511999372123, "learning_rate": 1.9968784563700586e-05, "loss": 0.8731, "step": 65 }, { "epoch": 0.2641509433962264, "grad_norm": 1.1148641046803303, "learning_rate": 1.9937385186393888e-05, "loss": 0.9063, "step": 70 }, { "epoch": 0.2830188679245283, "grad_norm": 1.2209470360517312, "learning_rate": 1.9895210347758233e-05, "loss": 0.8973, "step": 75 }, { "epoch": 0.3018867924528302, "grad_norm": 1.2489264571555163, "learning_rate": 1.984230577947597e-05, "loss": 0.9086, "step": 80 }, { "epoch": 0.32075471698113206, "grad_norm": 1.7088824250739199, "learning_rate": 1.977872884785815e-05, "loss": 0.891, "step": 85 }, { "epoch": 0.33962264150943394, "grad_norm": 1.1609543047434623, "learning_rate": 1.9704548491640195e-05, "loss": 0.9064, "step": 90 }, { "epoch": 0.3584905660377358, "grad_norm": 1.1548579761616171, "learning_rate": 1.961984514722914e-05, "loss": 0.886, "step": 95 }, { "epoch": 0.37735849056603776, "grad_norm": 1.2072363172467877, "learning_rate": 1.9524710661483594e-05, "loss": 0.914, "step": 100 }, { "epoch": 0.39622641509433965, "grad_norm": 1.2486639912473576, "learning_rate": 1.94192481921209e-05, "loss": 0.8858, "step": 105 }, { "epoch": 0.41509433962264153, "grad_norm": 1.1907067530146591, "learning_rate": 1.9303572095859545e-05, "loss": 0.883, "step": 110 }, { "epoch": 0.4339622641509434, "grad_norm": 1.205701711118422, "learning_rate": 1.91778078044181e-05, "loss": 0.903, "step": 115 }, { "epoch": 0.4528301886792453, "grad_norm": 1.4297409544875106, "learning_rate": 1.9042091688505104e-05, "loss": 0.8903, "step": 120 }, { "epoch": 0.4716981132075472, "grad_norm": 1.3015347726404307, "learning_rate": 1.8896570909947477e-05, "loss": 0.8975, "step": 125 }, { "epoch": 0.49056603773584906, "grad_norm": 1.1056494521961182, "learning_rate": 1.874140326211766e-05, "loss": 0.8973, "step": 130 }, { "epoch": 0.5094339622641509, "grad_norm": 1.3117079939596017, "learning_rate": 1.8576756998832667e-05, "loss": 0.8665, "step": 135 }, { "epoch": 0.5283018867924528, "grad_norm": 1.0536496719085662, "learning_rate": 1.8402810651910444e-05, "loss": 0.8638, "step": 140 }, { "epoch": 0.5471698113207547, "grad_norm": 2.702397102319974, "learning_rate": 1.8219752837581466e-05, "loss": 0.882, "step": 145 }, { "epoch": 0.5660377358490566, "grad_norm": 1.0422440813400673, "learning_rate": 1.8027782051965408e-05, "loss": 0.8657, "step": 150 }, { "epoch": 0.5849056603773585, "grad_norm": 1.3024995956700887, "learning_rate": 1.782710645583473e-05, "loss": 0.8599, "step": 155 }, { "epoch": 0.6037735849056604, "grad_norm": 1.1482394390531847, "learning_rate": 1.761794364889855e-05, "loss": 0.8926, "step": 160 }, { "epoch": 0.6226415094339622, "grad_norm": 1.079599574770899, "learning_rate": 1.7400520433851457e-05, "loss": 0.8428, "step": 165 }, { "epoch": 0.6415094339622641, "grad_norm": 1.0901592446951243, "learning_rate": 1.717507257044331e-05, "loss": 0.8496, "step": 170 }, { "epoch": 0.660377358490566, "grad_norm": 1.1077239121822327, "learning_rate": 1.694184451983651e-05, "loss": 0.9012, "step": 175 }, { "epoch": 0.6792452830188679, "grad_norm": 1.0853595621923493, "learning_rate": 1.6701089179528032e-05, "loss": 0.8681, "step": 180 }, { "epoch": 0.6981132075471698, "grad_norm": 1.0734841287772214, "learning_rate": 1.6453067609123656e-05, "loss": 0.856, "step": 185 }, { "epoch": 0.7169811320754716, "grad_norm": 1.088487035960174, "learning_rate": 1.619804874726171e-05, "loss": 0.8652, "step": 190 }, { "epoch": 0.7358490566037735, "grad_norm": 1.0825029747031027, "learning_rate": 1.5936309119993333e-05, "loss": 0.8565, "step": 195 }, { "epoch": 0.7547169811320755, "grad_norm": 1.200978191088004, "learning_rate": 1.566813254093538e-05, "loss": 0.8438, "step": 200 }, { "epoch": 0.7735849056603774, "grad_norm": 1.0577639466302755, "learning_rate": 1.5393809803521213e-05, "loss": 0.8681, "step": 205 }, { "epoch": 0.7924528301886793, "grad_norm": 1.1116902648533118, "learning_rate": 1.5113638365682996e-05, "loss": 0.8534, "step": 210 }, { "epoch": 0.8113207547169812, "grad_norm": 1.0957222267960907, "learning_rate": 1.482792202730745e-05, "loss": 0.8598, "step": 215 }, { "epoch": 0.8301886792452831, "grad_norm": 1.0870428672337984, "learning_rate": 1.4536970600814789e-05, "loss": 0.8854, "step": 220 }, { "epoch": 0.8490566037735849, "grad_norm": 1.0340161162672956, "learning_rate": 1.424109957521806e-05, "loss": 0.865, "step": 225 }, { "epoch": 0.8679245283018868, "grad_norm": 1.062102724438973, "learning_rate": 1.394062977402717e-05, "loss": 0.8664, "step": 230 }, { "epoch": 0.8867924528301887, "grad_norm": 1.0590868294575073, "learning_rate": 1.3635887007368467e-05, "loss": 0.8419, "step": 235 }, { "epoch": 0.9056603773584906, "grad_norm": 1.0864493134204916, "learning_rate": 1.3327201718697232e-05, "loss": 0.8519, "step": 240 }, { "epoch": 0.9245283018867925, "grad_norm": 1.0168196512793692, "learning_rate": 1.3014908626486032e-05, "loss": 0.8629, "step": 245 }, { "epoch": 0.9433962264150944, "grad_norm": 1.0404726364027557, "learning_rate": 1.2699346361277538e-05, "loss": 0.8337, "step": 250 }, { "epoch": 0.9622641509433962, "grad_norm": 1.018131650974526, "learning_rate": 1.2380857098495355e-05, "loss": 0.9745, "step": 255 }, { "epoch": 0.9811320754716981, "grad_norm": 1.0037862694896684, "learning_rate": 1.2059786187410984e-05, "loss": 0.8712, "step": 260 }, { "epoch": 1.0, "grad_norm": 0.989425132088932, "learning_rate": 1.1736481776669307e-05, "loss": 0.8495, "step": 265 }, { "epoch": 1.0, "eval_loss": 0.8464146852493286, "eval_runtime": 4.2653, "eval_samples_per_second": 40.091, "eval_steps_per_second": 0.703, "step": 265 }, { "epoch": 1.0188679245283019, "grad_norm": 3.0712777016007875, "learning_rate": 1.1411294436778562e-05, "loss": 0.5752, "step": 270 }, { "epoch": 1.0377358490566038, "grad_norm": 1.367908639519453, "learning_rate": 1.1084576779974257e-05, "loss": 0.5572, "step": 275 }, { "epoch": 1.0566037735849056, "grad_norm": 1.1027538267727532, "learning_rate": 1.0756683077869133e-05, "loss": 0.5489, "step": 280 }, { "epoch": 1.0754716981132075, "grad_norm": 1.1284473887078348, "learning_rate": 1.0427968877303809e-05, "loss": 0.549, "step": 285 }, { "epoch": 1.0943396226415094, "grad_norm": 1.0579765238733445, "learning_rate": 1.0098790614814658e-05, "loss": 0.5584, "step": 290 }, { "epoch": 1.1132075471698113, "grad_norm": 1.0510922170571144, "learning_rate": 9.769505230136962e-06, "loss": 0.5326, "step": 295 }, { "epoch": 1.1320754716981132, "grad_norm": 1.0257584558372113, "learning_rate": 9.440469779162407e-06, "loss": 0.5457, "step": 300 }, { "epoch": 1.150943396226415, "grad_norm": 1.0531732710979618, "learning_rate": 9.112041046770653e-06, "loss": 0.5378, "step": 305 }, { "epoch": 1.169811320754717, "grad_norm": 1.0708673795759969, "learning_rate": 8.784575159954748e-06, "loss": 0.5425, "step": 310 }, { "epoch": 1.1886792452830188, "grad_norm": 1.0132141363080573, "learning_rate": 8.458427201659926e-06, "loss": 0.5391, "step": 315 }, { "epoch": 1.2075471698113207, "grad_norm": 1.0680470294860878, "learning_rate": 8.133950825754511e-06, "loss": 0.5447, "step": 320 }, { "epoch": 1.2264150943396226, "grad_norm": 0.9666850547234715, "learning_rate": 7.81149787355039e-06, "loss": 0.5379, "step": 325 }, { "epoch": 1.2452830188679245, "grad_norm": 0.9710527007406873, "learning_rate": 7.491417992288927e-06, "loss": 0.5326, "step": 330 }, { "epoch": 1.2641509433962264, "grad_norm": 1.1620230853698044, "learning_rate": 7.174058256006012e-06, "loss": 0.5458, "step": 335 }, { "epoch": 1.2830188679245282, "grad_norm": 1.0264570630272518, "learning_rate": 6.859762789187259e-06, "loss": 0.5521, "step": 340 }, { "epoch": 1.3018867924528301, "grad_norm": 1.0303843487410083, "learning_rate": 6.548872393621578e-06, "loss": 0.5465, "step": 345 }, { "epoch": 1.320754716981132, "grad_norm": 0.9494639965897911, "learning_rate": 6.241724178857621e-06, "loss": 0.5531, "step": 350 }, { "epoch": 1.3396226415094339, "grad_norm": 0.998464274247554, "learning_rate": 5.938651196663865e-06, "loss": 0.5528, "step": 355 }, { "epoch": 1.3584905660377358, "grad_norm": 0.9719381109648194, "learning_rate": 5.6399820798887266e-06, "loss": 0.5351, "step": 360 }, { "epoch": 1.3773584905660377, "grad_norm": 0.9816121465159701, "learning_rate": 5.346040686112189e-06, "loss": 0.551, "step": 365 }, { "epoch": 1.3962264150943398, "grad_norm": 0.9743057773598173, "learning_rate": 5.0571457464755226e-06, "loss": 0.5345, "step": 370 }, { "epoch": 1.4150943396226414, "grad_norm": 0.9293281745891194, "learning_rate": 4.773610520069706e-06, "loss": 0.5455, "step": 375 }, { "epoch": 1.4339622641509435, "grad_norm": 0.9573637805384148, "learning_rate": 4.495742454257418e-06, "loss": 0.546, "step": 380 }, { "epoch": 1.4528301886792452, "grad_norm": 0.9286559221860932, "learning_rate": 4.223842851296907e-06, "loss": 0.5179, "step": 385 }, { "epoch": 1.4716981132075473, "grad_norm": 0.9387936040725331, "learning_rate": 3.9582065416291926e-06, "loss": 0.5561, "step": 390 }, { "epoch": 1.490566037735849, "grad_norm": 0.9706784805714281, "learning_rate": 3.6991215641828903e-06, "loss": 0.5471, "step": 395 }, { "epoch": 1.509433962264151, "grad_norm": 0.998331330776895, "learning_rate": 3.4468688540433425e-06, "loss": 0.5462, "step": 400 }, { "epoch": 1.5283018867924527, "grad_norm": 0.9387337728890874, "learning_rate": 3.2017219378246734e-06, "loss": 0.5658, "step": 405 }, { "epoch": 1.5471698113207548, "grad_norm": 0.9419101032950977, "learning_rate": 2.963946637075107e-06, "loss": 0.5416, "step": 410 }, { "epoch": 1.5660377358490565, "grad_norm": 0.9575101628921765, "learning_rate": 2.7338007800372024e-06, "loss": 0.5323, "step": 415 }, { "epoch": 1.5849056603773586, "grad_norm": 1.0034727804533015, "learning_rate": 2.5115339220754796e-06, "loss": 0.5402, "step": 420 }, { "epoch": 1.6037735849056602, "grad_norm": 7.5029005576343195, "learning_rate": 2.2973870750746253e-06, "loss": 0.5655, "step": 425 }, { "epoch": 1.6226415094339623, "grad_norm": 1.0027923672255161, "learning_rate": 2.09159244610172e-06, "loss": 0.5542, "step": 430 }, { "epoch": 1.641509433962264, "grad_norm": 0.9357231473507377, "learning_rate": 1.8943731856158299e-06, "loss": 0.6383, "step": 435 }, { "epoch": 1.6603773584905661, "grad_norm": 0.9686189698665412, "learning_rate": 1.7059431454979825e-06, "loss": 0.5562, "step": 440 }, { "epoch": 1.6792452830188678, "grad_norm": 0.9045883081564006, "learning_rate": 1.5265066471639701e-06, "loss": 0.5348, "step": 445 }, { "epoch": 1.6981132075471699, "grad_norm": 1.0056737797057134, "learning_rate": 1.3562582600113295e-06, "loss": 0.5352, "step": 450 }, { "epoch": 1.7169811320754715, "grad_norm": 0.9512535781198771, "learning_rate": 1.1953825904408033e-06, "loss": 0.5464, "step": 455 }, { "epoch": 1.7358490566037736, "grad_norm": 0.9930921183835165, "learning_rate": 1.0440540816810395e-06, "loss": 0.561, "step": 460 }, { "epoch": 1.7547169811320755, "grad_norm": 0.9410700940787784, "learning_rate": 9.024368246335735e-07, "loss": 0.5456, "step": 465 }, { "epoch": 1.7735849056603774, "grad_norm": 0.9323535925403078, "learning_rate": 7.706843799431985e-07, "loss": 0.5243, "step": 470 }, { "epoch": 1.7924528301886793, "grad_norm": 0.9689906420605915, "learning_rate": 6.489396114866942e-07, "loss": 0.5296, "step": 475 }, { "epoch": 1.8113207547169812, "grad_norm": 1.2229314873705728, "learning_rate": 5.373345314604206e-07, "loss": 0.53, "step": 480 }, { "epoch": 1.830188679245283, "grad_norm": 0.9545280104598107, "learning_rate": 4.359901572347758e-07, "loss": 0.5031, "step": 485 }, { "epoch": 1.849056603773585, "grad_norm": 0.9636644642636012, "learning_rate": 3.450163801307582e-07, "loss": 0.5159, "step": 490 }, { "epoch": 1.8679245283018868, "grad_norm": 0.9314616621215778, "learning_rate": 2.6451184626087646e-07, "loss": 0.5212, "step": 495 }, { "epoch": 1.8867924528301887, "grad_norm": 0.9019228681615893, "learning_rate": 1.9456384956365149e-07, "loss": 0.539, "step": 500 }, { "epoch": 1.9056603773584906, "grad_norm": 0.8782721096424342, "learning_rate": 1.3524823714768375e-07, "loss": 0.5462, "step": 505 }, { "epoch": 1.9245283018867925, "grad_norm": 0.9377029041620999, "learning_rate": 8.662932704792793e-08, "loss": 0.5352, "step": 510 }, { "epoch": 1.9433962264150944, "grad_norm": 0.9313929690259365, "learning_rate": 4.8759838483358745e-08, "loss": 0.5129, "step": 515 }, { "epoch": 1.9622641509433962, "grad_norm": 0.9729472297721098, "learning_rate": 2.1680834691628627e-08, "loss": 0.5358, "step": 520 }, { "epoch": 1.9811320754716981, "grad_norm": 0.9341412228490686, "learning_rate": 5.421678402741659e-09, "loss": 0.5717, "step": 525 }, { "epoch": 2.0, "grad_norm": 0.945793151587699, "learning_rate": 0.0, "loss": 0.5006, "step": 530 }, { "epoch": 2.0, "eval_loss": 0.852841854095459, "eval_runtime": 6.8692, "eval_samples_per_second": 24.894, "eval_steps_per_second": 0.437, "step": 530 }, { "epoch": 2.0, "step": 530, "total_flos": 110971217510400.0, "train_loss": 0.7231714307137256, "train_runtime": 3756.5168, "train_samples_per_second": 9.007, "train_steps_per_second": 0.141 } ], "logging_steps": 5, "max_steps": 530, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 110971217510400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }