lesso02's picture
Training in progress, step 50, checkpoint
e515cf0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.02021018593371059,
"eval_steps": 5,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004042037186742118,
"grad_norm": 1.263440728187561,
"learning_rate": 1e-05,
"loss": 4.0397,
"step": 1
},
{
"epoch": 0.0004042037186742118,
"eval_loss": 1.917946219444275,
"eval_runtime": 452.0131,
"eval_samples_per_second": 4.61,
"eval_steps_per_second": 0.577,
"step": 1
},
{
"epoch": 0.0008084074373484236,
"grad_norm": 1.0014753341674805,
"learning_rate": 2e-05,
"loss": 3.5685,
"step": 2
},
{
"epoch": 0.0012126111560226355,
"grad_norm": 0.9920203685760498,
"learning_rate": 3e-05,
"loss": 3.4815,
"step": 3
},
{
"epoch": 0.0016168148746968471,
"grad_norm": 0.9647438526153564,
"learning_rate": 4e-05,
"loss": 3.5763,
"step": 4
},
{
"epoch": 0.002021018593371059,
"grad_norm": 0.9799220561981201,
"learning_rate": 5e-05,
"loss": 3.5288,
"step": 5
},
{
"epoch": 0.002021018593371059,
"eval_loss": 1.8881479501724243,
"eval_runtime": 452.9863,
"eval_samples_per_second": 4.601,
"eval_steps_per_second": 0.576,
"step": 5
},
{
"epoch": 0.002425222312045271,
"grad_norm": 1.4729738235473633,
"learning_rate": 6e-05,
"loss": 3.8284,
"step": 6
},
{
"epoch": 0.0028294260307194824,
"grad_norm": 1.3433350324630737,
"learning_rate": 7e-05,
"loss": 3.8095,
"step": 7
},
{
"epoch": 0.0032336297493936943,
"grad_norm": 1.1986558437347412,
"learning_rate": 8e-05,
"loss": 3.8635,
"step": 8
},
{
"epoch": 0.003637833468067906,
"grad_norm": 1.181183934211731,
"learning_rate": 9e-05,
"loss": 3.4141,
"step": 9
},
{
"epoch": 0.004042037186742118,
"grad_norm": 1.2625378370285034,
"learning_rate": 0.0001,
"loss": 3.2466,
"step": 10
},
{
"epoch": 0.004042037186742118,
"eval_loss": 1.6723251342773438,
"eval_runtime": 453.0563,
"eval_samples_per_second": 4.6,
"eval_steps_per_second": 0.576,
"step": 10
},
{
"epoch": 0.00444624090541633,
"grad_norm": 1.3650546073913574,
"learning_rate": 9.98458666866564e-05,
"loss": 3.4225,
"step": 11
},
{
"epoch": 0.004850444624090542,
"grad_norm": 1.01077401638031,
"learning_rate": 9.938441702975689e-05,
"loss": 3.3738,
"step": 12
},
{
"epoch": 0.005254648342764754,
"grad_norm": 2.3417210578918457,
"learning_rate": 9.861849601988383e-05,
"loss": 3.4139,
"step": 13
},
{
"epoch": 0.005658852061438965,
"grad_norm": 1.6219801902770996,
"learning_rate": 9.755282581475769e-05,
"loss": 2.7161,
"step": 14
},
{
"epoch": 0.006063055780113177,
"grad_norm": 1.0307327508926392,
"learning_rate": 9.619397662556435e-05,
"loss": 2.8197,
"step": 15
},
{
"epoch": 0.006063055780113177,
"eval_loss": 1.4082988500595093,
"eval_runtime": 453.1209,
"eval_samples_per_second": 4.599,
"eval_steps_per_second": 0.576,
"step": 15
},
{
"epoch": 0.0064672594987873885,
"grad_norm": 1.0622483491897583,
"learning_rate": 9.45503262094184e-05,
"loss": 3.0642,
"step": 16
},
{
"epoch": 0.0068714632174616,
"grad_norm": 0.7077268362045288,
"learning_rate": 9.263200821770461e-05,
"loss": 2.7863,
"step": 17
},
{
"epoch": 0.007275666936135812,
"grad_norm": 0.8749211430549622,
"learning_rate": 9.045084971874738e-05,
"loss": 2.7657,
"step": 18
},
{
"epoch": 0.007679870654810024,
"grad_norm": 0.8615348935127258,
"learning_rate": 8.802029828000156e-05,
"loss": 2.6338,
"step": 19
},
{
"epoch": 0.008084074373484237,
"grad_norm": 0.8538162708282471,
"learning_rate": 8.535533905932738e-05,
"loss": 2.7195,
"step": 20
},
{
"epoch": 0.008084074373484237,
"eval_loss": 1.3105684518814087,
"eval_runtime": 453.3652,
"eval_samples_per_second": 4.597,
"eval_steps_per_second": 0.576,
"step": 20
},
{
"epoch": 0.008488278092158448,
"grad_norm": 0.7278123497962952,
"learning_rate": 8.247240241650918e-05,
"loss": 2.6497,
"step": 21
},
{
"epoch": 0.00889248181083266,
"grad_norm": 0.6712276935577393,
"learning_rate": 7.938926261462366e-05,
"loss": 2.6245,
"step": 22
},
{
"epoch": 0.009296685529506871,
"grad_norm": 0.5658913254737854,
"learning_rate": 7.612492823579745e-05,
"loss": 2.724,
"step": 23
},
{
"epoch": 0.009700889248181084,
"grad_norm": 0.7421402931213379,
"learning_rate": 7.269952498697734e-05,
"loss": 2.2569,
"step": 24
},
{
"epoch": 0.010105092966855295,
"grad_norm": 1.5091322660446167,
"learning_rate": 6.91341716182545e-05,
"loss": 2.7908,
"step": 25
},
{
"epoch": 0.010105092966855295,
"eval_loss": 1.2647299766540527,
"eval_runtime": 453.2429,
"eval_samples_per_second": 4.598,
"eval_steps_per_second": 0.576,
"step": 25
},
{
"epoch": 0.010509296685529508,
"grad_norm": 0.93433678150177,
"learning_rate": 6.545084971874738e-05,
"loss": 2.4645,
"step": 26
},
{
"epoch": 0.010913500404203719,
"grad_norm": 0.8287851810455322,
"learning_rate": 6.167226819279528e-05,
"loss": 2.6179,
"step": 27
},
{
"epoch": 0.01131770412287793,
"grad_norm": 0.8977319598197937,
"learning_rate": 5.782172325201155e-05,
"loss": 2.7171,
"step": 28
},
{
"epoch": 0.011721907841552142,
"grad_norm": 0.7713293433189392,
"learning_rate": 5.392295478639225e-05,
"loss": 2.6567,
"step": 29
},
{
"epoch": 0.012126111560226353,
"grad_norm": 0.7393156886100769,
"learning_rate": 5e-05,
"loss": 2.5136,
"step": 30
},
{
"epoch": 0.012126111560226353,
"eval_loss": 1.2371978759765625,
"eval_runtime": 453.0604,
"eval_samples_per_second": 4.6,
"eval_steps_per_second": 0.576,
"step": 30
},
{
"epoch": 0.012530315278900566,
"grad_norm": 0.6830640435218811,
"learning_rate": 4.607704521360776e-05,
"loss": 2.6676,
"step": 31
},
{
"epoch": 0.012934518997574777,
"grad_norm": 0.7319803237915039,
"learning_rate": 4.2178276747988446e-05,
"loss": 2.3856,
"step": 32
},
{
"epoch": 0.01333872271624899,
"grad_norm": 0.7554514408111572,
"learning_rate": 3.832773180720475e-05,
"loss": 2.5566,
"step": 33
},
{
"epoch": 0.0137429264349232,
"grad_norm": 0.6606221795082092,
"learning_rate": 3.4549150281252636e-05,
"loss": 2.3467,
"step": 34
},
{
"epoch": 0.014147130153597413,
"grad_norm": 0.6036873459815979,
"learning_rate": 3.086582838174551e-05,
"loss": 2.0719,
"step": 35
},
{
"epoch": 0.014147130153597413,
"eval_loss": 1.224223017692566,
"eval_runtime": 453.2044,
"eval_samples_per_second": 4.598,
"eval_steps_per_second": 0.576,
"step": 35
},
{
"epoch": 0.014551333872271624,
"grad_norm": 0.9149614572525024,
"learning_rate": 2.7300475013022663e-05,
"loss": 2.2007,
"step": 36
},
{
"epoch": 0.014955537590945837,
"grad_norm": 0.47336137294769287,
"learning_rate": 2.3875071764202563e-05,
"loss": 2.4277,
"step": 37
},
{
"epoch": 0.015359741309620048,
"grad_norm": 0.6594354510307312,
"learning_rate": 2.061073738537635e-05,
"loss": 2.8188,
"step": 38
},
{
"epoch": 0.01576394502829426,
"grad_norm": 0.6771719455718994,
"learning_rate": 1.7527597583490822e-05,
"loss": 2.3827,
"step": 39
},
{
"epoch": 0.016168148746968473,
"grad_norm": 0.7665150761604309,
"learning_rate": 1.4644660940672627e-05,
"loss": 2.6651,
"step": 40
},
{
"epoch": 0.016168148746968473,
"eval_loss": 1.2175811529159546,
"eval_runtime": 453.0444,
"eval_samples_per_second": 4.6,
"eval_steps_per_second": 0.576,
"step": 40
},
{
"epoch": 0.016572352465642683,
"grad_norm": 1.0414700508117676,
"learning_rate": 1.1979701719998453e-05,
"loss": 2.9944,
"step": 41
},
{
"epoch": 0.016976556184316895,
"grad_norm": 0.7823891043663025,
"learning_rate": 9.549150281252633e-06,
"loss": 2.3473,
"step": 42
},
{
"epoch": 0.017380759902991108,
"grad_norm": 0.49085983633995056,
"learning_rate": 7.367991782295391e-06,
"loss": 2.2603,
"step": 43
},
{
"epoch": 0.01778496362166532,
"grad_norm": 0.7287192344665527,
"learning_rate": 5.449673790581611e-06,
"loss": 2.5286,
"step": 44
},
{
"epoch": 0.01818916734033953,
"grad_norm": 0.5006673336029053,
"learning_rate": 3.8060233744356633e-06,
"loss": 2.0819,
"step": 45
},
{
"epoch": 0.01818916734033953,
"eval_loss": 1.215002417564392,
"eval_runtime": 453.0566,
"eval_samples_per_second": 4.6,
"eval_steps_per_second": 0.576,
"step": 45
},
{
"epoch": 0.018593371059013743,
"grad_norm": 0.6527900099754333,
"learning_rate": 2.4471741852423237e-06,
"loss": 2.2517,
"step": 46
},
{
"epoch": 0.018997574777687955,
"grad_norm": 1.0630096197128296,
"learning_rate": 1.3815039801161721e-06,
"loss": 2.9461,
"step": 47
},
{
"epoch": 0.019401778496362168,
"grad_norm": 0.761471152305603,
"learning_rate": 6.15582970243117e-07,
"loss": 2.2641,
"step": 48
},
{
"epoch": 0.019805982215036377,
"grad_norm": 0.7035945653915405,
"learning_rate": 1.5413331334360182e-07,
"loss": 2.2215,
"step": 49
},
{
"epoch": 0.02021018593371059,
"grad_norm": 0.7908831238746643,
"learning_rate": 0.0,
"loss": 2.6591,
"step": 50
},
{
"epoch": 0.02021018593371059,
"eval_loss": 1.2144694328308105,
"eval_runtime": 453.5575,
"eval_samples_per_second": 4.595,
"eval_steps_per_second": 0.575,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 50,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.538264861212672e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}