|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.02021018593371059, |
|
"eval_steps": 5, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004042037186742118, |
|
"grad_norm": 1.263440728187561, |
|
"learning_rate": 1e-05, |
|
"loss": 4.0397, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0004042037186742118, |
|
"eval_loss": 1.917946219444275, |
|
"eval_runtime": 452.0131, |
|
"eval_samples_per_second": 4.61, |
|
"eval_steps_per_second": 0.577, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008084074373484236, |
|
"grad_norm": 1.0014753341674805, |
|
"learning_rate": 2e-05, |
|
"loss": 3.5685, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0012126111560226355, |
|
"grad_norm": 0.9920203685760498, |
|
"learning_rate": 3e-05, |
|
"loss": 3.4815, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0016168148746968471, |
|
"grad_norm": 0.9647438526153564, |
|
"learning_rate": 4e-05, |
|
"loss": 3.5763, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.002021018593371059, |
|
"grad_norm": 0.9799220561981201, |
|
"learning_rate": 5e-05, |
|
"loss": 3.5288, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002021018593371059, |
|
"eval_loss": 1.8881479501724243, |
|
"eval_runtime": 452.9863, |
|
"eval_samples_per_second": 4.601, |
|
"eval_steps_per_second": 0.576, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002425222312045271, |
|
"grad_norm": 1.4729738235473633, |
|
"learning_rate": 6e-05, |
|
"loss": 3.8284, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0028294260307194824, |
|
"grad_norm": 1.3433350324630737, |
|
"learning_rate": 7e-05, |
|
"loss": 3.8095, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0032336297493936943, |
|
"grad_norm": 1.1986558437347412, |
|
"learning_rate": 8e-05, |
|
"loss": 3.8635, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.003637833468067906, |
|
"grad_norm": 1.181183934211731, |
|
"learning_rate": 9e-05, |
|
"loss": 3.4141, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004042037186742118, |
|
"grad_norm": 1.2625378370285034, |
|
"learning_rate": 0.0001, |
|
"loss": 3.2466, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004042037186742118, |
|
"eval_loss": 1.6723251342773438, |
|
"eval_runtime": 453.0563, |
|
"eval_samples_per_second": 4.6, |
|
"eval_steps_per_second": 0.576, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00444624090541633, |
|
"grad_norm": 1.3650546073913574, |
|
"learning_rate": 9.98458666866564e-05, |
|
"loss": 3.4225, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.004850444624090542, |
|
"grad_norm": 1.01077401638031, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 3.3738, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.005254648342764754, |
|
"grad_norm": 2.3417210578918457, |
|
"learning_rate": 9.861849601988383e-05, |
|
"loss": 3.4139, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.005658852061438965, |
|
"grad_norm": 1.6219801902770996, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 2.7161, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.006063055780113177, |
|
"grad_norm": 1.0307327508926392, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 2.8197, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006063055780113177, |
|
"eval_loss": 1.4082988500595093, |
|
"eval_runtime": 453.1209, |
|
"eval_samples_per_second": 4.599, |
|
"eval_steps_per_second": 0.576, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0064672594987873885, |
|
"grad_norm": 1.0622483491897583, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 3.0642, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0068714632174616, |
|
"grad_norm": 0.7077268362045288, |
|
"learning_rate": 9.263200821770461e-05, |
|
"loss": 2.7863, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.007275666936135812, |
|
"grad_norm": 0.8749211430549622, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.7657, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.007679870654810024, |
|
"grad_norm": 0.8615348935127258, |
|
"learning_rate": 8.802029828000156e-05, |
|
"loss": 2.6338, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.008084074373484237, |
|
"grad_norm": 0.8538162708282471, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 2.7195, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008084074373484237, |
|
"eval_loss": 1.3105684518814087, |
|
"eval_runtime": 453.3652, |
|
"eval_samples_per_second": 4.597, |
|
"eval_steps_per_second": 0.576, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008488278092158448, |
|
"grad_norm": 0.7278123497962952, |
|
"learning_rate": 8.247240241650918e-05, |
|
"loss": 2.6497, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00889248181083266, |
|
"grad_norm": 0.6712276935577393, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 2.6245, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.009296685529506871, |
|
"grad_norm": 0.5658913254737854, |
|
"learning_rate": 7.612492823579745e-05, |
|
"loss": 2.724, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.009700889248181084, |
|
"grad_norm": 0.7421402931213379, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 2.2569, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.010105092966855295, |
|
"grad_norm": 1.5091322660446167, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 2.7908, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010105092966855295, |
|
"eval_loss": 1.2647299766540527, |
|
"eval_runtime": 453.2429, |
|
"eval_samples_per_second": 4.598, |
|
"eval_steps_per_second": 0.576, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010509296685529508, |
|
"grad_norm": 0.93433678150177, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.4645, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.010913500404203719, |
|
"grad_norm": 0.8287851810455322, |
|
"learning_rate": 6.167226819279528e-05, |
|
"loss": 2.6179, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.01131770412287793, |
|
"grad_norm": 0.8977319598197937, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 2.7171, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.011721907841552142, |
|
"grad_norm": 0.7713293433189392, |
|
"learning_rate": 5.392295478639225e-05, |
|
"loss": 2.6567, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.012126111560226353, |
|
"grad_norm": 0.7393156886100769, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5136, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012126111560226353, |
|
"eval_loss": 1.2371978759765625, |
|
"eval_runtime": 453.0604, |
|
"eval_samples_per_second": 4.6, |
|
"eval_steps_per_second": 0.576, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012530315278900566, |
|
"grad_norm": 0.6830640435218811, |
|
"learning_rate": 4.607704521360776e-05, |
|
"loss": 2.6676, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.012934518997574777, |
|
"grad_norm": 0.7319803237915039, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 2.3856, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.01333872271624899, |
|
"grad_norm": 0.7554514408111572, |
|
"learning_rate": 3.832773180720475e-05, |
|
"loss": 2.5566, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0137429264349232, |
|
"grad_norm": 0.6606221795082092, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 2.3467, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.014147130153597413, |
|
"grad_norm": 0.6036873459815979, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 2.0719, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014147130153597413, |
|
"eval_loss": 1.224223017692566, |
|
"eval_runtime": 453.2044, |
|
"eval_samples_per_second": 4.598, |
|
"eval_steps_per_second": 0.576, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014551333872271624, |
|
"grad_norm": 0.9149614572525024, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 2.2007, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.014955537590945837, |
|
"grad_norm": 0.47336137294769287, |
|
"learning_rate": 2.3875071764202563e-05, |
|
"loss": 2.4277, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.015359741309620048, |
|
"grad_norm": 0.6594354510307312, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 2.8188, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.01576394502829426, |
|
"grad_norm": 0.6771719455718994, |
|
"learning_rate": 1.7527597583490822e-05, |
|
"loss": 2.3827, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.016168148746968473, |
|
"grad_norm": 0.7665150761604309, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 2.6651, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016168148746968473, |
|
"eval_loss": 1.2175811529159546, |
|
"eval_runtime": 453.0444, |
|
"eval_samples_per_second": 4.6, |
|
"eval_steps_per_second": 0.576, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016572352465642683, |
|
"grad_norm": 1.0414700508117676, |
|
"learning_rate": 1.1979701719998453e-05, |
|
"loss": 2.9944, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.016976556184316895, |
|
"grad_norm": 0.7823891043663025, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 2.3473, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.017380759902991108, |
|
"grad_norm": 0.49085983633995056, |
|
"learning_rate": 7.367991782295391e-06, |
|
"loss": 2.2603, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.01778496362166532, |
|
"grad_norm": 0.7287192344665527, |
|
"learning_rate": 5.449673790581611e-06, |
|
"loss": 2.5286, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.01818916734033953, |
|
"grad_norm": 0.5006673336029053, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 2.0819, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01818916734033953, |
|
"eval_loss": 1.215002417564392, |
|
"eval_runtime": 453.0566, |
|
"eval_samples_per_second": 4.6, |
|
"eval_steps_per_second": 0.576, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.018593371059013743, |
|
"grad_norm": 0.6527900099754333, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 2.2517, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.018997574777687955, |
|
"grad_norm": 1.0630096197128296, |
|
"learning_rate": 1.3815039801161721e-06, |
|
"loss": 2.9461, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.019401778496362168, |
|
"grad_norm": 0.761471152305603, |
|
"learning_rate": 6.15582970243117e-07, |
|
"loss": 2.2641, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.019805982215036377, |
|
"grad_norm": 0.7035945653915405, |
|
"learning_rate": 1.5413331334360182e-07, |
|
"loss": 2.2215, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.02021018593371059, |
|
"grad_norm": 0.7908831238746643, |
|
"learning_rate": 0.0, |
|
"loss": 2.6591, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02021018593371059, |
|
"eval_loss": 1.2144694328308105, |
|
"eval_runtime": 453.5575, |
|
"eval_samples_per_second": 4.595, |
|
"eval_steps_per_second": 0.575, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.538264861212672e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|