{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13199142055766375, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002639828411153275, "grad_norm": 22.346176147460938, "learning_rate": 0.00015, "loss": 10.0087, "step": 1 }, { "epoch": 0.002639828411153275, "eval_loss": 10.013830184936523, "eval_runtime": 229.1847, "eval_samples_per_second": 5.572, "eval_steps_per_second": 2.788, "step": 1 }, { "epoch": 0.00527965682230655, "grad_norm": 22.456666946411133, "learning_rate": 0.0003, "loss": 10.0246, "step": 2 }, { "epoch": 0.007919485233459825, "grad_norm": 19.28063201904297, "learning_rate": 0.0002996788384857905, "loss": 7.27, "step": 3 }, { "epoch": 0.0105593136446131, "grad_norm": 16.660444259643555, "learning_rate": 0.00029871672920607153, "loss": 2.6075, "step": 4 }, { "epoch": 0.013199142055766375, "grad_norm": 6.3326215744018555, "learning_rate": 0.00029711779206048454, "loss": 0.4204, "step": 5 }, { "epoch": 0.01583897046691965, "grad_norm": 0.7938358783721924, "learning_rate": 0.0002948888739433602, "loss": 0.0209, "step": 6 }, { "epoch": 0.018478798878072925, "grad_norm": 0.027728406712412834, "learning_rate": 0.0002920395194242658, "loss": 0.0006, "step": 7 }, { "epoch": 0.0211186272892262, "grad_norm": 0.004830565769225359, "learning_rate": 0.000288581929876693, "loss": 0.0001, "step": 8 }, { "epoch": 0.023758455700379474, "grad_norm": 0.0023810293059796095, "learning_rate": 0.00028453091122990323, "loss": 0.0, "step": 9 }, { "epoch": 0.02639828411153275, "grad_norm": 0.035360973328351974, "learning_rate": 0.0002799038105676658, "loss": 0.0001, "step": 10 }, { "epoch": 0.029038112522686024, "grad_norm": 0.0016157212667167187, "learning_rate": 0.0002747204418453818, "loss": 0.0, "step": 11 }, { "epoch": 0.0316779409338393, "grad_norm": 0.0013053239090368152, "learning_rate": 0.00026900300104368524, "loss": 0.0, "step": 12 }, { "epoch": 0.034317769344992574, "grad_norm": 0.0008068020106293261, "learning_rate": 0.0002627759711218466, "loss": 0.0, "step": 13 }, { "epoch": 0.034317769344992574, "eval_loss": 4.657953468267806e-06, "eval_runtime": 229.4053, "eval_samples_per_second": 5.567, "eval_steps_per_second": 2.785, "step": 13 }, { "epoch": 0.03695759775614585, "grad_norm": 0.0009434845414943993, "learning_rate": 0.00025606601717798207, "loss": 0.0, "step": 14 }, { "epoch": 0.039597426167299124, "grad_norm": 0.0005841734819114208, "learning_rate": 0.0002489018722650103, "loss": 0.0, "step": 15 }, { "epoch": 0.0422372545784524, "grad_norm": 0.003180427709594369, "learning_rate": 0.00024131421435130807, "loss": 0.0, "step": 16 }, { "epoch": 0.044877082989605674, "grad_norm": 0.004339354578405619, "learning_rate": 0.0002333355349529403, "loss": 0.0, "step": 17 }, { "epoch": 0.04751691140075895, "grad_norm": 0.006588978227227926, "learning_rate": 0.000225, "loss": 0.0, "step": 18 }, { "epoch": 0.050156739811912224, "grad_norm": 0.0036406053695827723, "learning_rate": 0.00021634330353285017, "loss": 0.0, "step": 19 }, { "epoch": 0.0527965682230655, "grad_norm": 0.00797521322965622, "learning_rate": 0.00020740251485476345, "loss": 0.0, "step": 20 }, { "epoch": 0.055436396634218774, "grad_norm": 0.0004899859195575118, "learning_rate": 0.00019821591979547423, "loss": 0.0, "step": 21 }, { "epoch": 0.05807622504537205, "grad_norm": 0.0003694019978865981, "learning_rate": 0.0001888228567653781, "loss": 0.0, "step": 22 }, { "epoch": 0.060716053456525323, "grad_norm": 0.00016751833027228713, "learning_rate": 0.00017926354830241924, "loss": 0.0, "step": 23 }, { "epoch": 0.0633558818676786, "grad_norm": 0.00011875380005221814, "learning_rate": 0.00016957892883300775, "loss": 0.0, "step": 24 }, { "epoch": 0.06599571027883187, "grad_norm": 5.093684740131721e-05, "learning_rate": 0.00015981046938452146, "loss": 0.0, "step": 25 }, { "epoch": 0.06863553868998515, "grad_norm": 3.9787148125469685e-05, "learning_rate": 0.00015, "loss": 0.0, "step": 26 }, { "epoch": 0.06863553868998515, "eval_loss": 2.810793660046329e-07, "eval_runtime": 229.3285, "eval_samples_per_second": 5.568, "eval_steps_per_second": 2.786, "step": 26 }, { "epoch": 0.07127536710113842, "grad_norm": 3.462208769633435e-05, "learning_rate": 0.0001401895306154785, "loss": 0.0, "step": 27 }, { "epoch": 0.0739151955122917, "grad_norm": 2.7388192393118516e-05, "learning_rate": 0.00013042107116699228, "loss": 0.0, "step": 28 }, { "epoch": 0.07655502392344497, "grad_norm": 3.9361602830467746e-05, "learning_rate": 0.00012073645169758076, "loss": 0.0, "step": 29 }, { "epoch": 0.07919485233459825, "grad_norm": 2.9102855478413403e-05, "learning_rate": 0.00011117714323462186, "loss": 0.0, "step": 30 }, { "epoch": 0.08183468074575152, "grad_norm": 2.171610321966e-05, "learning_rate": 0.00010178408020452579, "loss": 0.0, "step": 31 }, { "epoch": 0.0844745091569048, "grad_norm": 2.040547587967012e-05, "learning_rate": 9.259748514523653e-05, "loss": 0.0, "step": 32 }, { "epoch": 0.08711433756805807, "grad_norm": 1.634558975638356e-05, "learning_rate": 8.365669646714983e-05, "loss": 0.0, "step": 33 }, { "epoch": 0.08975416597921135, "grad_norm": 2.2022310076863505e-05, "learning_rate": 7.500000000000002e-05, "loss": 0.0, "step": 34 }, { "epoch": 0.09239399439036462, "grad_norm": 1.6614567357464693e-05, "learning_rate": 6.66644650470597e-05, "loss": 0.0, "step": 35 }, { "epoch": 0.0950338228015179, "grad_norm": 1.5442792573594488e-05, "learning_rate": 5.8685785648691894e-05, "loss": 0.0, "step": 36 }, { "epoch": 0.09767365121267117, "grad_norm": 1.3897730241296813e-05, "learning_rate": 5.109812773498967e-05, "loss": 0.0, "step": 37 }, { "epoch": 0.10031347962382445, "grad_norm": 1.476151192036923e-05, "learning_rate": 4.3933982822017876e-05, "loss": 0.0, "step": 38 }, { "epoch": 0.10295330803497772, "grad_norm": 1.7724401914165355e-05, "learning_rate": 3.72240288781534e-05, "loss": 0.0, "step": 39 }, { "epoch": 0.10295330803497772, "eval_loss": 1.1696874935296364e-07, "eval_runtime": 229.2689, "eval_samples_per_second": 5.57, "eval_steps_per_second": 2.787, "step": 39 }, { "epoch": 0.105593136446131, "grad_norm": 1.175746729131788e-05, "learning_rate": 3.099699895631474e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.10823296485728427, "grad_norm": 1.0341846973460633e-05, "learning_rate": 2.5279558154618197e-05, "loss": 0.0, "step": 41 }, { "epoch": 0.11087279326843755, "grad_norm": 1.2265077202755492e-05, "learning_rate": 2.009618943233419e-05, "loss": 0.0, "step": 42 }, { "epoch": 0.11351262167959082, "grad_norm": 1.1335401723044924e-05, "learning_rate": 1.546908877009676e-05, "loss": 0.0, "step": 43 }, { "epoch": 0.1161524500907441, "grad_norm": 1.6442403648397885e-05, "learning_rate": 1.1418070123306989e-05, "loss": 0.0, "step": 44 }, { "epoch": 0.11879227850189737, "grad_norm": 2.3461549062631093e-05, "learning_rate": 7.960480575734162e-06, "loss": 0.0, "step": 45 }, { "epoch": 0.12143210691305065, "grad_norm": 1.1922435987798963e-05, "learning_rate": 5.11112605663977e-06, "loss": 0.0, "step": 46 }, { "epoch": 0.12407193532420392, "grad_norm": 1.07811547422898e-05, "learning_rate": 2.882207939515435e-06, "loss": 0.0, "step": 47 }, { "epoch": 0.1267117637353572, "grad_norm": 4.0074228309094906e-05, "learning_rate": 1.2832707939284427e-06, "loss": 0.0, "step": 48 }, { "epoch": 0.12935159214651049, "grad_norm": 1.022117976390291e-05, "learning_rate": 3.211615142094781e-07, "loss": 0.0, "step": 49 }, { "epoch": 0.13199142055766375, "grad_norm": 9.782426786841825e-06, "learning_rate": 0.0, "loss": 0.0, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.935098268483584e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }