{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99812382739212, "eval_steps": 100, "global_step": 133, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 33.946875, "epoch": 0.0375234521575985, "grad_norm": 1.5592833757400513, "kl": 0.013537660241127014, "learning_rate": 7.1428571428571436e-06, "loss": 0.0031, "reward": 0.03671875, "reward_std": 0.08486471865326166, "rewards/accuracy_reward": 0.0158203125, "rewards/format_reward": 0.0033203125, "rewards/relaxed_accuracy_reward": 0.017578125, "step": 5 }, { "completion_length": 7.9525390625, "epoch": 0.075046904315197, "grad_norm": 26.73110008239746, "kl": 17.372037601470947, "learning_rate": 1.4285714285714287e-05, "loss": 0.7047, "reward": 0.413671875, "reward_std": 0.19759062808007002, "rewards/accuracy_reward": 0.1939453125, "rewards/format_reward": 0.0005859375, "rewards/relaxed_accuracy_reward": 0.219140625, "step": 10 }, { "completion_length": 29.3978515625, "epoch": 0.1125703564727955, "grad_norm": 12.636091232299805, "kl": 2.432843017578125, "learning_rate": 1.9996515418688493e-05, "loss": 0.0908, "reward": 0.30703125, "reward_std": 0.24456602307036518, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.16640625, "step": 15 }, { "completion_length": 27.5798828125, "epoch": 0.150093808630394, "grad_norm": 21.91870880126953, "kl": 1.253253173828125, "learning_rate": 1.9874809871741877e-05, "loss": 0.0532, "reward": 0.5099609375, "reward_std": 0.2849292915314436, "rewards/accuracy_reward": 0.22578125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2841796875, "step": 20 }, { "completion_length": 9.8216796875, "epoch": 0.18761726078799248, "grad_norm": 21.709964752197266, "kl": 1.0984130859375, "learning_rate": 1.9581296124106682e-05, "loss": 0.0578, "reward": 0.5490234375, "reward_std": 0.32321499213576316, "rewards/accuracy_reward": 0.2373046875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.31171875, "step": 25 }, { "completion_length": 22.2578125, "epoch": 0.225140712945591, "grad_norm": 165.1543426513672, "kl": 3.56005859375, "learning_rate": 1.912108091398988e-05, "loss": 0.1402, "reward": 0.4880859375, "reward_std": 0.3410706129856408, "rewards/accuracy_reward": 0.222265625, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2658203125, "step": 30 }, { "completion_length": 20.027734375, "epoch": 0.2626641651031895, "grad_norm": 1127.5472412109375, "kl": 6.7017822265625, "learning_rate": 1.8502171357296144e-05, "loss": 0.2507, "reward": 0.387890625, "reward_std": 0.3576823682524264, "rewards/accuracy_reward": 0.1626953125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2251953125, "step": 35 }, { "completion_length": 29.49453125, "epoch": 0.300187617260788, "grad_norm": 25.603010177612305, "kl": 1.09813232421875, "learning_rate": 1.773533563475053e-05, "loss": 0.0428, "reward": 0.5158203125, "reward_std": 0.36268206988461316, "rewards/accuracy_reward": 0.22578125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2900390625, "step": 40 }, { "completion_length": 17.1138671875, "epoch": 0.33771106941838647, "grad_norm": 6.592025279998779, "kl": 1.22408447265625, "learning_rate": 1.6833915640265485e-05, "loss": 0.0536, "reward": 0.4765625, "reward_std": 0.2620124928187579, "rewards/accuracy_reward": 0.2068359375, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2697265625, "step": 45 }, { "completion_length": 35.35703125, "epoch": 0.37523452157598497, "grad_norm": 3.753960132598877, "kl": 1.41783447265625, "learning_rate": 1.58135948502146e-05, "loss": 0.0564, "reward": 0.5171875, "reward_std": 0.3441387979779392, "rewards/accuracy_reward": 0.232421875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.284765625, "step": 50 }, { "completion_length": 19.3244140625, "epoch": 0.41275797373358347, "grad_norm": 8.08942985534668, "kl": 1.07467041015625, "learning_rate": 1.4692125452370664e-05, "loss": 0.038, "reward": 0.5447265625, "reward_std": 0.3162791552487761, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3025390625, "step": 55 }, { "completion_length": 13.803515625, "epoch": 0.450281425891182, "grad_norm": 6688.68994140625, "kl": 11.382958984375, "learning_rate": 1.348901948209167e-05, "loss": 0.4542, "reward": 0.5509765625, "reward_std": 0.2770004874095321, "rewards/accuracy_reward": 0.24453125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3064453125, "step": 60 }, { "completion_length": 10.150390625, "epoch": 0.4878048780487805, "grad_norm": 22.93476104736328, "kl": 2.70247802734375, "learning_rate": 1.2225209339563144e-05, "loss": 0.1131, "reward": 0.58359375, "reward_std": 0.277045093011111, "rewards/accuracy_reward": 0.2546875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.32890625, "step": 65 }, { "completion_length": 9.9724609375, "epoch": 0.525328330206379, "grad_norm": 4.137301445007324, "kl": 1.7162109375, "learning_rate": 1.092268359463302e-05, "loss": 0.0629, "reward": 0.524609375, "reward_std": 0.2948215680196881, "rewards/accuracy_reward": 0.2169921875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3076171875, "step": 70 }, { "completion_length": 8.46484375, "epoch": 0.5628517823639775, "grad_norm": 5.097509860992432, "kl": 2.0114501953125, "learning_rate": 9.604104415737309e-06, "loss": 0.0958, "reward": 0.5404296875, "reward_std": 0.3218729373533279, "rewards/accuracy_reward": 0.2349609375, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.30546875, "step": 75 }, { "completion_length": 8.421875, "epoch": 0.600375234521576, "grad_norm": 6.521908283233643, "kl": 2.06651611328125, "learning_rate": 8.292413279130625e-06, "loss": 0.0886, "reward": 0.528125, "reward_std": 0.32329283356666566, "rewards/accuracy_reward": 0.222265625, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.305859375, "step": 80 }, { "completion_length": 8.27578125, "epoch": 0.6378986866791745, "grad_norm": 3.548377513885498, "kl": 1.679052734375, "learning_rate": 7.010431818542298e-06, "loss": 0.0482, "reward": 0.4205078125, "reward_std": 0.3025111163035035, "rewards/accuracy_reward": 0.1734375, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.2470703125, "step": 85 }, { "completion_length": 8.621484375, "epoch": 0.6754221388367729, "grad_norm": 11.26955509185791, "kl": 2.084521484375, "learning_rate": 5.780464759928623e-06, "loss": 0.097, "reward": 0.566015625, "reward_std": 0.31781149725429714, "rewards/accuracy_reward": 0.239453125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3265625, "step": 90 }, { "completion_length": 8.6640625, "epoch": 0.7129455909943715, "grad_norm": 15.46524715423584, "kl": 2.5925048828125, "learning_rate": 4.623911849714226e-06, "loss": 0.0919, "reward": 0.546484375, "reward_std": 0.26226845681667327, "rewards/accuracy_reward": 0.22421875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.322265625, "step": 95 }, { "completion_length": 8.1521484375, "epoch": 0.7504690431519699, "grad_norm": 8.370095252990723, "kl": 2.80860595703125, "learning_rate": 3.560895528440844e-06, "loss": 0.0876, "reward": 0.6302734375, "reward_std": 0.27545339791104195, "rewards/accuracy_reward": 0.275, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3552734375, "step": 100 }, { "epoch": 0.7504690431519699, "eval_completion_length": 8.064242160278745, "eval_kl": 1.6110899390243902, "eval_loss": 0.05133385583758354, "eval_reward": 0.5677264808362369, "eval_reward_std": 0.37672981305716347, "eval_rewards/accuracy_reward": 0.24401132404181183, "eval_rewards/format_reward": 0.0, "eval_rewards/relaxed_accuracy_reward": 0.3237151567944251, "eval_runtime": 419.9905, "eval_samples_per_second": 2.726, "eval_steps_per_second": 0.683, "step": 100 }, { "completion_length": 8.2787109375, "epoch": 0.7879924953095685, "grad_norm": 5.566633224487305, "kl": 1.6791015625, "learning_rate": 2.6099108277934105e-06, "loss": 0.0743, "reward": 0.580859375, "reward_std": 0.3424753251951188, "rewards/accuracy_reward": 0.253125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.327734375, "step": 105 }, { "completion_length": 8.3333984375, "epoch": 0.8255159474671669, "grad_norm": 4.05443000793457, "kl": 2.09364013671875, "learning_rate": 1.7875035823168641e-06, "loss": 0.0748, "reward": 0.5283203125, "reward_std": 0.3331008433829993, "rewards/accuracy_reward": 0.2318359375, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.296484375, "step": 110 }, { "completion_length": 8.5595703125, "epoch": 0.8630393996247655, "grad_norm": 20.012693405151367, "kl": 2.143017578125, "learning_rate": 1.1079825545001887e-06, "loss": 0.0681, "reward": 0.5373046875, "reward_std": 0.3182933186646551, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3068359375, "step": 115 }, { "completion_length": 8.526171875, "epoch": 0.900562851782364, "grad_norm": 12.22225570678711, "kl": 1.8458984375, "learning_rate": 5.831704818578842e-07, "loss": 0.0716, "reward": 0.5498046875, "reward_std": 0.31468736389651897, "rewards/accuracy_reward": 0.237890625, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3119140625, "step": 120 }, { "completion_length": 8.767578125, "epoch": 0.9380863039399625, "grad_norm": 4.848618507385254, "kl": 1.8501708984375, "learning_rate": 2.2219837744959284e-07, "loss": 0.0649, "reward": 0.5712890625, "reward_std": 0.33388876002281903, "rewards/accuracy_reward": 0.242578125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3287109375, "step": 125 }, { "completion_length": 8.5228515625, "epoch": 0.975609756097561, "grad_norm": 5.429755210876465, "kl": 1.84400634765625, "learning_rate": 3.134666272774034e-08, "loss": 0.0876, "reward": 0.564453125, "reward_std": 0.31874394970946013, "rewards/accuracy_reward": 0.2439453125, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3205078125, "step": 130 }, { "completion_length": 8.5595703125, "epoch": 0.99812382739212, "kl": 1.7513020833333333, "reward": 0.55078125, "reward_std": 0.3167417396325618, "rewards/accuracy_reward": 0.2275390625, "rewards/format_reward": 0.0, "rewards/relaxed_accuracy_reward": 0.3232421875, "step": 133, "total_flos": 0.0, "train_loss": 0.11666255951964093, "train_runtime": 146747.14, "train_samples_per_second": 0.116, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 133, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 15, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }