Safetensors
qwen2
trustalign_qwen2.5_1.5b / trainer_state.json
shanghong's picture
Upload folder using huggingface_hub
66473ae verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6155917425310937,
"eval_steps": 10,
"global_step": 700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004615976407231696,
"grad_norm": 60.83765068742266,
"learning_rate": 1.1494252873563218e-08,
"logits/chosen": 0.4711977541446686,
"logits/rejected": 0.4847034811973572,
"logps/chosen": -41.84939193725586,
"logps/rejected": -44.508792877197266,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.009231952814463392,
"grad_norm": 89.49857504360673,
"learning_rate": 2.2988505747126436e-08,
"logits/chosen": 0.4102262556552887,
"logits/rejected": 0.4489870071411133,
"logps/chosen": -33.33359909057617,
"logps/rejected": -48.11466979980469,
"loss": 0.6965,
"rewards/accuracies": 0.4722222089767456,
"rewards/chosen": 0.016991177573800087,
"rewards/margins": -0.00249446090310812,
"rewards/rejected": 0.01948563940823078,
"step": 4
},
{
"epoch": 0.01384792922169509,
"grad_norm": 91.503921376188,
"learning_rate": 3.448275862068965e-08,
"logits/chosen": 0.4212642312049866,
"logits/rejected": 0.448761522769928,
"logps/chosen": -39.75364685058594,
"logps/rejected": -51.98044967651367,
"loss": 0.7058,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 0.01352924108505249,
"rewards/margins": -0.021095700562000275,
"rewards/rejected": 0.034624941647052765,
"step": 6
},
{
"epoch": 0.018463905628926785,
"grad_norm": 74.67414376851612,
"learning_rate": 4.597701149425287e-08,
"logits/chosen": 0.3533351719379425,
"logits/rejected": 0.38716739416122437,
"logps/chosen": -42.66749954223633,
"logps/rejected": -59.93525695800781,
"loss": 0.682,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.15689438581466675,
"rewards/margins": 0.02920585870742798,
"rewards/rejected": 0.12768852710723877,
"step": 8
},
{
"epoch": 0.023079882036158482,
"grad_norm": 68.70984683419653,
"learning_rate": 5.747126436781609e-08,
"logits/chosen": 0.49728691577911377,
"logits/rejected": 0.5158182978630066,
"logps/chosen": -40.442108154296875,
"logps/rejected": -47.894962310791016,
"loss": 0.6784,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.20257243514060974,
"rewards/margins": 0.033093564212322235,
"rewards/rejected": 0.1694788932800293,
"step": 10
},
{
"epoch": 0.023079882036158482,
"eval_logits/chosen": 0.3284655511379242,
"eval_logits/rejected": 0.3523290753364563,
"eval_logps/chosen": -41.368160247802734,
"eval_logps/rejected": -47.68316650390625,
"eval_loss": 0.6900005340576172,
"eval_rewards/accuracies": 0.5040322542190552,
"eval_rewards/chosen": 0.18856020271778107,
"eval_rewards/margins": 0.010975954122841358,
"eval_rewards/rejected": 0.1775842159986496,
"eval_runtime": 223.5149,
"eval_samples_per_second": 7.758,
"eval_steps_per_second": 1.942,
"step": 10
},
{
"epoch": 0.02769585844339018,
"grad_norm": 83.08821357925031,
"learning_rate": 6.89655172413793e-08,
"logits/chosen": 0.39168137311935425,
"logits/rejected": 0.428312748670578,
"logps/chosen": -40.189659118652344,
"logps/rejected": -55.229732513427734,
"loss": 0.6825,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.10889428108930588,
"rewards/margins": 0.025906018912792206,
"rewards/rejected": 0.08298826217651367,
"step": 12
},
{
"epoch": 0.032311834850621876,
"grad_norm": 83.2788469670015,
"learning_rate": 8.045977011494252e-08,
"logits/chosen": 0.4244603216648102,
"logits/rejected": 0.45606857538223267,
"logps/chosen": -45.81875228881836,
"logps/rejected": -59.79555130004883,
"loss": 0.707,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.0007680323324166238,
"rewards/margins": -0.02245757356286049,
"rewards/rejected": 0.023225605487823486,
"step": 14
},
{
"epoch": 0.03692781125785357,
"grad_norm": 65.27464739827121,
"learning_rate": 9.195402298850574e-08,
"logits/chosen": 0.43778783082962036,
"logits/rejected": 0.47771337628364563,
"logps/chosen": -33.643489837646484,
"logps/rejected": -47.315940856933594,
"loss": 0.6907,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": 0.16155114769935608,
"rewards/margins": 0.009526676498353481,
"rewards/rejected": 0.15202444791793823,
"step": 16
},
{
"epoch": 0.04154378766508527,
"grad_norm": 60.46601141846051,
"learning_rate": 1.0344827586206897e-07,
"logits/chosen": 0.4576772153377533,
"logits/rejected": 0.4669303894042969,
"logps/chosen": -49.01601791381836,
"logps/rejected": -44.165489196777344,
"loss": 0.7024,
"rewards/accuracies": 0.4166666567325592,
"rewards/chosen": 0.1639019399881363,
"rewards/margins": -0.013081331737339497,
"rewards/rejected": 0.1769832819700241,
"step": 18
},
{
"epoch": 0.046159764072316964,
"grad_norm": 79.31933330847342,
"learning_rate": 1.1494252873563217e-07,
"logits/chosen": 0.40101033449172974,
"logits/rejected": 0.4429229199886322,
"logps/chosen": -42.295860290527344,
"logps/rejected": -61.62363052368164,
"loss": 0.6993,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": 0.1744026243686676,
"rewards/margins": -0.005564332008361816,
"rewards/rejected": 0.17996692657470703,
"step": 20
},
{
"epoch": 0.046159764072316964,
"eval_logits/chosen": 0.3300890624523163,
"eval_logits/rejected": 0.3539319634437561,
"eval_logps/chosen": -41.36879348754883,
"eval_logps/rejected": -47.67192840576172,
"eval_loss": 0.6927710771560669,
"eval_rewards/accuracies": 0.4694700539112091,
"eval_rewards/chosen": 0.1882432997226715,
"eval_rewards/margins": 0.005040565971285105,
"eval_rewards/rejected": 0.18320275843143463,
"eval_runtime": 220.5959,
"eval_samples_per_second": 7.861,
"eval_steps_per_second": 1.967,
"step": 20
},
{
"epoch": 0.05077574047954866,
"grad_norm": 74.95614553584633,
"learning_rate": 1.2643678160919542e-07,
"logits/chosen": 0.35644879937171936,
"logits/rejected": 0.39824995398521423,
"logps/chosen": -44.09666442871094,
"logps/rejected": -67.98532104492188,
"loss": 0.6849,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": 0.061192478984594345,
"rewards/margins": 0.024405598640441895,
"rewards/rejected": 0.03678688034415245,
"step": 22
},
{
"epoch": 0.05539171688678036,
"grad_norm": 59.95529358393387,
"learning_rate": 1.379310344827586e-07,
"logits/chosen": 0.4076593816280365,
"logits/rejected": 0.4187220335006714,
"logps/chosen": -50.34169006347656,
"logps/rejected": -52.33488464355469,
"loss": 0.6737,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.18035806715488434,
"rewards/margins": 0.046983275562524796,
"rewards/rejected": 0.13337479531764984,
"step": 24
},
{
"epoch": 0.06000769329401205,
"grad_norm": 58.82337491053465,
"learning_rate": 1.4942528735632184e-07,
"logits/chosen": 0.38400429487228394,
"logits/rejected": 0.3896331191062927,
"logps/chosen": -45.30482482910156,
"logps/rejected": -38.63485336303711,
"loss": 0.6927,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 0.12515152990818024,
"rewards/margins": 0.005594419315457344,
"rewards/rejected": 0.11955711245536804,
"step": 26
},
{
"epoch": 0.06462366970124375,
"grad_norm": 79.835671840217,
"learning_rate": 1.6091954022988505e-07,
"logits/chosen": 0.38133352994918823,
"logits/rejected": 0.4193841814994812,
"logps/chosen": -46.66801452636719,
"logps/rejected": -66.68572998046875,
"loss": 0.6817,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 0.1615171581506729,
"rewards/margins": 0.03017430752515793,
"rewards/rejected": 0.1313428282737732,
"step": 28
},
{
"epoch": 0.06923964610847544,
"grad_norm": 63.4551490148668,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": 0.38135284185409546,
"logits/rejected": 0.4095006585121155,
"logps/chosen": -40.06434631347656,
"logps/rejected": -49.53153610229492,
"loss": 0.6732,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.18324324488639832,
"rewards/margins": 0.04405728355050087,
"rewards/rejected": 0.13918595016002655,
"step": 30
},
{
"epoch": 0.06923964610847544,
"eval_logits/chosen": 0.33052363991737366,
"eval_logits/rejected": 0.3544217050075531,
"eval_logps/chosen": -41.5091438293457,
"eval_logps/rejected": -47.86111068725586,
"eval_loss": 0.6812014579772949,
"eval_rewards/accuracies": 0.546658992767334,
"eval_rewards/chosen": 0.11806601285934448,
"eval_rewards/margins": 0.029455602169036865,
"eval_rewards/rejected": 0.08861041069030762,
"eval_runtime": 220.5898,
"eval_samples_per_second": 7.861,
"eval_steps_per_second": 1.967,
"step": 30
},
{
"epoch": 0.07385562251570714,
"grad_norm": 54.00177702879831,
"learning_rate": 1.839080459770115e-07,
"logits/chosen": 0.4309755563735962,
"logits/rejected": 0.45285335183143616,
"logps/chosen": -42.45962905883789,
"logps/rejected": -47.46916198730469,
"loss": 0.6778,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 0.059671804308891296,
"rewards/margins": 0.03700065612792969,
"rewards/rejected": 0.022671150043606758,
"step": 32
},
{
"epoch": 0.07847159892293884,
"grad_norm": 55.987754524641964,
"learning_rate": 1.9540229885057472e-07,
"logits/chosen": 0.3958838880062103,
"logits/rejected": 0.43136459589004517,
"logps/chosen": -37.61958694458008,
"logps/rejected": -52.296146392822266,
"loss": 0.6756,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.2221948355436325,
"rewards/margins": 0.04236772283911705,
"rewards/rejected": 0.17982712388038635,
"step": 34
},
{
"epoch": 0.08308757533017054,
"grad_norm": 67.31410028619514,
"learning_rate": 2.0689655172413793e-07,
"logits/chosen": 0.44812121987342834,
"logits/rejected": 0.46431127190589905,
"logps/chosen": -42.98078155517578,
"logps/rejected": -41.65153884887695,
"loss": 0.6493,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.30534830689430237,
"rewards/margins": 0.09969804435968399,
"rewards/rejected": 0.20565026998519897,
"step": 36
},
{
"epoch": 0.08770355173740223,
"grad_norm": 57.904024813693326,
"learning_rate": 2.1839080459770114e-07,
"logits/chosen": 0.49128374457359314,
"logits/rejected": 0.5145975351333618,
"logps/chosen": -44.50560760498047,
"logps/rejected": -49.38070297241211,
"loss": 0.6628,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.18226391077041626,
"rewards/margins": 0.0719500482082367,
"rewards/rejected": 0.11031384021043777,
"step": 38
},
{
"epoch": 0.09231952814463393,
"grad_norm": 64.05496800782066,
"learning_rate": 2.2988505747126435e-07,
"logits/chosen": 0.46414005756378174,
"logits/rejected": 0.47909700870513916,
"logps/chosen": -45.80656433105469,
"logps/rejected": -48.13614273071289,
"loss": 0.6614,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.1019454374909401,
"rewards/margins": 0.07323868572711945,
"rewards/rejected": 0.028706755489110947,
"step": 40
},
{
"epoch": 0.09231952814463393,
"eval_logits/chosen": 0.3304091989994049,
"eval_logits/rejected": 0.35432368516921997,
"eval_logps/chosen": -41.51032638549805,
"eval_logps/rejected": -47.951194763183594,
"eval_loss": 0.6623325347900391,
"eval_rewards/accuracies": 0.5748847723007202,
"eval_rewards/chosen": 0.11747448146343231,
"eval_rewards/margins": 0.07390521466732025,
"eval_rewards/rejected": 0.04356926307082176,
"eval_runtime": 220.5888,
"eval_samples_per_second": 7.861,
"eval_steps_per_second": 1.967,
"step": 40
},
{
"epoch": 0.09693550455186563,
"grad_norm": 57.52177717893154,
"learning_rate": 2.413793103448276e-07,
"logits/chosen": 0.40689817070961,
"logits/rejected": 0.427402138710022,
"logps/chosen": -38.75439453125,
"logps/rejected": -44.31669235229492,
"loss": 0.6302,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.2862703502178192,
"rewards/margins": 0.14747940003871918,
"rewards/rejected": 0.13879093527793884,
"step": 42
},
{
"epoch": 0.10155148095909731,
"grad_norm": 64.43087177898828,
"learning_rate": 2.5287356321839084e-07,
"logits/chosen": 0.38502392172813416,
"logits/rejected": 0.42915642261505127,
"logps/chosen": -44.23611831665039,
"logps/rejected": -70.150634765625,
"loss": 0.6523,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.29370981454849243,
"rewards/margins": 0.11090421676635742,
"rewards/rejected": 0.18280558288097382,
"step": 44
},
{
"epoch": 0.10616745736632902,
"grad_norm": 60.64457511313282,
"learning_rate": 2.64367816091954e-07,
"logits/chosen": 0.4625084698200226,
"logits/rejected": 0.47940170764923096,
"logps/chosen": -47.40989685058594,
"logps/rejected": -50.2266731262207,
"loss": 0.6586,
"rewards/accuracies": 0.5416666865348816,
"rewards/chosen": 0.19395428895950317,
"rewards/margins": 0.09819034487009048,
"rewards/rejected": 0.09576395153999329,
"step": 46
},
{
"epoch": 0.11078343377356072,
"grad_norm": 48.97275141927136,
"learning_rate": 2.758620689655172e-07,
"logits/chosen": 0.40377330780029297,
"logits/rejected": 0.4251302480697632,
"logps/chosen": -40.91835021972656,
"logps/rejected": -46.69221878051758,
"loss": 0.6553,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.19001546502113342,
"rewards/margins": 0.11491294950246811,
"rewards/rejected": 0.07510250806808472,
"step": 48
},
{
"epoch": 0.1153994101807924,
"grad_norm": 50.405334242894924,
"learning_rate": 2.873563218390804e-07,
"logits/chosen": 0.42986366152763367,
"logits/rejected": 0.4425734579563141,
"logps/chosen": -45.240882873535156,
"logps/rejected": -45.33219528198242,
"loss": 0.6545,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.27720221877098083,
"rewards/margins": 0.10149689018726349,
"rewards/rejected": 0.17570529878139496,
"step": 50
},
{
"epoch": 0.1153994101807924,
"eval_logits/chosen": 0.3320508301258087,
"eval_logits/rejected": 0.35591834783554077,
"eval_logps/chosen": -41.36655807495117,
"eval_logps/rejected": -48.028892517089844,
"eval_loss": 0.6255878210067749,
"eval_rewards/accuracies": 0.6278801560401917,
"eval_rewards/chosen": 0.1893603652715683,
"eval_rewards/margins": 0.18464061617851257,
"eval_rewards/rejected": 0.004719759337604046,
"eval_runtime": 220.5277,
"eval_samples_per_second": 7.863,
"eval_steps_per_second": 1.968,
"step": 50
},
{
"epoch": 0.1200153865880241,
"grad_norm": 59.45036930086866,
"learning_rate": 2.988505747126437e-07,
"logits/chosen": 0.4412320852279663,
"logits/rejected": 0.47745391726493835,
"logps/chosen": -38.808204650878906,
"logps/rejected": -57.61214828491211,
"loss": 0.6523,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.24403540790081024,
"rewards/margins": 0.14384596049785614,
"rewards/rejected": 0.10018942505121231,
"step": 52
},
{
"epoch": 0.1246313629952558,
"grad_norm": 54.50350019894718,
"learning_rate": 3.103448275862069e-07,
"logits/chosen": 0.305615097284317,
"logits/rejected": 0.3378358781337738,
"logps/chosen": -41.46311950683594,
"logps/rejected": -55.873138427734375,
"loss": 0.6345,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": 0.2176976054906845,
"rewards/margins": 0.17212893068790436,
"rewards/rejected": 0.045568663626909256,
"step": 54
},
{
"epoch": 0.1292473394024875,
"grad_norm": 49.044811443941626,
"learning_rate": 3.218390804597701e-07,
"logits/chosen": 0.4806426763534546,
"logits/rejected": 0.5007810592651367,
"logps/chosen": -37.00300216674805,
"logps/rejected": -42.795040130615234,
"loss": 0.6005,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.41853082180023193,
"rewards/margins": 0.23157899081707,
"rewards/rejected": 0.18695180118083954,
"step": 56
},
{
"epoch": 0.1338633158097192,
"grad_norm": 54.19272761171978,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": 0.4073159098625183,
"logits/rejected": 0.4315372109413147,
"logps/chosen": -39.63461685180664,
"logps/rejected": -41.75359344482422,
"loss": 0.5767,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.488341748714447,
"rewards/margins": 0.2964838743209839,
"rewards/rejected": 0.19185791909694672,
"step": 58
},
{
"epoch": 0.13847929221695088,
"grad_norm": 45.35161413256781,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": 0.3869187831878662,
"logits/rejected": 0.4154462218284607,
"logps/chosen": -40.21774673461914,
"logps/rejected": -49.05698013305664,
"loss": 0.5939,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.4058813452720642,
"rewards/margins": 0.2916874289512634,
"rewards/rejected": 0.11419390141963959,
"step": 60
},
{
"epoch": 0.13847929221695088,
"eval_logits/chosen": 0.3351307511329651,
"eval_logits/rejected": 0.35898876190185547,
"eval_logps/chosen": -40.8883171081543,
"eval_logps/rejected": -47.73066711425781,
"eval_loss": 0.5981891751289368,
"eval_rewards/accuracies": 0.6745391488075256,
"eval_rewards/chosen": 0.4284805655479431,
"eval_rewards/margins": 0.2746467888355255,
"eval_rewards/rejected": 0.15383380651474,
"eval_runtime": 220.5776,
"eval_samples_per_second": 7.861,
"eval_steps_per_second": 1.968,
"step": 60
},
{
"epoch": 0.1430952686241826,
"grad_norm": 43.3905484769897,
"learning_rate": 3.5632183908045977e-07,
"logits/chosen": 0.4458725154399872,
"logits/rejected": 0.46262863278388977,
"logps/chosen": -40.7205924987793,
"logps/rejected": -47.21548080444336,
"loss": 0.6053,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.42287477850914,
"rewards/margins": 0.2391357719898224,
"rewards/rejected": 0.18373897671699524,
"step": 62
},
{
"epoch": 0.14771124503141428,
"grad_norm": 46.32576522285691,
"learning_rate": 3.67816091954023e-07,
"logits/chosen": 0.42775771021842957,
"logits/rejected": 0.4581214487552643,
"logps/chosen": -42.59015655517578,
"logps/rejected": -51.6392822265625,
"loss": 0.5669,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.43508684635162354,
"rewards/margins": 0.3611146807670593,
"rewards/rejected": 0.0739721804857254,
"step": 64
},
{
"epoch": 0.152327221438646,
"grad_norm": 42.51968356117282,
"learning_rate": 3.793103448275862e-07,
"logits/chosen": 0.4170711636543274,
"logits/rejected": 0.45475757122039795,
"logps/chosen": -38.8193359375,
"logps/rejected": -59.24808120727539,
"loss": 0.5514,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.5248206257820129,
"rewards/margins": 0.48520517349243164,
"rewards/rejected": 0.03961547836661339,
"step": 66
},
{
"epoch": 0.15694319784587768,
"grad_norm": 59.1347578756685,
"learning_rate": 3.9080459770114945e-07,
"logits/chosen": 0.3444980978965759,
"logits/rejected": 0.38142290711402893,
"logps/chosen": -37.63268280029297,
"logps/rejected": -56.55868911743164,
"loss": 0.6421,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": 0.28356897830963135,
"rewards/margins": 0.37217438220977783,
"rewards/rejected": -0.0886053591966629,
"step": 68
},
{
"epoch": 0.16155917425310937,
"grad_norm": 43.01512643240851,
"learning_rate": 4.0229885057471266e-07,
"logits/chosen": 0.47459664940834045,
"logits/rejected": 0.5048218369483948,
"logps/chosen": -37.06074905395508,
"logps/rejected": -41.83311462402344,
"loss": 0.4985,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.7238032221794128,
"rewards/margins": 0.6140663623809814,
"rewards/rejected": 0.10973668098449707,
"step": 70
},
{
"epoch": 0.16155917425310937,
"eval_logits/chosen": 0.3373754024505615,
"eval_logits/rejected": 0.361337274312973,
"eval_logps/chosen": -40.72093200683594,
"eval_logps/rejected": -47.9627685546875,
"eval_loss": 0.5673334002494812,
"eval_rewards/accuracies": 0.7073732614517212,
"eval_rewards/chosen": 0.512172520160675,
"eval_rewards/margins": 0.47438928484916687,
"eval_rewards/rejected": 0.03778325766324997,
"eval_runtime": 220.4667,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.969,
"step": 70
},
{
"epoch": 0.16617515066034108,
"grad_norm": 58.03313651952633,
"learning_rate": 4.1379310344827586e-07,
"logits/chosen": 0.47328370809555054,
"logits/rejected": 0.516916036605835,
"logps/chosen": -39.381927490234375,
"logps/rejected": -63.04606628417969,
"loss": 0.5446,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.5364831686019897,
"rewards/margins": 0.5998459458351135,
"rewards/rejected": -0.06336280703544617,
"step": 72
},
{
"epoch": 0.17079112706757277,
"grad_norm": 38.72118579621577,
"learning_rate": 4.25287356321839e-07,
"logits/chosen": 0.4743606746196747,
"logits/rejected": 0.48414406180381775,
"logps/chosen": -47.13395690917969,
"logps/rejected": -47.23988723754883,
"loss": 0.6296,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.46243613958358765,
"rewards/margins": 0.4000816345214844,
"rewards/rejected": 0.06235449016094208,
"step": 74
},
{
"epoch": 0.17540710347480445,
"grad_norm": 52.02457163056824,
"learning_rate": 4.367816091954023e-07,
"logits/chosen": 0.4869605302810669,
"logits/rejected": 0.5183277726173401,
"logps/chosen": -41.5470085144043,
"logps/rejected": -52.64150619506836,
"loss": 0.5554,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.6152575612068176,
"rewards/margins": 0.5033391118049622,
"rewards/rejected": 0.11191850155591965,
"step": 76
},
{
"epoch": 0.18002307988203617,
"grad_norm": 39.942306949985145,
"learning_rate": 4.482758620689655e-07,
"logits/chosen": 0.4678427278995514,
"logits/rejected": 0.4919649660587311,
"logps/chosen": -36.33488845825195,
"logps/rejected": -46.28294372558594,
"loss": 0.5509,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.7630440592765808,
"rewards/margins": 0.5350204110145569,
"rewards/rejected": 0.22802363336086273,
"step": 78
},
{
"epoch": 0.18463905628926786,
"grad_norm": 49.68265630941736,
"learning_rate": 4.597701149425287e-07,
"logits/chosen": 0.4118601381778717,
"logits/rejected": 0.4340742528438568,
"logps/chosen": -36.466026306152344,
"logps/rejected": -40.359230041503906,
"loss": 0.5161,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.7488323450088501,
"rewards/margins": 0.6430253982543945,
"rewards/rejected": 0.1058068722486496,
"step": 80
},
{
"epoch": 0.18463905628926786,
"eval_logits/chosen": 0.3424670994281769,
"eval_logits/rejected": 0.3665529489517212,
"eval_logps/chosen": -40.603668212890625,
"eval_logps/rejected": -48.121337890625,
"eval_loss": 0.5314013957977295,
"eval_rewards/accuracies": 0.7206221222877502,
"eval_rewards/chosen": 0.5708039999008179,
"eval_rewards/margins": 0.6123039126396179,
"eval_rewards/rejected": -0.04149990156292915,
"eval_runtime": 220.269,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 1.97,
"step": 80
},
{
"epoch": 0.18925503269649954,
"grad_norm": 40.18854063526523,
"learning_rate": 4.712643678160919e-07,
"logits/chosen": 0.4146896302700043,
"logits/rejected": 0.44372716546058655,
"logps/chosen": -44.112205505371094,
"logps/rejected": -54.97979736328125,
"loss": 0.5066,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.5255990624427795,
"rewards/margins": 0.6622204780578613,
"rewards/rejected": -0.1366213709115982,
"step": 82
},
{
"epoch": 0.19387100910373126,
"grad_norm": 35.8737356471814,
"learning_rate": 4.827586206896552e-07,
"logits/chosen": 0.46901315450668335,
"logits/rejected": 0.5202505588531494,
"logps/chosen": -37.11308288574219,
"logps/rejected": -64.51854705810547,
"loss": 0.4791,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.6412889957427979,
"rewards/margins": 0.8545607924461365,
"rewards/rejected": -0.213271826505661,
"step": 84
},
{
"epoch": 0.19848698551096294,
"grad_norm": 38.06201763208911,
"learning_rate": 4.942528735632184e-07,
"logits/chosen": 0.4869195520877838,
"logits/rejected": 0.5158190727233887,
"logps/chosen": -41.02754592895508,
"logps/rejected": -52.270511627197266,
"loss": 0.4592,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.6756889820098877,
"rewards/margins": 0.815551221370697,
"rewards/rejected": -0.13986223936080933,
"step": 86
},
{
"epoch": 0.20310296191819463,
"grad_norm": 36.34414677933188,
"learning_rate": 4.999979670146248e-07,
"logits/chosen": 0.4226466119289398,
"logits/rejected": 0.4440222680568695,
"logps/chosen": -45.02897644042969,
"logps/rejected": -53.814674377441406,
"loss": 0.4665,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5262306928634644,
"rewards/margins": 0.772502601146698,
"rewards/rejected": -0.24627192318439484,
"step": 88
},
{
"epoch": 0.20771893832542634,
"grad_norm": 47.62232189356859,
"learning_rate": 4.99981703330008e-07,
"logits/chosen": 0.43458905816078186,
"logits/rejected": 0.45631253719329834,
"logps/chosen": -39.44232177734375,
"logps/rejected": -49.5074462890625,
"loss": 0.508,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.6604294180870056,
"rewards/margins": 0.6735664010047913,
"rewards/rejected": -0.013136889785528183,
"step": 90
},
{
"epoch": 0.20771893832542634,
"eval_logits/chosen": 0.3509339988231659,
"eval_logits/rejected": 0.37502503395080566,
"eval_logps/chosen": -40.31688690185547,
"eval_logps/rejected": -48.16210174560547,
"eval_loss": 0.4914422631263733,
"eval_rewards/accuracies": 0.7263824939727783,
"eval_rewards/chosen": 0.7141958475112915,
"eval_rewards/margins": 0.7760785222053528,
"eval_rewards/rejected": -0.06188271939754486,
"eval_runtime": 220.2045,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 90
},
{
"epoch": 0.21233491473265803,
"grad_norm": 33.53538998473167,
"learning_rate": 4.99949177018813e-07,
"logits/chosen": 0.4293578863143921,
"logits/rejected": 0.46211349964141846,
"logps/chosen": -34.20891571044922,
"logps/rejected": -45.82402420043945,
"loss": 0.4007,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.9528428316116333,
"rewards/margins": 1.0176244974136353,
"rewards/rejected": -0.06478171050548553,
"step": 92
},
{
"epoch": 0.21695089113988972,
"grad_norm": 47.45137697847349,
"learning_rate": 4.999003901970474e-07,
"logits/chosen": 0.4385245442390442,
"logits/rejected": 0.45108774304389954,
"logps/chosen": -47.24710464477539,
"logps/rejected": -47.30147171020508,
"loss": 0.5534,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.644627034664154,
"rewards/margins": 0.6343204975128174,
"rewards/rejected": 0.010306484065949917,
"step": 94
},
{
"epoch": 0.22156686754712143,
"grad_norm": 33.39629568865361,
"learning_rate": 4.998353460385512e-07,
"logits/chosen": 0.4504711329936981,
"logits/rejected": 0.48663392663002014,
"logps/chosen": -40.03446578979492,
"logps/rejected": -55.506591796875,
"loss": 0.4222,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.6967446804046631,
"rewards/margins": 1.0778069496154785,
"rewards/rejected": -0.381062388420105,
"step": 96
},
{
"epoch": 0.22618284395435312,
"grad_norm": 34.18594601316725,
"learning_rate": 4.997540487747892e-07,
"logits/chosen": 0.38444679975509644,
"logits/rejected": 0.4130491614341736,
"logps/chosen": -37.72957992553711,
"logps/rejected": -57.71113967895508,
"loss": 0.4716,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.864948570728302,
"rewards/margins": 1.0170652866363525,
"rewards/rejected": -0.152116596698761,
"step": 98
},
{
"epoch": 0.2307988203615848,
"grad_norm": 31.852168197293704,
"learning_rate": 4.996565036945769e-07,
"logits/chosen": 0.4658397436141968,
"logits/rejected": 0.4849558472633362,
"logps/chosen": -44.069618225097656,
"logps/rejected": -46.06491470336914,
"loss": 0.4924,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.5598255395889282,
"rewards/margins": 0.8147852420806885,
"rewards/rejected": -0.25495976209640503,
"step": 100
},
{
"epoch": 0.2307988203615848,
"eval_logits/chosen": 0.3590577244758606,
"eval_logits/rejected": 0.38313183188438416,
"eval_logps/chosen": -40.04033660888672,
"eval_logps/rejected": -48.23354721069336,
"eval_loss": 0.4618569314479828,
"eval_rewards/accuracies": 0.7298387289047241,
"eval_rewards/chosen": 0.852470874786377,
"eval_rewards/margins": 0.9500778913497925,
"eval_rewards/rejected": -0.09760700911283493,
"eval_runtime": 220.4716,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.969,
"step": 100
},
{
"epoch": 0.23541479676881652,
"grad_norm": 32.563251146586694,
"learning_rate": 4.995427171437356e-07,
"logits/chosen": 0.41394177079200745,
"logits/rejected": 0.4560126066207886,
"logps/chosen": -36.68212890625,
"logps/rejected": -56.006553649902344,
"loss": 0.3851,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.7943739891052246,
"rewards/margins": 1.1945956945419312,
"rewards/rejected": -0.40022173523902893,
"step": 102
},
{
"epoch": 0.2400307731760482,
"grad_norm": 35.159104202159625,
"learning_rate": 4.994126965246796e-07,
"logits/chosen": 0.43339937925338745,
"logits/rejected": 0.45789778232574463,
"logps/chosen": -40.00631332397461,
"logps/rejected": -48.161224365234375,
"loss": 0.4153,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.7441484928131104,
"rewards/margins": 1.0307202339172363,
"rewards/rejected": -0.28657177090644836,
"step": 104
},
{
"epoch": 0.24464674958327992,
"grad_norm": 35.54279884741835,
"learning_rate": 4.992664502959351e-07,
"logits/chosen": 0.42503511905670166,
"logits/rejected": 0.48626741766929626,
"logps/chosen": -36.73310852050781,
"logps/rejected": -73.78736877441406,
"loss": 0.3536,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.8739730715751648,
"rewards/margins": 1.558694839477539,
"rewards/rejected": -0.6847219467163086,
"step": 106
},
{
"epoch": 0.2492627259905116,
"grad_norm": 45.173611856138976,
"learning_rate": 4.991039879715898e-07,
"logits/chosen": 0.4289478361606598,
"logits/rejected": 0.46912992000579834,
"logps/chosen": -40.94606399536133,
"logps/rejected": -58.62925338745117,
"loss": 0.4057,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 1.023528814315796,
"rewards/margins": 1.251956582069397,
"rewards/rejected": -0.22842761874198914,
"step": 108
},
{
"epoch": 0.2538787023977433,
"grad_norm": 25.213187587591246,
"learning_rate": 4.989253201206736e-07,
"logits/chosen": 0.4647282361984253,
"logits/rejected": 0.4716295003890991,
"logps/chosen": -40.334922790527344,
"logps/rejected": -41.65603256225586,
"loss": 0.4339,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.9747889637947083,
"rewards/margins": 1.0528353452682495,
"rewards/rejected": -0.07804636657238007,
"step": 110
},
{
"epoch": 0.2538787023977433,
"eval_logits/chosen": 0.36145398020744324,
"eval_logits/rejected": 0.38587653636932373,
"eval_logps/chosen": -39.77558135986328,
"eval_logps/rejected": -48.301231384277344,
"eval_loss": 0.43463748693466187,
"eval_rewards/accuracies": 0.7379032373428345,
"eval_rewards/chosen": 0.9848493337631226,
"eval_rewards/margins": 1.1162999868392944,
"eval_rewards/rejected": -0.1314505934715271,
"eval_runtime": 220.4446,
"eval_samples_per_second": 7.866,
"eval_steps_per_second": 1.969,
"step": 110
},
{
"epoch": 0.258494678804975,
"grad_norm": 39.895524747857486,
"learning_rate": 4.987304583664712e-07,
"logits/chosen": 0.4972270429134369,
"logits/rejected": 0.5156663060188293,
"logps/chosen": -46.859134674072266,
"logps/rejected": -53.12602996826172,
"loss": 0.4463,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.8810398578643799,
"rewards/margins": 0.9829990863800049,
"rewards/rejected": -0.10195919126272202,
"step": 112
},
{
"epoch": 0.26311065521220667,
"grad_norm": 36.88032065773003,
"learning_rate": 4.985194153857662e-07,
"logits/chosen": 0.4386284351348877,
"logits/rejected": 0.4557953476905823,
"logps/chosen": -36.74658203125,
"logps/rejected": -39.56464767456055,
"loss": 0.4788,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.9255303144454956,
"rewards/margins": 0.9156983494758606,
"rewards/rejected": 0.009831971488893032,
"step": 114
},
{
"epoch": 0.2677266316194384,
"grad_norm": 23.636821560598456,
"learning_rate": 4.982922049080163e-07,
"logits/chosen": 0.40630775690078735,
"logits/rejected": 0.4236665964126587,
"logps/chosen": -35.141971588134766,
"logps/rejected": -42.14583969116211,
"loss": 0.3872,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8281899690628052,
"rewards/margins": 1.215153455734253,
"rewards/rejected": -0.386963427066803,
"step": 116
},
{
"epoch": 0.2723426080266701,
"grad_norm": 38.873691089935114,
"learning_rate": 4.980488417144599e-07,
"logits/chosen": 0.37884485721588135,
"logits/rejected": 0.4280329644680023,
"logps/chosen": -41.57583999633789,
"logps/rejected": -71.53160095214844,
"loss": 0.4818,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.7444247603416443,
"rewards/margins": 1.1956461668014526,
"rewards/rejected": -0.4512213468551636,
"step": 118
},
{
"epoch": 0.27695858443390176,
"grad_norm": 27.126567445081033,
"learning_rate": 4.977893416371544e-07,
"logits/chosen": 0.4753592908382416,
"logits/rejected": 0.4997613728046417,
"logps/chosen": -34.07433319091797,
"logps/rejected": -45.33045959472656,
"loss": 0.3838,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.8865776062011719,
"rewards/margins": 1.4225349426269531,
"rewards/rejected": -0.5359571576118469,
"step": 120
},
{
"epoch": 0.27695858443390176,
"eval_logits/chosen": 0.3657575249671936,
"eval_logits/rejected": 0.39033937454223633,
"eval_logps/chosen": -39.95652770996094,
"eval_logps/rejected": -48.739437103271484,
"eval_loss": 0.410579651594162,
"eval_rewards/accuracies": 0.7540322542190552,
"eval_rewards/chosen": 0.8943750858306885,
"eval_rewards/margins": 1.2449262142181396,
"eval_rewards/rejected": -0.35055097937583923,
"eval_runtime": 220.2442,
"eval_samples_per_second": 7.873,
"eval_steps_per_second": 1.971,
"step": 120
},
{
"epoch": 0.28157456084113347,
"grad_norm": 26.815456443053705,
"learning_rate": 4.975137215579469e-07,
"logits/chosen": 0.5420396327972412,
"logits/rejected": 0.5500521659851074,
"logps/chosen": -45.788516998291016,
"logps/rejected": -42.21580505371094,
"loss": 0.4117,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.8019249439239502,
"rewards/margins": 1.2268595695495605,
"rewards/rejected": -0.42493465542793274,
"step": 122
},
{
"epoch": 0.2861905372483652,
"grad_norm": 30.749785890404876,
"learning_rate": 4.972219994073755e-07,
"logits/chosen": 0.49169254302978516,
"logits/rejected": 0.5404393672943115,
"logps/chosen": -38.644107818603516,
"logps/rejected": -67.01266479492188,
"loss": 0.3844,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.8940033316612244,
"rewards/margins": 1.6317830085754395,
"rewards/rejected": -0.7377796173095703,
"step": 124
},
{
"epoch": 0.2908065136555969,
"grad_norm": 29.538977791375373,
"learning_rate": 4.969141941635025e-07,
"logits/chosen": 0.47598233819007874,
"logits/rejected": 0.5060492753982544,
"logps/chosen": -40.60331344604492,
"logps/rejected": -59.37862014770508,
"loss": 0.4476,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.5469496250152588,
"rewards/margins": 1.4448275566101074,
"rewards/rejected": -0.8978776931762695,
"step": 126
},
{
"epoch": 0.29542249006282856,
"grad_norm": 50.663011631161446,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": 0.49228647351264954,
"logits/rejected": 0.5329996943473816,
"logps/chosen": -39.90941619873047,
"logps/rejected": -61.23884963989258,
"loss": 0.347,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.8301137685775757,
"rewards/margins": 1.5421695709228516,
"rewards/rejected": -0.7120558619499207,
"step": 128
},
{
"epoch": 0.30003846647006027,
"grad_norm": 32.98345370505989,
"learning_rate": 4.962504155382493e-07,
"logits/chosen": 0.4239842891693115,
"logits/rejected": 0.44136151671409607,
"logps/chosen": -36.07121276855469,
"logps/rejected": -41.06203079223633,
"loss": 0.3667,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.7973310351371765,
"rewards/margins": 1.2334084510803223,
"rewards/rejected": -0.4360772669315338,
"step": 130
},
{
"epoch": 0.30003846647006027,
"eval_logits/chosen": 0.3723231256008148,
"eval_logits/rejected": 0.3968786299228668,
"eval_logps/chosen": -39.925048828125,
"eval_logps/rejected": -48.9533576965332,
"eval_loss": 0.39173391461372375,
"eval_rewards/accuracies": 0.7753456234931946,
"eval_rewards/chosen": 0.9101160168647766,
"eval_rewards/margins": 1.3676302433013916,
"eval_rewards/rejected": -0.4575144052505493,
"eval_runtime": 220.267,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 1.97,
"step": 130
},
{
"epoch": 0.304654442877292,
"grad_norm": 28.392702162901728,
"learning_rate": 4.958944853391652e-07,
"logits/chosen": 0.520796537399292,
"logits/rejected": 0.5420558452606201,
"logps/chosen": -37.87763595581055,
"logps/rejected": -46.05318069458008,
"loss": 0.3819,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.932469367980957,
"rewards/margins": 1.2907413244247437,
"rewards/rejected": -0.3582719564437866,
"step": 132
},
{
"epoch": 0.30927041928452365,
"grad_norm": 27.83688192223066,
"learning_rate": 4.955225584085624e-07,
"logits/chosen": 0.42395105957984924,
"logits/rejected": 0.44882073998451233,
"logps/chosen": -36.98991775512695,
"logps/rejected": -51.79054260253906,
"loss": 0.3951,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.9578195214271545,
"rewards/margins": 1.4403272867202759,
"rewards/rejected": -0.48250770568847656,
"step": 134
},
{
"epoch": 0.31388639569175536,
"grad_norm": 27.432482792006017,
"learning_rate": 4.951346589422467e-07,
"logits/chosen": 0.483965128660202,
"logits/rejected": 0.5153691172599792,
"logps/chosen": -37.48245620727539,
"logps/rejected": -54.50342559814453,
"loss": 0.3942,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 1.0384331941604614,
"rewards/margins": 1.5820738077163696,
"rewards/rejected": -0.5436408519744873,
"step": 136
},
{
"epoch": 0.3185023720989871,
"grad_norm": 46.62557646275611,
"learning_rate": 4.94730812175122e-07,
"logits/chosen": 0.43841731548309326,
"logits/rejected": 0.4499746561050415,
"logps/chosen": -38.93119812011719,
"logps/rejected": -42.26424026489258,
"loss": 0.4384,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.8961164951324463,
"rewards/margins": 1.247178554534912,
"rewards/rejected": -0.3510621190071106,
"step": 138
},
{
"epoch": 0.32311834850621873,
"grad_norm": 34.05743924648359,
"learning_rate": 4.943110443795476e-07,
"logits/chosen": 0.49757227301597595,
"logits/rejected": 0.5091323852539062,
"logps/chosen": -42.93407440185547,
"logps/rejected": -45.01084899902344,
"loss": 0.4061,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.8557752370834351,
"rewards/margins": 1.3424351215362549,
"rewards/rejected": -0.48665979504585266,
"step": 140
},
{
"epoch": 0.32311834850621873,
"eval_logits/chosen": 0.3763599395751953,
"eval_logits/rejected": 0.4011194705963135,
"eval_logps/chosen": -39.798316955566406,
"eval_logps/rejected": -49.11299133300781,
"eval_loss": 0.3788905441761017,
"eval_rewards/accuracies": 0.764976978302002,
"eval_rewards/chosen": 0.9734821915626526,
"eval_rewards/margins": 1.5108132362365723,
"eval_rewards/rejected": -0.5373309850692749,
"eval_runtime": 220.3233,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 140
},
{
"epoch": 0.32773432491345045,
"grad_norm": 36.481001944632766,
"learning_rate": 4.938753828636297e-07,
"logits/chosen": 0.4888935089111328,
"logits/rejected": 0.4963880777359009,
"logps/chosen": -46.02848815917969,
"logps/rejected": -44.94346237182617,
"loss": 0.4382,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.7767104506492615,
"rewards/margins": 1.235382080078125,
"rewards/rejected": -0.45867156982421875,
"step": 142
},
{
"epoch": 0.33235030132068216,
"grad_norm": 27.008693506029694,
"learning_rate": 4.934238559694447e-07,
"logits/chosen": 0.460690975189209,
"logits/rejected": 0.5057052969932556,
"logps/chosen": -38.473411560058594,
"logps/rejected": -54.91615295410156,
"loss": 0.3338,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.8698298335075378,
"rewards/margins": 1.6055673360824585,
"rewards/rejected": -0.7357374429702759,
"step": 144
},
{
"epoch": 0.3369662777279138,
"grad_norm": 32.261266015848825,
"learning_rate": 4.929564930711957e-07,
"logits/chosen": 0.4295574426651001,
"logits/rejected": 0.4522492587566376,
"logps/chosen": -39.829490661621094,
"logps/rejected": -44.733333587646484,
"loss": 0.3533,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.7346515655517578,
"rewards/margins": 1.3469676971435547,
"rewards/rejected": -0.6123161315917969,
"step": 146
},
{
"epoch": 0.34158225413514554,
"grad_norm": 28.797840444924386,
"learning_rate": 4.924733245733008e-07,
"logits/chosen": 0.5410254001617432,
"logits/rejected": 0.5485421419143677,
"logps/chosen": -43.81610870361328,
"logps/rejected": -40.52272033691406,
"loss": 0.3651,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.9063374996185303,
"rewards/margins": 1.2729685306549072,
"rewards/rejected": -0.366630882024765,
"step": 148
},
{
"epoch": 0.34619823054237725,
"grad_norm": 30.202896827963542,
"learning_rate": 4.91974381908416e-07,
"logits/chosen": 0.42066994309425354,
"logits/rejected": 0.4589553475379944,
"logps/chosen": -38.81809997558594,
"logps/rejected": -58.59386444091797,
"loss": 0.3446,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6800815463066101,
"rewards/margins": 1.928250789642334,
"rewards/rejected": -1.2481693029403687,
"step": 150
},
{
"epoch": 0.34619823054237725,
"eval_logits/chosen": 0.3821311295032501,
"eval_logits/rejected": 0.40684476494789124,
"eval_logps/chosen": -40.001861572265625,
"eval_logps/rejected": -49.4797477722168,
"eval_loss": 0.3633531332015991,
"eval_rewards/accuracies": 0.7724654674530029,
"eval_rewards/chosen": 0.8717083930969238,
"eval_rewards/margins": 1.5924171209335327,
"eval_rewards/rejected": -0.7207087278366089,
"eval_runtime": 220.1362,
"eval_samples_per_second": 7.877,
"eval_steps_per_second": 1.972,
"step": 150
},
{
"epoch": 0.3508142069496089,
"grad_norm": 26.385033894551455,
"learning_rate": 4.914596975353898e-07,
"logits/chosen": 0.4991176426410675,
"logits/rejected": 0.5242553353309631,
"logps/chosen": -38.974281311035156,
"logps/rejected": -48.54939270019531,
"loss": 0.3721,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.771294355392456,
"rewards/margins": 1.5243595838546753,
"rewards/rejected": -0.7530653476715088,
"step": 152
},
{
"epoch": 0.3554301833568406,
"grad_norm": 42.428423927932826,
"learning_rate": 4.909293049371519e-07,
"logits/chosen": 0.5288230180740356,
"logits/rejected": 0.5352779626846313,
"logps/chosen": -45.90478515625,
"logps/rejected": -44.53614044189453,
"loss": 0.3542,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.7464312314987183,
"rewards/margins": 1.5150079727172852,
"rewards/rejected": -0.7685766220092773,
"step": 154
},
{
"epoch": 0.36004615976407234,
"grad_norm": 36.75812549927479,
"learning_rate": 4.903832386185343e-07,
"logits/chosen": 0.47585126757621765,
"logits/rejected": 0.49040529131889343,
"logps/chosen": -44.172325134277344,
"logps/rejected": -43.98606872558594,
"loss": 0.3956,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.5973650813102722,
"rewards/margins": 1.340658187866211,
"rewards/rejected": -0.743293046951294,
"step": 156
},
{
"epoch": 0.364662136171304,
"grad_norm": 26.152211217958236,
"learning_rate": 4.89821534104028e-07,
"logits/chosen": 0.39484938979148865,
"logits/rejected": 0.42477357387542725,
"logps/chosen": -41.93134307861328,
"logps/rejected": -56.39106750488281,
"loss": 0.3275,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.827555239200592,
"rewards/margins": 1.9599329233169556,
"rewards/rejected": -1.1323776245117188,
"step": 158
},
{
"epoch": 0.3692781125785357,
"grad_norm": 29.041350828980583,
"learning_rate": 4.892442279354698e-07,
"logits/chosen": 0.4744550287723541,
"logits/rejected": 0.5093830227851868,
"logps/chosen": -42.794578552246094,
"logps/rejected": -59.93064498901367,
"loss": 0.3605,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.540644645690918,
"rewards/margins": 1.6665728092193604,
"rewards/rejected": -1.125928282737732,
"step": 160
},
{
"epoch": 0.3692781125785357,
"eval_logits/chosen": 0.38920047879219055,
"eval_logits/rejected": 0.41388043761253357,
"eval_logps/chosen": -40.3745231628418,
"eval_logps/rejected": -49.94725036621094,
"eval_loss": 0.3510279059410095,
"eval_rewards/accuracies": 0.7920507192611694,
"eval_rewards/chosen": 0.6853779554367065,
"eval_rewards/margins": 1.6398398876190186,
"eval_rewards/rejected": -0.9544618129730225,
"eval_runtime": 220.1812,
"eval_samples_per_second": 7.875,
"eval_steps_per_second": 1.971,
"step": 160
},
{
"epoch": 0.3738940889857674,
"grad_norm": 32.36067481486556,
"learning_rate": 4.886513576696673e-07,
"logits/chosen": 0.4680570960044861,
"logits/rejected": 0.5030277371406555,
"logps/chosen": -42.39280700683594,
"logps/rejected": -58.18678283691406,
"loss": 0.392,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.7217347621917725,
"rewards/margins": 1.8615412712097168,
"rewards/rejected": -1.1398065090179443,
"step": 162
},
{
"epoch": 0.3785100653929991,
"grad_norm": 27.802667550507227,
"learning_rate": 4.880429618759543e-07,
"logits/chosen": 0.46893131732940674,
"logits/rejected": 0.4787411093711853,
"logps/chosen": -45.52459716796875,
"logps/rejected": -46.459312438964844,
"loss": 0.3819,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.870037317276001,
"rewards/margins": 1.4820109605789185,
"rewards/rejected": -0.6119736433029175,
"step": 164
},
{
"epoch": 0.3831260418002308,
"grad_norm": 27.278325528930967,
"learning_rate": 4.874190801336817e-07,
"logits/chosen": 0.46610963344573975,
"logits/rejected": 0.4872422218322754,
"logps/chosen": -44.28363037109375,
"logps/rejected": -51.54701232910156,
"loss": 0.323,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6502636075019836,
"rewards/margins": 1.7216179370880127,
"rewards/rejected": -1.0713541507720947,
"step": 166
},
{
"epoch": 0.3877420182074625,
"grad_norm": 25.09454062223173,
"learning_rate": 4.867797530296431e-07,
"logits/chosen": 0.4582709074020386,
"logits/rejected": 0.48244646191596985,
"logps/chosen": -45.76988983154297,
"logps/rejected": -55.2458610534668,
"loss": 0.2842,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.6319215297698975,
"rewards/margins": 2.007154941558838,
"rewards/rejected": -1.3752332925796509,
"step": 168
},
{
"epoch": 0.39235799461469417,
"grad_norm": 25.014228656107395,
"learning_rate": 4.861250221554343e-07,
"logits/chosen": 0.4760267436504364,
"logits/rejected": 0.5161222219467163,
"logps/chosen": -36.09988021850586,
"logps/rejected": -58.49198913574219,
"loss": 0.317,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.6937295794487,
"rewards/margins": 2.0070507526397705,
"rewards/rejected": -1.3133213520050049,
"step": 170
},
{
"epoch": 0.39235799461469417,
"eval_logits/chosen": 0.39130648970603943,
"eval_logits/rejected": 0.41622862219810486,
"eval_logps/chosen": -40.4833984375,
"eval_logps/rejected": -50.18775177001953,
"eval_loss": 0.343056857585907,
"eval_rewards/accuracies": 0.796658992767334,
"eval_rewards/chosen": 0.6309407949447632,
"eval_rewards/margins": 1.7056493759155273,
"eval_rewards/rejected": -1.0747085809707642,
"eval_runtime": 220.3261,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 170
},
{
"epoch": 0.3969739710219259,
"grad_norm": 21.660777806253456,
"learning_rate": 4.854549301047476e-07,
"logits/chosen": 0.5408195853233337,
"logits/rejected": 0.5565234422683716,
"logps/chosen": -42.90623474121094,
"logps/rejected": -43.590702056884766,
"loss": 0.373,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.6533936262130737,
"rewards/margins": 1.5095248222351074,
"rewards/rejected": -0.8561312556266785,
"step": 172
},
{
"epoch": 0.4015899474291576,
"grad_norm": 32.27746768838142,
"learning_rate": 4.847695204706005e-07,
"logits/chosen": 0.47649839520454407,
"logits/rejected": 0.49190616607666016,
"logps/chosen": -38.49553680419922,
"logps/rejected": -40.65150451660156,
"loss": 0.3558,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.7918031811714172,
"rewards/margins": 1.4087355136871338,
"rewards/rejected": -0.6169323325157166,
"step": 174
},
{
"epoch": 0.40620592383638926,
"grad_norm": 31.844706703711985,
"learning_rate": 4.840688378425e-07,
"logits/chosen": 0.5188453793525696,
"logits/rejected": 0.5562708973884583,
"logps/chosen": -46.135372161865234,
"logps/rejected": -56.292930603027344,
"loss": 0.261,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.7925480604171753,
"rewards/margins": 2.1678171157836914,
"rewards/rejected": -1.3752690553665161,
"step": 176
},
{
"epoch": 0.410821900243621,
"grad_norm": 26.376171346573187,
"learning_rate": 4.833529278035422e-07,
"logits/chosen": 0.357127845287323,
"logits/rejected": 0.4103134572505951,
"logps/chosen": -37.78556442260742,
"logps/rejected": -67.52072143554688,
"loss": 0.2899,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.9015005826950073,
"rewards/margins": 2.719820261001587,
"rewards/rejected": -1.81831955909729,
"step": 178
},
{
"epoch": 0.4154378766508527,
"grad_norm": 26.0393680431703,
"learning_rate": 4.826218369274459e-07,
"logits/chosen": 0.4666251540184021,
"logits/rejected": 0.5160384178161621,
"logps/chosen": -39.356258392333984,
"logps/rejected": -62.83391571044922,
"loss": 0.3066,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8675535917282104,
"rewards/margins": 2.234145164489746,
"rewards/rejected": -1.3665915727615356,
"step": 180
},
{
"epoch": 0.4154378766508527,
"eval_logits/chosen": 0.3935144245624542,
"eval_logits/rejected": 0.41844189167022705,
"eval_logps/chosen": -39.861793518066406,
"eval_logps/rejected": -49.855037689208984,
"eval_loss": 0.3321295380592346,
"eval_rewards/accuracies": 0.7926267385482788,
"eval_rewards/chosen": 0.941743791103363,
"eval_rewards/margins": 1.8500969409942627,
"eval_rewards/rejected": -0.9083530902862549,
"eval_runtime": 220.3176,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 180
},
{
"epoch": 0.42005385305808435,
"grad_norm": 23.061889635448846,
"learning_rate": 4.818756127755237e-07,
"logits/chosen": 0.49034425616264343,
"logits/rejected": 0.5069853663444519,
"logps/chosen": -37.846553802490234,
"logps/rejected": -41.30693817138672,
"loss": 0.2693,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 1.0121811628341675,
"rewards/margins": 1.8432265520095825,
"rewards/rejected": -0.831045389175415,
"step": 182
},
{
"epoch": 0.42466982946531606,
"grad_norm": 22.17904586209137,
"learning_rate": 4.811143038935873e-07,
"logits/chosen": 0.5580455660820007,
"logits/rejected": 0.5748550295829773,
"logps/chosen": -42.32413101196289,
"logps/rejected": -46.0750732421875,
"loss": 0.3264,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 1.0455000400543213,
"rewards/margins": 1.93173086643219,
"rewards/rejected": -0.8862307667732239,
"step": 184
},
{
"epoch": 0.4292858058725478,
"grad_norm": 30.29917055573095,
"learning_rate": 4.803379598087899e-07,
"logits/chosen": 0.5174715518951416,
"logits/rejected": 0.5311744213104248,
"logps/chosen": -40.50711441040039,
"logps/rejected": -40.298824310302734,
"loss": 0.316,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.9772326350212097,
"rewards/margins": 1.7723863124847412,
"rewards/rejected": -0.795153796672821,
"step": 186
},
{
"epoch": 0.43390178227977944,
"grad_norm": 40.38001852713412,
"learning_rate": 4.795466310264034e-07,
"logits/chosen": 0.42736437916755676,
"logits/rejected": 0.463912695646286,
"logps/chosen": -39.35895919799805,
"logps/rejected": -64.93545532226562,
"loss": 0.4185,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.5966134667396545,
"rewards/margins": 1.980704665184021,
"rewards/rejected": -1.3840913772583008,
"step": 188
},
{
"epoch": 0.43851775868701115,
"grad_norm": 17.949323810733784,
"learning_rate": 4.787403690265335e-07,
"logits/chosen": 0.5044853091239929,
"logits/rejected": 0.5284148454666138,
"logps/chosen": -39.47854995727539,
"logps/rejected": -49.92608642578125,
"loss": 0.3266,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 1.0101630687713623,
"rewards/margins": 1.9091652631759644,
"rewards/rejected": -0.8990020751953125,
"step": 190
},
{
"epoch": 0.43851775868701115,
"eval_logits/chosen": 0.3972060978412628,
"eval_logits/rejected": 0.4221220314502716,
"eval_logps/chosen": -39.831119537353516,
"eval_logps/rejected": -50.09023666381836,
"eval_loss": 0.3243154287338257,
"eval_rewards/accuracies": 0.7914746403694153,
"eval_rewards/chosen": 0.9570826292037964,
"eval_rewards/margins": 1.9830337762832642,
"eval_rewards/rejected": -1.0259510278701782,
"eval_runtime": 220.3237,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 190
},
{
"epoch": 0.44313373509424286,
"grad_norm": 36.065620072852695,
"learning_rate": 4.779192262607702e-07,
"logits/chosen": 0.5138534903526306,
"logits/rejected": 0.544155478477478,
"logps/chosen": -43.310760498046875,
"logps/rejected": -59.56623840332031,
"loss": 0.3542,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.9537274837493896,
"rewards/margins": 2.111888885498047,
"rewards/rejected": -1.1581614017486572,
"step": 192
},
{
"epoch": 0.4477497115014745,
"grad_norm": 24.653058016123207,
"learning_rate": 4.770832561487758e-07,
"logits/chosen": 0.4504295885562897,
"logits/rejected": 0.46597781777381897,
"logps/chosen": -41.51498794555664,
"logps/rejected": -43.07120132446289,
"loss": 0.2587,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.9096213579177856,
"rewards/margins": 2.131375551223755,
"rewards/rejected": -1.2217543125152588,
"step": 194
},
{
"epoch": 0.45236568790870624,
"grad_norm": 36.95305184003922,
"learning_rate": 4.762325130748097e-07,
"logits/chosen": 0.5585076808929443,
"logits/rejected": 0.5717556476593018,
"logps/chosen": -47.50046920776367,
"logps/rejected": -44.811973571777344,
"loss": 0.3412,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.9956084489822388,
"rewards/margins": 1.8879083395004272,
"rewards/rejected": -0.8922999501228333,
"step": 196
},
{
"epoch": 0.45698166431593795,
"grad_norm": 16.999205852011567,
"learning_rate": 4.7536705238418995e-07,
"logits/chosen": 0.47373294830322266,
"logits/rejected": 0.49137142300605774,
"logps/chosen": -42.69048309326172,
"logps/rejected": -50.26279067993164,
"loss": 0.275,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8595349788665771,
"rewards/margins": 2.2136645317077637,
"rewards/rejected": -1.3541297912597656,
"step": 198
},
{
"epoch": 0.4615976407231696,
"grad_norm": 33.06750404898565,
"learning_rate": 4.7448693037969336e-07,
"logits/chosen": 0.5136507749557495,
"logits/rejected": 0.527184247970581,
"logps/chosen": -41.794132232666016,
"logps/rejected": -48.2490119934082,
"loss": 0.2986,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.8771340847015381,
"rewards/margins": 1.9797013998031616,
"rewards/rejected": -1.102567195892334,
"step": 200
},
{
"epoch": 0.4615976407231696,
"eval_logits/chosen": 0.39842745661735535,
"eval_logits/rejected": 0.42355066537857056,
"eval_logps/chosen": -40.12582778930664,
"eval_logps/rejected": -50.502620697021484,
"eval_loss": 0.3160472810268402,
"eval_rewards/accuracies": 0.7978110313415527,
"eval_rewards/chosen": 0.8097268342971802,
"eval_rewards/margins": 2.041868209838867,
"eval_rewards/rejected": -1.2321414947509766,
"eval_runtime": 220.4769,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.968,
"step": 200
},
{
"epoch": 0.4662136171304013,
"grad_norm": 27.6688088244827,
"learning_rate": 4.735922043178923e-07,
"logits/chosen": 0.5529847741127014,
"logits/rejected": 0.5818406939506531,
"logps/chosen": -42.29270553588867,
"logps/rejected": -57.84202575683594,
"loss": 0.2725,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.8104487061500549,
"rewards/margins": 2.3321969509124756,
"rewards/rejected": -1.521748423576355,
"step": 202
},
{
"epoch": 0.47082959353763304,
"grad_norm": 23.404484719369563,
"learning_rate": 4.7268293240543017e-07,
"logits/chosen": 0.48225533962249756,
"logits/rejected": 0.5109025239944458,
"logps/chosen": -40.953433990478516,
"logps/rejected": -55.026153564453125,
"loss": 0.3435,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.8147386908531189,
"rewards/margins": 2.057671546936035,
"rewards/rejected": -1.2429331541061401,
"step": 204
},
{
"epoch": 0.4754455699448647,
"grad_norm": 29.663210206611154,
"learning_rate": 4.717591737952344e-07,
"logits/chosen": 0.48208919167518616,
"logits/rejected": 0.517291247844696,
"logps/chosen": -36.30723190307617,
"logps/rejected": -54.3764533996582,
"loss": 0.3135,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.8137081861495972,
"rewards/margins": 2.101260185241699,
"rewards/rejected": -1.287551999092102,
"step": 206
},
{
"epoch": 0.4800615463520964,
"grad_norm": 29.39474251364716,
"learning_rate": 4.7082098858266837e-07,
"logits/chosen": 0.48040205240249634,
"logits/rejected": 0.5284512042999268,
"logps/chosen": -31.84227180480957,
"logps/rejected": -61.47830581665039,
"loss": 0.3821,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.455925315618515,
"rewards/margins": 2.105367422103882,
"rewards/rejected": -1.649442195892334,
"step": 208
},
{
"epoch": 0.4846775227593281,
"grad_norm": 15.879511628269139,
"learning_rate": 4.698684378016222e-07,
"logits/chosen": 0.4825616478919983,
"logits/rejected": 0.5131646394729614,
"logps/chosen": -43.97586441040039,
"logps/rejected": -58.62031936645508,
"loss": 0.271,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.6879336833953857,
"rewards/margins": 2.212796211242676,
"rewards/rejected": -1.5248624086380005,
"step": 210
},
{
"epoch": 0.4846775227593281,
"eval_logits/chosen": 0.40579578280448914,
"eval_logits/rejected": 0.43089571595191956,
"eval_logps/chosen": -40.55123519897461,
"eval_logps/rejected": -51.04283905029297,
"eval_loss": 0.3111670911312103,
"eval_rewards/accuracies": 0.804147481918335,
"eval_rewards/chosen": 0.597020149230957,
"eval_rewards/margins": 2.099271535873413,
"eval_rewards/rejected": -1.502251386642456,
"eval_runtime": 220.3759,
"eval_samples_per_second": 7.868,
"eval_steps_per_second": 1.969,
"step": 210
},
{
"epoch": 0.48929349916655984,
"grad_norm": 33.220388342252136,
"learning_rate": 4.6890158342054174e-07,
"logits/chosen": 0.46122825145721436,
"logits/rejected": 0.48773014545440674,
"logps/chosen": -38.094722747802734,
"logps/rejected": -50.649871826171875,
"loss": 0.3288,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.5131194591522217,
"rewards/margins": 2.1884312629699707,
"rewards/rejected": -1.6753116846084595,
"step": 212
},
{
"epoch": 0.4939094755737915,
"grad_norm": 27.37607169791161,
"learning_rate": 4.679204883383973e-07,
"logits/chosen": 0.45677465200424194,
"logits/rejected": 0.5006839632987976,
"logps/chosen": -36.343292236328125,
"logps/rejected": -65.76275634765625,
"loss": 0.301,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.5972538590431213,
"rewards/margins": 2.6644716262817383,
"rewards/rejected": -2.0672178268432617,
"step": 214
},
{
"epoch": 0.4985254519810232,
"grad_norm": 28.712191033509406,
"learning_rate": 4.669252163805919e-07,
"logits/chosen": 0.48203393816947937,
"logits/rejected": 0.5129568576812744,
"logps/chosen": -40.263328552246094,
"logps/rejected": -53.96393966674805,
"loss": 0.3434,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.3674449920654297,
"rewards/margins": 2.094463348388672,
"rewards/rejected": -1.7270184755325317,
"step": 216
},
{
"epoch": 0.5031414283882549,
"grad_norm": 21.430060439194165,
"learning_rate": 4.65915832294809e-07,
"logits/chosen": 0.5647565722465515,
"logits/rejected": 0.6052375435829163,
"logps/chosen": -37.24385070800781,
"logps/rejected": -58.28202438354492,
"loss": 0.2945,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.5437911748886108,
"rewards/margins": 2.518171787261963,
"rewards/rejected": -1.9743802547454834,
"step": 218
},
{
"epoch": 0.5077574047954866,
"grad_norm": 24.194015322932014,
"learning_rate": 4.6489240174680026e-07,
"logits/chosen": 0.5365298390388489,
"logits/rejected": 0.5451048612594604,
"logps/chosen": -40.26055145263672,
"logps/rejected": -40.11984634399414,
"loss": 0.4064,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.4349411725997925,
"rewards/margins": 1.4253244400024414,
"rewards/rejected": -0.9903832674026489,
"step": 220
},
{
"epoch": 0.5077574047954866,
"eval_logits/chosen": 0.40611767768859863,
"eval_logits/rejected": 0.43133166432380676,
"eval_logps/chosen": -40.628150939941406,
"eval_logps/rejected": -51.22826385498047,
"eval_loss": 0.30713996291160583,
"eval_rewards/accuracies": 0.8018433451652527,
"eval_rewards/chosen": 0.5585668087005615,
"eval_rewards/margins": 2.153529644012451,
"eval_rewards/rejected": -1.5949628353118896,
"eval_runtime": 220.3416,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 220
},
{
"epoch": 0.5123733812027182,
"grad_norm": 23.39715012730976,
"learning_rate": 4.638549913161138e-07,
"logits/chosen": 0.5600088834762573,
"logits/rejected": 0.5736495852470398,
"logps/chosen": -46.20627212524414,
"logps/rejected": -47.1099739074707,
"loss": 0.2227,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.7162383794784546,
"rewards/margins": 2.4795196056365967,
"rewards/rejected": -1.763281226158142,
"step": 222
},
{
"epoch": 0.51698935760995,
"grad_norm": 23.70013936518676,
"learning_rate": 4.6280366849176267e-07,
"logits/chosen": 0.553576648235321,
"logits/rejected": 0.5800661444664001,
"logps/chosen": -41.73429870605469,
"logps/rejected": -47.09934997558594,
"loss": 0.2708,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6174063682556152,
"rewards/margins": 2.10538649559021,
"rewards/rejected": -1.4879801273345947,
"step": 224
},
{
"epoch": 0.5216053340171817,
"grad_norm": 19.39438827436505,
"learning_rate": 4.6173850166783446e-07,
"logits/chosen": 0.5699052810668945,
"logits/rejected": 0.5908712148666382,
"logps/chosen": -40.74462127685547,
"logps/rejected": -53.7403450012207,
"loss": 0.2716,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.5502187609672546,
"rewards/margins": 2.0002176761627197,
"rewards/rejected": -1.4499988555908203,
"step": 226
},
{
"epoch": 0.5262213104244133,
"grad_norm": 24.49934372199594,
"learning_rate": 4.606595601390417e-07,
"logits/chosen": 0.46904435753822327,
"logits/rejected": 0.5106580257415771,
"logps/chosen": -39.85272979736328,
"logps/rejected": -61.70741653442383,
"loss": 0.2336,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3319948613643646,
"rewards/margins": 2.6446897983551025,
"rewards/rejected": -2.312695026397705,
"step": 228
},
{
"epoch": 0.5308372868316451,
"grad_norm": 28.165420212664795,
"learning_rate": 4.595669140962143e-07,
"logits/chosen": 0.4127655625343323,
"logits/rejected": 0.479299396276474,
"logps/chosen": -34.939422607421875,
"logps/rejected": -78.63516235351562,
"loss": 0.3107,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.18619215488433838,
"rewards/margins": 2.8822548389434814,
"rewards/rejected": -2.6960630416870117,
"step": 230
},
{
"epoch": 0.5308372868316451,
"eval_logits/chosen": 0.4082220494747162,
"eval_logits/rejected": 0.4335884749889374,
"eval_logps/chosen": -40.824676513671875,
"eval_logps/rejected": -51.529090881347656,
"eval_loss": 0.30161648988723755,
"eval_rewards/accuracies": 0.8104838728904724,
"eval_rewards/chosen": 0.4603023827075958,
"eval_rewards/margins": 2.205678939819336,
"eval_rewards/rejected": -1.745376706123352,
"eval_runtime": 220.269,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 1.97,
"step": 230
},
{
"epoch": 0.5354532632388768,
"grad_norm": 16.564014282307436,
"learning_rate": 4.5846063462173284e-07,
"logits/chosen": 0.5141347050666809,
"logits/rejected": 0.5398997664451599,
"logps/chosen": -38.93478012084961,
"logps/rejected": -53.1637077331543,
"loss": 0.2932,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.3137105405330658,
"rewards/margins": 2.214162826538086,
"rewards/rejected": -1.9004522562026978,
"step": 232
},
{
"epoch": 0.5400692396461084,
"grad_norm": 30.180896923031582,
"learning_rate": 4.573407936849044e-07,
"logits/chosen": 0.49748367071151733,
"logits/rejected": 0.502750039100647,
"logps/chosen": -46.67736053466797,
"logps/rejected": -48.594566345214844,
"loss": 0.3143,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.39324572682380676,
"rewards/margins": 1.9298076629638672,
"rewards/rejected": -1.5365619659423828,
"step": 234
},
{
"epoch": 0.5446852160533402,
"grad_norm": 43.03719615392396,
"learning_rate": 4.5620746413728063e-07,
"logits/chosen": 0.5845724940299988,
"logits/rejected": 0.5915371775627136,
"logps/chosen": -52.0160026550293,
"logps/rejected": -49.12672805786133,
"loss": 0.2833,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.14373371005058289,
"rewards/margins": 2.1639184951782227,
"rewards/rejected": -2.0201845169067383,
"step": 236
},
{
"epoch": 0.5493011924605719,
"grad_norm": 21.1030283537707,
"learning_rate": 4.550607197079185e-07,
"logits/chosen": 0.552834153175354,
"logits/rejected": 0.5818264484405518,
"logps/chosen": -38.04405212402344,
"logps/rejected": -46.87253189086914,
"loss": 0.2897,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.4428212344646454,
"rewards/margins": 1.7602063417434692,
"rewards/rejected": -1.317385196685791,
"step": 238
},
{
"epoch": 0.5539171688678035,
"grad_norm": 14.340136381864786,
"learning_rate": 4.5390063499858353e-07,
"logits/chosen": 0.5454181432723999,
"logits/rejected": 0.5769542455673218,
"logps/chosen": -47.16811752319336,
"logps/rejected": -62.15293884277344,
"loss": 0.2046,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.36544325947761536,
"rewards/margins": 2.6488418579101562,
"rewards/rejected": -2.2833986282348633,
"step": 240
},
{
"epoch": 0.5539171688678035,
"eval_logits/chosen": 0.41252779960632324,
"eval_logits/rejected": 0.4378991425037384,
"eval_logps/chosen": -40.974342346191406,
"eval_logps/rejected": -51.8930778503418,
"eval_loss": 0.2962896525859833,
"eval_rewards/accuracies": 0.8070276379585266,
"eval_rewards/chosen": 0.3854685127735138,
"eval_rewards/margins": 2.312840223312378,
"eval_rewards/rejected": -1.927371859550476,
"eval_runtime": 220.4271,
"eval_samples_per_second": 7.867,
"eval_steps_per_second": 1.969,
"step": 240
},
{
"epoch": 0.5585331452750353,
"grad_norm": 20.121912107871452,
"learning_rate": 4.5272728547889687e-07,
"logits/chosen": 0.5017317533493042,
"logits/rejected": 0.5252359509468079,
"logps/chosen": -43.418678283691406,
"logps/rejected": -51.78999710083008,
"loss": 0.2157,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.29254353046417236,
"rewards/margins": 2.571570873260498,
"rewards/rejected": -2.2790274620056152,
"step": 242
},
{
"epoch": 0.5631491216822669,
"grad_norm": 36.79556689673262,
"learning_rate": 4.5154074748142535e-07,
"logits/chosen": 0.5326908230781555,
"logits/rejected": 0.5592876672744751,
"logps/chosen": -45.176578521728516,
"logps/rejected": -55.26374053955078,
"loss": 0.2959,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.26444554328918457,
"rewards/margins": 2.2498509883880615,
"rewards/rejected": -1.985405445098877,
"step": 244
},
{
"epoch": 0.5677650980894986,
"grad_norm": 30.279268688467162,
"learning_rate": 4.503410981967158e-07,
"logits/chosen": 0.508591890335083,
"logits/rejected": 0.5472189784049988,
"logps/chosen": -37.81255340576172,
"logps/rejected": -59.81355285644531,
"loss": 0.3387,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.324074387550354,
"rewards/margins": 2.479010581970215,
"rewards/rejected": -2.1549363136291504,
"step": 246
},
{
"epoch": 0.5723810744967304,
"grad_norm": 32.696656835575155,
"learning_rate": 4.4912841566827333e-07,
"logits/chosen": 0.5358154773712158,
"logits/rejected": 0.572979211807251,
"logps/chosen": -40.84016799926758,
"logps/rejected": -57.57326889038086,
"loss": 0.2559,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.6461736559867859,
"rewards/margins": 2.717188835144043,
"rewards/rejected": -2.0710153579711914,
"step": 248
},
{
"epoch": 0.576997050903962,
"grad_norm": 26.864795137183627,
"learning_rate": 4.4790277878748415e-07,
"logits/chosen": 0.5129296779632568,
"logits/rejected": 0.543644368648529,
"logps/chosen": -36.90694046020508,
"logps/rejected": -51.41253662109375,
"loss": 0.2466,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.4497109651565552,
"rewards/margins": 2.559537172317505,
"rewards/rejected": -2.1098265647888184,
"step": 250
},
{
"epoch": 0.576997050903962,
"eval_logits/chosen": 0.4140053689479828,
"eval_logits/rejected": 0.43953680992126465,
"eval_logps/chosen": -40.92128372192383,
"eval_logps/rejected": -52.06728744506836,
"eval_loss": 0.29202744364738464,
"eval_rewards/accuracies": 0.8064516186714172,
"eval_rewards/chosen": 0.41199636459350586,
"eval_rewards/margins": 2.4264743328094482,
"eval_rewards/rejected": -2.0144779682159424,
"eval_runtime": 220.3958,
"eval_samples_per_second": 7.868,
"eval_steps_per_second": 1.969,
"step": 250
},
{
"epoch": 0.5816130273111938,
"grad_norm": 34.34355868179491,
"learning_rate": 4.466642672884835e-07,
"logits/chosen": 0.5273095965385437,
"logits/rejected": 0.5604310631752014,
"logps/chosen": -39.039512634277344,
"logps/rejected": -52.470951080322266,
"loss": 0.2676,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.256040096282959,
"rewards/margins": 2.4306235313415527,
"rewards/rejected": -2.1745834350585938,
"step": 252
},
{
"epoch": 0.5862290037184255,
"grad_norm": 27.545044099293104,
"learning_rate": 4.454129617429682e-07,
"logits/chosen": 0.515310525894165,
"logits/rejected": 0.5264334678649902,
"logps/chosen": -41.25297546386719,
"logps/rejected": -44.831031799316406,
"loss": 0.2921,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.2963744103908539,
"rewards/margins": 2.2201662063598633,
"rewards/rejected": -1.9237921237945557,
"step": 254
},
{
"epoch": 0.5908449801256571,
"grad_norm": 16.22258168997157,
"learning_rate": 4.441489435549551e-07,
"logits/chosen": 0.5497354865074158,
"logits/rejected": 0.5820472240447998,
"logps/chosen": -45.16104507446289,
"logps/rejected": -60.09016799926758,
"loss": 0.2492,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.36222705245018005,
"rewards/margins": 2.6290435791015625,
"rewards/rejected": -2.2668166160583496,
"step": 256
},
{
"epoch": 0.5954609565328889,
"grad_norm": 22.519317936372268,
"learning_rate": 4.4287229495548573e-07,
"logits/chosen": 0.5290111303329468,
"logits/rejected": 0.550987184047699,
"logps/chosen": -45.896942138671875,
"logps/rejected": -57.38431930541992,
"loss": 0.2158,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.3132680654525757,
"rewards/margins": 2.935549020767212,
"rewards/rejected": -2.622281074523926,
"step": 258
},
{
"epoch": 0.6000769329401205,
"grad_norm": 33.27879387908239,
"learning_rate": 4.415830989972761e-07,
"logits/chosen": 0.613827645778656,
"logits/rejected": 0.6395273208618164,
"logps/chosen": -40.98984146118164,
"logps/rejected": -48.8809700012207,
"loss": 0.3209,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.3634183704853058,
"rewards/margins": 2.285569190979004,
"rewards/rejected": -1.922150731086731,
"step": 260
},
{
"epoch": 0.6000769329401205,
"eval_logits/chosen": 0.41586774587631226,
"eval_logits/rejected": 0.4413994550704956,
"eval_logps/chosen": -41.435340881347656,
"eval_logps/rejected": -52.66230773925781,
"eval_loss": 0.28806936740875244,
"eval_rewards/accuracies": 0.8116359710693359,
"eval_rewards/chosen": 0.15496963262557983,
"eval_rewards/margins": 2.4669582843780518,
"eval_rewards/rejected": -2.3119888305664062,
"eval_runtime": 220.1153,
"eval_samples_per_second": 7.878,
"eval_steps_per_second": 1.972,
"step": 260
},
{
"epoch": 0.6046929093473522,
"grad_norm": 28.090703957454657,
"learning_rate": 4.402814395493142e-07,
"logits/chosen": 0.49612462520599365,
"logits/rejected": 0.4979320168495178,
"logps/chosen": -40.7058219909668,
"logps/rejected": -38.908050537109375,
"loss": 0.3653,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.15811699628829956,
"rewards/margins": 1.8890395164489746,
"rewards/rejected": -1.7309226989746094,
"step": 262
},
{
"epoch": 0.609308885754584,
"grad_norm": 20.963207734816056,
"learning_rate": 4.3896740129140354e-07,
"logits/chosen": 0.49926820397377014,
"logits/rejected": 0.518930196762085,
"logps/chosen": -41.947425842285156,
"logps/rejected": -42.273597717285156,
"loss": 0.2493,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.2666120231151581,
"rewards/margins": 2.4279704093933105,
"rewards/rejected": -2.161358594894409,
"step": 264
},
{
"epoch": 0.6139248621618156,
"grad_norm": 24.847993356607933,
"learning_rate": 4.3764106970865456e-07,
"logits/chosen": 0.5007407665252686,
"logits/rejected": 0.5330516695976257,
"logps/chosen": -36.07570266723633,
"logps/rejected": -50.92935562133789,
"loss": 0.3174,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.02925288677215576,
"rewards/margins": 2.231614589691162,
"rewards/rejected": -2.202361583709717,
"step": 266
},
{
"epoch": 0.6185408385690473,
"grad_norm": 26.539349634561272,
"learning_rate": 4.3630253108592305e-07,
"logits/chosen": 0.5235443115234375,
"logits/rejected": 0.5463228821754456,
"logps/chosen": -48.52283477783203,
"logps/rejected": -54.78059387207031,
"loss": 0.2266,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.005189484916627407,
"rewards/margins": 2.9114773273468018,
"rewards/rejected": -2.9062881469726562,
"step": 268
},
{
"epoch": 0.6231568149762791,
"grad_norm": 35.3397663590889,
"learning_rate": 4.3495187250219723e-07,
"logits/chosen": 0.4959086775779724,
"logits/rejected": 0.5330989360809326,
"logps/chosen": -37.50285339355469,
"logps/rejected": -56.99623489379883,
"loss": 0.2865,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.16485626995563507,
"rewards/margins": 2.9254465103149414,
"rewards/rejected": -3.0903029441833496,
"step": 270
},
{
"epoch": 0.6231568149762791,
"eval_logits/chosen": 0.4182251989841461,
"eval_logits/rejected": 0.44391536712646484,
"eval_logps/chosen": -41.51067352294922,
"eval_logps/rejected": -52.77988052368164,
"eval_loss": 0.2869359254837036,
"eval_rewards/accuracies": 0.8116359710693359,
"eval_rewards/chosen": 0.11730305105447769,
"eval_rewards/margins": 2.488077163696289,
"eval_rewards/rejected": -2.3707735538482666,
"eval_runtime": 220.1579,
"eval_samples_per_second": 7.876,
"eval_steps_per_second": 1.971,
"step": 270
},
{
"epoch": 0.6277727913835107,
"grad_norm": 23.403340630174217,
"learning_rate": 4.3358918182493253e-07,
"logits/chosen": 0.5670427083969116,
"logits/rejected": 0.5846278071403503,
"logps/chosen": -41.197166442871094,
"logps/rejected": -48.75783920288086,
"loss": 0.229,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.05103777348995209,
"rewards/margins": 2.2875313758850098,
"rewards/rejected": -2.338569164276123,
"step": 272
},
{
"epoch": 0.6323887677907424,
"grad_norm": 31.35543837574939,
"learning_rate": 4.3221454770433554e-07,
"logits/chosen": 0.5044899582862854,
"logits/rejected": 0.5252879858016968,
"logps/chosen": -46.43470764160156,
"logps/rejected": -50.872764587402344,
"loss": 0.2558,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.030280061066150665,
"rewards/margins": 2.529269218444824,
"rewards/rejected": -2.4989893436431885,
"step": 274
},
{
"epoch": 0.6370047441979741,
"grad_norm": 27.239886684790495,
"learning_rate": 4.308280595675966e-07,
"logits/chosen": 0.5399680733680725,
"logits/rejected": 0.5539530515670776,
"logps/chosen": -45.22441101074219,
"logps/rejected": -51.61985397338867,
"loss": 0.3439,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": -0.1256939023733139,
"rewards/margins": 2.2664339542388916,
"rewards/rejected": -2.392127752304077,
"step": 276
},
{
"epoch": 0.6416207206052058,
"grad_norm": 29.254953852014435,
"learning_rate": 4.2942980761307227e-07,
"logits/chosen": 0.5513600707054138,
"logits/rejected": 0.5763798356056213,
"logps/chosen": -42.95576477050781,
"logps/rejected": -53.852542877197266,
"loss": 0.2795,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.3323157727718353,
"rewards/margins": 2.3478498458862305,
"rewards/rejected": -2.680166006088257,
"step": 278
},
{
"epoch": 0.6462366970124375,
"grad_norm": 16.01715280590405,
"learning_rate": 4.2801988280441765e-07,
"logits/chosen": 0.5487841367721558,
"logits/rejected": 0.5692893862724304,
"logps/chosen": -45.817508697509766,
"logps/rejected": -54.61252975463867,
"loss": 0.2162,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.03073420189321041,
"rewards/margins": 2.809882402420044,
"rewards/rejected": -2.840616226196289,
"step": 280
},
{
"epoch": 0.6462366970124375,
"eval_logits/chosen": 0.41910773515701294,
"eval_logits/rejected": 0.44490164518356323,
"eval_logps/chosen": -41.43645477294922,
"eval_logps/rejected": -52.90102005004883,
"eval_loss": 0.2802717387676239,
"eval_rewards/accuracies": 0.8104838728904724,
"eval_rewards/chosen": 0.15440984070301056,
"eval_rewards/margins": 2.585754156112671,
"eval_rewards/rejected": -2.431344509124756,
"eval_runtime": 220.3099,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 280
},
{
"epoch": 0.6508526734196692,
"grad_norm": 21.181113416054586,
"learning_rate": 4.2659837686466813e-07,
"logits/chosen": 0.498602032661438,
"logits/rejected": 0.5217832922935486,
"logps/chosen": -40.613285064697266,
"logps/rejected": -50.06806945800781,
"loss": 0.262,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.07628664374351501,
"rewards/margins": 2.542593240737915,
"rewards/rejected": -2.466306447982788,
"step": 282
},
{
"epoch": 0.6554686498269009,
"grad_norm": 27.465624654814576,
"learning_rate": 4.25165382270273e-07,
"logits/chosen": 0.5099713206291199,
"logits/rejected": 0.5337219834327698,
"logps/chosen": -37.57986831665039,
"logps/rejected": -45.39601516723633,
"loss": 0.2483,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.15927743911743164,
"rewards/margins": 2.373776912689209,
"rewards/rejected": -2.2144994735717773,
"step": 284
},
{
"epoch": 0.6600846262341326,
"grad_norm": 24.232084058794833,
"learning_rate": 4.2372099224507875e-07,
"logits/chosen": 0.47430500388145447,
"logits/rejected": 0.5168524980545044,
"logps/chosen": -34.61323547363281,
"logps/rejected": -60.36859130859375,
"loss": 0.2904,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.013289166614413261,
"rewards/margins": 2.84716534614563,
"rewards/rejected": -2.860454797744751,
"step": 286
},
{
"epoch": 0.6647006026413643,
"grad_norm": 28.26074226923709,
"learning_rate": 4.2226530075426503e-07,
"logits/chosen": 0.5559656620025635,
"logits/rejected": 0.562049150466919,
"logps/chosen": -48.77291488647461,
"logps/rejected": -52.30695343017578,
"loss": 0.2904,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.016986362636089325,
"rewards/margins": 2.4160873889923096,
"rewards/rejected": -2.3991012573242188,
"step": 288
},
{
"epoch": 0.669316579048596,
"grad_norm": 25.964047989048964,
"learning_rate": 4.2079840249823106e-07,
"logits/chosen": 0.5188059210777283,
"logits/rejected": 0.5476034879684448,
"logps/chosen": -43.39430236816406,
"logps/rejected": -63.02970886230469,
"loss": 0.2964,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.22233732044696808,
"rewards/margins": 2.6584837436676025,
"rewards/rejected": -2.8808212280273438,
"step": 290
},
{
"epoch": 0.669316579048596,
"eval_logits/chosen": 0.41873642802238464,
"eval_logits/rejected": 0.44454658031463623,
"eval_logps/chosen": -41.64173126220703,
"eval_logps/rejected": -53.234169006347656,
"eval_loss": 0.27578282356262207,
"eval_rewards/accuracies": 0.8127880096435547,
"eval_rewards/chosen": 0.05177304521203041,
"eval_rewards/margins": 2.6496896743774414,
"eval_rewards/rejected": -2.597916603088379,
"eval_runtime": 220.2319,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 290
},
{
"epoch": 0.6739325554558276,
"grad_norm": 28.11981406671555,
"learning_rate": 4.193203929064353e-07,
"logits/chosen": 0.5352766513824463,
"logits/rejected": 0.5633915066719055,
"logps/chosen": -43.08574676513672,
"logps/rejected": -63.65277099609375,
"loss": 0.292,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": -0.09769348800182343,
"rewards/margins": 2.7585980892181396,
"rewards/rejected": -2.8562917709350586,
"step": 292
},
{
"epoch": 0.6785485318630594,
"grad_norm": 22.159785280949862,
"learning_rate": 4.1783136813118705e-07,
"logits/chosen": 0.5104035139083862,
"logits/rejected": 0.5326347947120667,
"logps/chosen": -44.235877990722656,
"logps/rejected": -53.24985885620117,
"loss": 0.2764,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.3315318822860718,
"rewards/margins": 2.574824810028076,
"rewards/rejected": -2.9063568115234375,
"step": 294
},
{
"epoch": 0.6831645082702911,
"grad_norm": 16.58376439365046,
"learning_rate": 4.163314250413913e-07,
"logits/chosen": 0.5757681131362915,
"logits/rejected": 0.6053035855293274,
"logps/chosen": -40.00181579589844,
"logps/rejected": -50.29273986816406,
"loss": 0.193,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.18691450357437134,
"rewards/margins": 2.6521503925323486,
"rewards/rejected": -2.465236186981201,
"step": 296
},
{
"epoch": 0.6877804846775227,
"grad_norm": 32.319500846176076,
"learning_rate": 4.1482066121624716e-07,
"logits/chosen": 0.5265994668006897,
"logits/rejected": 0.5376725792884827,
"logps/chosen": -42.3819580078125,
"logps/rejected": -43.448524475097656,
"loss": 0.3285,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.1531985104084015,
"rewards/margins": 2.268404245376587,
"rewards/rejected": -2.115206003189087,
"step": 298
},
{
"epoch": 0.6923964610847545,
"grad_norm": 23.349636529497012,
"learning_rate": 4.1329917493889933e-07,
"logits/chosen": 0.43518775701522827,
"logits/rejected": 0.46238911151885986,
"logps/chosen": -39.432003021240234,
"logps/rejected": -52.38154983520508,
"loss": 0.2382,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.1465599089860916,
"rewards/margins": 2.628819704055786,
"rewards/rejected": -2.7753796577453613,
"step": 300
},
{
"epoch": 0.6923964610847545,
"eval_logits/chosen": 0.4236195683479309,
"eval_logits/rejected": 0.4493381381034851,
"eval_logps/chosen": -41.62788009643555,
"eval_logps/rejected": -53.235809326171875,
"eval_loss": 0.2743636965751648,
"eval_rewards/accuracies": 0.8122119903564453,
"eval_rewards/chosen": 0.05869903042912483,
"eval_rewards/margins": 2.6574366092681885,
"eval_rewards/rejected": -2.5987374782562256,
"eval_runtime": 220.281,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 1.97,
"step": 300
},
{
"epoch": 0.6970124374919862,
"grad_norm": 23.497513813632327,
"learning_rate": 4.117670651900446e-07,
"logits/chosen": 0.5692274570465088,
"logits/rejected": 0.5857737064361572,
"logps/chosen": -44.88375473022461,
"logps/rejected": -50.89904022216797,
"loss": 0.3059,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.18370471894741058,
"rewards/margins": 2.1322684288024902,
"rewards/rejected": -2.3159730434417725,
"step": 302
},
{
"epoch": 0.7016284138992178,
"grad_norm": 31.67224576363876,
"learning_rate": 4.1022443164149237e-07,
"logits/chosen": 0.48219427466392517,
"logits/rejected": 0.5107440948486328,
"logps/chosen": -46.37804412841797,
"logps/rejected": -62.33393859863281,
"loss": 0.2685,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.04567752406001091,
"rewards/margins": 2.84682559967041,
"rewards/rejected": -2.892503261566162,
"step": 304
},
{
"epoch": 0.7062443903064496,
"grad_norm": 19.857257644454698,
"learning_rate": 4.086713746496808e-07,
"logits/chosen": 0.5637336373329163,
"logits/rejected": 0.588976263999939,
"logps/chosen": -39.28482437133789,
"logps/rejected": -50.71957778930664,
"loss": 0.2575,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.2317693531513214,
"rewards/margins": 2.6872549057006836,
"rewards/rejected": -2.4554860591888428,
"step": 306
},
{
"epoch": 0.7108603667136812,
"grad_norm": 17.71463775233371,
"learning_rate": 4.0710799524914805e-07,
"logits/chosen": 0.5934479832649231,
"logits/rejected": 0.6081465482711792,
"logps/chosen": -50.33334732055664,
"logps/rejected": -55.25143814086914,
"loss": 0.2103,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.034923017024993896,
"rewards/margins": 2.6961231231689453,
"rewards/rejected": -2.731046199798584,
"step": 308
},
{
"epoch": 0.7154763431209129,
"grad_norm": 19.132153588643654,
"learning_rate": 4.055343951459592e-07,
"logits/chosen": 0.5560102462768555,
"logits/rejected": 0.5947719812393188,
"logps/chosen": -37.43670654296875,
"logps/rejected": -57.06461715698242,
"loss": 0.226,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.07254935055971146,
"rewards/margins": 2.918682336807251,
"rewards/rejected": -2.991231918334961,
"step": 310
},
{
"epoch": 0.7154763431209129,
"eval_logits/chosen": 0.42303159832954407,
"eval_logits/rejected": 0.44889286160469055,
"eval_logps/chosen": -41.60685348510742,
"eval_logps/rejected": -53.284358978271484,
"eval_loss": 0.27253130078315735,
"eval_rewards/accuracies": 0.8133640289306641,
"eval_rewards/chosen": 0.06921074539422989,
"eval_rewards/margins": 2.692223072052002,
"eval_rewards/rejected": -2.6230127811431885,
"eval_runtime": 220.2961,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 310
},
{
"epoch": 0.7200923195281447,
"grad_norm": 20.574269162073108,
"learning_rate": 4.0395067671108985e-07,
"logits/chosen": 0.47218936681747437,
"logits/rejected": 0.5014721155166626,
"logps/chosen": -35.916664123535156,
"logps/rejected": -44.856101989746094,
"loss": 0.2579,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.182376891374588,
"rewards/margins": 2.569021701812744,
"rewards/rejected": -2.3866446018218994,
"step": 312
},
{
"epoch": 0.7247082959353763,
"grad_norm": 30.250167869534483,
"learning_rate": 4.0235694297376637e-07,
"logits/chosen": 0.5631113648414612,
"logits/rejected": 0.5769122242927551,
"logps/chosen": -49.87733459472656,
"logps/rejected": -55.8229866027832,
"loss": 0.2861,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.1988232433795929,
"rewards/margins": 2.635685443878174,
"rewards/rejected": -2.4368624687194824,
"step": 314
},
{
"epoch": 0.729324272342608,
"grad_norm": 32.09859733085628,
"learning_rate": 4.0075329761476347e-07,
"logits/chosen": 0.5582194924354553,
"logits/rejected": 0.5716796517372131,
"logps/chosen": -44.06077575683594,
"logps/rejected": -48.060577392578125,
"loss": 0.2637,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.14417774975299835,
"rewards/margins": 2.182429313659668,
"rewards/rejected": -2.3266072273254395,
"step": 316
},
{
"epoch": 0.7339402487498398,
"grad_norm": 20.839702603979845,
"learning_rate": 3.991398449596588e-07,
"logits/chosen": 0.5104639530181885,
"logits/rejected": 0.5302228331565857,
"logps/chosen": -46.450565338134766,
"logps/rejected": -56.8250732421875,
"loss": 0.2178,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.05337013676762581,
"rewards/margins": 2.7899389266967773,
"rewards/rejected": -2.7365689277648926,
"step": 318
},
{
"epoch": 0.7385562251570714,
"grad_norm": 35.607964067039056,
"learning_rate": 3.9751668997204647e-07,
"logits/chosen": 0.573165774345398,
"logits/rejected": 0.592732310295105,
"logps/chosen": -46.10280990600586,
"logps/rejected": -53.3104248046875,
"loss": 0.238,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.08940169960260391,
"rewards/margins": 2.5656909942626953,
"rewards/rejected": -2.655092716217041,
"step": 320
},
{
"epoch": 0.7385562251570714,
"eval_logits/chosen": 0.4224054217338562,
"eval_logits/rejected": 0.4482380449771881,
"eval_logps/chosen": -41.65960693359375,
"eval_logps/rejected": -53.47556686401367,
"eval_loss": 0.2701371908187866,
"eval_rewards/accuracies": 0.8185483813285828,
"eval_rewards/chosen": 0.04283595457673073,
"eval_rewards/margins": 2.761453866958618,
"eval_rewards/rejected": -2.718618154525757,
"eval_runtime": 220.4956,
"eval_samples_per_second": 7.864,
"eval_steps_per_second": 1.968,
"step": 320
},
{
"epoch": 0.7431722015643031,
"grad_norm": 40.34998221595971,
"learning_rate": 3.958839382467084e-07,
"logits/chosen": 0.5077357888221741,
"logits/rejected": 0.5302278995513916,
"logps/chosen": -38.23583984375,
"logps/rejected": -49.62001037597656,
"loss": 0.2911,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.431808739900589,
"rewards/margins": 2.4383790493011475,
"rewards/rejected": -2.0065698623657227,
"step": 322
},
{
"epoch": 0.7477881779715349,
"grad_norm": 37.34949673704143,
"learning_rate": 3.9424169600274494e-07,
"logits/chosen": 0.5166856646537781,
"logits/rejected": 0.5311781167984009,
"logps/chosen": -43.24025344848633,
"logps/rejected": -48.49333190917969,
"loss": 0.3054,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.1698003113269806,
"rewards/margins": 2.2522177696228027,
"rewards/rejected": -2.422018051147461,
"step": 324
},
{
"epoch": 0.7524041543787665,
"grad_norm": 25.91010722050029,
"learning_rate": 3.9259007007666436e-07,
"logits/chosen": 0.5167285203933716,
"logits/rejected": 0.5338759422302246,
"logps/chosen": -44.82267761230469,
"logps/rejected": -55.40620803833008,
"loss": 0.2723,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.06528851389884949,
"rewards/margins": 2.759828805923462,
"rewards/rejected": -2.694540023803711,
"step": 326
},
{
"epoch": 0.7570201307859982,
"grad_norm": 30.862683948057615,
"learning_rate": 3.909291679154332e-07,
"logits/chosen": 0.5040656328201294,
"logits/rejected": 0.5386430025100708,
"logps/chosen": -42.25190734863281,
"logps/rejected": -62.51930618286133,
"loss": 0.2759,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.2548324167728424,
"rewards/margins": 3.0783848762512207,
"rewards/rejected": -3.333217144012451,
"step": 328
},
{
"epoch": 0.7616361071932299,
"grad_norm": 19.125155732205084,
"learning_rate": 3.892590975694858e-07,
"logits/chosen": 0.49563461542129517,
"logits/rejected": 0.539116621017456,
"logps/chosen": -39.31736755371094,
"logps/rejected": -60.45228576660156,
"loss": 0.2182,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.16796639561653137,
"rewards/margins": 3.4695467948913574,
"rewards/rejected": -3.301579713821411,
"step": 330
},
{
"epoch": 0.7616361071932299,
"eval_logits/chosen": 0.4225333333015442,
"eval_logits/rejected": 0.44842836260795593,
"eval_logps/chosen": -41.670494079589844,
"eval_logps/rejected": -53.553314208984375,
"eval_loss": 0.2688952684402466,
"eval_rewards/accuracies": 0.8145161271095276,
"eval_rewards/chosen": 0.037393342703580856,
"eval_rewards/margins": 2.7948849201202393,
"eval_rewards/rejected": -2.7574915885925293,
"eval_runtime": 220.4734,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.968,
"step": 330
},
{
"epoch": 0.7662520836004616,
"grad_norm": 20.197390141727503,
"learning_rate": 3.875799676856952e-07,
"logits/chosen": 0.5481100082397461,
"logits/rejected": 0.5680783987045288,
"logps/chosen": -43.26856994628906,
"logps/rejected": -54.90293884277344,
"loss": 0.2148,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.2920362651348114,
"rewards/margins": 2.9112956523895264,
"rewards/rejected": -3.20333194732666,
"step": 332
},
{
"epoch": 0.7708680600076933,
"grad_norm": 28.41138671183374,
"learning_rate": 3.858918875003053e-07,
"logits/chosen": 0.5375738143920898,
"logits/rejected": 0.5755133628845215,
"logps/chosen": -41.622859954833984,
"logps/rejected": -61.92311096191406,
"loss": 0.2733,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.09028860926628113,
"rewards/margins": 3.286768674850464,
"rewards/rejected": -3.3770573139190674,
"step": 334
},
{
"epoch": 0.775484036414925,
"grad_norm": 16.265551276537238,
"learning_rate": 3.8419496683182396e-07,
"logits/chosen": 0.5556432604789734,
"logits/rejected": 0.5942565202713013,
"logps/chosen": -41.74842071533203,
"logps/rejected": -57.50096893310547,
"loss": 0.1896,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.0623447448015213,
"rewards/margins": 2.878957748413086,
"rewards/rejected": -2.941302537918091,
"step": 336
},
{
"epoch": 0.7801000128221567,
"grad_norm": 26.59915287717055,
"learning_rate": 3.824893160738792e-07,
"logits/chosen": 0.5246456861495972,
"logits/rejected": 0.553848385810852,
"logps/chosen": -42.39156723022461,
"logps/rejected": -57.20592498779297,
"loss": 0.2682,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.07438618689775467,
"rewards/margins": 3.046879291534424,
"rewards/rejected": -3.1212656497955322,
"step": 338
},
{
"epoch": 0.7847159892293883,
"grad_norm": 23.023616857684974,
"learning_rate": 3.8077504618803737e-07,
"logits/chosen": 0.580450713634491,
"logits/rejected": 0.5835237503051758,
"logps/chosen": -48.9189567565918,
"logps/rejected": -47.836578369140625,
"loss": 0.2668,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.06672815978527069,
"rewards/margins": 2.457933187484741,
"rewards/rejected": -2.5246615409851074,
"step": 340
},
{
"epoch": 0.7847159892293883,
"eval_logits/chosen": 0.4240727126598358,
"eval_logits/rejected": 0.4500102698802948,
"eval_logps/chosen": -41.714290618896484,
"eval_logps/rejected": -53.6696662902832,
"eval_loss": 0.2670309841632843,
"eval_rewards/accuracies": 0.8179723620414734,
"eval_rewards/chosen": 0.015493539161980152,
"eval_rewards/margins": 2.83115816116333,
"eval_rewards/rejected": -2.815664768218994,
"eval_runtime": 220.6721,
"eval_samples_per_second": 7.858,
"eval_steps_per_second": 1.967,
"step": 340
},
{
"epoch": 0.7893319656366201,
"grad_norm": 16.479244956266236,
"learning_rate": 3.7905226869658446e-07,
"logits/chosen": 0.4684799015522003,
"logits/rejected": 0.4874458909034729,
"logps/chosen": -43.62626647949219,
"logps/rejected": -55.70362854003906,
"loss": 0.2494,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.13190358877182007,
"rewards/margins": 2.8091206550598145,
"rewards/rejected": -2.6772167682647705,
"step": 342
},
{
"epoch": 0.7939479420438518,
"grad_norm": 24.369877883114157,
"learning_rate": 3.773210956752709e-07,
"logits/chosen": 0.544243574142456,
"logits/rejected": 0.5578660368919373,
"logps/chosen": -40.1495246887207,
"logps/rejected": -44.17314910888672,
"loss": 0.2798,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.020625757053494453,
"rewards/margins": 2.502214193344116,
"rewards/rejected": -2.481588363647461,
"step": 344
},
{
"epoch": 0.7985639184510834,
"grad_norm": 25.623903462647995,
"learning_rate": 3.7558163974602093e-07,
"logits/chosen": 0.474899560213089,
"logits/rejected": 0.5161857008934021,
"logps/chosen": -37.74607467651367,
"logps/rejected": -55.48906707763672,
"loss": 0.2419,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.001830246765166521,
"rewards/margins": 2.923034906387329,
"rewards/rejected": -2.924865245819092,
"step": 346
},
{
"epoch": 0.8031798948583152,
"grad_norm": 25.184522607734593,
"learning_rate": 3.73834014069605e-07,
"logits/chosen": 0.558302104473114,
"logits/rejected": 0.5833041667938232,
"logps/chosen": -48.4046630859375,
"logps/rejected": -61.20756149291992,
"loss": 0.2374,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.1346227377653122,
"rewards/margins": 2.8843278884887695,
"rewards/rejected": -3.0189502239227295,
"step": 348
},
{
"epoch": 0.8077958712655469,
"grad_norm": 24.77024105098058,
"learning_rate": 3.7207833233827914e-07,
"logits/chosen": 0.4649287462234497,
"logits/rejected": 0.482571542263031,
"logps/chosen": -44.39389419555664,
"logps/rejected": -58.24624252319336,
"loss": 0.2534,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.14530682563781738,
"rewards/margins": 3.2228527069091797,
"rewards/rejected": -3.368159532546997,
"step": 350
},
{
"epoch": 0.8077958712655469,
"eval_logits/chosen": 0.42746230959892273,
"eval_logits/rejected": 0.45336535573005676,
"eval_logps/chosen": -42.037269592285156,
"eval_logps/rejected": -54.03358459472656,
"eval_loss": 0.2634715437889099,
"eval_rewards/accuracies": 0.8168202638626099,
"eval_rewards/chosen": -0.1459963023662567,
"eval_rewards/margins": 2.8516335487365723,
"eval_rewards/rejected": -2.9976296424865723,
"eval_runtime": 220.3701,
"eval_samples_per_second": 7.869,
"eval_steps_per_second": 1.969,
"step": 350
},
{
"epoch": 0.8124118476727785,
"grad_norm": 26.201135314502036,
"learning_rate": 3.7031470876838786e-07,
"logits/chosen": 0.5293068289756775,
"logits/rejected": 0.5655782222747803,
"logps/chosen": -42.89842224121094,
"logps/rejected": -63.14483642578125,
"loss": 0.2516,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.4706004559993744,
"rewards/margins": 2.8817062377929688,
"rewards/rejected": -3.352307081222534,
"step": 352
},
{
"epoch": 0.8170278240800103,
"grad_norm": 22.294887268242963,
"learning_rate": 3.6854325809293455e-07,
"logits/chosen": 0.49771615862846375,
"logits/rejected": 0.5413529276847839,
"logps/chosen": -36.90867233276367,
"logps/rejected": -64.4770278930664,
"loss": 0.2284,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.27428972721099854,
"rewards/margins": 3.501157522201538,
"rewards/rejected": -3.775447368621826,
"step": 354
},
{
"epoch": 0.821643800487242,
"grad_norm": 28.188753078893058,
"learning_rate": 3.6676409555411653e-07,
"logits/chosen": 0.5484297871589661,
"logits/rejected": 0.5813949704170227,
"logps/chosen": -45.460365295410156,
"logps/rejected": -60.86439895629883,
"loss": 0.2542,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4246326684951782,
"rewards/margins": 3.2056918144226074,
"rewards/rejected": -3.630324363708496,
"step": 356
},
{
"epoch": 0.8262597768944736,
"grad_norm": 17.14121226520804,
"learning_rate": 3.6497733689582866e-07,
"logits/chosen": 0.48876845836639404,
"logits/rejected": 0.5145962238311768,
"logps/chosen": -39.37761688232422,
"logps/rejected": -49.643211364746094,
"loss": 0.2016,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.21149006485939026,
"rewards/margins": 2.893353223800659,
"rewards/rejected": -3.1048433780670166,
"step": 358
},
{
"epoch": 0.8308757533017054,
"grad_norm": 35.17955186267088,
"learning_rate": 3.631830983561335e-07,
"logits/chosen": 0.573662519454956,
"logits/rejected": 0.5948094725608826,
"logps/chosen": -47.85080337524414,
"logps/rejected": -52.225006103515625,
"loss": 0.2586,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.3559052646160126,
"rewards/margins": 2.786222219467163,
"rewards/rejected": -3.142127513885498,
"step": 360
},
{
"epoch": 0.8308757533017054,
"eval_logits/chosen": 0.42756161093711853,
"eval_logits/rejected": 0.45349106192588806,
"eval_logps/chosen": -42.38340759277344,
"eval_logps/rejected": -54.44844436645508,
"eval_loss": 0.2630784213542938,
"eval_rewards/accuracies": 0.8179723620414734,
"eval_rewards/chosen": -0.31906506419181824,
"eval_rewards/margins": 2.8859920501708984,
"eval_rewards/rejected": -3.205056667327881,
"eval_runtime": 220.2057,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 360
},
{
"epoch": 0.835491729708937,
"grad_norm": 36.03053976982613,
"learning_rate": 3.613814966596991e-07,
"logits/chosen": 0.5263631343841553,
"logits/rejected": 0.5573300123214722,
"logps/chosen": -43.24696731567383,
"logps/rejected": -57.23331069946289,
"loss": 0.2526,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.4683598279953003,
"rewards/margins": 3.082267999649048,
"rewards/rejected": -3.5506277084350586,
"step": 362
},
{
"epoch": 0.8401077061161687,
"grad_norm": 15.328563865471402,
"learning_rate": 3.595726490102059e-07,
"logits/chosen": 0.5707637071609497,
"logits/rejected": 0.6143693327903748,
"logps/chosen": -40.44147491455078,
"logps/rejected": -62.61209487915039,
"loss": 0.1294,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.3496915102005005,
"rewards/margins": 3.486618995666504,
"rewards/rejected": -3.836310863494873,
"step": 364
},
{
"epoch": 0.8447236825234005,
"grad_norm": 15.002635114989888,
"learning_rate": 3.577566730827214e-07,
"logits/chosen": 0.5126733779907227,
"logits/rejected": 0.5439874529838562,
"logps/chosen": -40.29549789428711,
"logps/rejected": -56.204898834228516,
"loss": 0.2951,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.3362084925174713,
"rewards/margins": 2.846021890640259,
"rewards/rejected": -3.182230234146118,
"step": 366
},
{
"epoch": 0.8493396589306321,
"grad_norm": 25.52691859216037,
"learning_rate": 3.559336870160453e-07,
"logits/chosen": 0.5128374099731445,
"logits/rejected": 0.5424924492835999,
"logps/chosen": -38.71543884277344,
"logps/rejected": -52.61689758300781,
"loss": 0.2084,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.28658950328826904,
"rewards/margins": 3.0817792415618896,
"rewards/rejected": -3.368368625640869,
"step": 368
},
{
"epoch": 0.8539556353378638,
"grad_norm": 30.283513234320385,
"learning_rate": 3.541038094050241e-07,
"logits/chosen": 0.515430212020874,
"logits/rejected": 0.5466374158859253,
"logps/chosen": -45.59136962890625,
"logps/rejected": -63.18849182128906,
"loss": 0.2378,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.5768634676933289,
"rewards/margins": 3.5630674362182617,
"rewards/rejected": -4.139930725097656,
"step": 370
},
{
"epoch": 0.8539556353378638,
"eval_logits/chosen": 0.4274056553840637,
"eval_logits/rejected": 0.45338377356529236,
"eval_logps/chosen": -43.063682556152344,
"eval_logps/rejected": -55.225093841552734,
"eval_loss": 0.2617854177951813,
"eval_rewards/accuracies": 0.817396342754364,
"eval_rewards/chosen": -0.659203290939331,
"eval_rewards/margins": 2.9341788291931152,
"eval_rewards/rejected": -3.5933821201324463,
"eval_runtime": 220.2088,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 370
},
{
"epoch": 0.8585716117450956,
"grad_norm": 30.9826241797592,
"learning_rate": 3.52267159292835e-07,
"logits/chosen": 0.4993041455745697,
"logits/rejected": 0.5248599052429199,
"logps/chosen": -44.83211898803711,
"logps/rejected": -61.29323959350586,
"loss": 0.2333,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.7047384977340698,
"rewards/margins": 3.358118772506714,
"rewards/rejected": -4.062856674194336,
"step": 372
},
{
"epoch": 0.8631875881523272,
"grad_norm": 16.52463887201103,
"learning_rate": 3.5042385616324236e-07,
"logits/chosen": 0.4287330210208893,
"logits/rejected": 0.46707651019096375,
"logps/chosen": -36.363590240478516,
"logps/rejected": -59.82657241821289,
"loss": 0.22,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8720024228096008,
"rewards/margins": 3.389249086380005,
"rewards/rejected": -4.261251449584961,
"step": 374
},
{
"epoch": 0.8678035645595589,
"grad_norm": 15.500715269356169,
"learning_rate": 3.485740199328244e-07,
"logits/chosen": 0.5408291816711426,
"logits/rejected": 0.5578600764274597,
"logps/chosen": -50.285335540771484,
"logps/rejected": -54.07209014892578,
"loss": 0.1876,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.5448592305183411,
"rewards/margins": 3.2346181869506836,
"rewards/rejected": -3.779477119445801,
"step": 376
},
{
"epoch": 0.8724195409667906,
"grad_norm": 12.222084345575727,
"learning_rate": 3.4671777094317196e-07,
"logits/chosen": 0.5013281106948853,
"logits/rejected": 0.5262949466705322,
"logps/chosen": -46.47956848144531,
"logps/rejected": -53.49814224243164,
"loss": 0.1677,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.7341945767402649,
"rewards/margins": 3.0543222427368164,
"rewards/rejected": -3.7885169982910156,
"step": 378
},
{
"epoch": 0.8770355173740223,
"grad_norm": 22.531696347522484,
"learning_rate": 3.448552299530595e-07,
"logits/chosen": 0.5649933218955994,
"logits/rejected": 0.5860426425933838,
"logps/chosen": -42.52098083496094,
"logps/rejected": -52.308616638183594,
"loss": 0.3071,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.9869860410690308,
"rewards/margins": 2.7113142013549805,
"rewards/rejected": -3.698300361633301,
"step": 380
},
{
"epoch": 0.8770355173740223,
"eval_logits/chosen": 0.4274827539920807,
"eval_logits/rejected": 0.45349830389022827,
"eval_logps/chosen": -43.129615783691406,
"eval_logps/rejected": -55.33893585205078,
"eval_loss": 0.2627149224281311,
"eval_rewards/accuracies": 0.8156682252883911,
"eval_rewards/chosen": -0.6921693086624146,
"eval_rewards/margins": 2.958131790161133,
"eval_rewards/rejected": -3.650301218032837,
"eval_runtime": 220.3046,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 380
},
{
"epoch": 0.881651493781254,
"grad_norm": 39.03269809250303,
"learning_rate": 3.429865181305894e-07,
"logits/chosen": 0.5594089031219482,
"logits/rejected": 0.5762946605682373,
"logps/chosen": -46.85918045043945,
"logps/rejected": -55.68655776977539,
"loss": 0.2915,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.9153691530227661,
"rewards/margins": 2.779404401779175,
"rewards/rejected": -3.694772958755493,
"step": 382
},
{
"epoch": 0.8862674701884857,
"grad_norm": 25.617922410092657,
"learning_rate": 3.411117570453091e-07,
"logits/chosen": 0.5484945774078369,
"logits/rejected": 0.5738579034805298,
"logps/chosen": -42.73631286621094,
"logps/rejected": -53.853271484375,
"loss": 0.2369,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.7328565120697021,
"rewards/margins": 2.8266656398773193,
"rewards/rejected": -3.5595223903656006,
"step": 384
},
{
"epoch": 0.8908834465957174,
"grad_norm": 30.869961559508535,
"learning_rate": 3.392310686603025e-07,
"logits/chosen": 0.534080982208252,
"logits/rejected": 0.5444844365119934,
"logps/chosen": -42.41215515136719,
"logps/rejected": -50.85294723510742,
"loss": 0.2909,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": -0.9006066918373108,
"rewards/margins": 2.361262559890747,
"rewards/rejected": -3.261868953704834,
"step": 386
},
{
"epoch": 0.895499423002949,
"grad_norm": 19.657432685783167,
"learning_rate": 3.3734457532425554e-07,
"logits/chosen": 0.5231594443321228,
"logits/rejected": 0.5530441403388977,
"logps/chosen": -42.48830795288086,
"logps/rejected": -57.00692367553711,
"loss": 0.2606,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.6170899271965027,
"rewards/margins": 3.237041711807251,
"rewards/rejected": -3.8541314601898193,
"step": 388
},
{
"epoch": 0.9001153994101808,
"grad_norm": 24.399140672578795,
"learning_rate": 3.354523997634969e-07,
"logits/chosen": 0.540899932384491,
"logits/rejected": 0.5695917010307312,
"logps/chosen": -44.531185150146484,
"logps/rejected": -58.8494873046875,
"loss": 0.2251,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.7790883183479309,
"rewards/margins": 3.128167152404785,
"rewards/rejected": -3.9072554111480713,
"step": 390
},
{
"epoch": 0.9001153994101808,
"eval_logits/chosen": 0.42857107520103455,
"eval_logits/rejected": 0.4546278119087219,
"eval_logps/chosen": -43.16852951049805,
"eval_logps/rejected": -55.42344665527344,
"eval_loss": 0.2621525228023529,
"eval_rewards/accuracies": 0.8179723620414734,
"eval_rewards/chosen": -0.7116276621818542,
"eval_rewards/margins": 2.980929374694824,
"eval_rewards/rejected": -3.6925570964813232,
"eval_runtime": 220.3143,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 390
},
{
"epoch": 0.9047313758174125,
"grad_norm": 35.01908054863291,
"learning_rate": 3.3355466507401374e-07,
"logits/chosen": 0.5315423607826233,
"logits/rejected": 0.5454668998718262,
"logps/chosen": -42.16218185424805,
"logps/rejected": -44.85585403442383,
"loss": 0.372,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": -0.805086612701416,
"rewards/margins": 2.338005542755127,
"rewards/rejected": -3.143092155456543,
"step": 392
},
{
"epoch": 0.9093473522246441,
"grad_norm": 21.288998506479572,
"learning_rate": 3.3165149471344394e-07,
"logits/chosen": 0.5552914142608643,
"logits/rejected": 0.5818530321121216,
"logps/chosen": -42.95904541015625,
"logps/rejected": -52.76212692260742,
"loss": 0.2934,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.9580552577972412,
"rewards/margins": 2.6676671504974365,
"rewards/rejected": -3.6257221698760986,
"step": 394
},
{
"epoch": 0.9139633286318759,
"grad_norm": 25.556003693396036,
"learning_rate": 3.297430124930444e-07,
"logits/chosen": 0.582655668258667,
"logits/rejected": 0.5952574014663696,
"logps/chosen": -48.771934509277344,
"logps/rejected": -54.426483154296875,
"loss": 0.3223,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": -0.6146318912506104,
"rewards/margins": 2.4974234104156494,
"rewards/rejected": -3.112055540084839,
"step": 396
},
{
"epoch": 0.9185793050391076,
"grad_norm": 23.905362174336005,
"learning_rate": 3.2782934256963647e-07,
"logits/chosen": 0.5089656114578247,
"logits/rejected": 0.5398065447807312,
"logps/chosen": -45.75530242919922,
"logps/rejected": -61.64253234863281,
"loss": 0.2549,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.6105983853340149,
"rewards/margins": 3.1589841842651367,
"rewards/rejected": -3.769582509994507,
"step": 398
},
{
"epoch": 0.9231952814463392,
"grad_norm": 24.17532494020093,
"learning_rate": 3.259106094375289e-07,
"logits/chosen": 0.539167046546936,
"logits/rejected": 0.5812445282936096,
"logps/chosen": -39.31736755371094,
"logps/rejected": -63.33793640136719,
"loss": 0.2698,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.3948301374912262,
"rewards/margins": 3.442387819290161,
"rewards/rejected": -3.8372182846069336,
"step": 400
},
{
"epoch": 0.9231952814463392,
"eval_logits/chosen": 0.42656469345092773,
"eval_logits/rejected": 0.45276370644569397,
"eval_logps/chosen": -42.66855239868164,
"eval_logps/rejected": -55.0075798034668,
"eval_loss": 0.2560158371925354,
"eval_rewards/accuracies": 0.8231566548347473,
"eval_rewards/chosen": -0.46163854002952576,
"eval_rewards/margins": 3.0229856967926025,
"eval_rewards/rejected": -3.4846243858337402,
"eval_runtime": 220.2216,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 400
},
{
"epoch": 0.927811257853571,
"grad_norm": 30.671620714098214,
"learning_rate": 3.239869379204189e-07,
"logits/chosen": 0.4974105656147003,
"logits/rejected": 0.5221477746963501,
"logps/chosen": -45.057281494140625,
"logps/rejected": -56.83816909790039,
"loss": 0.2017,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.5868238210678101,
"rewards/margins": 3.3964414596557617,
"rewards/rejected": -3.9832653999328613,
"step": 402
},
{
"epoch": 0.9324272342608027,
"grad_norm": 24.915176146115876,
"learning_rate": 3.2205845316327144e-07,
"logits/chosen": 0.5429517030715942,
"logits/rejected": 0.5683455467224121,
"logps/chosen": -34.97327423095703,
"logps/rejected": -46.666717529296875,
"loss": 0.3399,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": -0.43591320514678955,
"rewards/margins": 2.185106039047241,
"rewards/rejected": -2.6210196018218994,
"step": 404
},
{
"epoch": 0.9370432106680343,
"grad_norm": 23.867375292949593,
"learning_rate": 3.2012528062417845e-07,
"logits/chosen": 0.5323294997215271,
"logits/rejected": 0.5459015369415283,
"logps/chosen": -43.10551071166992,
"logps/rejected": -47.71934127807617,
"loss": 0.2436,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.7240028977394104,
"rewards/margins": 2.4708030223846436,
"rewards/rejected": -3.1948060989379883,
"step": 406
},
{
"epoch": 0.9416591870752661,
"grad_norm": 15.007721932706033,
"learning_rate": 3.1818754606619643e-07,
"logits/chosen": 0.5331852436065674,
"logits/rejected": 0.564946174621582,
"logps/chosen": -36.540283203125,
"logps/rejected": -57.03317642211914,
"loss": 0.2822,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.16474466025829315,
"rewards/margins": 3.167923927307129,
"rewards/rejected": -3.3326683044433594,
"step": 408
},
{
"epoch": 0.9462751634824977,
"grad_norm": 22.364487052769828,
"learning_rate": 3.162453755491655e-07,
"logits/chosen": 0.49684393405914307,
"logits/rejected": 0.5316374897956848,
"logps/chosen": -38.39241027832031,
"logps/rejected": -59.15244674682617,
"loss": 0.1874,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.322665810585022,
"rewards/margins": 3.4969892501831055,
"rewards/rejected": -3.819655179977417,
"step": 410
},
{
"epoch": 0.9462751634824977,
"eval_logits/chosen": 0.4290708899497986,
"eval_logits/rejected": 0.45515918731689453,
"eval_logps/chosen": -42.679603576660156,
"eval_logps/rejected": -55.10276412963867,
"eval_loss": 0.2565246820449829,
"eval_rewards/accuracies": 0.8191244006156921,
"eval_rewards/chosen": -0.467162162065506,
"eval_rewards/margins": 3.065053939819336,
"eval_rewards/rejected": -3.5322158336639404,
"eval_runtime": 220.2891,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 410
},
{
"epoch": 0.9508911398897294,
"grad_norm": 25.19862106785063,
"learning_rate": 3.142988954215079e-07,
"logits/chosen": 0.5264102816581726,
"logits/rejected": 0.5622512698173523,
"logps/chosen": -43.48373794555664,
"logps/rejected": -66.42120361328125,
"loss": 0.2996,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.48827776312828064,
"rewards/margins": 3.3450686931610107,
"rewards/rejected": -3.833346128463745,
"step": 412
},
{
"epoch": 0.9555071162969612,
"grad_norm": 21.74301345510537,
"learning_rate": 3.1234823231200925e-07,
"logits/chosen": 0.5031583309173584,
"logits/rejected": 0.5540390014648438,
"logps/chosen": -40.93600845336914,
"logps/rejected": -66.30878448486328,
"loss": 0.2428,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.5792509317398071,
"rewards/margins": 3.6368870735168457,
"rewards/rejected": -4.2161383628845215,
"step": 414
},
{
"epoch": 0.9601230927041928,
"grad_norm": 22.436508219334904,
"learning_rate": 3.1039351312157993e-07,
"logits/chosen": 0.56053227186203,
"logits/rejected": 0.590539813041687,
"logps/chosen": -41.67660140991211,
"logps/rejected": -58.28109359741211,
"loss": 0.2048,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.6333367228507996,
"rewards/margins": 3.312451124191284,
"rewards/rejected": -3.9457881450653076,
"step": 416
},
{
"epoch": 0.9647390691114246,
"grad_norm": 36.50210265432233,
"learning_rate": 3.0843486501499967e-07,
"logits/chosen": 0.508413553237915,
"logits/rejected": 0.5429882407188416,
"logps/chosen": -42.58755111694336,
"logps/rejected": -52.10399627685547,
"loss": 0.3069,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.4402269721031189,
"rewards/margins": 2.6428239345550537,
"rewards/rejected": -3.0830507278442383,
"step": 418
},
{
"epoch": 0.9693550455186563,
"grad_norm": 19.432988353108243,
"learning_rate": 3.064724154126449e-07,
"logits/chosen": 0.48101869225502014,
"logits/rejected": 0.49470260739326477,
"logps/chosen": -43.99076461791992,
"logps/rejected": -47.8154411315918,
"loss": 0.2486,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.6770768761634827,
"rewards/margins": 2.6182446479797363,
"rewards/rejected": -3.2953217029571533,
"step": 420
},
{
"epoch": 0.9693550455186563,
"eval_logits/chosen": 0.4298844337463379,
"eval_logits/rejected": 0.45596131682395935,
"eval_logps/chosen": -42.74457550048828,
"eval_logps/rejected": -55.1827278137207,
"eval_loss": 0.2540464699268341,
"eval_rewards/accuracies": 0.8231566548347473,
"eval_rewards/chosen": -0.4996483027935028,
"eval_rewards/margins": 3.072551727294922,
"eval_rewards/rejected": -3.572199821472168,
"eval_runtime": 220.4655,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.969,
"step": 420
},
{
"epoch": 0.9739710219258879,
"grad_norm": 21.396529357952137,
"learning_rate": 3.045062919821995e-07,
"logits/chosen": 0.5096142292022705,
"logits/rejected": 0.5509178638458252,
"logps/chosen": -40.65134811401367,
"logps/rejected": -64.13406372070312,
"loss": 0.2407,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.25429394841194153,
"rewards/margins": 3.5406899452209473,
"rewards/rejected": -3.7949838638305664,
"step": 422
},
{
"epoch": 0.9785869983331197,
"grad_norm": 27.30197314549755,
"learning_rate": 3.0253662263034925e-07,
"logits/chosen": 0.5253940224647522,
"logits/rejected": 0.5617537498474121,
"logps/chosen": -44.63224792480469,
"logps/rejected": -62.29665756225586,
"loss": 0.2666,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.6128353476524353,
"rewards/margins": 3.4666247367858887,
"rewards/rejected": -4.079460144042969,
"step": 424
},
{
"epoch": 0.9832029747403513,
"grad_norm": 40.51282949087652,
"learning_rate": 3.005635354944606e-07,
"logits/chosen": 0.5502428412437439,
"logits/rejected": 0.5616468787193298,
"logps/chosen": -46.97676467895508,
"logps/rejected": -46.36595153808594,
"loss": 0.2894,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.7273317575454712,
"rewards/margins": 2.6478114128112793,
"rewards/rejected": -3.375143051147461,
"step": 426
},
{
"epoch": 0.987818951147583,
"grad_norm": 23.92512657865844,
"learning_rate": 2.9858715893424504e-07,
"logits/chosen": 0.5228149890899658,
"logits/rejected": 0.5698718428611755,
"logps/chosen": -40.91889953613281,
"logps/rejected": -64.06893920898438,
"loss": 0.1794,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.32393407821655273,
"rewards/margins": 3.8048884868621826,
"rewards/rejected": -4.128821849822998,
"step": 428
},
{
"epoch": 0.9924349275548148,
"grad_norm": 18.33017798245734,
"learning_rate": 2.966076215234082e-07,
"logits/chosen": 0.5833015441894531,
"logits/rejected": 0.6151509881019592,
"logps/chosen": -47.47243118286133,
"logps/rejected": -64.26097869873047,
"loss": 0.2098,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2687421441078186,
"rewards/margins": 3.582411766052246,
"rewards/rejected": -3.85115385055542,
"step": 430
},
{
"epoch": 0.9924349275548148,
"eval_logits/chosen": 0.42911431193351746,
"eval_logits/rejected": 0.45535048842430115,
"eval_logps/chosen": -42.6432991027832,
"eval_logps/rejected": -55.0967903137207,
"eval_loss": 0.25298023223876953,
"eval_rewards/accuracies": 0.8237327337265015,
"eval_rewards/chosen": -0.4490084946155548,
"eval_rewards/margins": 3.0802206993103027,
"eval_rewards/rejected": -3.5292294025421143,
"eval_runtime": 220.5016,
"eval_samples_per_second": 7.864,
"eval_steps_per_second": 1.968,
"step": 430
},
{
"epoch": 0.9970509039620464,
"grad_norm": 24.845062608395242,
"learning_rate": 2.94625052041286e-07,
"logits/chosen": 0.529398500919342,
"logits/rejected": 0.5461426377296448,
"logps/chosen": -42.26673889160156,
"logps/rejected": -52.43321228027344,
"loss": 0.2582,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.38506922125816345,
"rewards/margins": 2.947833299636841,
"rewards/rejected": -3.332902431488037,
"step": 432
},
{
"epoch": 1.001666880369278,
"grad_norm": 14.705625802608846,
"learning_rate": 2.926395794644665e-07,
"logits/chosen": 0.5060461759567261,
"logits/rejected": 0.5222041010856628,
"logps/chosen": -45.8979606628418,
"logps/rejected": -55.48097229003906,
"loss": 0.1798,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.3213649392127991,
"rewards/margins": 3.302720308303833,
"rewards/rejected": -3.6240854263305664,
"step": 434
},
{
"epoch": 1.0062828567765099,
"grad_norm": 24.90302953634143,
"learning_rate": 2.906513329583991e-07,
"logits/chosen": 0.5120677351951599,
"logits/rejected": 0.5406749844551086,
"logps/chosen": -40.07225036621094,
"logps/rejected": -54.882259368896484,
"loss": 0.2186,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.5253066420555115,
"rewards/margins": 3.1281352043151855,
"rewards/rejected": -3.653441905975342,
"step": 436
},
{
"epoch": 1.0108988331837414,
"grad_norm": 20.006366802619794,
"learning_rate": 2.886604418689921e-07,
"logits/chosen": 0.48885577917099,
"logits/rejected": 0.5327137112617493,
"logps/chosen": -38.752708435058594,
"logps/rejected": -66.8874740600586,
"loss": 0.2705,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.5506837368011475,
"rewards/margins": 3.6388425827026367,
"rewards/rejected": -4.189526557922363,
"step": 438
},
{
"epoch": 1.0155148095909732,
"grad_norm": 11.538422039384988,
"learning_rate": 2.866670357141979e-07,
"logits/chosen": 0.5471632480621338,
"logits/rejected": 0.5706813931465149,
"logps/chosen": -44.1706428527832,
"logps/rejected": -54.80915832519531,
"loss": 0.2123,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.5128348469734192,
"rewards/margins": 3.5640437602996826,
"rewards/rejected": -4.076879024505615,
"step": 440
},
{
"epoch": 1.0155148095909732,
"eval_logits/chosen": 0.42714568972587585,
"eval_logits/rejected": 0.4533489942550659,
"eval_logps/chosen": -42.395565032958984,
"eval_logps/rejected": -54.934104919433594,
"eval_loss": 0.2539977729320526,
"eval_rewards/accuracies": 0.8231566548347473,
"eval_rewards/chosen": -0.3251444697380066,
"eval_rewards/margins": 3.122741937637329,
"eval_rewards/rejected": -3.4478864669799805,
"eval_runtime": 220.3559,
"eval_samples_per_second": 7.869,
"eval_steps_per_second": 1.97,
"step": 440
},
{
"epoch": 1.020130785998205,
"grad_norm": 16.119320288131345,
"learning_rate": 2.8467124417558737e-07,
"logits/chosen": 0.5559278130531311,
"logits/rejected": 0.5782606601715088,
"logps/chosen": -43.08287048339844,
"logps/rejected": -55.4886474609375,
"loss": 0.2118,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.22590351104736328,
"rewards/margins": 3.3553009033203125,
"rewards/rejected": -3.581204414367676,
"step": 442
},
{
"epoch": 1.0247467624054365,
"grad_norm": 21.10014479926061,
"learning_rate": 2.8267319708991253e-07,
"logits/chosen": 0.5570061206817627,
"logits/rejected": 0.5741885304450989,
"logps/chosen": -46.57928466796875,
"logps/rejected": -48.77629089355469,
"loss": 0.2203,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.2255779653787613,
"rewards/margins": 2.8583762645721436,
"rewards/rejected": -3.083954334259033,
"step": 444
},
{
"epoch": 1.0293627388126683,
"grad_norm": 21.99323071947427,
"learning_rate": 2.806730244406612e-07,
"logits/chosen": 0.5444987416267395,
"logits/rejected": 0.5731097459793091,
"logps/chosen": -40.73080825805664,
"logps/rejected": -52.80342102050781,
"loss": 0.2407,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.2986847758293152,
"rewards/margins": 3.0820257663726807,
"rewards/rejected": -3.3807103633880615,
"step": 446
},
{
"epoch": 1.0339787152199,
"grad_norm": 17.17450683483707,
"learning_rate": 2.786708563496001e-07,
"logits/chosen": 0.5541989207267761,
"logits/rejected": 0.5817456841468811,
"logps/chosen": -45.73213195800781,
"logps/rejected": -61.18666458129883,
"loss": 0.1772,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.05669987201690674,
"rewards/margins": 3.8165981769561768,
"rewards/rejected": -3.873298168182373,
"step": 448
},
{
"epoch": 1.0385946916271316,
"grad_norm": 27.653708636239905,
"learning_rate": 2.7666682306830994e-07,
"logits/chosen": 0.5207394957542419,
"logits/rejected": 0.5322983860969543,
"logps/chosen": -41.09166717529297,
"logps/rejected": -43.31468200683594,
"loss": 0.2544,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.3381701707839966,
"rewards/margins": 2.6456761360168457,
"rewards/rejected": -2.9838459491729736,
"step": 450
},
{
"epoch": 1.0385946916271316,
"eval_logits/chosen": 0.43128177523612976,
"eval_logits/rejected": 0.4573296308517456,
"eval_logps/chosen": -42.16498565673828,
"eval_logps/rejected": -54.75392150878906,
"eval_loss": 0.2521970570087433,
"eval_rewards/accuracies": 0.8248847723007202,
"eval_rewards/chosen": -0.20985357463359833,
"eval_rewards/margins": 3.147939920425415,
"eval_rewards/rejected": -3.3577938079833984,
"eval_runtime": 220.2887,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 450
},
{
"epoch": 1.0432106680343634,
"grad_norm": 26.863807248353726,
"learning_rate": 2.746610549697119e-07,
"logits/chosen": 0.5497666001319885,
"logits/rejected": 0.5746829509735107,
"logps/chosen": -42.95619583129883,
"logps/rejected": -57.17405700683594,
"loss": 0.2279,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.33137860894203186,
"rewards/margins": 3.0671894550323486,
"rewards/rejected": -3.3985676765441895,
"step": 452
},
{
"epoch": 1.0478266444415951,
"grad_norm": 15.765922708965844,
"learning_rate": 2.7265368253958615e-07,
"logits/chosen": 0.5027904510498047,
"logits/rejected": 0.5187773108482361,
"logps/chosen": -40.01198959350586,
"logps/rejected": -49.16390609741211,
"loss": 0.1826,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.024355987086892128,
"rewards/margins": 3.001004219055176,
"rewards/rejected": -3.025360107421875,
"step": 454
},
{
"epoch": 1.0524426208488267,
"grad_norm": 13.117750938407347,
"learning_rate": 2.706448363680831e-07,
"logits/chosen": 0.5505272746086121,
"logits/rejected": 0.592627763748169,
"logps/chosen": -40.86323928833008,
"logps/rejected": -65.0215072631836,
"loss": 0.1182,
"rewards/accuracies": 0.9722222089767456,
"rewards/chosen": -0.19750367105007172,
"rewards/margins": 4.092833995819092,
"rewards/rejected": -4.290337562561035,
"step": 456
},
{
"epoch": 1.0570585972560584,
"grad_norm": 16.896591758231867,
"learning_rate": 2.686346471412277e-07,
"logits/chosen": 0.4872972071170807,
"logits/rejected": 0.5277370810508728,
"logps/chosen": -44.69199752807617,
"logps/rejected": -65.82919311523438,
"loss": 0.1481,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.253704696893692,
"rewards/margins": 3.8575947284698486,
"rewards/rejected": -4.111299514770508,
"step": 458
},
{
"epoch": 1.0616745736632902,
"grad_norm": 20.974972760985903,
"learning_rate": 2.6662324563241805e-07,
"logits/chosen": 0.5082690119743347,
"logits/rejected": 0.5304160118103027,
"logps/chosen": -39.70173263549805,
"logps/rejected": -50.749732971191406,
"loss": 0.218,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.058096084743738174,
"rewards/margins": 2.925325632095337,
"rewards/rejected": -2.983421802520752,
"step": 460
},
{
"epoch": 1.0616745736632902,
"eval_logits/chosen": 0.42715081572532654,
"eval_logits/rejected": 0.45357510447502136,
"eval_logps/chosen": -41.917137145996094,
"eval_logps/rejected": -54.64493179321289,
"eval_loss": 0.2522634267807007,
"eval_rewards/accuracies": 0.8231566548347473,
"eval_rewards/chosen": -0.08592969179153442,
"eval_rewards/margins": 3.217369556427002,
"eval_rewards/rejected": -3.3032991886138916,
"eval_runtime": 220.2922,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 460
},
{
"epoch": 1.0662905500705218,
"grad_norm": 14.344965515087893,
"learning_rate": 2.6461076269391713e-07,
"logits/chosen": 0.5723965167999268,
"logits/rejected": 0.6080074310302734,
"logps/chosen": -47.22536087036133,
"logps/rejected": -63.04933166503906,
"loss": 0.1633,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.08401741087436676,
"rewards/margins": 4.024357318878174,
"rewards/rejected": -4.10837459564209,
"step": 462
},
{
"epoch": 1.0709065264777535,
"grad_norm": 22.161377940303407,
"learning_rate": 2.625973292483409e-07,
"logits/chosen": 0.49575677514076233,
"logits/rejected": 0.5175695419311523,
"logps/chosen": -49.86793518066406,
"logps/rejected": -61.0032958984375,
"loss": 0.2086,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2437991052865982,
"rewards/margins": 3.3475723266601562,
"rewards/rejected": -3.5913712978363037,
"step": 464
},
{
"epoch": 1.0755225028849853,
"grad_norm": 9.157546830395537,
"learning_rate": 2.6058307628014065e-07,
"logits/chosen": 0.5648156404495239,
"logits/rejected": 0.5903113484382629,
"logps/chosen": -47.16014099121094,
"logps/rejected": -58.00987243652344,
"loss": 0.1681,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.20527897775173187,
"rewards/margins": 3.885181427001953,
"rewards/rejected": -4.090460777282715,
"step": 466
},
{
"epoch": 1.0801384792922168,
"grad_norm": 20.418800394750264,
"learning_rate": 2.5856813482708217e-07,
"logits/chosen": 0.5167273879051208,
"logits/rejected": 0.5341954827308655,
"logps/chosen": -44.03962707519531,
"logps/rejected": -48.64061737060547,
"loss": 0.205,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.06323742121458054,
"rewards/margins": 3.104510545730591,
"rewards/rejected": -3.041273355484009,
"step": 468
},
{
"epoch": 1.0847544556994486,
"grad_norm": 24.70628607742756,
"learning_rate": 2.565526359717206e-07,
"logits/chosen": 0.537581205368042,
"logits/rejected": 0.5596475005149841,
"logps/chosen": -37.46675109863281,
"logps/rejected": -45.9968147277832,
"loss": 0.3005,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.24194829165935516,
"rewards/margins": 2.6193909645080566,
"rewards/rejected": -2.8613390922546387,
"step": 470
},
{
"epoch": 1.0847544556994486,
"eval_logits/chosen": 0.4362466037273407,
"eval_logits/rejected": 0.4623866379261017,
"eval_logps/chosen": -42.15773010253906,
"eval_logps/rejected": -54.935401916503906,
"eval_loss": 0.24963192641735077,
"eval_rewards/accuracies": 0.8260368704795837,
"eval_rewards/chosen": -0.20622780919075012,
"eval_rewards/margins": 3.242306709289551,
"eval_rewards/rejected": -3.4485342502593994,
"eval_runtime": 220.4037,
"eval_samples_per_second": 7.867,
"eval_steps_per_second": 1.969,
"step": 470
},
{
"epoch": 1.0893704321066804,
"grad_norm": 27.430779359112005,
"learning_rate": 2.545367108328731e-07,
"logits/chosen": 0.5652859807014465,
"logits/rejected": 0.591205358505249,
"logps/chosen": -43.71979904174805,
"logps/rejected": -53.00830841064453,
"loss": 0.2156,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.20083469152450562,
"rewards/margins": 3.2087488174438477,
"rewards/rejected": -3.409583330154419,
"step": 472
},
{
"epoch": 1.0939864085139122,
"grad_norm": 13.134510140867176,
"learning_rate": 2.525204905570889e-07,
"logits/chosen": 0.5791910290718079,
"logits/rejected": 0.6038353443145752,
"logps/chosen": -46.998390197753906,
"logps/rejected": -59.18220520019531,
"loss": 0.1707,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.05355483293533325,
"rewards/margins": 3.5535666942596436,
"rewards/rejected": -3.607121706008911,
"step": 474
},
{
"epoch": 1.0986023849211437,
"grad_norm": 19.90392742325827,
"learning_rate": 2.505041063101171e-07,
"logits/chosen": 0.5816848278045654,
"logits/rejected": 0.6008831858634949,
"logps/chosen": -47.19880676269531,
"logps/rejected": -51.822105407714844,
"loss": 0.2218,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.03883904218673706,
"rewards/margins": 3.348583221435547,
"rewards/rejected": -3.309744358062744,
"step": 476
},
{
"epoch": 1.1032183613283755,
"grad_norm": 17.00116980477646,
"learning_rate": 2.4848768926837466e-07,
"logits/chosen": 0.5338962078094482,
"logits/rejected": 0.5906614065170288,
"logps/chosen": -40.04157257080078,
"logps/rejected": -76.84749603271484,
"loss": 0.1893,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1601162701845169,
"rewards/margins": 4.218037128448486,
"rewards/rejected": -4.378152847290039,
"step": 478
},
{
"epoch": 1.107834337735607,
"grad_norm": 15.038557815597683,
"learning_rate": 2.464713706104113e-07,
"logits/chosen": 0.5352125763893127,
"logits/rejected": 0.5612537264823914,
"logps/chosen": -43.91660690307617,
"logps/rejected": -56.44979476928711,
"loss": 0.1633,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.2793487310409546,
"rewards/margins": 3.6175765991210938,
"rewards/rejected": -3.896925210952759,
"step": 480
},
{
"epoch": 1.107834337735607,
"eval_logits/chosen": 0.43004509806632996,
"eval_logits/rejected": 0.4563468098640442,
"eval_logps/chosen": -42.171958923339844,
"eval_logps/rejected": -54.986507415771484,
"eval_loss": 0.24832715094089508,
"eval_rewards/accuracies": 0.8271889686584473,
"eval_rewards/chosen": -0.21334028244018555,
"eval_rewards/margins": 3.2607483863830566,
"eval_rewards/rejected": -3.474088668823242,
"eval_runtime": 220.2251,
"eval_samples_per_second": 7.874,
"eval_steps_per_second": 1.971,
"step": 480
},
{
"epoch": 1.1124503141428388,
"grad_norm": 22.9744657106464,
"learning_rate": 2.444552815083767e-07,
"logits/chosen": 0.6254298686981201,
"logits/rejected": 0.6373676061630249,
"logps/chosen": -42.673282623291016,
"logps/rejected": -45.563087463378906,
"loss": 0.2114,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.057508740574121475,
"rewards/margins": 3.0235791206359863,
"rewards/rejected": -3.081087350845337,
"step": 482
},
{
"epoch": 1.1170662905500706,
"grad_norm": 17.674691508042564,
"learning_rate": 2.4243955311948693e-07,
"logits/chosen": 0.5245480537414551,
"logits/rejected": 0.5648095011711121,
"logps/chosen": -39.3298225402832,
"logps/rejected": -61.31127166748047,
"loss": 0.2236,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.1908557116985321,
"rewards/margins": 3.677870512008667,
"rewards/rejected": -3.8687260150909424,
"step": 484
},
{
"epoch": 1.1216822669573023,
"grad_norm": 19.4717194397301,
"learning_rate": 2.4042431657749115e-07,
"logits/chosen": 0.585620105266571,
"logits/rejected": 0.6345695853233337,
"logps/chosen": -41.645267486572266,
"logps/rejected": -72.78955078125,
"loss": 0.1703,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.20467931032180786,
"rewards/margins": 4.08174991607666,
"rewards/rejected": -4.286429405212402,
"step": 486
},
{
"epoch": 1.1262982433645339,
"grad_norm": 30.909727917565508,
"learning_rate": 2.384097029841419e-07,
"logits/chosen": 0.4901224672794342,
"logits/rejected": 0.5071887969970703,
"logps/chosen": -43.30605697631836,
"logps/rejected": -50.992618560791016,
"loss": 0.2185,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.18728405237197876,
"rewards/margins": 2.9479784965515137,
"rewards/rejected": -3.1352624893188477,
"step": 488
},
{
"epoch": 1.1309142197717656,
"grad_norm": 16.93415094151409,
"learning_rate": 2.3639584340066544e-07,
"logits/chosen": 0.5211553573608398,
"logits/rejected": 0.5518543124198914,
"logps/chosen": -37.83938980102539,
"logps/rejected": -53.91053009033203,
"loss": 0.234,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.05988183990120888,
"rewards/margins": 3.5345206260681152,
"rewards/rejected": -3.4746387004852295,
"step": 490
},
{
"epoch": 1.1309142197717656,
"eval_logits/chosen": 0.43326738476753235,
"eval_logits/rejected": 0.45958051085472107,
"eval_logps/chosen": -41.84520721435547,
"eval_logps/rejected": -54.6281852722168,
"eval_loss": 0.24792973697185516,
"eval_rewards/accuracies": 0.8220046162605286,
"eval_rewards/chosen": -0.04996471852064133,
"eval_rewards/margins": 3.244964361190796,
"eval_rewards/rejected": -3.294929265975952,
"eval_runtime": 220.3046,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 490
},
{
"epoch": 1.1355301961789972,
"grad_norm": 16.790260075155444,
"learning_rate": 2.3438286883923539e-07,
"logits/chosen": 0.5881079435348511,
"logits/rejected": 0.6105315685272217,
"logps/chosen": -46.794837951660156,
"logps/rejected": -53.43986511230469,
"loss": 0.2269,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.08306831121444702,
"rewards/margins": 3.1719002723693848,
"rewards/rejected": -3.088831663131714,
"step": 492
},
{
"epoch": 1.140146172586229,
"grad_norm": 22.957641710400285,
"learning_rate": 2.323709102544506e-07,
"logits/chosen": 0.6002509593963623,
"logits/rejected": 0.6072889566421509,
"logps/chosen": -39.66600036621094,
"logps/rejected": -41.07653045654297,
"loss": 0.2857,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.20397840440273285,
"rewards/margins": 2.4769766330718994,
"rewards/rejected": -2.272998094558716,
"step": 494
},
{
"epoch": 1.1447621489934607,
"grad_norm": 27.504424003065566,
"learning_rate": 2.3036009853481474e-07,
"logits/chosen": 0.5301830768585205,
"logits/rejected": 0.5608452558517456,
"logps/chosen": -39.39542770385742,
"logps/rejected": -58.36659622192383,
"loss": 0.2681,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.2189822793006897,
"rewards/margins": 3.4378933906555176,
"rewards/rejected": -3.6568756103515625,
"step": 496
},
{
"epoch": 1.1493781254006925,
"grad_norm": 16.835368907101664,
"learning_rate": 2.283505644942223e-07,
"logits/chosen": 0.5190525054931641,
"logits/rejected": 0.5493537783622742,
"logps/chosen": -34.43808364868164,
"logps/rejected": -54.84063720703125,
"loss": 0.1937,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.13352231681346893,
"rewards/margins": 3.440141201019287,
"rewards/rejected": -3.3066186904907227,
"step": 498
},
{
"epoch": 1.153994101807924,
"grad_norm": 14.320814422051418,
"learning_rate": 2.2634243886344781e-07,
"logits/chosen": 0.5132643580436707,
"logits/rejected": 0.5381724834442139,
"logps/chosen": -41.94618225097656,
"logps/rejected": -54.74879455566406,
"loss": 0.243,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.1846380978822708,
"rewards/margins": 3.523959159851074,
"rewards/rejected": -3.3393211364746094,
"step": 500
},
{
"epoch": 1.153994101807924,
"eval_logits/chosen": 0.43241602182388306,
"eval_logits/rejected": 0.45862025022506714,
"eval_logps/chosen": -41.512245178222656,
"eval_logps/rejected": -54.365325927734375,
"eval_loss": 0.24479356408119202,
"eval_rewards/accuracies": 0.8289170265197754,
"eval_rewards/chosen": 0.11651827394962311,
"eval_rewards/margins": 3.2800135612487793,
"eval_rewards/rejected": -3.1634950637817383,
"eval_runtime": 220.3257,
"eval_samples_per_second": 7.87,
"eval_steps_per_second": 1.97,
"step": 500
},
{
"epoch": 1.1586100782151558,
"grad_norm": 17.24901468893502,
"learning_rate": 2.2433585228164115e-07,
"logits/chosen": 0.5386977791786194,
"logits/rejected": 0.5774834156036377,
"logps/chosen": -43.753910064697266,
"logps/rejected": -65.60494232177734,
"loss": 0.1918,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.19071653485298157,
"rewards/margins": 4.159061908721924,
"rewards/rejected": -3.9683446884155273,
"step": 502
},
{
"epoch": 1.1632260546223874,
"grad_norm": 22.994462305856853,
"learning_rate": 2.2233093528782938e-07,
"logits/chosen": 0.5429908037185669,
"logits/rejected": 0.5663915872573853,
"logps/chosen": -49.295047760009766,
"logps/rejected": -58.83778381347656,
"loss": 0.1741,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.27108439803123474,
"rewards/margins": 3.4974775314331055,
"rewards/rejected": -3.226392984390259,
"step": 504
},
{
"epoch": 1.1678420310296191,
"grad_norm": 19.749474882703815,
"learning_rate": 2.2032781831242367e-07,
"logits/chosen": 0.5360143184661865,
"logits/rejected": 0.5641200542449951,
"logps/chosen": -35.82609558105469,
"logps/rejected": -44.779361724853516,
"loss": 0.2253,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4115668535232544,
"rewards/margins": 2.9376118183135986,
"rewards/rejected": -2.526045083999634,
"step": 506
},
{
"epoch": 1.172458007436851,
"grad_norm": 29.881557534524536,
"learning_rate": 2.183266316687347e-07,
"logits/chosen": 0.5799429416656494,
"logits/rejected": 0.5963388681411743,
"logps/chosen": -42.11252975463867,
"logps/rejected": -44.56486511230469,
"loss": 0.2905,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.2770005762577057,
"rewards/margins": 2.54060435295105,
"rewards/rejected": -2.263603687286377,
"step": 508
},
{
"epoch": 1.1770739838440827,
"grad_norm": 11.72889590765659,
"learning_rate": 2.16327505544495e-07,
"logits/chosen": 0.5231108069419861,
"logits/rejected": 0.5499060153961182,
"logps/chosen": -43.436798095703125,
"logps/rejected": -57.92034912109375,
"loss": 0.1472,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.47280406951904297,
"rewards/margins": 4.098244667053223,
"rewards/rejected": -3.625440835952759,
"step": 510
},
{
"epoch": 1.1770739838440827,
"eval_logits/chosen": 0.43323588371276855,
"eval_logits/rejected": 0.4594508111476898,
"eval_logps/chosen": -41.14154815673828,
"eval_logps/rejected": -54.075172424316406,
"eval_loss": 0.247583270072937,
"eval_rewards/accuracies": 0.828341007232666,
"eval_rewards/chosen": 0.30186572670936584,
"eval_rewards/margins": 3.3202853202819824,
"eval_rewards/rejected": -3.0184197425842285,
"eval_runtime": 220.3645,
"eval_samples_per_second": 7.869,
"eval_steps_per_second": 1.969,
"step": 510
},
{
"epoch": 1.1816899602513142,
"grad_norm": 19.02915371887465,
"learning_rate": 2.143305699933892e-07,
"logits/chosen": 0.5309435725212097,
"logits/rejected": 0.5609121322631836,
"logps/chosen": -39.10821533203125,
"logps/rejected": -55.85133743286133,
"loss": 0.2148,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.35923802852630615,
"rewards/margins": 3.6412789821624756,
"rewards/rejected": -3.282041549682617,
"step": 512
},
{
"epoch": 1.186305936658546,
"grad_norm": 18.184730820886717,
"learning_rate": 2.1233595492659382e-07,
"logits/chosen": 0.6312618851661682,
"logits/rejected": 0.6453579068183899,
"logps/chosen": -48.93413543701172,
"logps/rejected": -50.58020782470703,
"loss": 0.1701,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.28959882259368896,
"rewards/margins": 3.4992854595184326,
"rewards/rejected": -3.209686040878296,
"step": 514
},
{
"epoch": 1.1909219130657775,
"grad_norm": 21.115621604290848,
"learning_rate": 2.1034379010432542e-07,
"logits/chosen": 0.5738712549209595,
"logits/rejected": 0.5990296006202698,
"logps/chosen": -36.4149055480957,
"logps/rejected": -47.95274353027344,
"loss": 0.2192,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.35762646794319153,
"rewards/margins": 3.1450395584106445,
"rewards/rejected": -2.7874133586883545,
"step": 516
},
{
"epoch": 1.1955378894730093,
"grad_norm": 18.313049973835163,
"learning_rate": 2.0835420512739957e-07,
"logits/chosen": 0.48849010467529297,
"logits/rejected": 0.5418619513511658,
"logps/chosen": -39.52627182006836,
"logps/rejected": -70.53701782226562,
"loss": 0.1678,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.39579084515571594,
"rewards/margins": 4.528857231140137,
"rewards/rejected": -4.133066654205322,
"step": 518
},
{
"epoch": 1.200153865880241,
"grad_norm": 18.512425100692376,
"learning_rate": 2.0636732942879917e-07,
"logits/chosen": 0.5643823146820068,
"logits/rejected": 0.5917804837226868,
"logps/chosen": -43.44633483886719,
"logps/rejected": -56.26163101196289,
"loss": 0.166,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.33819130063056946,
"rewards/margins": 3.693488121032715,
"rewards/rejected": -3.3552963733673096,
"step": 520
},
{
"epoch": 1.200153865880241,
"eval_logits/chosen": 0.4335879981517792,
"eval_logits/rejected": 0.45994046330451965,
"eval_logps/chosen": -41.402774810791016,
"eval_logps/rejected": -54.35234451293945,
"eval_loss": 0.2449788749217987,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": 0.1712525486946106,
"eval_rewards/margins": 3.328258991241455,
"eval_rewards/rejected": -3.1570065021514893,
"eval_runtime": 220.2998,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 520
},
{
"epoch": 1.2047698422874729,
"grad_norm": 11.696545134195471,
"learning_rate": 2.0438329226525415e-07,
"logits/chosen": 0.5642399787902832,
"logits/rejected": 0.587860643863678,
"logps/chosen": -41.212337493896484,
"logps/rejected": -43.521636962890625,
"loss": 0.2246,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.5518161058425903,
"rewards/margins": 2.9677634239196777,
"rewards/rejected": -2.415947675704956,
"step": 522
},
{
"epoch": 1.2093858186947044,
"grad_norm": 24.196902238001236,
"learning_rate": 2.0240222270883288e-07,
"logits/chosen": 0.5227870941162109,
"logits/rejected": 0.5579611659049988,
"logps/chosen": -44.49864196777344,
"logps/rejected": -64.84123229980469,
"loss": 0.2314,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.16053809225559235,
"rewards/margins": 3.896054267883301,
"rewards/rejected": -3.73551607131958,
"step": 524
},
{
"epoch": 1.2140017951019362,
"grad_norm": 12.971615376216704,
"learning_rate": 2.0042424963854542e-07,
"logits/chosen": 0.5063973665237427,
"logits/rejected": 0.5544097423553467,
"logps/chosen": -40.40736389160156,
"logps/rejected": -70.9152603149414,
"loss": 0.1526,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.3248124122619629,
"rewards/margins": 4.234506607055664,
"rewards/rejected": -3.9096946716308594,
"step": 526
},
{
"epoch": 1.2186177715091677,
"grad_norm": 14.0866861852398,
"learning_rate": 1.9844950173195883e-07,
"logits/chosen": 0.5182596445083618,
"logits/rejected": 0.549498975276947,
"logps/chosen": -39.39563751220703,
"logps/rejected": -54.05485153198242,
"loss": 0.1818,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.22824376821517944,
"rewards/margins": 3.397740364074707,
"rewards/rejected": -3.169497013092041,
"step": 528
},
{
"epoch": 1.2232337479163995,
"grad_norm": 13.76916365285817,
"learning_rate": 1.964781074568265e-07,
"logits/chosen": 0.5031299591064453,
"logits/rejected": 0.5121724009513855,
"logps/chosen": -41.18108367919922,
"logps/rejected": -45.627994537353516,
"loss": 0.1945,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.03019801713526249,
"rewards/margins": 3.0934128761291504,
"rewards/rejected": -3.0632145404815674,
"step": 530
},
{
"epoch": 1.2232337479163995,
"eval_logits/chosen": 0.43405523896217346,
"eval_logits/rejected": 0.46039465069770813,
"eval_logps/chosen": -41.60369110107422,
"eval_logps/rejected": -54.51262664794922,
"eval_loss": 0.24258121848106384,
"eval_rewards/accuracies": 0.8335253596305847,
"eval_rewards/chosen": 0.07079467922449112,
"eval_rewards/margins": 3.3079416751861572,
"eval_rewards/rejected": -3.2371468544006348,
"eval_runtime": 220.2641,
"eval_samples_per_second": 7.872,
"eval_steps_per_second": 1.97,
"step": 530
},
{
"epoch": 1.2278497243236313,
"grad_norm": 16.411903473780164,
"learning_rate": 1.9451019506273018e-07,
"logits/chosen": 0.541588306427002,
"logits/rejected": 0.5615941286087036,
"logps/chosen": -36.563297271728516,
"logps/rejected": -48.32072448730469,
"loss": 0.2351,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.17822687327861786,
"rewards/margins": 2.845065116882324,
"rewards/rejected": -2.6668384075164795,
"step": 532
},
{
"epoch": 1.232465700730863,
"grad_norm": 13.467269631637619,
"learning_rate": 1.9254589257273712e-07,
"logits/chosen": 0.5137292146682739,
"logits/rejected": 0.5505712032318115,
"logps/chosen": -36.598384857177734,
"logps/rejected": -57.48229217529297,
"loss": 0.1473,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.19568167626857758,
"rewards/margins": 4.128161907196045,
"rewards/rejected": -3.9324798583984375,
"step": 534
},
{
"epoch": 1.2370816771380946,
"grad_norm": 24.645788661655104,
"learning_rate": 1.9058532777507141e-07,
"logits/chosen": 0.5294635891914368,
"logits/rejected": 0.5472697615623474,
"logps/chosen": -39.22220230102539,
"logps/rejected": -49.91395950317383,
"loss": 0.2172,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.11992057412862778,
"rewards/margins": 3.224815845489502,
"rewards/rejected": -3.1048953533172607,
"step": 536
},
{
"epoch": 1.2416976535453264,
"grad_norm": 18.291984511184836,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": 0.5298857688903809,
"logits/rejected": 0.5633623600006104,
"logps/chosen": -41.294647216796875,
"logps/rejected": -57.79304885864258,
"loss": 0.2731,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.11145944148302078,
"rewards/margins": 3.1801443099975586,
"rewards/rejected": -3.2916040420532227,
"step": 538
},
{
"epoch": 1.246313629952558,
"grad_norm": 17.71916747448851,
"learning_rate": 1.8667592118553693e-07,
"logits/chosen": 0.5349301099777222,
"logits/rejected": 0.5512058734893799,
"logps/chosen": -43.72676467895508,
"logps/rejected": -52.80296325683594,
"loss": 0.2216,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.03689540922641754,
"rewards/margins": 3.2271673679351807,
"rewards/rejected": -3.2640628814697266,
"step": 540
},
{
"epoch": 1.246313629952558,
"eval_logits/chosen": 0.4325529932975769,
"eval_logits/rejected": 0.45892781019210815,
"eval_logps/chosen": -41.67875289916992,
"eval_logps/rejected": -54.59620666503906,
"eval_loss": 0.24205271899700165,
"eval_rewards/accuracies": 0.8277649879455566,
"eval_rewards/chosen": 0.03326287120580673,
"eval_rewards/margins": 3.312199115753174,
"eval_rewards/rejected": -3.2789359092712402,
"eval_runtime": 220.1774,
"eval_samples_per_second": 7.875,
"eval_steps_per_second": 1.971,
"step": 540
},
{
"epoch": 1.2509296063597897,
"grad_norm": 15.1063531754732,
"learning_rate": 1.8472733372115956e-07,
"logits/chosen": 0.4958040416240692,
"logits/rejected": 0.5259097814559937,
"logps/chosen": -43.43186950683594,
"logps/rejected": -60.27039337158203,
"loss": 0.1823,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.40173831582069397,
"rewards/margins": 3.8025894165039062,
"rewards/rejected": -4.2043280601501465,
"step": 542
},
{
"epoch": 1.2555455827670214,
"grad_norm": 23.60965925798032,
"learning_rate": 1.8278299258754692e-07,
"logits/chosen": 0.47050708532333374,
"logits/rejected": 0.5154716968536377,
"logps/chosen": -43.42805480957031,
"logps/rejected": -71.56327056884766,
"loss": 0.2284,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.35217729210853577,
"rewards/margins": 4.311697483062744,
"rewards/rejected": -4.663875102996826,
"step": 544
},
{
"epoch": 1.2601615591742532,
"grad_norm": 11.785150141913245,
"learning_rate": 1.808430242743316e-07,
"logits/chosen": 0.46195343136787415,
"logits/rejected": 0.4784909784793854,
"logps/chosen": -42.974945068359375,
"logps/rejected": -54.21615219116211,
"loss": 0.1867,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.31646448373794556,
"rewards/margins": 3.5641021728515625,
"rewards/rejected": -3.2476377487182617,
"step": 546
},
{
"epoch": 1.2647775355814848,
"grad_norm": 13.346160813344762,
"learning_rate": 1.7890755498667104e-07,
"logits/chosen": 0.5626040101051331,
"logits/rejected": 0.5980097651481628,
"logps/chosen": -36.59039306640625,
"logps/rejected": -55.57601547241211,
"loss": 0.182,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.17459021508693695,
"rewards/margins": 3.451416015625,
"rewards/rejected": -3.2768259048461914,
"step": 548
},
{
"epoch": 1.2693935119887165,
"grad_norm": 25.621843956328824,
"learning_rate": 1.7697671063703756e-07,
"logits/chosen": 0.5085393786430359,
"logits/rejected": 0.5440909266471863,
"logps/chosen": -39.27238464355469,
"logps/rejected": -59.40525817871094,
"loss": 0.2243,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.011964095756411552,
"rewards/margins": 3.6004185676574707,
"rewards/rejected": -3.588454246520996,
"step": 550
},
{
"epoch": 1.2693935119887165,
"eval_logits/chosen": 0.4355390965938568,
"eval_logits/rejected": 0.46181005239486694,
"eval_logps/chosen": -41.701602935791016,
"eval_logps/rejected": -54.663360595703125,
"eval_loss": 0.24010230600833893,
"eval_rewards/accuracies": 0.8260368704795837,
"eval_rewards/chosen": 0.0218377523124218,
"eval_rewards/margins": 3.3343515396118164,
"eval_rewards/rejected": -3.312513828277588,
"eval_runtime": 220.234,
"eval_samples_per_second": 7.873,
"eval_steps_per_second": 1.971,
"step": 550
},
{
"epoch": 1.274009488395948,
"grad_norm": 29.85339571581757,
"learning_rate": 1.750506168370267e-07,
"logits/chosen": 0.5484946370124817,
"logits/rejected": 0.5642725229263306,
"logps/chosen": -40.738338470458984,
"logps/rejected": -47.2222900390625,
"loss": 0.2665,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.49148619174957275,
"rewards/margins": 3.0378835201263428,
"rewards/rejected": -2.5463972091674805,
"step": 552
},
{
"epoch": 1.2786254648031798,
"grad_norm": 11.606234417897845,
"learning_rate": 1.7312939888918594e-07,
"logits/chosen": 0.5540368556976318,
"logits/rejected": 0.5830137729644775,
"logps/chosen": -43.42100143432617,
"logps/rejected": -63.07583999633789,
"loss": 0.1529,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.060752179473638535,
"rewards/margins": 3.951368570327759,
"rewards/rejected": -3.8906164169311523,
"step": 554
},
{
"epoch": 1.2832414412104116,
"grad_norm": 8.195981315855988,
"learning_rate": 1.712131817788628e-07,
"logits/chosen": 0.5598903298377991,
"logits/rejected": 0.582931637763977,
"logps/chosen": -39.05931854248047,
"logps/rejected": -49.5858154296875,
"loss": 0.2278,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.035774629563093185,
"rewards/margins": 3.2900662422180176,
"rewards/rejected": -3.325840950012207,
"step": 556
},
{
"epoch": 1.2878574176176434,
"grad_norm": 10.58953396876903,
"learning_rate": 1.693020901660738e-07,
"logits/chosen": 0.5586022138595581,
"logits/rejected": 0.5835521221160889,
"logps/chosen": -46.566070556640625,
"logps/rejected": -56.1746940612793,
"loss": 0.1347,
"rewards/accuracies": 0.9583333134651184,
"rewards/chosen": 0.1323520541191101,
"rewards/margins": 3.951080322265625,
"rewards/rejected": -3.81872820854187,
"step": 558
},
{
"epoch": 1.292473394024875,
"grad_norm": 20.647672350132265,
"learning_rate": 1.6739624837739518e-07,
"logits/chosen": 0.4893258512020111,
"logits/rejected": 0.5065658092498779,
"logps/chosen": -46.70867919921875,
"logps/rejected": -53.02800369262695,
"loss": 0.2073,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.15436476469039917,
"rewards/margins": 3.050819158554077,
"rewards/rejected": -3.205183744430542,
"step": 560
},
{
"epoch": 1.292473394024875,
"eval_logits/chosen": 0.4335208237171173,
"eval_logits/rejected": 0.45989227294921875,
"eval_logps/chosen": -41.82432556152344,
"eval_logps/rejected": -54.859825134277344,
"eval_loss": 0.23924875259399414,
"eval_rewards/accuracies": 0.8312212228775024,
"eval_rewards/chosen": -0.03952277451753616,
"eval_rewards/margins": 3.371224880218506,
"eval_rewards/rejected": -3.410747766494751,
"eval_runtime": 220.3082,
"eval_samples_per_second": 7.871,
"eval_steps_per_second": 1.97,
"step": 560
},
{
"epoch": 1.2970893704321067,
"grad_norm": 15.328848187023517,
"learning_rate": 1.6549578039787434e-07,
"logits/chosen": 0.5223647356033325,
"logits/rejected": 0.5576710104942322,
"logps/chosen": -43.448875427246094,
"logps/rejected": -67.14339447021484,
"loss": 0.2405,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.1683083474636078,
"rewards/margins": 3.6626782417297363,
"rewards/rejected": -3.830986499786377,
"step": 562
},
{
"epoch": 1.3017053468393383,
"grad_norm": 14.362719389125761,
"learning_rate": 1.6360080986296384e-07,
"logits/chosen": 0.5163556337356567,
"logits/rejected": 0.5569749474525452,
"logps/chosen": -37.78327941894531,
"logps/rejected": -64.23339080810547,
"loss": 0.186,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.047993943095207214,
"rewards/margins": 4.109629154205322,
"rewards/rejected": -4.157623291015625,
"step": 564
},
{
"epoch": 1.30632132324657,
"grad_norm": 8.849930925918736,
"learning_rate": 1.6171146005047894e-07,
"logits/chosen": 0.5622715353965759,
"logits/rejected": 0.5891626477241516,
"logps/chosen": -46.50107955932617,
"logps/rejected": -63.37003707885742,
"loss": 0.1689,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.1092919185757637,
"rewards/margins": 4.0769548416137695,
"rewards/rejected": -3.967662811279297,
"step": 566
},
{
"epoch": 1.3109372996538018,
"grad_norm": 16.110148125770678,
"learning_rate": 1.5982785387257694e-07,
"logits/chosen": 0.5649956464767456,
"logits/rejected": 0.5782197117805481,
"logps/chosen": -43.4311408996582,
"logps/rejected": -49.03315734863281,
"loss": 0.2002,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.15342091023921967,
"rewards/margins": 2.909942150115967,
"rewards/rejected": -3.0633630752563477,
"step": 568
},
{
"epoch": 1.3155532760610336,
"grad_norm": 23.725153045927403,
"learning_rate": 1.5795011386776159e-07,
"logits/chosen": 0.5103439688682556,
"logits/rejected": 0.5300507545471191,
"logps/chosen": -42.80021667480469,
"logps/rejected": -47.7119255065918,
"loss": 0.2255,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.14677530527114868,
"rewards/margins": 3.0557618141174316,
"rewards/rejected": -3.2025370597839355,
"step": 570
},
{
"epoch": 1.3155532760610336,
"eval_logits/chosen": 0.43335986137390137,
"eval_logits/rejected": 0.4598417580127716,
"eval_logps/chosen": -41.851680755615234,
"eval_logps/rejected": -54.97309112548828,
"eval_loss": 0.23906731605529785,
"eval_rewards/accuracies": 0.835829496383667,
"eval_rewards/chosen": -0.05320117622613907,
"eval_rewards/margins": 3.4141783714294434,
"eval_rewards/rejected": -3.467379570007324,
"eval_runtime": 220.3588,
"eval_samples_per_second": 7.869,
"eval_steps_per_second": 1.97,
"step": 570
},
{
"epoch": 1.320169252468265,
"grad_norm": 16.172756609459842,
"learning_rate": 1.560783621929113e-07,
"logits/chosen": 0.5175637006759644,
"logits/rejected": 0.5229324102401733,
"logps/chosen": -49.446102142333984,
"logps/rejected": -55.164894104003906,
"loss": 0.1869,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.08015252649784088,
"rewards/margins": 3.3609066009521484,
"rewards/rejected": -3.2807538509368896,
"step": 572
},
{
"epoch": 1.3247852288754969,
"grad_norm": 24.72268513177688,
"learning_rate": 1.5421272061533177e-07,
"logits/chosen": 0.5066720247268677,
"logits/rejected": 0.5451788306236267,
"logps/chosen": -37.343570709228516,
"logps/rejected": -60.23046112060547,
"loss": 0.2949,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.1486133188009262,
"rewards/margins": 3.3898818492889404,
"rewards/rejected": -3.2412681579589844,
"step": 574
},
{
"epoch": 1.3294012052827284,
"grad_norm": 18.734543272703554,
"learning_rate": 1.5235331050483513e-07,
"logits/chosen": 0.5524860620498657,
"logits/rejected": 0.5772072672843933,
"logps/chosen": -43.33749771118164,
"logps/rejected": -56.5976676940918,
"loss": 0.2367,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.3428743779659271,
"rewards/margins": 3.3112895488739014,
"rewards/rejected": -3.6541638374328613,
"step": 576
},
{
"epoch": 1.3340171816899602,
"grad_norm": 15.636365920242639,
"learning_rate": 1.5050025282584327e-07,
"logits/chosen": 0.5805926322937012,
"logits/rejected": 0.6090676188468933,
"logps/chosen": -49.13417434692383,
"logps/rejected": -64.1076431274414,
"loss": 0.1791,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.08167193830013275,
"rewards/margins": 3.955726146697998,
"rewards/rejected": -4.037397861480713,
"step": 578
},
{
"epoch": 1.338633158097192,
"grad_norm": 15.524132351808905,
"learning_rate": 1.4865366812951921e-07,
"logits/chosen": 0.598872721195221,
"logits/rejected": 0.62497878074646,
"logps/chosen": -36.58146667480469,
"logps/rejected": -46.25484085083008,
"loss": 0.1893,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.01747778430581093,
"rewards/margins": 3.4903595447540283,
"rewards/rejected": -3.5078377723693848,
"step": 580
},
{
"epoch": 1.338633158097192,
"eval_logits/chosen": 0.4342789053916931,
"eval_logits/rejected": 0.46078288555145264,
"eval_logps/chosen": -42.1205940246582,
"eval_logps/rejected": -55.25835418701172,
"eval_loss": 0.2389531433582306,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": -0.18765874207019806,
"eval_rewards/margins": 3.4223523139953613,
"eval_rewards/rejected": -3.610011339187622,
"eval_runtime": 220.361,
"eval_samples_per_second": 7.869,
"eval_steps_per_second": 1.969,
"step": 580
},
{
"epoch": 1.3432491345044237,
"grad_norm": 22.418640332294185,
"learning_rate": 1.4681367654592446e-07,
"logits/chosen": 0.583182692527771,
"logits/rejected": 0.596510112285614,
"logps/chosen": -45.08745574951172,
"logps/rejected": -52.57502746582031,
"loss": 0.1635,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.24202129244804382,
"rewards/margins": 3.0601682662963867,
"rewards/rejected": -3.302189826965332,
"step": 582
},
{
"epoch": 1.3478651109116553,
"grad_norm": 16.477398466397805,
"learning_rate": 1.4498039777620353e-07,
"logits/chosen": 0.5257098078727722,
"logits/rejected": 0.5561378598213196,
"logps/chosen": -49.92831039428711,
"logps/rejected": -66.70814514160156,
"loss": 0.1983,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.07970259338617325,
"rewards/margins": 4.159069538116455,
"rewards/rejected": -4.079366683959961,
"step": 584
},
{
"epoch": 1.352481087318887,
"grad_norm": 21.638282072644653,
"learning_rate": 1.4315395108479728e-07,
"logits/chosen": 0.5448426008224487,
"logits/rejected": 0.5733739733695984,
"logps/chosen": -42.567203521728516,
"logps/rejected": -59.23841094970703,
"loss": 0.1872,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.3566977083683014,
"rewards/margins": 3.441741943359375,
"rewards/rejected": -3.7984399795532227,
"step": 586
},
{
"epoch": 1.3570970637261186,
"grad_norm": 22.386629994354788,
"learning_rate": 1.4133445529168365e-07,
"logits/chosen": 0.5482079982757568,
"logits/rejected": 0.5674624443054199,
"logps/chosen": -47.31834030151367,
"logps/rejected": -59.47747802734375,
"loss": 0.1735,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -0.25350263714790344,
"rewards/margins": 3.711785316467285,
"rewards/rejected": -3.965287923812866,
"step": 588
},
{
"epoch": 1.3617130401333504,
"grad_norm": 14.716672759245373,
"learning_rate": 1.395220287646483e-07,
"logits/chosen": 0.5413531064987183,
"logits/rejected": 0.5619943141937256,
"logps/chosen": -45.74396514892578,
"logps/rejected": -54.50990295410156,
"loss": 0.1609,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.3855374753475189,
"rewards/margins": 3.439289093017578,
"rewards/rejected": -3.82482647895813,
"step": 590
},
{
"epoch": 1.3617130401333504,
"eval_logits/chosen": 0.43462061882019043,
"eval_logits/rejected": 0.461076945066452,
"eval_logps/chosen": -42.448509216308594,
"eval_logps/rejected": -55.58904266357422,
"eval_loss": 0.2393806427717209,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": -0.3516136407852173,
"eval_rewards/margins": 3.4237425327301025,
"eval_rewards/rejected": -3.7753562927246094,
"eval_runtime": 220.4141,
"eval_samples_per_second": 7.867,
"eval_steps_per_second": 1.969,
"step": 590
},
{
"epoch": 1.3663290165405821,
"grad_norm": 21.200823940085225,
"learning_rate": 1.377167894115837e-07,
"logits/chosen": 0.562565803527832,
"logits/rejected": 0.6183031797409058,
"logps/chosen": -38.32450866699219,
"logps/rejected": -68.53689575195312,
"loss": 0.179,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.12118612229824066,
"rewards/margins": 4.04473876953125,
"rewards/rejected": -4.165925025939941,
"step": 592
},
{
"epoch": 1.370944992947814,
"grad_norm": 13.082922810935031,
"learning_rate": 1.3591885467281877e-07,
"logits/chosen": 0.4695725440979004,
"logits/rejected": 0.4965362548828125,
"logps/chosen": -39.13195037841797,
"logps/rejected": -58.23176574707031,
"loss": 0.1861,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.30562350153923035,
"rewards/margins": 3.781522035598755,
"rewards/rejected": -4.0871453285217285,
"step": 594
},
{
"epoch": 1.3755609693550455,
"grad_norm": 34.97692684836387,
"learning_rate": 1.3412834151347896e-07,
"logits/chosen": 0.5469548106193542,
"logits/rejected": 0.5717971324920654,
"logps/chosen": -44.02994155883789,
"logps/rejected": -57.28227996826172,
"loss": 0.2084,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.3421705365180969,
"rewards/margins": 3.692906379699707,
"rewards/rejected": -4.035076141357422,
"step": 596
},
{
"epoch": 1.3801769457622772,
"grad_norm": 14.254996050777464,
"learning_rate": 1.323453664158769e-07,
"logits/chosen": 0.5193799138069153,
"logits/rejected": 0.5635771155357361,
"logps/chosen": -40.06482696533203,
"logps/rejected": -67.0745620727539,
"loss": 0.2322,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.5795915126800537,
"rewards/margins": 3.6668989658355713,
"rewards/rejected": -4.246490001678467,
"step": 598
},
{
"epoch": 1.3847929221695088,
"grad_norm": 18.46063830068681,
"learning_rate": 1.3057004537193422e-07,
"logits/chosen": 0.5273723602294922,
"logits/rejected": 0.5402401685714722,
"logps/chosen": -45.491241455078125,
"logps/rejected": -53.827972412109375,
"loss": 0.185,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.32591530680656433,
"rewards/margins": 3.758335590362549,
"rewards/rejected": -4.084251403808594,
"step": 600
},
{
"epoch": 1.3847929221695088,
"eval_logits/chosen": 0.4299531877040863,
"eval_logits/rejected": 0.45649805665016174,
"eval_logps/chosen": -42.37248992919922,
"eval_logps/rejected": -55.565975189208984,
"eval_loss": 0.23996217548847198,
"eval_rewards/accuracies": 0.8300691246986389,
"eval_rewards/chosen": -0.31360533833503723,
"eval_rewards/margins": 3.450216054916382,
"eval_rewards/rejected": -3.7638211250305176,
"eval_runtime": 220.4449,
"eval_samples_per_second": 7.866,
"eval_steps_per_second": 1.969,
"step": 600
},
{
"epoch": 1.3894088985767405,
"grad_norm": 24.193490725343704,
"learning_rate": 1.2880249387563662e-07,
"logits/chosen": 0.5480252504348755,
"logits/rejected": 0.5805102586746216,
"logps/chosen": -43.4918098449707,
"logps/rejected": -62.1549072265625,
"loss": 0.1713,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.4662397801876068,
"rewards/margins": 3.974961280822754,
"rewards/rejected": -4.441201210021973,
"step": 602
},
{
"epoch": 1.3940248749839723,
"grad_norm": 8.975682909766576,
"learning_rate": 1.2704282691551938e-07,
"logits/chosen": 0.45732539892196655,
"logits/rejected": 0.5041163563728333,
"logps/chosen": -40.32965850830078,
"logps/rejected": -67.52854919433594,
"loss": 0.1754,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.014653444290161133,
"rewards/margins": 4.295289993286133,
"rewards/rejected": -4.280636787414551,
"step": 604
},
{
"epoch": 1.398640851391204,
"grad_norm": 27.018968489026342,
"learning_rate": 1.2529115896718714e-07,
"logits/chosen": 0.5242836475372314,
"logits/rejected": 0.5399221777915955,
"logps/chosen": -45.72035217285156,
"logps/rejected": -52.612548828125,
"loss": 0.2076,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.37530067563056946,
"rewards/margins": 3.2071659564971924,
"rewards/rejected": -3.5824666023254395,
"step": 606
},
{
"epoch": 1.4032568277984356,
"grad_norm": 13.414881670063712,
"learning_rate": 1.2354760398586708e-07,
"logits/chosen": 0.5383539199829102,
"logits/rejected": 0.5773718953132629,
"logps/chosen": -48.75130081176758,
"logps/rejected": -72.36872863769531,
"loss": 0.1511,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.44930893182754517,
"rewards/margins": 4.512818336486816,
"rewards/rejected": -4.962126731872559,
"step": 608
},
{
"epoch": 1.4078728042056674,
"grad_norm": 7.330900567316457,
"learning_rate": 1.2181227539899468e-07,
"logits/chosen": 0.5381309986114502,
"logits/rejected": 0.5586973428726196,
"logps/chosen": -45.09908676147461,
"logps/rejected": -58.20050811767578,
"loss": 0.1744,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.2882673442363739,
"rewards/margins": 3.7085728645324707,
"rewards/rejected": -3.996840238571167,
"step": 610
},
{
"epoch": 1.4078728042056674,
"eval_logits/chosen": 0.4304519295692444,
"eval_logits/rejected": 0.45695292949676514,
"eval_logps/chosen": -42.44300842285156,
"eval_logps/rejected": -55.6538200378418,
"eval_loss": 0.238841712474823,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": -0.34886524081230164,
"eval_rewards/margins": 3.4588773250579834,
"eval_rewards/rejected": -3.8077423572540283,
"eval_runtime": 220.5308,
"eval_samples_per_second": 7.863,
"eval_steps_per_second": 1.968,
"step": 610
},
{
"epoch": 1.412488780612899,
"grad_norm": 10.681757170937171,
"learning_rate": 1.2008528609883557e-07,
"logits/chosen": 0.5007774233818054,
"logits/rejected": 0.5296944379806519,
"logps/chosen": -47.22381591796875,
"logps/rejected": -64.06365966796875,
"loss": 0.1531,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.030609939247369766,
"rewards/margins": 4.320724010467529,
"rewards/rejected": -4.351334571838379,
"step": 612
},
{
"epoch": 1.4171047570201307,
"grad_norm": 10.655182313924602,
"learning_rate": 1.1836674843514042e-07,
"logits/chosen": 0.5347999930381775,
"logits/rejected": 0.564474880695343,
"logps/chosen": -37.77484893798828,
"logps/rejected": -54.86954879760742,
"loss": 0.175,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.38236376643180847,
"rewards/margins": 3.763371706008911,
"rewards/rejected": -4.145735263824463,
"step": 614
},
{
"epoch": 1.4217207334273625,
"grad_norm": 4.808937007878847,
"learning_rate": 1.1665677420783671e-07,
"logits/chosen": 0.5504859089851379,
"logits/rejected": 0.5750877261161804,
"logps/chosen": -43.14183807373047,
"logps/rejected": -53.28805160522461,
"loss": 0.1417,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.12460337579250336,
"rewards/margins": 3.7694272994995117,
"rewards/rejected": -3.894031047821045,
"step": 616
},
{
"epoch": 1.4263367098345943,
"grad_norm": 25.84566759360446,
"learning_rate": 1.149554746597553e-07,
"logits/chosen": 0.5723487734794617,
"logits/rejected": 0.6003535389900208,
"logps/chosen": -45.33318328857422,
"logps/rejected": -59.90052795410156,
"loss": 0.262,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.3526383936405182,
"rewards/margins": 3.843003988265991,
"rewards/rejected": -4.195642471313477,
"step": 618
},
{
"epoch": 1.4309526862418258,
"grad_norm": 16.545628594299828,
"learning_rate": 1.1326296046939333e-07,
"logits/chosen": 0.5338951945304871,
"logits/rejected": 0.5544497966766357,
"logps/chosen": -39.78907775878906,
"logps/rejected": -49.23013687133789,
"loss": 0.2511,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.12468406558036804,
"rewards/margins": 3.2776834964752197,
"rewards/rejected": -3.402367353439331,
"step": 620
},
{
"epoch": 1.4309526862418258,
"eval_logits/chosen": 0.43395209312438965,
"eval_logits/rejected": 0.46030664443969727,
"eval_logps/chosen": -42.41356658935547,
"eval_logps/rejected": -55.699623107910156,
"eval_loss": 0.23819313943386078,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": -0.3341463804244995,
"eval_rewards/margins": 3.4965004920959473,
"eval_rewards/rejected": -3.8306467533111572,
"eval_runtime": 220.4261,
"eval_samples_per_second": 7.867,
"eval_steps_per_second": 1.969,
"step": 620
},
{
"epoch": 1.4355686626490576,
"grad_norm": 18.451086465748666,
"learning_rate": 1.1157934174371413e-07,
"logits/chosen": 0.497620165348053,
"logits/rejected": 0.5271977782249451,
"logps/chosen": -44.88563919067383,
"logps/rejected": -63.52084732055664,
"loss": 0.1973,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.4545660614967346,
"rewards/margins": 4.014831066131592,
"rewards/rejected": -4.469396591186523,
"step": 622
},
{
"epoch": 1.4401846390562894,
"grad_norm": 15.41826391561629,
"learning_rate": 1.0990472801098419e-07,
"logits/chosen": 0.49964290857315063,
"logits/rejected": 0.5341427326202393,
"logps/chosen": -39.38306427001953,
"logps/rejected": -59.41951370239258,
"loss": 0.1465,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.07668253034353256,
"rewards/margins": 4.010004043579102,
"rewards/rejected": -4.086687088012695,
"step": 624
},
{
"epoch": 1.444800615463521,
"grad_norm": 13.657128245878823,
"learning_rate": 1.0823922821364795e-07,
"logits/chosen": 0.5488825440406799,
"logits/rejected": 0.5648425221443176,
"logps/chosen": -49.72515869140625,
"logps/rejected": -57.29216766357422,
"loss": 0.1844,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.15428660809993744,
"rewards/margins": 3.7048492431640625,
"rewards/rejected": -3.859135627746582,
"step": 626
},
{
"epoch": 1.4494165918707527,
"grad_norm": 17.171702939592354,
"learning_rate": 1.0658295070124026e-07,
"logits/chosen": 0.5274313688278198,
"logits/rejected": 0.540188729763031,
"logps/chosen": -47.955406188964844,
"logps/rejected": -54.03617477416992,
"loss": 0.2187,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.16990727186203003,
"rewards/margins": 3.60162091255188,
"rewards/rejected": -3.7715280055999756,
"step": 628
},
{
"epoch": 1.4540325682779844,
"grad_norm": 25.795693399142227,
"learning_rate": 1.0493600322333762e-07,
"logits/chosen": 0.5215524435043335,
"logits/rejected": 0.5590708255767822,
"logps/chosen": -44.3021354675293,
"logps/rejected": -73.55774688720703,
"loss": 0.141,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.3967975080013275,
"rewards/margins": 4.7301740646362305,
"rewards/rejected": -5.12697172164917,
"step": 630
},
{
"epoch": 1.4540325682779844,
"eval_logits/chosen": 0.43174034357070923,
"eval_logits/rejected": 0.4582732319831848,
"eval_logps/chosen": -42.194610595703125,
"eval_logps/rejected": -55.55934524536133,
"eval_loss": 0.23693177103996277,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": -0.22466643154621124,
"eval_rewards/margins": 3.535839080810547,
"eval_rewards/rejected": -3.7605059146881104,
"eval_runtime": 220.3801,
"eval_samples_per_second": 7.868,
"eval_steps_per_second": 1.969,
"step": 630
},
{
"epoch": 1.458648544685216,
"grad_norm": 14.475820972948407,
"learning_rate": 1.0329849292254883e-07,
"logits/chosen": 0.596792995929718,
"logits/rejected": 0.624647855758667,
"logps/chosen": -45.63186264038086,
"logps/rejected": -62.25794982910156,
"loss": 0.1936,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.2872418463230133,
"rewards/margins": 3.9080302715301514,
"rewards/rejected": -4.195271968841553,
"step": 632
},
{
"epoch": 1.4632645210924478,
"grad_norm": 26.862980766739724,
"learning_rate": 1.0167052632754458e-07,
"logits/chosen": 0.5725838541984558,
"logits/rejected": 0.5932745337486267,
"logps/chosen": -41.20800018310547,
"logps/rejected": -51.21732711791992,
"loss": 0.227,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.39695149660110474,
"rewards/margins": 2.928715229034424,
"rewards/rejected": -3.325666666030884,
"step": 634
},
{
"epoch": 1.4678804974996795,
"grad_norm": 13.962052681918495,
"learning_rate": 1.0005220934612713e-07,
"logits/chosen": 0.6229636669158936,
"logits/rejected": 0.6402004361152649,
"logps/chosen": -46.95052719116211,
"logps/rejected": -53.86199951171875,
"loss": 0.1824,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.3401051461696625,
"rewards/margins": 3.6175549030303955,
"rewards/rejected": -3.95766019821167,
"step": 636
},
{
"epoch": 1.472496473906911,
"grad_norm": 9.092245687630806,
"learning_rate": 9.844364725834056e-08,
"logits/chosen": 0.48213544487953186,
"logits/rejected": 0.5316063761711121,
"logps/chosen": -45.23646545410156,
"logps/rejected": -75.49991607666016,
"loss": 0.0997,
"rewards/accuracies": 0.9583333134651184,
"rewards/chosen": -0.1606331765651703,
"rewards/margins": 5.202739238739014,
"rewards/rejected": -5.363372802734375,
"step": 638
},
{
"epoch": 1.4771124503141428,
"grad_norm": 18.96340702396886,
"learning_rate": 9.68449447096217e-08,
"logits/chosen": 0.4373500943183899,
"logits/rejected": 0.4579113721847534,
"logps/chosen": -39.44499588012695,
"logps/rejected": -51.54633712768555,
"loss": 0.3299,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -0.2867163419723511,
"rewards/margins": 3.076793670654297,
"rewards/rejected": -3.3635098934173584,
"step": 640
},
{
"epoch": 1.4771124503141428,
"eval_logits/chosen": 0.4346330463886261,
"eval_logits/rejected": 0.461146742105484,
"eval_logps/chosen": -42.071449279785156,
"eval_logps/rejected": -55.46683883666992,
"eval_loss": 0.23784740269184113,
"eval_rewards/accuracies": 0.835829496383667,
"eval_rewards/chosen": -0.16308562457561493,
"eval_rewards/margins": 3.551164388656616,
"eval_rewards/rejected": -3.714250087738037,
"eval_runtime": 220.3881,
"eval_samples_per_second": 7.868,
"eval_steps_per_second": 1.969,
"step": 640
},
{
"epoch": 1.4817284267213746,
"grad_norm": 22.570461867090884,
"learning_rate": 9.525620570399259e-08,
"logits/chosen": 0.5038811564445496,
"logits/rejected": 0.5432533025741577,
"logps/chosen": -44.41080856323242,
"logps/rejected": -65.23593139648438,
"loss": 0.1275,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -0.2485545426607132,
"rewards/margins": 4.013004779815674,
"rewards/rejected": -4.261559009552002,
"step": 642
},
{
"epoch": 1.4863444031286062,
"grad_norm": 11.127499049370783,
"learning_rate": 9.36775335972943e-08,
"logits/chosen": 0.4518318772315979,
"logits/rejected": 0.531367838382721,
"logps/chosen": -39.415767669677734,
"logps/rejected": -98.71846771240234,
"loss": 0.1566,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.000497970322612673,
"rewards/margins": 6.575231075286865,
"rewards/rejected": -6.575727939605713,
"step": 644
},
{
"epoch": 1.490960379535838,
"grad_norm": 24.53509661266678,
"learning_rate": 9.210903109046284e-08,
"logits/chosen": 0.46663856506347656,
"logits/rejected": 0.5147727727890015,
"logps/chosen": -43.30581283569336,
"logps/rejected": -63.16206741333008,
"loss": 0.1684,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.5338683128356934,
"rewards/margins": 4.3571882247924805,
"rewards/rejected": -4.89105749130249,
"step": 646
},
{
"epoch": 1.4955763559430697,
"grad_norm": 11.303027411423997,
"learning_rate": 9.05508002228485e-08,
"logits/chosen": 0.529050350189209,
"logits/rejected": 0.5628350377082825,
"logps/chosen": -38.363826751708984,
"logps/rejected": -53.06625747680664,
"loss": 0.2071,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.031818799674510956,
"rewards/margins": 3.961611032485962,
"rewards/rejected": -3.929792642593384,
"step": 648
},
{
"epoch": 1.5001923323503012,
"grad_norm": 10.500286558923209,
"learning_rate": 8.900294236557707e-08,
"logits/chosen": 0.49337685108184814,
"logits/rejected": 0.5243138074874878,
"logps/chosen": -37.17765808105469,
"logps/rejected": -49.10523986816406,
"loss": 0.2143,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.008912450633943081,
"rewards/margins": 3.240175485610962,
"rewards/rejected": -3.2312631607055664,
"step": 650
},
{
"epoch": 1.5001923323503012,
"eval_logits/chosen": 0.4313080310821533,
"eval_logits/rejected": 0.45790737867355347,
"eval_logps/chosen": -42.17680740356445,
"eval_logps/rejected": -55.66178894042969,
"eval_loss": 0.2398524433374405,
"eval_rewards/accuracies": 0.8306451439857483,
"eval_rewards/chosen": -0.21576282382011414,
"eval_rewards/margins": 3.59596586227417,
"eval_rewards/rejected": -3.8117284774780273,
"eval_runtime": 220.4293,
"eval_samples_per_second": 7.866,
"eval_steps_per_second": 1.969,
"step": 650
},
{
"epoch": 1.504808308757533,
"grad_norm": 21.390880404408534,
"learning_rate": 8.746555821495561e-08,
"logits/chosen": 0.4801899492740631,
"logits/rejected": 0.5136987566947937,
"logps/chosen": -43.907596588134766,
"logps/rejected": -62.06863021850586,
"loss": 0.1972,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.21802374720573425,
"rewards/margins": 4.019637584686279,
"rewards/rejected": -4.237661361694336,
"step": 652
},
{
"epoch": 1.5094242851647648,
"grad_norm": 17.814740010117944,
"learning_rate": 8.593874778592122e-08,
"logits/chosen": 0.4772498309612274,
"logits/rejected": 0.5082363486289978,
"logps/chosen": -36.85258483886719,
"logps/rejected": -49.34876251220703,
"loss": 0.1537,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.038329627364873886,
"rewards/margins": 3.5393142700195312,
"rewards/rejected": -3.577643394470215,
"step": 654
},
{
"epoch": 1.5140402615719966,
"grad_norm": 24.684686325904988,
"learning_rate": 8.442261040553472e-08,
"logits/chosen": 0.5512763857841492,
"logits/rejected": 0.5618037581443787,
"logps/chosen": -44.694515228271484,
"logps/rejected": -49.48525619506836,
"loss": 0.1683,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.0919620469212532,
"rewards/margins": 3.498401403427124,
"rewards/rejected": -3.406439781188965,
"step": 656
},
{
"epoch": 1.518656237979228,
"grad_norm": 21.50701378180569,
"learning_rate": 8.291724470651903e-08,
"logits/chosen": 0.49069249629974365,
"logits/rejected": 0.5210825800895691,
"logps/chosen": -44.639766693115234,
"logps/rejected": -57.28916549682617,
"loss": 0.2335,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.4066303074359894,
"rewards/margins": 3.4069387912750244,
"rewards/rejected": -3.813568592071533,
"step": 658
},
{
"epoch": 1.5232722143864597,
"grad_norm": 11.082339838552715,
"learning_rate": 8.14227486208423e-08,
"logits/chosen": 0.4665941596031189,
"logits/rejected": 0.4930134415626526,
"logps/chosen": -37.94073486328125,
"logps/rejected": -53.0433464050293,
"loss": 0.1797,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.09727773815393448,
"rewards/margins": 3.91404128074646,
"rewards/rejected": -3.8167638778686523,
"step": 660
},
{
"epoch": 1.5232722143864597,
"eval_logits/chosen": 0.43500614166259766,
"eval_logits/rejected": 0.4616233706474304,
"eval_logps/chosen": -42.075767517089844,
"eval_logps/rejected": -55.58706283569336,
"eval_loss": 0.2391819953918457,
"eval_rewards/accuracies": 0.8306451439857483,
"eval_rewards/chosen": -0.1652439683675766,
"eval_rewards/margins": 3.609118938446045,
"eval_rewards/rejected": -3.774362802505493,
"eval_runtime": 220.4966,
"eval_samples_per_second": 7.864,
"eval_steps_per_second": 1.968,
"step": 660
},
{
"epoch": 1.5278881907936914,
"grad_norm": 17.884909353386927,
"learning_rate": 7.993921937334716e-08,
"logits/chosen": 0.5584304332733154,
"logits/rejected": 0.5700749754905701,
"logps/chosen": -41.323944091796875,
"logps/rejected": -49.892147064208984,
"loss": 0.2096,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.27705806493759155,
"rewards/margins": 3.536667823791504,
"rewards/rejected": -3.813725709915161,
"step": 662
},
{
"epoch": 1.5325041672009232,
"grad_norm": 6.982953174746173,
"learning_rate": 7.846675347542578e-08,
"logits/chosen": 0.5807335376739502,
"logits/rejected": 0.6132792234420776,
"logps/chosen": -37.81986999511719,
"logps/rejected": -49.71797180175781,
"loss": 0.1272,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.3861154019832611,
"rewards/margins": 4.170031547546387,
"rewards/rejected": -3.783916473388672,
"step": 664
},
{
"epoch": 1.537120143608155,
"grad_norm": 18.18022469520284,
"learning_rate": 7.700544671874079e-08,
"logits/chosen": 0.6006969213485718,
"logits/rejected": 0.6162829995155334,
"logps/chosen": -47.33814239501953,
"logps/rejected": -52.70623016357422,
"loss": 0.1962,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.2818297743797302,
"rewards/margins": 3.495248317718506,
"rewards/rejected": -3.7770779132843018,
"step": 666
},
{
"epoch": 1.5417361200153867,
"grad_norm": 17.752568042598934,
"learning_rate": 7.555539416899437e-08,
"logits/chosen": 0.5043608546257019,
"logits/rejected": 0.535383939743042,
"logps/chosen": -37.40916442871094,
"logps/rejected": -52.42148971557617,
"loss": 0.2323,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.4006814658641815,
"rewards/margins": 3.385708808898926,
"rewards/rejected": -3.7863900661468506,
"step": 668
},
{
"epoch": 1.5463520964226183,
"grad_norm": 14.165329854797266,
"learning_rate": 7.41166901597429e-08,
"logits/chosen": 0.5081818699836731,
"logits/rejected": 0.5341579914093018,
"logps/chosen": -42.154205322265625,
"logps/rejected": -55.97992706298828,
"loss": 0.1774,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.05981425940990448,
"rewards/margins": 3.988154172897339,
"rewards/rejected": -4.047967910766602,
"step": 670
},
{
"epoch": 1.5463520964226183,
"eval_logits/chosen": 0.4372006952762604,
"eval_logits/rejected": 0.46362602710723877,
"eval_logps/chosen": -42.13774490356445,
"eval_logps/rejected": -55.63636779785156,
"eval_loss": 0.23786574602127075,
"eval_rewards/accuracies": 0.8329492807388306,
"eval_rewards/chosen": -0.19623348116874695,
"eval_rewards/margins": 3.602783203125,
"eval_rewards/rejected": -3.7990164756774902,
"eval_runtime": 220.5205,
"eval_samples_per_second": 7.863,
"eval_steps_per_second": 1.968,
"step": 670
},
{
"epoch": 1.5509680728298498,
"grad_norm": 22.84931762442886,
"learning_rate": 7.268942828626046e-08,
"logits/chosen": 0.5015777349472046,
"logits/rejected": 0.5260412096977234,
"logps/chosen": -39.39936828613281,
"logps/rejected": -50.80826950073242,
"loss": 0.2259,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.02117648348212242,
"rewards/margins": 3.6156790256500244,
"rewards/rejected": -3.6368556022644043,
"step": 672
},
{
"epoch": 1.5555840492370816,
"grad_norm": 10.729660784502734,
"learning_rate": 7.127370139945018e-08,
"logits/chosen": 0.5064399242401123,
"logits/rejected": 0.542765736579895,
"logps/chosen": -41.118350982666016,
"logps/rejected": -57.55162048339844,
"loss": 0.1581,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.18698811531066895,
"rewards/margins": 4.028824806213379,
"rewards/rejected": -4.215813159942627,
"step": 674
},
{
"epoch": 1.5602000256443134,
"grad_norm": 12.758336439580667,
"learning_rate": 6.986960159980326e-08,
"logits/chosen": 0.5471921563148499,
"logits/rejected": 0.5656020045280457,
"logps/chosen": -44.28984069824219,
"logps/rejected": -53.67868423461914,
"loss": 0.1621,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.007483018562197685,
"rewards/margins": 3.514232873916626,
"rewards/rejected": -3.5217158794403076,
"step": 676
},
{
"epoch": 1.5648160020515451,
"grad_norm": 25.743372698631337,
"learning_rate": 6.847722023140776e-08,
"logits/chosen": 0.5099420547485352,
"logits/rejected": 0.5306479930877686,
"logps/chosen": -38.24551773071289,
"logps/rejected": -46.37004470825195,
"loss": 0.2453,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": -0.13890628516674042,
"rewards/margins": 3.242166757583618,
"rewards/rejected": -3.381072998046875,
"step": 678
},
{
"epoch": 1.569431978458777,
"grad_norm": 29.001544411683714,
"learning_rate": 6.709664787600616e-08,
"logits/chosen": 0.5341071486473083,
"logits/rejected": 0.549387514591217,
"logps/chosen": -38.39107131958008,
"logps/rejected": -45.22284698486328,
"loss": 0.2519,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": -0.32282909750938416,
"rewards/margins": 2.876624822616577,
"rewards/rejected": -3.1994540691375732,
"step": 680
},
{
"epoch": 1.569431978458777,
"eval_logits/chosen": 0.4367799460887909,
"eval_logits/rejected": 0.46335569024086,
"eval_logps/chosen": -42.14803695678711,
"eval_logps/rejected": -55.68684005737305,
"eval_loss": 0.23701736330986023,
"eval_rewards/accuracies": 0.8335253596305847,
"eval_rewards/chosen": -0.20137952268123627,
"eval_rewards/margins": 3.622871160507202,
"eval_rewards/rejected": -3.8242506980895996,
"eval_runtime": 220.405,
"eval_samples_per_second": 7.867,
"eval_steps_per_second": 1.969,
"step": 680
},
{
"epoch": 1.5740479548660085,
"grad_norm": 26.57226192590101,
"learning_rate": 6.572797434710219e-08,
"logits/chosen": 0.47764989733695984,
"logits/rejected": 0.5231152772903442,
"logps/chosen": -39.2479362487793,
"logps/rejected": -67.22251892089844,
"loss": 0.1985,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.03589929640293121,
"rewards/margins": 4.406409740447998,
"rewards/rejected": -4.370510578155518,
"step": 682
},
{
"epoch": 1.57866393127324,
"grad_norm": 7.8158043752344115,
"learning_rate": 6.437128868411856e-08,
"logits/chosen": 0.5327097177505493,
"logits/rejected": 0.5473262071609497,
"logps/chosen": -38.83921813964844,
"logps/rejected": -47.30848693847656,
"loss": 0.212,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.002615167060866952,
"rewards/margins": 3.3942179679870605,
"rewards/rejected": -3.3916027545928955,
"step": 684
},
{
"epoch": 1.5832799076804718,
"grad_norm": 11.10012939486401,
"learning_rate": 6.302667914660384e-08,
"logits/chosen": 0.5219799280166626,
"logits/rejected": 0.55839604139328,
"logps/chosen": -37.46578598022461,
"logps/rejected": -54.46531295776367,
"loss": 0.2233,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.02391706593334675,
"rewards/margins": 3.7034101486206055,
"rewards/rejected": -3.679492950439453,
"step": 686
},
{
"epoch": 1.5878958840877035,
"grad_norm": 19.67549763311113,
"learning_rate": 6.169423320849112e-08,
"logits/chosen": 0.5211795568466187,
"logits/rejected": 0.5298517346382141,
"logps/chosen": -45.8150520324707,
"logps/rejected": -47.33256149291992,
"loss": 0.2021,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2716074287891388,
"rewards/margins": 3.559727191925049,
"rewards/rejected": -3.831334352493286,
"step": 688
},
{
"epoch": 1.5925118604949353,
"grad_norm": 15.711220514951888,
"learning_rate": 6.037403755240748e-08,
"logits/chosen": 0.5544189810752869,
"logits/rejected": 0.5787670612335205,
"logps/chosen": -45.216304779052734,
"logps/rejected": -59.76258850097656,
"loss": 0.1572,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": -0.14509858191013336,
"rewards/margins": 3.88366436958313,
"rewards/rejected": -4.0287628173828125,
"step": 690
},
{
"epoch": 1.5925118604949353,
"eval_logits/chosen": 0.43276646733283997,
"eval_logits/rejected": 0.45934849977493286,
"eval_logps/chosen": -42.20445251464844,
"eval_logps/rejected": -55.753753662109375,
"eval_loss": 0.23724210262298584,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": -0.2295861542224884,
"eval_rewards/margins": 3.6281206607818604,
"eval_rewards/rejected": -3.8577067852020264,
"eval_runtime": 220.4833,
"eval_samples_per_second": 7.865,
"eval_steps_per_second": 1.968,
"step": 690
},
{
"epoch": 1.597127836902167,
"grad_norm": 14.487508826565733,
"learning_rate": 5.9066178064034326e-08,
"logits/chosen": 0.4430210590362549,
"logits/rejected": 0.4965353012084961,
"logps/chosen": -33.27760696411133,
"logps/rejected": -71.74127197265625,
"loss": 0.2328,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2861379384994507,
"rewards/margins": 4.55012321472168,
"rewards/rejected": -4.836262226104736,
"step": 692
},
{
"epoch": 1.6017438133093986,
"grad_norm": 23.580990452467088,
"learning_rate": 5.777073982652064e-08,
"logits/chosen": 0.5170236825942993,
"logits/rejected": 0.5521243214607239,
"logps/chosen": -35.71030044555664,
"logps/rejected": -52.74575424194336,
"loss": 0.2247,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": -0.3935600519180298,
"rewards/margins": 3.574741840362549,
"rewards/rejected": -3.96830153465271,
"step": 694
},
{
"epoch": 1.6063597897166302,
"grad_norm": 13.54068941517088,
"learning_rate": 5.6487807114947325e-08,
"logits/chosen": 0.551853358745575,
"logits/rejected": 0.5928479433059692,
"logps/chosen": -42.63957214355469,
"logps/rejected": -70.68295288085938,
"loss": 0.1803,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": -0.2251981645822525,
"rewards/margins": 4.277625560760498,
"rewards/rejected": -4.502823352813721,
"step": 696
},
{
"epoch": 1.610975766123862,
"grad_norm": 27.742044897151906,
"learning_rate": 5.521746339084532e-08,
"logits/chosen": 0.5765677094459534,
"logits/rejected": 0.5921374559402466,
"logps/chosen": -47.175655364990234,
"logps/rejected": -58.09642028808594,
"loss": 0.2516,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": -0.3188338875770569,
"rewards/margins": 3.57645583152771,
"rewards/rejected": -3.895289897918701,
"step": 698
},
{
"epoch": 1.6155917425310937,
"grad_norm": 13.652878320465026,
"learning_rate": 5.39597912967652e-08,
"logits/chosen": 0.5359885692596436,
"logits/rejected": 0.575743556022644,
"logps/chosen": -38.843807220458984,
"logps/rejected": -61.49338150024414,
"loss": 0.1886,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": -0.01514108944684267,
"rewards/margins": 4.108646392822266,
"rewards/rejected": -4.1237874031066895,
"step": 700
},
{
"epoch": 1.6155917425310937,
"eval_logits/chosen": 0.43191081285476685,
"eval_logits/rejected": 0.4585791528224945,
"eval_logps/chosen": -42.20844268798828,
"eval_logps/rejected": -55.773094177246094,
"eval_loss": 0.23592650890350342,
"eval_rewards/accuracies": 0.8364055156707764,
"eval_rewards/chosen": -0.23158276081085205,
"eval_rewards/margins": 3.635798692703247,
"eval_rewards/rejected": -3.8673815727233887,
"eval_runtime": 220.5019,
"eval_samples_per_second": 7.864,
"eval_steps_per_second": 1.968,
"step": 700
}
],
"logging_steps": 2,
"max_steps": 866,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}