{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016260162601626018, "grad_norm": 18.177886962890625, "learning_rate": 2e-05, "logits/chosen": -0.3472236394882202, "logits/rejected": -0.13716036081314087, "logps/chosen": -780.8181762695312, "logps/rejected": -909.20263671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032520325203252036, "grad_norm": 23.274246215820312, "learning_rate": 4e-05, "logits/chosen": -0.2127760350704193, "logits/rejected": -0.08323362469673157, "logps/chosen": -583.0169067382812, "logps/rejected": -715.5615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.04878048780487805, "grad_norm": 20.149507522583008, "learning_rate": 6e-05, "logits/chosen": -0.18167662620544434, "logits/rejected": -0.04478086531162262, "logps/chosen": -941.0387573242188, "logps/rejected": -825.662841796875, "loss": 0.6976, "rewards/accuracies": 0.5, "rewards/chosen": 0.025517277419567108, "rewards/margins": 0.022285467013716698, "rewards/rejected": 0.0032318076118826866, "step": 3 }, { "epoch": 0.06504065040650407, "grad_norm": 16.67251205444336, "learning_rate": 8e-05, "logits/chosen": 0.6866837739944458, "logits/rejected": 0.971089243888855, "logps/chosen": -999.306640625, "logps/rejected": -386.5375671386719, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 0.2688583433628082, "rewards/margins": 0.3312031030654907, "rewards/rejected": -0.062344741076231, "step": 4 }, { "epoch": 0.08130081300813008, "grad_norm": 15.646084785461426, "learning_rate": 0.0001, "logits/chosen": 0.5107800364494324, "logits/rejected": 0.5942208766937256, "logps/chosen": -1051.1270751953125, "logps/rejected": -745.8003540039062, "loss": 0.647, "rewards/accuracies": 0.5, "rewards/chosen": 0.3622299134731293, "rewards/margins": 0.34313660860061646, "rewards/rejected": 0.01909332349896431, "step": 5 }, { "epoch": 0.0975609756097561, "grad_norm": 38.70280456542969, "learning_rate": 0.00012, "logits/chosen": -0.31406939029693604, "logits/rejected": -0.24293695390224457, "logps/chosen": -845.9321899414062, "logps/rejected": -932.499755859375, "loss": 0.5175, "rewards/accuracies": 0.75, "rewards/chosen": 0.5435073971748352, "rewards/margins": 0.47774890065193176, "rewards/rejected": 0.06575851887464523, "step": 6 }, { "epoch": 0.11382113821138211, "grad_norm": 23.665071487426758, "learning_rate": 0.00014, "logits/chosen": -0.2646118402481079, "logits/rejected": -0.11520399153232574, "logps/chosen": -866.503173828125, "logps/rejected": -975.55126953125, "loss": 0.5487, "rewards/accuracies": 0.5, "rewards/chosen": 0.6112838387489319, "rewards/margins": 0.4790405333042145, "rewards/rejected": 0.1322433352470398, "step": 7 }, { "epoch": 0.13008130081300814, "grad_norm": 15.794047355651855, "learning_rate": 0.00016, "logits/chosen": -0.8256000876426697, "logits/rejected": -0.8912097811698914, "logps/chosen": -523.3858032226562, "logps/rejected": -1084.9468994140625, "loss": 0.4442, "rewards/accuracies": 0.5, "rewards/chosen": 0.5804435610771179, "rewards/margins": 0.24081651866436005, "rewards/rejected": 0.33962705731391907, "step": 8 }, { "epoch": 0.14634146341463414, "grad_norm": 13.538564682006836, "learning_rate": 0.00018, "logits/chosen": -0.11683523654937744, "logits/rejected": -0.0632472038269043, "logps/chosen": -652.114501953125, "logps/rejected": -551.6069946289062, "loss": 0.1564, "rewards/accuracies": 1.0, "rewards/chosen": 1.6716469526290894, "rewards/margins": 2.151698350906372, "rewards/rejected": -0.4800514578819275, "step": 9 }, { "epoch": 0.16260162601626016, "grad_norm": 3.9652626514434814, "learning_rate": 0.0002, "logits/chosen": 0.4062778949737549, "logits/rejected": 0.5438919067382812, "logps/chosen": -771.1934814453125, "logps/rejected": -616.55908203125, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": 3.8721909523010254, "rewards/margins": 5.208758354187012, "rewards/rejected": -1.3365669250488281, "step": 10 }, { "epoch": 0.17886178861788618, "grad_norm": 0.18261243402957916, "learning_rate": 0.0001999911398855782, "logits/chosen": -0.7774271965026855, "logits/rejected": -0.8629493117332458, "logps/chosen": -601.1015014648438, "logps/rejected": -1039.275146484375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 1.0800025463104248, "rewards/margins": 6.853862762451172, "rewards/rejected": -5.773860454559326, "step": 11 }, { "epoch": 0.1951219512195122, "grad_norm": 0.1421748697757721, "learning_rate": 0.00019996456111234527, "logits/chosen": 0.7899215817451477, "logits/rejected": 1.119359016418457, "logps/chosen": -1416.412353515625, "logps/rejected": -827.2066650390625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 3.7505874633789062, "rewards/margins": 15.09115982055664, "rewards/rejected": -11.340574264526367, "step": 12 }, { "epoch": 0.21138211382113822, "grad_norm": 3.4406840801239014, "learning_rate": 0.00019992026839012067, "logits/chosen": -0.8033453226089478, "logits/rejected": -0.877557098865509, "logps/chosen": -514.6026611328125, "logps/rejected": -1206.25537109375, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 1.7983558177947998, "rewards/margins": 23.49526596069336, "rewards/rejected": -21.696908950805664, "step": 13 }, { "epoch": 0.22764227642276422, "grad_norm": 0.19398577511310577, "learning_rate": 0.0001998582695676762, "logits/chosen": 0.9254277944564819, "logits/rejected": 1.1634798049926758, "logps/chosen": -1028.993408203125, "logps/rejected": -955.4432983398438, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5009795427322388, "rewards/margins": 17.867931365966797, "rewards/rejected": -18.368911743164062, "step": 14 }, { "epoch": 0.24390243902439024, "grad_norm": 0.00010074722376884893, "learning_rate": 0.000199778575631345, "logits/chosen": 0.3904605507850647, "logits/rejected": 0.3719422519207001, "logps/chosen": -884.9620361328125, "logps/rejected": -1075.615966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.482113838195801, "rewards/margins": 21.95424461364746, "rewards/rejected": -24.436357498168945, "step": 15 }, { "epoch": 0.2601626016260163, "grad_norm": 3.7136353057576343e-05, "learning_rate": 0.000199681200703075, "logits/chosen": 0.2578551769256592, "logits/rejected": 0.5335351824760437, "logps/chosen": -1073.548828125, "logps/rejected": -992.4033813476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9434356689453125, "rewards/margins": 20.854663848876953, "rewards/rejected": -23.798099517822266, "step": 16 }, { "epoch": 0.2764227642276423, "grad_norm": 8.596338147981442e-07, "learning_rate": 0.00019956616203792635, "logits/chosen": 0.5267460346221924, "logits/rejected": 0.4893237352371216, "logps/chosen": -987.3567504882812, "logps/rejected": -1127.171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0684036016464233, "rewards/margins": 32.558319091796875, "rewards/rejected": -33.62671661376953, "step": 17 }, { "epoch": 0.2926829268292683, "grad_norm": 0.004051027819514275, "learning_rate": 0.00019943348002101371, "logits/chosen": 1.0484071969985962, "logits/rejected": 1.1081664562225342, "logps/chosen": -1105.1634521484375, "logps/rejected": -898.9759521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1622314453125, "rewards/margins": 23.434669494628906, "rewards/rejected": -26.596900939941406, "step": 18 }, { "epoch": 0.3089430894308943, "grad_norm": 0.003306547412648797, "learning_rate": 0.00019928317816389417, "logits/chosen": 0.5566614866256714, "logits/rejected": 0.6963181495666504, "logps/chosen": -932.650390625, "logps/rejected": -1061.4989013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.36033821105957, "rewards/margins": 30.25779914855957, "rewards/rejected": -34.61813735961914, "step": 19 }, { "epoch": 0.3252032520325203, "grad_norm": 1.3893560968369911e-08, "learning_rate": 0.00019911528310040074, "logits/chosen": 1.239579200744629, "logits/rejected": 1.046311855316162, "logps/chosen": -1079.0159912109375, "logps/rejected": -1033.2017822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.044548749923706, "rewards/margins": 41.88936233520508, "rewards/rejected": -40.844810485839844, "step": 20 }, { "epoch": 0.34146341463414637, "grad_norm": 4.666223851756968e-09, "learning_rate": 0.00019892982458192288, "logits/chosen": 0.2726232409477234, "logits/rejected": 0.14665402472019196, "logps/chosen": -978.7222900390625, "logps/rejected": -1133.2047119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.054238319396973, "rewards/margins": 54.86410140991211, "rewards/rejected": -43.80986404418945, "step": 21 }, { "epoch": 0.35772357723577236, "grad_norm": 4.876813477494579e-07, "learning_rate": 0.00019872683547213446, "logits/chosen": -0.16925190389156342, "logits/rejected": -0.19759103655815125, "logps/chosen": -965.187255859375, "logps/rejected": -1239.143798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.977485656738281, "rewards/margins": 29.40732765197754, "rewards/rejected": -44.38481140136719, "step": 22 }, { "epoch": 0.37398373983739835, "grad_norm": 37.638973236083984, "learning_rate": 0.00019850635174117033, "logits/chosen": 0.437714159488678, "logits/rejected": 0.4761970639228821, "logps/chosen": -1137.6966552734375, "logps/rejected": -1166.5640869140625, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": -11.159793853759766, "rewards/margins": 32.14189529418945, "rewards/rejected": -43.301692962646484, "step": 23 }, { "epoch": 0.3902439024390244, "grad_norm": 1.8173747229344173e-11, "learning_rate": 0.00019826841245925212, "logits/chosen": -0.7153763175010681, "logits/rejected": -0.6940470933914185, "logps/chosen": -938.263916015625, "logps/rejected": -1608.4205322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.817350387573242, "rewards/margins": 34.095001220703125, "rewards/rejected": -58.912349700927734, "step": 24 }, { "epoch": 0.4065040650406504, "grad_norm": 83.79772186279297, "learning_rate": 0.0001980130597897651, "logits/chosen": 1.1592888832092285, "logits/rejected": 1.1738824844360352, "logps/chosen": -948.4622802734375, "logps/rejected": -865.396728515625, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": -3.343675374984741, "rewards/margins": 26.49417495727539, "rewards/rejected": -29.837852478027344, "step": 25 }, { "epoch": 0.42276422764227645, "grad_norm": 2.6143006834900007e-06, "learning_rate": 0.00019774033898178667, "logits/chosen": 0.5444796085357666, "logits/rejected": 0.47586876153945923, "logps/chosen": -932.6605834960938, "logps/rejected": -1091.639892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.2753777503967285, "rewards/margins": 34.133514404296875, "rewards/rejected": -38.40888977050781, "step": 26 }, { "epoch": 0.43902439024390244, "grad_norm": 0.0003061926399823278, "learning_rate": 0.00019745029836206813, "logits/chosen": -0.6794779896736145, "logits/rejected": -0.8602011203765869, "logps/chosen": -894.3270263671875, "logps/rejected": -1067.5921630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.433198928833008, "rewards/margins": 17.333955764770508, "rewards/rejected": -30.767154693603516, "step": 27 }, { "epoch": 0.45528455284552843, "grad_norm": 3.805017101399244e-08, "learning_rate": 0.00019714298932647098, "logits/chosen": 0.4980026185512543, "logits/rejected": 0.6999194025993347, "logps/chosen": -911.8473510742188, "logps/rejected": -1126.07421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5412168502807617, "rewards/margins": 29.520708084106445, "rewards/rejected": -30.06192398071289, "step": 28 }, { "epoch": 0.4715447154471545, "grad_norm": 5.17633900187775e-08, "learning_rate": 0.00019681846633085967, "logits/chosen": -0.5973828434944153, "logits/rejected": -0.8376109600067139, "logps/chosen": -711.66259765625, "logps/rejected": -1186.1884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.467390537261963, "rewards/margins": 25.050704956054688, "rewards/rejected": -27.518096923828125, "step": 29 }, { "epoch": 0.4878048780487805, "grad_norm": 0.00011633769463514909, "learning_rate": 0.0001964767868814516, "logits/chosen": 1.3797093629837036, "logits/rejected": 1.5397391319274902, "logps/chosen": -877.42333984375, "logps/rejected": -1003.4732666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.624107360839844, "rewards/margins": 29.784557342529297, "rewards/rejected": -25.160449981689453, "step": 30 }, { "epoch": 0.5040650406504065, "grad_norm": 6.257723228486611e-09, "learning_rate": 0.00019611801152462715, "logits/chosen": 1.2731826305389404, "logits/rejected": 1.6379995346069336, "logps/chosen": -1053.573486328125, "logps/rejected": -1010.915283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.018058776855469, "rewards/margins": 32.15219497680664, "rewards/rejected": -21.13413429260254, "step": 31 }, { "epoch": 0.5203252032520326, "grad_norm": 0.00035472630406729877, "learning_rate": 0.00019574220383620055, "logits/chosen": 0.6649560928344727, "logits/rejected": 0.983564019203186, "logps/chosen": -872.1873168945312, "logps/rejected": -965.9480590820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.504961967468262, "rewards/margins": 23.669071197509766, "rewards/rejected": -18.164108276367188, "step": 32 }, { "epoch": 0.5365853658536586, "grad_norm": 3.0934195820009336e-05, "learning_rate": 0.00019534943041015423, "logits/chosen": 0.49574941396713257, "logits/rejected": 0.5190873742103577, "logps/chosen": -708.9269409179688, "logps/rejected": -842.974365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.209194660186768, "rewards/margins": 20.690357208251953, "rewards/rejected": -13.48116397857666, "step": 33 }, { "epoch": 0.5528455284552846, "grad_norm": 0.0006856573163531721, "learning_rate": 0.00019493976084683813, "logits/chosen": 0.992796778678894, "logits/rejected": 1.1291236877441406, "logps/chosen": -673.6188354492188, "logps/rejected": -723.4482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.3715057373046875, "rewards/margins": 19.963485717773438, "rewards/rejected": -14.591980934143066, "step": 34 }, { "epoch": 0.5691056910569106, "grad_norm": 5.983891969663091e-05, "learning_rate": 0.00019451326774063636, "logits/chosen": 0.7630600929260254, "logits/rejected": 0.910960853099823, "logps/chosen": -993.23828125, "logps/rejected": -1011.3184204101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.109509468078613, "rewards/margins": 24.603878021240234, "rewards/rejected": -17.494367599487305, "step": 35 }, { "epoch": 0.5853658536585366, "grad_norm": 1.9749455532291904e-05, "learning_rate": 0.00019407002666710336, "logits/chosen": 1.8401339054107666, "logits/rejected": 1.9955703020095825, "logps/chosen": -1152.950927734375, "logps/rejected": -827.0269775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.768245697021484, "rewards/margins": 38.1776123046875, "rewards/rejected": -22.40936851501465, "step": 36 }, { "epoch": 0.6016260162601627, "grad_norm": 0.0017285533249378204, "learning_rate": 0.00019361011616957164, "logits/chosen": 2.153351306915283, "logits/rejected": 2.235447883605957, "logps/chosen": -1090.1943359375, "logps/rejected": -682.7992553710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.726329803466797, "rewards/margins": 24.018630981445312, "rewards/rejected": -12.292303085327148, "step": 37 }, { "epoch": 0.6178861788617886, "grad_norm": 0.00919501855969429, "learning_rate": 0.00019313361774523385, "logits/chosen": 0.47314736247062683, "logits/rejected": 0.557833731174469, "logps/chosen": -691.4217529296875, "logps/rejected": -673.1847534179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.087795257568359, "rewards/margins": 12.628225326538086, "rewards/rejected": -6.540430068969727, "step": 38 }, { "epoch": 0.6341463414634146, "grad_norm": 0.002680833451449871, "learning_rate": 0.00019264061583070127, "logits/chosen": 0.20066705346107483, "logits/rejected": 0.2085224837064743, "logps/chosen": -693.7376098632812, "logps/rejected": -982.19091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.779763221740723, "rewards/margins": 22.904094696044922, "rewards/rejected": -15.124334335327148, "step": 39 }, { "epoch": 0.6504065040650406, "grad_norm": 8.798202907200903e-05, "learning_rate": 0.00019213119778704128, "logits/chosen": 1.3898746967315674, "logits/rejected": 1.5520107746124268, "logps/chosen": -1247.770263671875, "logps/rejected": -916.4830322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.276836395263672, "rewards/margins": 34.69191360473633, "rewards/rejected": -19.415077209472656, "step": 40 }, { "epoch": 0.6666666666666666, "grad_norm": 0.0009758697124198079, "learning_rate": 0.00019160545388429708, "logits/chosen": 2.345059633255005, "logits/rejected": 2.5746054649353027, "logps/chosen": -1102.5548095703125, "logps/rejected": -722.4332885742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.800348281860352, "rewards/margins": 32.747169494628906, "rewards/rejected": -18.946823120117188, "step": 41 }, { "epoch": 0.6829268292682927, "grad_norm": 0.0016077810432761908, "learning_rate": 0.00019106347728549135, "logits/chosen": 0.9104095697402954, "logits/rejected": 0.9921329021453857, "logps/chosen": -753.8040771484375, "logps/rejected": -886.5813598632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.367500305175781, "rewards/margins": 27.856563568115234, "rewards/rejected": -16.489063262939453, "step": 42 }, { "epoch": 0.6991869918699187, "grad_norm": 0.0004074655589647591, "learning_rate": 0.0001905053640301176, "logits/chosen": 0.5256392955780029, "logits/rejected": 0.4733426570892334, "logps/chosen": -715.4669189453125, "logps/rejected": -565.0441284179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.25009822845459, "rewards/margins": 21.391075134277344, "rewards/rejected": -15.14097785949707, "step": 43 }, { "epoch": 0.7154471544715447, "grad_norm": 0.013145952485501766, "learning_rate": 0.00018993121301712193, "logits/chosen": 0.9358551502227783, "logits/rejected": 0.8306156992912292, "logps/chosen": -867.1063232421875, "logps/rejected": -973.7214965820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.3925018310546875, "rewards/margins": 21.35105323791504, "rewards/rejected": -13.958552360534668, "step": 44 }, { "epoch": 0.7317073170731707, "grad_norm": 8.829876605886966e-05, "learning_rate": 0.00018934112598737777, "logits/chosen": 2.2844998836517334, "logits/rejected": 2.831254482269287, "logps/chosen": -1142.8726806640625, "logps/rejected": -776.1110229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 17.17538833618164, "rewards/margins": 33.72625732421875, "rewards/rejected": -16.550867080688477, "step": 45 }, { "epoch": 0.7479674796747967, "grad_norm": 0.02624354511499405, "learning_rate": 0.00018873520750565718, "logits/chosen": 0.1806122362613678, "logits/rejected": 0.31054702401161194, "logps/chosen": -692.7060546875, "logps/rejected": -1032.708740234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 6.434965133666992, "rewards/margins": 16.74932098388672, "rewards/rejected": -10.314356803894043, "step": 46 }, { "epoch": 0.7642276422764228, "grad_norm": 4.268178963684477e-05, "learning_rate": 0.00018811356494210165, "logits/chosen": 1.1679103374481201, "logits/rejected": 1.0418663024902344, "logps/chosen": -720.220703125, "logps/rejected": -911.58837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.991888523101807, "rewards/margins": 21.064565658569336, "rewards/rejected": -13.072675704956055, "step": 47 }, { "epoch": 0.7804878048780488, "grad_norm": 0.0009461237932555377, "learning_rate": 0.00018747630845319612, "logits/chosen": 0.13339552283287048, "logits/rejected": 0.3655449151992798, "logps/chosen": -420.11431884765625, "logps/rejected": -786.4783325195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.16606330871582, "rewards/margins": 30.41803741455078, "rewards/rejected": -19.251976013183594, "step": 48 }, { "epoch": 0.7967479674796748, "grad_norm": 0.0033115639816969633, "learning_rate": 0.00018682355096224872, "logits/chosen": 0.4472777247428894, "logits/rejected": 0.3390260934829712, "logps/chosen": -536.7960205078125, "logps/rejected": -901.3749389648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.887458801269531, "rewards/margins": 27.701595306396484, "rewards/rejected": -16.814136505126953, "step": 49 }, { "epoch": 0.8130081300813008, "grad_norm": 0.01153454091399908, "learning_rate": 0.0001861554081393806, "logits/chosen": 0.6489148139953613, "logits/rejected": 0.689254105091095, "logps/chosen": -738.5593872070312, "logps/rejected": -755.362060546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 10.205413818359375, "rewards/margins": 16.344358444213867, "rewards/rejected": -6.138944625854492, "step": 50 }, { "epoch": 0.8292682926829268, "grad_norm": 0.001985176932066679, "learning_rate": 0.00018547199838102904, "logits/chosen": 0.144524484872818, "logits/rejected": 0.26266002655029297, "logps/chosen": -893.19482421875, "logps/rejected": -1031.27294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.087849617004395, "rewards/margins": 23.393884658813477, "rewards/rejected": -14.306035041809082, "step": 51 }, { "epoch": 0.8455284552845529, "grad_norm": 0.00042794409091584384, "learning_rate": 0.0001847734427889671, "logits/chosen": 0.5121033191680908, "logits/rejected": 1.0676312446594238, "logps/chosen": -987.8340454101562, "logps/rejected": -830.7366943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.409669876098633, "rewards/margins": 19.569660186767578, "rewards/rejected": -8.159988403320312, "step": 52 }, { "epoch": 0.8617886178861789, "grad_norm": 0.0011688657104969025, "learning_rate": 0.00018405986514884434, "logits/chosen": 1.793473243713379, "logits/rejected": 1.9872632026672363, "logps/chosen": -926.424560546875, "logps/rejected": -618.4228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.011417388916016, "rewards/margins": 22.01776123046875, "rewards/rejected": -11.006343841552734, "step": 53 }, { "epoch": 0.8780487804878049, "grad_norm": 0.005157554987818003, "learning_rate": 0.0001833313919082515, "logits/chosen": -0.02910199761390686, "logits/rejected": 0.14243453741073608, "logps/chosen": -725.36376953125, "logps/rejected": -997.5311279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.557222366333008, "rewards/margins": 15.359309196472168, "rewards/rejected": -9.802087783813477, "step": 54 }, { "epoch": 0.8943089430894309, "grad_norm": 0.005044507794082165, "learning_rate": 0.00018258815215431396, "logits/chosen": 0.17898443341255188, "logits/rejected": 0.09989897906780243, "logps/chosen": -803.9798583984375, "logps/rejected": -925.3179321289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.798739433288574, "rewards/margins": 17.492319107055664, "rewards/rejected": -10.69357967376709, "step": 55 }, { "epoch": 0.9105691056910569, "grad_norm": 0.0031374047975987196, "learning_rate": 0.0001818302775908169, "logits/chosen": 1.017639398574829, "logits/rejected": 1.2823631763458252, "logps/chosen": -824.6445922851562, "logps/rejected": -860.8942260742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.019498825073242, "rewards/margins": 16.16924285888672, "rewards/rejected": -10.149742126464844, "step": 56 }, { "epoch": 0.926829268292683, "grad_norm": 0.00014241511235013604, "learning_rate": 0.0001810579025148674, "logits/chosen": 1.0959478616714478, "logits/rejected": 0.9008815288543701, "logps/chosen": -782.0526123046875, "logps/rejected": -916.8338623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.443077087402344, "rewards/margins": 24.263744354248047, "rewards/rejected": -15.820667266845703, "step": 57 }, { "epoch": 0.943089430894309, "grad_norm": 5.913816494285129e-05, "learning_rate": 0.00018027116379309638, "logits/chosen": 0.2709883153438568, "logits/rejected": 0.29769933223724365, "logps/chosen": -735.5257568359375, "logps/rejected": -1044.0601806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.65300178527832, "rewards/margins": 18.755083084106445, "rewards/rejected": -10.102080345153809, "step": 58 }, { "epoch": 0.959349593495935, "grad_norm": 0.01578771322965622, "learning_rate": 0.00017947020083740575, "logits/chosen": 1.5522100925445557, "logits/rejected": 1.7518442869186401, "logps/chosen": -1019.1099853515625, "logps/rejected": -624.6131591796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 10.32003402709961, "rewards/margins": 23.75770378112793, "rewards/rejected": -13.43766975402832, "step": 59 }, { "epoch": 0.975609756097561, "grad_norm": 0.0010152229806408286, "learning_rate": 0.00017865515558026428, "logits/chosen": 0.8601479530334473, "logits/rejected": 0.819040060043335, "logps/chosen": -763.342041015625, "logps/rejected": -817.870849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.2501859664917, "rewards/margins": 16.491539001464844, "rewards/rejected": -8.241353034973145, "step": 60 }, { "epoch": 0.991869918699187, "grad_norm": 0.008696873672306538, "learning_rate": 0.0001778261724495566, "logits/chosen": 0.7409014701843262, "logits/rejected": 0.9245580434799194, "logps/chosen": -888.8350830078125, "logps/rejected": -796.002685546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 11.07230281829834, "rewards/margins": 22.53582000732422, "rewards/rejected": -11.463518142700195, "step": 61 }, { "epoch": 1.0, "grad_norm": 2.3132517526391894e-05, "learning_rate": 0.00017698339834299061, "logits/chosen": 0.962340772151947, "logits/rejected": 1.369040608406067, "logps/chosen": -843.8861083984375, "logps/rejected": -833.0137329101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.60971736907959, "rewards/margins": 22.649456024169922, "rewards/rejected": -15.039739608764648, "step": 62 }, { "epoch": 1.016260162601626, "grad_norm": 3.0814584306426696e-07, "learning_rate": 0.00017612698260206666, "logits/chosen": 1.7351003885269165, "logits/rejected": 2.39410400390625, "logps/chosen": -1081.0841064453125, "logps/rejected": -664.132080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.010480880737305, "rewards/margins": 23.851722717285156, "rewards/rejected": -11.841242790222168, "step": 63 }, { "epoch": 1.032520325203252, "grad_norm": 0.0014821357326582074, "learning_rate": 0.00017525707698561385, "logits/chosen": 0.8669869899749756, "logits/rejected": 1.2894644737243652, "logps/chosen": -794.047607421875, "logps/rejected": -812.5697631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.141783714294434, "rewards/margins": 23.891061782836914, "rewards/rejected": -12.749277114868164, "step": 64 }, { "epoch": 1.048780487804878, "grad_norm": 0.002492019208148122, "learning_rate": 0.00017437383564289816, "logits/chosen": 1.1617192029953003, "logits/rejected": 1.0443211793899536, "logps/chosen": -706.7365112304688, "logps/rejected": -834.9153442382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.32893180847168, "rewards/margins": 23.380508422851562, "rewards/rejected": -13.0515775680542, "step": 65 }, { "epoch": 1.065040650406504, "grad_norm": 0.10320430248975754, "learning_rate": 0.00017347741508630672, "logits/chosen": 1.5734750032424927, "logits/rejected": 2.108652114868164, "logps/chosen": -919.78125, "logps/rejected": -843.049560546875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 14.794572830200195, "rewards/margins": 27.74661636352539, "rewards/rejected": -12.952045440673828, "step": 66 }, { "epoch": 1.08130081300813, "grad_norm": 0.00033748566056601703, "learning_rate": 0.00017256797416361362, "logits/chosen": 0.10465478897094727, "logits/rejected": 0.11954197287559509, "logps/chosen": -770.0354614257812, "logps/rejected": -705.5811767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.188321113586426, "rewards/margins": 18.007652282714844, "rewards/rejected": -9.819330215454102, "step": 67 }, { "epoch": 1.0975609756097562, "grad_norm": 0.4934139549732208, "learning_rate": 0.00017164567402983152, "logits/chosen": 0.7908147573471069, "logits/rejected": 1.0772439241409302, "logps/chosen": -869.843017578125, "logps/rejected": -729.0626831054688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 8.537101745605469, "rewards/margins": 12.491724014282227, "rewards/rejected": -3.9546217918395996, "step": 68 }, { "epoch": 1.113821138211382, "grad_norm": 2.1183014098369313e-07, "learning_rate": 0.00017071067811865476, "logits/chosen": 0.6217237710952759, "logits/rejected": 0.5386490225791931, "logps/chosen": -799.1664428710938, "logps/rejected": -820.0735473632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.295455932617188, "rewards/margins": 30.9702091217041, "rewards/rejected": -18.674753189086914, "step": 69 }, { "epoch": 1.1300813008130082, "grad_norm": 7.591093162773177e-05, "learning_rate": 0.0001697631521134985, "logits/chosen": 1.664866328239441, "logits/rejected": 1.980355978012085, "logps/chosen": -1113.451416015625, "logps/rejected": -825.9473876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.451591491699219, "rewards/margins": 29.68605613708496, "rewards/rejected": -18.23446273803711, "step": 70 }, { "epoch": 1.146341463414634, "grad_norm": 4.4439241264626617e-07, "learning_rate": 0.00016880326391813916, "logits/chosen": -0.02196294069290161, "logits/rejected": 0.18253503739833832, "logps/chosen": -661.0505981445312, "logps/rejected": -834.158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.791834831237793, "rewards/margins": 28.233205795288086, "rewards/rejected": -18.441370010375977, "step": 71 }, { "epoch": 1.1626016260162602, "grad_norm": 8.045230060815811e-05, "learning_rate": 0.00016783118362696163, "logits/chosen": 0.24465110898017883, "logits/rejected": 0.2313007265329361, "logps/chosen": -715.2831420898438, "logps/rejected": -1050.01171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.176504611968994, "rewards/margins": 19.875812530517578, "rewards/rejected": -15.699307441711426, "step": 72 }, { "epoch": 1.1788617886178863, "grad_norm": 5.927664005866973e-06, "learning_rate": 0.00016684708349481804, "logits/chosen": 1.5342342853546143, "logits/rejected": 2.0414443016052246, "logps/chosen": -1195.0989990234375, "logps/rejected": -652.9114990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.883450508117676, "rewards/margins": 19.403560638427734, "rewards/rejected": -10.520109176635742, "step": 73 }, { "epoch": 1.1951219512195121, "grad_norm": 1.7679340089671314e-05, "learning_rate": 0.00016585113790650388, "logits/chosen": 0.13918209075927734, "logits/rejected": 0.21283580362796783, "logps/chosen": -937.8267211914062, "logps/rejected": -958.693115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.578910827636719, "rewards/margins": 31.493125915527344, "rewards/rejected": -21.914215087890625, "step": 74 }, { "epoch": 1.2113821138211383, "grad_norm": 9.838218102231622e-05, "learning_rate": 0.00016484352334585653, "logits/chosen": 1.7902581691741943, "logits/rejected": 1.8008999824523926, "logps/chosen": -898.8333740234375, "logps/rejected": -869.8264770507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.36214828491211, "rewards/margins": 23.546051025390625, "rewards/rejected": -15.183902740478516, "step": 75 }, { "epoch": 1.2276422764227641, "grad_norm": 0.00042859543464146554, "learning_rate": 0.00016382441836448202, "logits/chosen": 0.40593788027763367, "logits/rejected": 0.24162518978118896, "logps/chosen": -713.95263671875, "logps/rejected": -873.909423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.870103359222412, "rewards/margins": 17.166872024536133, "rewards/rejected": -13.296768188476562, "step": 76 }, { "epoch": 1.2439024390243902, "grad_norm": 0.0007489994168281555, "learning_rate": 0.0001627940035501152, "logits/chosen": 1.2316575050354004, "logits/rejected": 1.2072526216506958, "logps/chosen": -961.4344482421875, "logps/rejected": -1073.3685302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.6541852951049805, "rewards/margins": 27.57451057434082, "rewards/rejected": -20.920326232910156, "step": 77 }, { "epoch": 1.2601626016260163, "grad_norm": 3.269678200013004e-05, "learning_rate": 0.0001617524614946192, "logits/chosen": 0.06140974164009094, "logits/rejected": 0.11881747841835022, "logps/chosen": -900.48876953125, "logps/rejected": -1085.7061767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6411392688751221, "rewards/margins": 19.955745697021484, "rewards/rejected": -19.314605712890625, "step": 78 }, { "epoch": 1.2764227642276422, "grad_norm": 3.813441480815527e-06, "learning_rate": 0.0001606999767616298, "logits/chosen": 1.1457127332687378, "logits/rejected": 0.8977339267730713, "logps/chosen": -757.8355712890625, "logps/rejected": -838.0936279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.651698112487793, "rewards/margins": 31.715707778930664, "rewards/rejected": -23.064010620117188, "step": 79 }, { "epoch": 1.2926829268292683, "grad_norm": 2.5300651032011956e-05, "learning_rate": 0.00015963673585385016, "logits/chosen": -0.5050560235977173, "logits/rejected": -0.5818659067153931, "logps/chosen": -833.4871826171875, "logps/rejected": -1177.144287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1878601312637329, "rewards/margins": 28.51848602294922, "rewards/rejected": -28.330625534057617, "step": 80 }, { "epoch": 1.3089430894308944, "grad_norm": 6.81912133586593e-05, "learning_rate": 0.00015856292718000235, "logits/chosen": 1.6245973110198975, "logits/rejected": 1.942758560180664, "logps/chosen": -925.15966796875, "logps/rejected": -746.8193969726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.29654598236084, "rewards/margins": 26.77484893798828, "rewards/rejected": -17.478303909301758, "step": 81 }, { "epoch": 1.3252032520325203, "grad_norm": 1.1350484783179127e-06, "learning_rate": 0.0001574787410214407, "logits/chosen": 0.8831353187561035, "logits/rejected": 1.1747808456420898, "logps/chosen": -812.7021484375, "logps/rejected": -1058.893310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.832669258117676, "rewards/margins": 33.81871795654297, "rewards/rejected": -29.986047744750977, "step": 82 }, { "epoch": 1.3414634146341464, "grad_norm": 7.43222301480273e-07, "learning_rate": 0.0001563843694984336, "logits/chosen": 1.199593424797058, "logits/rejected": 1.2259372472763062, "logps/chosen": -846.8779296875, "logps/rejected": -1035.00244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.645470142364502, "rewards/margins": 35.18595886230469, "rewards/rejected": -30.540489196777344, "step": 83 }, { "epoch": 1.3577235772357723, "grad_norm": 4.4819596951128915e-05, "learning_rate": 0.00015528000653611935, "logits/chosen": 1.7928721904754639, "logits/rejected": 2.1661128997802734, "logps/chosen": -932.3726806640625, "logps/rejected": -844.2169189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.103044509887695, "rewards/margins": 21.569711685180664, "rewards/rejected": -17.4666690826416, "step": 84 }, { "epoch": 1.3739837398373984, "grad_norm": 7.042069594120903e-09, "learning_rate": 0.0001541658478301421, "logits/chosen": 0.2531038522720337, "logits/rejected": 0.2639998197555542, "logps/chosen": -1010.8427734375, "logps/rejected": -1247.974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7464678287506104, "rewards/margins": 30.038406372070312, "rewards/rejected": -29.291942596435547, "step": 85 }, { "epoch": 1.3902439024390243, "grad_norm": 2.4762075057083166e-08, "learning_rate": 0.00015304209081197425, "logits/chosen": 2.228158473968506, "logits/rejected": 2.7146129608154297, "logps/chosen": -1221.494384765625, "logps/rejected": -882.4944458007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.98241901397705, "rewards/margins": 33.62451171875, "rewards/rejected": -19.642091751098633, "step": 86 }, { "epoch": 1.4065040650406504, "grad_norm": 3.7480401715583866e-06, "learning_rate": 0.00015190893461393108, "logits/chosen": 1.5811924934387207, "logits/rejected": 2.0754153728485107, "logps/chosen": -958.1056518554688, "logps/rejected": -741.9910278320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 14.536327362060547, "rewards/margins": 32.516456604003906, "rewards/rejected": -17.980131149291992, "step": 87 }, { "epoch": 1.4227642276422765, "grad_norm": 1.9098067696177168e-06, "learning_rate": 0.000150766580033884, "logits/chosen": 1.6907765865325928, "logits/rejected": 1.9654494524002075, "logps/chosen": -1132.77978515625, "logps/rejected": -908.571044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.22573709487915, "rewards/margins": 34.5124626159668, "rewards/rejected": -29.286724090576172, "step": 88 }, { "epoch": 1.4390243902439024, "grad_norm": 1.1447126780694816e-05, "learning_rate": 0.00014961522949967886, "logits/chosen": 0.9937865734100342, "logits/rejected": 1.2049672603607178, "logps/chosen": -739.3209838867188, "logps/rejected": -1007.2611083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.235821723937988, "rewards/margins": 34.75508499145508, "rewards/rejected": -24.51926040649414, "step": 89 }, { "epoch": 1.4552845528455285, "grad_norm": 1.5996234026260936e-07, "learning_rate": 0.00014845508703326504, "logits/chosen": 1.005773663520813, "logits/rejected": 0.9975143671035767, "logps/chosen": -912.9910278320312, "logps/rejected": -1205.926513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.948190212249756, "rewards/margins": 31.25839614868164, "rewards/rejected": -28.310203552246094, "step": 90 }, { "epoch": 1.4715447154471546, "grad_norm": 1.9003784473170526e-05, "learning_rate": 0.00014728635821454255, "logits/chosen": 2.574889659881592, "logits/rejected": 2.5759711265563965, "logps/chosen": -915.0121459960938, "logps/rejected": -623.8654174804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.099142074584961, "rewards/margins": 31.881959915161133, "rewards/rejected": -16.782817840576172, "step": 91 }, { "epoch": 1.4878048780487805, "grad_norm": 4.1650441318097364e-08, "learning_rate": 0.0001461092501449326, "logits/chosen": 1.0031987428665161, "logits/rejected": 1.2941582202911377, "logps/chosen": -823.1492309570312, "logps/rejected": -1055.567626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.4376673698425293, "rewards/margins": 26.05483055114746, "rewards/rejected": -23.617162704467773, "step": 92 }, { "epoch": 1.5040650406504064, "grad_norm": 4.165614697626552e-08, "learning_rate": 0.00014492397141067887, "logits/chosen": 0.8133536577224731, "logits/rejected": 1.0407506227493286, "logps/chosen": -961.2422485351562, "logps/rejected": -1156.6856689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8701601028442383, "rewards/margins": 33.655277252197266, "rewards/rejected": -31.785114288330078, "step": 93 }, { "epoch": 1.5203252032520327, "grad_norm": 3.824939540209016e-06, "learning_rate": 0.00014373073204588556, "logits/chosen": 2.6779818534851074, "logits/rejected": 2.7686123847961426, "logps/chosen": -1121.3564453125, "logps/rejected": -698.586669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.171032905578613, "rewards/margins": 27.788890838623047, "rewards/rejected": -17.617855072021484, "step": 94 }, { "epoch": 1.5365853658536586, "grad_norm": 3.954168641939759e-05, "learning_rate": 0.0001425297434952987, "logits/chosen": 0.22321929037570953, "logits/rejected": 0.2271191030740738, "logps/chosen": -671.6175537109375, "logps/rejected": -1141.6953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.185655355453491, "rewards/margins": 26.3375301361084, "rewards/rejected": -28.52318572998047, "step": 95 }, { "epoch": 1.5528455284552845, "grad_norm": 6.408844566152538e-10, "learning_rate": 0.00014132121857683783, "logits/chosen": 1.1100516319274902, "logits/rejected": 1.0310027599334717, "logps/chosen": -995.9828491210938, "logps/rejected": -1024.00244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.543378829956055, "rewards/margins": 33.411643981933594, "rewards/rejected": -24.868263244628906, "step": 96 }, { "epoch": 1.5691056910569106, "grad_norm": 6.710484399263805e-07, "learning_rate": 0.00014010537144388416, "logits/chosen": 0.19941049814224243, "logits/rejected": 0.2904074490070343, "logps/chosen": -580.1328125, "logps/rejected": -1122.187744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.563772439956665, "rewards/margins": 23.33687400817871, "rewards/rejected": -23.900646209716797, "step": 97 }, { "epoch": 1.5853658536585367, "grad_norm": 2.6136473252336145e-07, "learning_rate": 0.00013888241754733208, "logits/chosen": 0.8143081665039062, "logits/rejected": 1.183271050453186, "logps/chosen": -973.23583984375, "logps/rejected": -904.20556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.3894622325897217, "rewards/margins": 23.915855407714844, "rewards/rejected": -20.526391983032227, "step": 98 }, { "epoch": 1.6016260162601625, "grad_norm": 1.735031582938973e-05, "learning_rate": 0.00013765257359741063, "logits/chosen": 0.8897725343704224, "logits/rejected": 0.8052040338516235, "logps/chosen": -771.9832763671875, "logps/rejected": -874.3773193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.943796157836914, "rewards/margins": 29.497058868408203, "rewards/rejected": -22.55326271057129, "step": 99 }, { "epoch": 1.6178861788617886, "grad_norm": 1.2570103535836097e-07, "learning_rate": 0.00013641605752528224, "logits/chosen": 1.0415421724319458, "logits/rejected": 1.3014307022094727, "logps/chosen": -918.8525390625, "logps/rejected": -955.0538330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.44915771484375, "rewards/margins": 33.4973258972168, "rewards/rejected": -26.04817008972168, "step": 100 }, { "epoch": 1.6341463414634148, "grad_norm": 3.719053154327412e-07, "learning_rate": 0.0001351730884444245, "logits/chosen": 0.4167521595954895, "logits/rejected": 0.3483416438102722, "logps/chosen": -604.3650512695312, "logps/rejected": -1362.02587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4617691040039062, "rewards/margins": 44.77275466918945, "rewards/rejected": -47.23452377319336, "step": 101 }, { "epoch": 1.6504065040650406, "grad_norm": 1.487089633656069e-07, "learning_rate": 0.00013392388661180303, "logits/chosen": 0.9698238968849182, "logits/rejected": 1.1324440240859985, "logps/chosen": -742.9386596679688, "logps/rejected": -905.581298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.503021717071533, "rewards/margins": 32.864501953125, "rewards/rejected": -27.361482620239258, "step": 102 }, { "epoch": 1.6666666666666665, "grad_norm": 0.00015168750542216003, "learning_rate": 0.0001326686733888413, "logits/chosen": 2.734503746032715, "logits/rejected": 2.7868616580963135, "logps/chosen": -845.9635009765625, "logps/rejected": -674.9261474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.455021858215332, "rewards/margins": 21.768619537353516, "rewards/rejected": -15.3135986328125, "step": 103 }, { "epoch": 1.6829268292682928, "grad_norm": 5.236762717686361e-06, "learning_rate": 0.0001314076712021949, "logits/chosen": 0.8474237322807312, "logits/rejected": 1.0795999765396118, "logps/chosen": -844.8881225585938, "logps/rejected": -1026.413818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.01052474975586, "rewards/margins": 34.12953186035156, "rewards/rejected": -25.119007110595703, "step": 104 }, { "epoch": 1.6991869918699187, "grad_norm": 4.3044991571150604e-08, "learning_rate": 0.000130141103504337, "logits/chosen": 1.0104427337646484, "logits/rejected": 0.809540867805481, "logps/chosen": -806.0650634765625, "logps/rejected": -1019.7612915039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.093156814575195, "rewards/margins": 29.144248962402344, "rewards/rejected": -22.051090240478516, "step": 105 }, { "epoch": 1.7154471544715446, "grad_norm": 6.236035243745164e-09, "learning_rate": 0.0001288691947339621, "logits/chosen": 0.26283663511276245, "logits/rejected": 0.21620601415634155, "logps/chosen": -764.7117919921875, "logps/rejected": -1384.037353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5661294460296631, "rewards/margins": 35.904212951660156, "rewards/rejected": -36.470340728759766, "step": 106 }, { "epoch": 1.7317073170731707, "grad_norm": 0.0002312189608346671, "learning_rate": 0.00012759217027621505, "logits/chosen": 0.8271576166152954, "logits/rejected": 0.8352835178375244, "logps/chosen": -639.9276123046875, "logps/rejected": -721.3944702148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.1902108192443848, "rewards/margins": 19.32707977294922, "rewards/rejected": -16.13686752319336, "step": 107 }, { "epoch": 1.7479674796747968, "grad_norm": 5.53435963723814e-09, "learning_rate": 0.00012631025642275212, "logits/chosen": 0.9540997743606567, "logits/rejected": 1.0216646194458008, "logps/chosen": -920.1544189453125, "logps/rejected": -919.189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.917628288269043, "rewards/margins": 31.62308692932129, "rewards/rejected": -22.705459594726562, "step": 108 }, { "epoch": 1.7642276422764227, "grad_norm": 5.7604488290508016e-08, "learning_rate": 0.00012502368033164176, "logits/chosen": 1.9378834962844849, "logits/rejected": 2.0527262687683105, "logps/chosen": -616.1436767578125, "logps/rejected": -781.5704956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.269429683685303, "rewards/margins": 27.761857986450195, "rewards/rejected": -23.492429733276367, "step": 109 }, { "epoch": 1.7804878048780488, "grad_norm": 3.0333463740817024e-08, "learning_rate": 0.0001237326699871115, "logits/chosen": 0.784665584564209, "logits/rejected": 1.0081039667129517, "logps/chosen": -864.7948608398438, "logps/rejected": -946.906982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.097116470336914, "rewards/margins": 30.87978172302246, "rewards/rejected": -24.78266716003418, "step": 110 }, { "epoch": 1.796747967479675, "grad_norm": 3.1582476367475465e-07, "learning_rate": 0.00012243745415914883, "logits/chosen": -0.5353690385818481, "logits/rejected": -0.6592149138450623, "logps/chosen": -722.5419921875, "logps/rejected": -1070.7403564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3367981910705566, "rewards/margins": 27.85375213623047, "rewards/rejected": -29.190549850463867, "step": 111 }, { "epoch": 1.8130081300813008, "grad_norm": 2.334864745989762e-07, "learning_rate": 0.00012113826236296244, "logits/chosen": 1.986028790473938, "logits/rejected": 2.0000312328338623, "logps/chosen": -1034.116455078125, "logps/rejected": -924.2823486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.337306022644043, "rewards/margins": 34.88032531738281, "rewards/rejected": -25.54302215576172, "step": 112 }, { "epoch": 1.8292682926829267, "grad_norm": 1.956110463652294e-05, "learning_rate": 0.0001198353248183118, "logits/chosen": 1.1676946878433228, "logits/rejected": 1.3392938375473022, "logps/chosen": -839.8267211914062, "logps/rejected": -966.1685180664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.940967082977295, "rewards/margins": 33.268653869628906, "rewards/rejected": -28.327686309814453, "step": 113 }, { "epoch": 1.845528455284553, "grad_norm": 1.2582788144754886e-07, "learning_rate": 0.00011852887240871145, "logits/chosen": 1.7121946811676025, "logits/rejected": 1.834307074546814, "logps/chosen": -825.6591796875, "logps/rejected": -910.5638427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.057826519012451, "rewards/margins": 26.722637176513672, "rewards/rejected": -21.664812088012695, "step": 114 }, { "epoch": 1.8617886178861789, "grad_norm": 3.8171506275830325e-06, "learning_rate": 0.00011721913664051813, "logits/chosen": 0.09213051199913025, "logits/rejected": 0.2805327773094177, "logps/chosen": -785.7156982421875, "logps/rejected": -1021.4864501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.823834240436554, "rewards/margins": 25.152664184570312, "rewards/rejected": -24.32883071899414, "step": 115 }, { "epoch": 1.8780487804878048, "grad_norm": 2.6529932029006886e-08, "learning_rate": 0.00011590634960190721, "logits/chosen": -0.5069230198860168, "logits/rejected": -0.5888826847076416, "logps/chosen": -707.7698974609375, "logps/rejected": -1266.01904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.027275919914245605, "rewards/margins": 27.478078842163086, "rewards/rejected": -27.450803756713867, "step": 116 }, { "epoch": 1.8943089430894309, "grad_norm": 9.935014304573997e-07, "learning_rate": 0.00011459074392174618, "logits/chosen": 1.5636107921600342, "logits/rejected": 1.8575186729431152, "logps/chosen": -1191.93359375, "logps/rejected": -990.843505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.92037582397461, "rewards/margins": 39.89407730102539, "rewards/rejected": -26.973697662353516, "step": 117 }, { "epoch": 1.910569105691057, "grad_norm": 1.2037819942634087e-05, "learning_rate": 0.00011327255272837221, "logits/chosen": 1.0499224662780762, "logits/rejected": 0.9787989854812622, "logps/chosen": -971.0214233398438, "logps/rejected": -877.3848876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.003582715988159, "rewards/margins": 20.236526489257812, "rewards/rejected": -18.23294448852539, "step": 118 }, { "epoch": 1.9268292682926829, "grad_norm": 1.8166872450819938e-06, "learning_rate": 0.00011195200960828139, "logits/chosen": 1.6961169242858887, "logits/rejected": 2.2738733291625977, "logps/chosen": -1074.953369140625, "logps/rejected": -778.5762939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.411404609680176, "rewards/margins": 25.984111785888672, "rewards/rejected": -17.57270622253418, "step": 119 }, { "epoch": 1.943089430894309, "grad_norm": 0.002434302121400833, "learning_rate": 0.00011062934856473655, "logits/chosen": 0.24992449581623077, "logits/rejected": 0.18503600358963013, "logps/chosen": -811.4505615234375, "logps/rejected": -1088.271240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.826874017715454, "rewards/margins": 32.1160888671875, "rewards/rejected": -29.289215087890625, "step": 120 }, { "epoch": 1.959349593495935, "grad_norm": 3.818647797970698e-08, "learning_rate": 0.00010930480397630145, "logits/chosen": 1.889555811882019, "logits/rejected": 2.055070400238037, "logps/chosen": -1008.6806640625, "logps/rejected": -997.8306884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.727387428283691, "rewards/margins": 32.15311813354492, "rewards/rejected": -27.42573356628418, "step": 121 }, { "epoch": 1.975609756097561, "grad_norm": 4.203374359690315e-08, "learning_rate": 0.00010797861055530831, "logits/chosen": 0.33176711201667786, "logits/rejected": 0.2883341312408447, "logps/chosen": -764.9257202148438, "logps/rejected": -1157.33642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.931965708732605, "rewards/margins": 29.445417404174805, "rewards/rejected": -30.377384185791016, "step": 122 }, { "epoch": 1.9918699186991868, "grad_norm": 0.0003661888767965138, "learning_rate": 0.00010665100330626625, "logits/chosen": 2.023690700531006, "logits/rejected": 2.543468475341797, "logps/chosen": -1341.046875, "logps/rejected": -852.0292358398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.60735034942627, "rewards/margins": 33.2912483215332, "rewards/rejected": -19.68389892578125, "step": 123 }, { "epoch": 2.0, "grad_norm": 1.4813576854066923e-07, "learning_rate": 0.00010532221748421787, "logits/chosen": 2.4457969665527344, "logits/rejected": 2.6656110286712646, "logps/chosen": -1094.49560546875, "logps/rejected": -546.4738159179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 12.480463027954102, "rewards/margins": 21.069480895996094, "rewards/rejected": -8.589018821716309, "step": 124 } ], "logging_steps": 1, "max_steps": 246, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }