{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994111874386653, "eval_steps": 100, "global_step": 1273, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007850834151128558, "grad_norm": 8.785704612731934, "learning_rate": 3.90625e-08, "logits/chosen": -2.957148313522339, "logits/rejected": -2.900550365447998, "logps/chosen": -466.9051818847656, "logps/rejected": -502.35345458984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007850834151128557, "grad_norm": 6.684790134429932, "learning_rate": 3.90625e-07, "logits/chosen": -3.0609865188598633, "logits/rejected": -2.9977359771728516, "logps/chosen": -308.3629455566406, "logps/rejected": -261.8404235839844, "loss": 0.6931, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 0.0009382184944115579, "rewards/margins": 5.389652869780548e-05, "rewards/rejected": 0.0008843218092806637, "step": 10 }, { "epoch": 0.015701668302257114, "grad_norm": 6.8678717613220215, "learning_rate": 7.8125e-07, "logits/chosen": -3.1096813678741455, "logits/rejected": -3.0798025131225586, "logps/chosen": -292.4770202636719, "logps/rejected": -254.6656494140625, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": 0.007172191981226206, "rewards/margins": 0.001467574737034738, "rewards/rejected": 0.005704617593437433, "step": 20 }, { "epoch": 0.023552502453385672, "grad_norm": 6.647519588470459, "learning_rate": 1.1718750000000001e-06, "logits/chosen": -3.1090664863586426, "logits/rejected": -3.084791660308838, "logps/chosen": -265.77301025390625, "logps/rejected": -266.1317138671875, "loss": 0.691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.01251781266182661, "rewards/margins": 0.0033119157887995243, "rewards/rejected": 0.009205898270010948, "step": 30 }, { "epoch": 0.03140333660451423, "grad_norm": 6.744068145751953, "learning_rate": 1.5625e-06, "logits/chosen": -3.081329107284546, "logits/rejected": -3.1170654296875, "logps/chosen": -297.75823974609375, "logps/rejected": -270.17462158203125, "loss": 0.6871, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.011317016556859016, "rewards/margins": 0.013026026077568531, "rewards/rejected": -0.00170900858938694, "step": 40 }, { "epoch": 0.03925417075564279, "grad_norm": 7.407871246337891, "learning_rate": 1.953125e-06, "logits/chosen": -3.1149449348449707, "logits/rejected": -3.066861629486084, "logps/chosen": -306.6522216796875, "logps/rejected": -255.38491821289062, "loss": 0.6829, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015373636968433857, "rewards/margins": 0.022752460092306137, "rewards/rejected": -0.00737882312387228, "step": 50 }, { "epoch": 0.047105004906771344, "grad_norm": 7.203430652618408, "learning_rate": 2.3437500000000002e-06, "logits/chosen": -3.008836269378662, "logits/rejected": -3.026230573654175, "logps/chosen": -269.2247009277344, "logps/rejected": -276.4228820800781, "loss": 0.6811, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.006092413794249296, "rewards/margins": 0.021052923053503036, "rewards/rejected": -0.014960509724915028, "step": 60 }, { "epoch": 0.0549558390578999, "grad_norm": 6.94841194152832, "learning_rate": 2.7343750000000004e-06, "logits/chosen": -3.0492148399353027, "logits/rejected": -3.0904951095581055, "logps/chosen": -267.11553955078125, "logps/rejected": -257.773681640625, "loss": 0.668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.009770817123353481, "rewards/margins": 0.04689077287912369, "rewards/rejected": -0.056661587208509445, "step": 70 }, { "epoch": 0.06280667320902845, "grad_norm": 9.025800704956055, "learning_rate": 3.125e-06, "logits/chosen": -3.1011645793914795, "logits/rejected": -3.1380457878112793, "logps/chosen": -308.1461486816406, "logps/rejected": -312.40869140625, "loss": 0.6652, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09069164097309113, "rewards/margins": 0.052069295197725296, "rewards/rejected": -0.14276091754436493, "step": 80 }, { "epoch": 0.07065750736015702, "grad_norm": 10.733589172363281, "learning_rate": 3.5156250000000003e-06, "logits/chosen": -3.0480704307556152, "logits/rejected": -3.068376064300537, "logps/chosen": -309.16766357421875, "logps/rejected": -306.99627685546875, "loss": 0.6473, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12756133079528809, "rewards/margins": 0.07411627471446991, "rewards/rejected": -0.20167763531208038, "step": 90 }, { "epoch": 0.07850834151128558, "grad_norm": 13.016451835632324, "learning_rate": 3.90625e-06, "logits/chosen": -2.9545440673828125, "logits/rejected": -2.923600435256958, "logps/chosen": -302.78509521484375, "logps/rejected": -304.83795166015625, "loss": 0.6438, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2593812942504883, "rewards/margins": 0.14604052901268005, "rewards/rejected": -0.40542179346084595, "step": 100 }, { "epoch": 0.07850834151128558, "eval_logits/chosen": -3.014045238494873, "eval_logits/rejected": -3.0357654094696045, "eval_logps/chosen": -329.1207580566406, "eval_logps/rejected": -306.6942443847656, "eval_loss": 0.6424023509025574, "eval_rewards/accuracies": 0.6547619104385376, "eval_rewards/chosen": -0.40733060240745544, "eval_rewards/margins": 0.12344833463430405, "eval_rewards/rejected": -0.5307790040969849, "eval_runtime": 174.6916, "eval_samples_per_second": 11.449, "eval_steps_per_second": 0.481, "step": 100 }, { "epoch": 0.08635917566241413, "grad_norm": 14.229337692260742, "learning_rate": 4.296875e-06, "logits/chosen": -2.9414217472076416, "logits/rejected": -2.9711925983428955, "logps/chosen": -339.30914306640625, "logps/rejected": -324.44390869140625, "loss": 0.6339, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3848220109939575, "rewards/margins": 0.16449818015098572, "rewards/rejected": -0.5493202209472656, "step": 110 }, { "epoch": 0.09421000981354269, "grad_norm": 20.014785766601562, "learning_rate": 4.6875000000000004e-06, "logits/chosen": -2.9944517612457275, "logits/rejected": -3.045173168182373, "logps/chosen": -391.199462890625, "logps/rejected": -376.3497009277344, "loss": 0.6211, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5974650979042053, "rewards/margins": 0.2744296193122864, "rewards/rejected": -0.8718946576118469, "step": 120 }, { "epoch": 0.10206084396467124, "grad_norm": 12.38216781616211, "learning_rate": 4.999962359300416e-06, "logits/chosen": -2.9552016258239746, "logits/rejected": -2.9254870414733887, "logps/chosen": -402.8971252441406, "logps/rejected": -404.6396789550781, "loss": 0.6189, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8715218305587769, "rewards/margins": 0.27080851793289185, "rewards/rejected": -1.1423304080963135, "step": 130 }, { "epoch": 0.1099116781157998, "grad_norm": 16.306636810302734, "learning_rate": 4.998645053824218e-06, "logits/chosen": -2.803802967071533, "logits/rejected": -2.8079888820648193, "logps/chosen": -379.4205017089844, "logps/rejected": -362.0006103515625, "loss": 0.6495, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0367389917373657, "rewards/margins": 0.17243310809135437, "rewards/rejected": -1.2091721296310425, "step": 140 }, { "epoch": 0.11776251226692837, "grad_norm": 16.321983337402344, "learning_rate": 4.9954468466732145e-06, "logits/chosen": -2.8862144947052, "logits/rejected": -2.9243063926696777, "logps/chosen": -417.48272705078125, "logps/rejected": -420.72381591796875, "loss": 0.6151, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8957064747810364, "rewards/margins": 0.33516108989715576, "rewards/rejected": -1.2308675050735474, "step": 150 }, { "epoch": 0.1256133464180569, "grad_norm": 21.833602905273438, "learning_rate": 4.990370145357496e-06, "logits/chosen": -2.880340099334717, "logits/rejected": -2.8787879943847656, "logps/chosen": -374.5000305175781, "logps/rejected": -366.66619873046875, "loss": 0.6028, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9459589719772339, "rewards/margins": 0.245649054646492, "rewards/rejected": -1.1916080713272095, "step": 160 }, { "epoch": 0.13346418056918546, "grad_norm": 22.123382568359375, "learning_rate": 4.983418771458684e-06, "logits/chosen": -2.9111855030059814, "logits/rejected": -2.8439784049987793, "logps/chosen": -384.2833251953125, "logps/rejected": -396.34326171875, "loss": 0.598, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9746086001396179, "rewards/margins": 0.32514628767967224, "rewards/rejected": -1.2997548580169678, "step": 170 }, { "epoch": 0.14131501472031405, "grad_norm": 18.860288619995117, "learning_rate": 4.97459795775315e-06, "logits/chosen": -2.846890449523926, "logits/rejected": -2.8465495109558105, "logps/chosen": -371.8786315917969, "logps/rejected": -400.20501708984375, "loss": 0.5886, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8063033819198608, "rewards/margins": 0.39446142315864563, "rewards/rejected": -1.200764775276184, "step": 180 }, { "epoch": 0.1491658488714426, "grad_norm": 17.285179138183594, "learning_rate": 4.963914344272961e-06, "logits/chosen": -2.9533636569976807, "logits/rejected": -2.9740447998046875, "logps/chosen": -379.0119934082031, "logps/rejected": -406.78936767578125, "loss": 0.6036, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7768992781639099, "rewards/margins": 0.33595213294029236, "rewards/rejected": -1.1128513813018799, "step": 190 }, { "epoch": 0.15701668302257116, "grad_norm": 23.66827964782715, "learning_rate": 4.951375973307458e-06, "logits/chosen": -2.9775123596191406, "logits/rejected": -2.977674961090088, "logps/chosen": -389.47088623046875, "logps/rejected": -386.0644226074219, "loss": 0.5977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8474963903427124, "rewards/margins": 0.37333354353904724, "rewards/rejected": -1.220829963684082, "step": 200 }, { "epoch": 0.15701668302257116, "eval_logits/chosen": -3.014695405960083, "eval_logits/rejected": -3.025944232940674, "eval_logps/chosen": -394.195068359375, "eval_logps/rejected": -395.111328125, "eval_loss": 0.5976593494415283, "eval_rewards/accuracies": 0.6666666865348816, "eval_rewards/chosen": -1.0580739974975586, "eval_rewards/margins": 0.35687559843063354, "eval_rewards/rejected": -1.414949655532837, "eval_runtime": 171.2039, "eval_samples_per_second": 11.682, "eval_steps_per_second": 0.491, "step": 200 }, { "epoch": 0.1648675171736997, "grad_norm": 21.703943252563477, "learning_rate": 4.93699228334928e-06, "logits/chosen": -3.014017343521118, "logits/rejected": -2.9310977458953857, "logps/chosen": -398.8971862792969, "logps/rejected": -423.201416015625, "loss": 0.5652, "rewards/accuracies": 0.75, "rewards/chosen": -1.1008360385894775, "rewards/margins": 0.518481433391571, "rewards/rejected": -1.6193174123764038, "step": 210 }, { "epoch": 0.17271835132482827, "grad_norm": 30.472820281982422, "learning_rate": 4.920774101989362e-06, "logits/chosen": -2.922285318374634, "logits/rejected": -2.8559417724609375, "logps/chosen": -394.4663391113281, "logps/rejected": -436.795166015625, "loss": 0.5943, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.094743013381958, "rewards/margins": 0.4840098023414612, "rewards/rejected": -1.578752875328064, "step": 220 }, { "epoch": 0.18056918547595682, "grad_norm": 19.1884708404541, "learning_rate": 4.902733637766261e-06, "logits/chosen": -2.8735547065734863, "logits/rejected": -2.8807244300842285, "logps/chosen": -358.8312072753906, "logps/rejected": -393.9620056152344, "loss": 0.5503, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8047897219657898, "rewards/margins": 0.49235886335372925, "rewards/rejected": -1.297148585319519, "step": 230 }, { "epoch": 0.18842001962708538, "grad_norm": 25.822147369384766, "learning_rate": 4.882884470975954e-06, "logits/chosen": -2.733098030090332, "logits/rejected": -2.768909454345703, "logps/chosen": -396.49188232421875, "logps/rejected": -438.16455078125, "loss": 0.5768, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.028875708580017, "rewards/margins": 0.5807405710220337, "rewards/rejected": -1.6096162796020508, "step": 240 }, { "epoch": 0.19627085377821393, "grad_norm": 18.91808319091797, "learning_rate": 4.861241543449015e-06, "logits/chosen": -2.723087787628174, "logits/rejected": -2.6532533168792725, "logps/chosen": -397.3339538574219, "logps/rejected": -422.185302734375, "loss": 0.5773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.094036340713501, "rewards/margins": 0.49098238348960876, "rewards/rejected": -1.5850186347961426, "step": 250 }, { "epoch": 0.2041216879293425, "grad_norm": 32.23611068725586, "learning_rate": 4.8378211473028755e-06, "logits/chosen": -2.828057289123535, "logits/rejected": -2.838313579559326, "logps/chosen": -390.9112243652344, "logps/rejected": -397.8682556152344, "loss": 0.573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7329429984092712, "rewards/margins": 0.33645009994506836, "rewards/rejected": -1.0693930387496948, "step": 260 }, { "epoch": 0.21197252208047104, "grad_norm": 25.560338973999023, "learning_rate": 4.812640912677624e-06, "logits/chosen": -2.9140567779541016, "logits/rejected": -2.930488109588623, "logps/chosen": -346.61273193359375, "logps/rejected": -370.59771728515625, "loss": 0.5867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6457637548446655, "rewards/margins": 0.3336387276649475, "rewards/rejected": -0.9794024229049683, "step": 270 }, { "epoch": 0.2198233562315996, "grad_norm": 21.532350540161133, "learning_rate": 4.785719794464596e-06, "logits/chosen": -2.7777903079986572, "logits/rejected": -2.7826027870178223, "logps/chosen": -360.8690490722656, "logps/rejected": -386.78753662109375, "loss": 0.5804, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9706557393074036, "rewards/margins": 0.3798271715641022, "rewards/rejected": -1.3504829406738281, "step": 280 }, { "epoch": 0.22767419038272815, "grad_norm": 25.418230056762695, "learning_rate": 4.757078058037722e-06, "logits/chosen": -2.886289119720459, "logits/rejected": -2.813042402267456, "logps/chosen": -386.9649353027344, "logps/rejected": -436.1798400878906, "loss": 0.5789, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1510156393051147, "rewards/margins": 0.4267166256904602, "rewards/rejected": -1.5777322053909302, "step": 290 }, { "epoch": 0.23552502453385674, "grad_norm": 26.089282989501953, "learning_rate": 4.72673726399839e-06, "logits/chosen": -2.6797690391540527, "logits/rejected": -2.7410836219787598, "logps/chosen": -347.84405517578125, "logps/rejected": -429.4021911621094, "loss": 0.5583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0482864379882812, "rewards/margins": 0.5523291826248169, "rewards/rejected": -1.6006155014038086, "step": 300 }, { "epoch": 0.23552502453385674, "eval_logits/chosen": -2.723576307296753, "eval_logits/rejected": -2.7299251556396484, "eval_logps/chosen": -384.55767822265625, "eval_logps/rejected": -390.4145812988281, "eval_loss": 0.5714064836502075, "eval_rewards/accuracies": 0.6711309552192688, "eval_rewards/chosen": -0.9617000818252563, "eval_rewards/margins": 0.40628206729888916, "eval_rewards/rejected": -1.3679821491241455, "eval_runtime": 171.2042, "eval_samples_per_second": 11.682, "eval_steps_per_second": 0.491, "step": 300 }, { "epoch": 0.2433758586849853, "grad_norm": 18.173839569091797, "learning_rate": 4.694720251945298e-06, "logits/chosen": -2.7685041427612305, "logits/rejected": -2.686394691467285, "logps/chosen": -381.9715881347656, "logps/rejected": -402.4884338378906, "loss": 0.5718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9211880564689636, "rewards/margins": 0.3484678566455841, "rewards/rejected": -1.2696558237075806, "step": 310 }, { "epoch": 0.2512266928361138, "grad_norm": 20.20842933654785, "learning_rate": 4.661051123281528e-06, "logits/chosen": -2.535449504852295, "logits/rejected": -2.4344544410705566, "logps/chosen": -394.1264953613281, "logps/rejected": -438.5677185058594, "loss": 0.542, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9613161087036133, "rewards/margins": 0.5449072122573853, "rewards/rejected": -1.5062233209609985, "step": 320 }, { "epoch": 0.2590775269872424, "grad_norm": 19.297094345092773, "learning_rate": 4.6257552230717536e-06, "logits/chosen": -2.40204119682312, "logits/rejected": -2.392609119415283, "logps/chosen": -459.760498046875, "logps/rejected": -468.84698486328125, "loss": 0.5285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.178815245628357, "rewards/margins": 0.6503817439079285, "rewards/rejected": -1.8291969299316406, "step": 330 }, { "epoch": 0.26692836113837093, "grad_norm": 36.15755081176758, "learning_rate": 4.588859120963282e-06, "logits/chosen": -2.267246723175049, "logits/rejected": -2.1048290729522705, "logps/chosen": -382.4331359863281, "logps/rejected": -419.305908203125, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2648835182189941, "rewards/margins": 0.6192021369934082, "rewards/rejected": -1.8840856552124023, "step": 340 }, { "epoch": 0.2747791952894995, "grad_norm": 15.106271743774414, "learning_rate": 4.5503905911852435e-06, "logits/chosen": -2.3543121814727783, "logits/rejected": -2.33532452583313, "logps/chosen": -404.1642761230469, "logps/rejected": -431.0093688964844, "loss": 0.5287, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0297491550445557, "rewards/margins": 0.6717931032180786, "rewards/rejected": -1.7015421390533447, "step": 350 }, { "epoch": 0.2826300294406281, "grad_norm": 20.0123348236084, "learning_rate": 4.510378591641036e-06, "logits/chosen": -2.2860474586486816, "logits/rejected": -2.3591558933258057, "logps/chosen": -394.79827880859375, "logps/rejected": -417.1219177246094, "loss": 0.5561, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.9084060788154602, "rewards/margins": 0.6691475510597229, "rewards/rejected": -1.5775535106658936, "step": 360 }, { "epoch": 0.2904808635917566, "grad_norm": 19.609752655029297, "learning_rate": 4.468853242109712e-06, "logits/chosen": -2.3907597064971924, "logits/rejected": -2.378951072692871, "logps/chosen": -362.96331787109375, "logps/rejected": -399.92401123046875, "loss": 0.5624, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9224265217781067, "rewards/margins": 0.5320521593093872, "rewards/rejected": -1.4544788599014282, "step": 370 }, { "epoch": 0.2983316977428852, "grad_norm": 29.80910873413086, "learning_rate": 4.42584580157276e-06, "logits/chosen": -2.2916672229766846, "logits/rejected": -2.059715986251831, "logps/chosen": -365.691162109375, "logps/rejected": -418.39642333984375, "loss": 0.5196, "rewards/accuracies": 0.75, "rewards/chosen": -1.0353277921676636, "rewards/margins": 0.6206272840499878, "rewards/rejected": -1.6559550762176514, "step": 380 }, { "epoch": 0.30618253189401373, "grad_norm": 28.393800735473633, "learning_rate": 4.381388644683317e-06, "logits/chosen": -2.1753897666931152, "logits/rejected": -2.1332502365112305, "logps/chosen": -400.159423828125, "logps/rejected": -432.15777587890625, "loss": 0.5341, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2140204906463623, "rewards/margins": 0.7265356779098511, "rewards/rejected": -1.9405561685562134, "step": 390 }, { "epoch": 0.3140333660451423, "grad_norm": 32.2076301574707, "learning_rate": 4.33551523739555e-06, "logits/chosen": -2.03031849861145, "logits/rejected": -2.0334537029266357, "logps/chosen": -369.33056640625, "logps/rejected": -458.2646484375, "loss": 0.5727, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2683178186416626, "rewards/margins": 0.6755903363227844, "rewards/rejected": -1.9439083337783813, "step": 400 }, { "epoch": 0.3140333660451423, "eval_logits/chosen": -2.2418928146362305, "eval_logits/rejected": -2.2240025997161865, "eval_logps/chosen": -389.8349609375, "eval_logps/rejected": -412.5816955566406, "eval_loss": 0.5430436730384827, "eval_rewards/accuracies": 0.6875, "eval_rewards/chosen": -1.0144727230072021, "eval_rewards/margins": 0.5751808285713196, "eval_rewards/rejected": -1.5896533727645874, "eval_runtime": 172.1182, "eval_samples_per_second": 11.62, "eval_steps_per_second": 0.488, "step": 400 }, { "epoch": 0.32188420019627084, "grad_norm": 22.76254653930664, "learning_rate": 4.288260111772535e-06, "logits/chosen": -2.275933027267456, "logits/rejected": -2.148829221725464, "logps/chosen": -390.9195861816406, "logps/rejected": -428.26226806640625, "loss": 0.5302, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.9567239880561829, "rewards/margins": 0.5892980694770813, "rewards/rejected": -1.5460221767425537, "step": 410 }, { "epoch": 0.3297350343473994, "grad_norm": 23.928640365600586, "learning_rate": 4.239658839991594e-06, "logits/chosen": -2.107412099838257, "logits/rejected": -2.1913232803344727, "logps/chosen": -405.66265869140625, "logps/rejected": -424.18377685546875, "loss": 0.5677, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.139583706855774, "rewards/margins": 0.5120341777801514, "rewards/rejected": -1.6516176462173462, "step": 420 }, { "epoch": 0.33758586849852795, "grad_norm": 21.068220138549805, "learning_rate": 4.189748007566686e-06, "logits/chosen": -2.05175518989563, "logits/rejected": -1.9536798000335693, "logps/chosen": -372.6251525878906, "logps/rejected": -439.80810546875, "loss": 0.5315, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.315185785293579, "rewards/margins": 0.6874850988388062, "rewards/rejected": -2.0026707649230957, "step": 430 }, { "epoch": 0.34543670264965654, "grad_norm": 30.113636016845703, "learning_rate": 4.138565185807972e-06, "logits/chosen": -2.102708339691162, "logits/rejected": -2.0921308994293213, "logps/chosen": -431.109375, "logps/rejected": -471.14532470703125, "loss": 0.5454, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4245600700378418, "rewards/margins": 0.6852970719337463, "rewards/rejected": -2.1098570823669434, "step": 440 }, { "epoch": 0.35328753680078506, "grad_norm": 43.14057540893555, "learning_rate": 4.086148903539311e-06, "logits/chosen": -1.9374672174453735, "logits/rejected": -1.9135332107543945, "logps/chosen": -499.2344665527344, "logps/rejected": -536.7950439453125, "loss": 0.5821, "rewards/accuracies": 0.71875, "rewards/chosen": -1.975155234336853, "rewards/margins": 0.659866988658905, "rewards/rejected": -2.6350224018096924, "step": 450 }, { "epoch": 0.36113837095191365, "grad_norm": 38.40256881713867, "learning_rate": 4.032538618094972e-06, "logits/chosen": -2.0139780044555664, "logits/rejected": -1.9372785091400146, "logps/chosen": -458.24664306640625, "logps/rejected": -535.4005126953125, "loss": 0.4926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7541725635528564, "rewards/margins": 0.8262192010879517, "rewards/rejected": -2.5803914070129395, "step": 460 }, { "epoch": 0.3689892051030422, "grad_norm": 19.52273941040039, "learning_rate": 3.977774685617386e-06, "logits/chosen": -2.1808319091796875, "logits/rejected": -2.155151844024658, "logps/chosen": -449.31927490234375, "logps/rejected": -500.30242919921875, "loss": 0.4962, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5000309944152832, "rewards/margins": 0.7721298336982727, "rewards/rejected": -2.272160768508911, "step": 470 }, { "epoch": 0.37684003925417076, "grad_norm": 40.90033721923828, "learning_rate": 3.92189833067831e-06, "logits/chosen": -1.9630296230316162, "logits/rejected": -1.909574270248413, "logps/chosen": -471.18243408203125, "logps/rejected": -558.3318481445312, "loss": 0.5098, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8825572729110718, "rewards/margins": 0.9354137182235718, "rewards/rejected": -2.8179707527160645, "step": 480 }, { "epoch": 0.38469087340529934, "grad_norm": 20.045015335083008, "learning_rate": 3.864951615246261e-06, "logits/chosen": -1.8974872827529907, "logits/rejected": -1.8500369787216187, "logps/chosen": -516.6534423828125, "logps/rejected": -588.5135498046875, "loss": 0.5681, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3989312648773193, "rewards/margins": 0.9191252589225769, "rewards/rejected": -3.3180572986602783, "step": 490 }, { "epoch": 0.39254170755642787, "grad_norm": 24.87650489807129, "learning_rate": 3.806977407023581e-06, "logits/chosen": -2.218294143676758, "logits/rejected": -2.087562084197998, "logps/chosen": -463.06121826171875, "logps/rejected": -502.892333984375, "loss": 0.5178, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.609442949295044, "rewards/margins": 0.832965075969696, "rewards/rejected": -2.4424080848693848, "step": 500 }, { "epoch": 0.39254170755642787, "eval_logits/chosen": -2.3772380352020264, "eval_logits/rejected": -2.3567545413970947, "eval_logps/chosen": -410.2373046875, "eval_logps/rejected": -440.0260925292969, "eval_loss": 0.5367991328239441, "eval_rewards/accuracies": 0.6815476417541504, "eval_rewards/chosen": -1.2184962034225464, "eval_rewards/margins": 0.6456010937690735, "eval_rewards/rejected": -1.864097237586975, "eval_runtime": 164.1, "eval_samples_per_second": 12.188, "eval_steps_per_second": 0.512, "step": 500 }, { "epoch": 0.40039254170755645, "grad_norm": 17.09919548034668, "learning_rate": 3.7480193471769815e-06, "logits/chosen": -2.3634283542633057, "logits/rejected": -2.379462242126465, "logps/chosen": -422.3035583496094, "logps/rejected": -452.18951416015625, "loss": 0.559, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2336976528167725, "rewards/margins": 0.5405682325363159, "rewards/rejected": -1.7742656469345093, "step": 510 }, { "epoch": 0.408243375858685, "grad_norm": 19.045442581176758, "learning_rate": 3.6881218174858354e-06, "logits/chosen": -2.298239231109619, "logits/rejected": -2.1397132873535156, "logps/chosen": -401.8265075683594, "logps/rejected": -456.30535888671875, "loss": 0.5242, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.240301251411438, "rewards/margins": 0.7864383459091187, "rewards/rejected": -2.0267395973205566, "step": 520 }, { "epoch": 0.41609421000981356, "grad_norm": 26.22776985168457, "learning_rate": 3.627329906932964e-06, "logits/chosen": -2.407930374145508, "logits/rejected": -2.3968963623046875, "logps/chosen": -411.4175720214844, "logps/rejected": -485.57379150390625, "loss": 0.5329, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2162271738052368, "rewards/margins": 0.7498941421508789, "rewards/rejected": -1.9661214351654053, "step": 530 }, { "epoch": 0.4239450441609421, "grad_norm": 33.6424674987793, "learning_rate": 3.5656893777630686e-06, "logits/chosen": -2.208657741546631, "logits/rejected": -2.1544740200042725, "logps/chosen": -431.5694274902344, "logps/rejected": -502.0116271972656, "loss": 0.5605, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4970638751983643, "rewards/margins": 0.8648282885551453, "rewards/rejected": -2.361891984939575, "step": 540 }, { "epoch": 0.43179587831207067, "grad_norm": 32.2934684753418, "learning_rate": 3.503246631034345e-06, "logits/chosen": -2.119847297668457, "logits/rejected": -2.133668899536133, "logps/chosen": -413.0341796875, "logps/rejected": -459.9708557128906, "loss": 0.5818, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7010523080825806, "rewards/margins": 0.699297308921814, "rewards/rejected": -2.4003493785858154, "step": 550 }, { "epoch": 0.4396467124631992, "grad_norm": 20.624055862426758, "learning_rate": 3.440048671689219e-06, "logits/chosen": -2.2201478481292725, "logits/rejected": -2.28852915763855, "logps/chosen": -394.3067321777344, "logps/rejected": -428.1966857910156, "loss": 0.5362, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.156294822692871, "rewards/margins": 0.6579602360725403, "rewards/rejected": -1.8142551183700562, "step": 560 }, { "epoch": 0.4474975466143278, "grad_norm": 20.51217269897461, "learning_rate": 3.3761430731705056e-06, "logits/chosen": -2.342036485671997, "logits/rejected": -2.3035025596618652, "logps/chosen": -397.56768798828125, "logps/rejected": -449.5596618652344, "loss": 0.526, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1822260618209839, "rewards/margins": 0.667534589767456, "rewards/rejected": -1.84976065158844, "step": 570 }, { "epoch": 0.4553483807654563, "grad_norm": 23.517745971679688, "learning_rate": 3.311577941609604e-06, "logits/chosen": -2.2895524501800537, "logits/rejected": -2.30122447013855, "logps/chosen": -426.5897521972656, "logps/rejected": -487.896484375, "loss": 0.5231, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1464052200317383, "rewards/margins": 0.7059827446937561, "rewards/rejected": -1.8523880243301392, "step": 580 }, { "epoch": 0.4631992149165849, "grad_norm": 28.418771743774414, "learning_rate": 3.2464018796137157e-06, "logits/chosen": -2.184406042098999, "logits/rejected": -2.1148581504821777, "logps/chosen": -412.546875, "logps/rejected": -503.6065368652344, "loss": 0.4968, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3471360206604004, "rewards/margins": 0.9848724603652954, "rewards/rejected": -2.3320083618164062, "step": 590 }, { "epoch": 0.47105004906771347, "grad_norm": 30.563884735107422, "learning_rate": 3.1806639496793245e-06, "logits/chosen": -2.0617759227752686, "logits/rejected": -1.9668960571289062, "logps/chosen": -447.58984375, "logps/rejected": -517.9015502929688, "loss": 0.5238, "rewards/accuracies": 0.75, "rewards/chosen": -1.5449590682983398, "rewards/margins": 0.9296010732650757, "rewards/rejected": -2.474560260772705, "step": 600 }, { "epoch": 0.47105004906771347, "eval_logits/chosen": -2.120598793029785, "eval_logits/rejected": -2.074557065963745, "eval_logps/chosen": -434.86480712890625, "eval_logps/rejected": -479.731201171875, "eval_loss": 0.5332732200622559, "eval_rewards/accuracies": 0.6875, "eval_rewards/chosen": -1.4647715091705322, "eval_rewards/margins": 0.7963771820068359, "eval_rewards/rejected": -2.261148691177368, "eval_runtime": 168.5149, "eval_samples_per_second": 11.868, "eval_steps_per_second": 0.498, "step": 600 }, { "epoch": 0.478900883218842, "grad_norm": 26.9317626953125, "learning_rate": 3.114413637259484e-06, "logits/chosen": -2.065842628479004, "logits/rejected": -1.9007959365844727, "logps/chosen": -437.0047912597656, "logps/rejected": -493.7703552246094, "loss": 0.5562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6189963817596436, "rewards/margins": 0.897415816783905, "rewards/rejected": -2.5164122581481934, "step": 610 }, { "epoch": 0.4867517173699706, "grad_norm": 30.88678741455078, "learning_rate": 3.0477008135127247e-06, "logits/chosen": -2.133183002471924, "logits/rejected": -2.0338778495788574, "logps/chosen": -457.9064025878906, "logps/rejected": -531.5266723632812, "loss": 0.5087, "rewards/accuracies": 0.6875, "rewards/chosen": -1.429966688156128, "rewards/margins": 0.9474767446517944, "rewards/rejected": -2.377443552017212, "step": 620 }, { "epoch": 0.4946025515210991, "grad_norm": 34.801639556884766, "learning_rate": 2.980575697761603e-06, "logits/chosen": -2.0099399089813232, "logits/rejected": -1.8623266220092773, "logps/chosen": -441.07757568359375, "logps/rejected": -508.0874938964844, "loss": 0.5061, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6641613245010376, "rewards/margins": 1.0001566410064697, "rewards/rejected": -2.664318084716797, "step": 630 }, { "epoch": 0.5024533856722276, "grad_norm": 30.205976486206055, "learning_rate": 2.9130888196891755e-06, "logits/chosen": -2.0108351707458496, "logits/rejected": -1.890523910522461, "logps/chosen": -568.1267700195312, "logps/rejected": -614.1755981445312, "loss": 0.5158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.448976993560791, "rewards/margins": 0.9926842451095581, "rewards/rejected": -3.4416611194610596, "step": 640 }, { "epoch": 0.5103042198233563, "grad_norm": 27.39600372314453, "learning_rate": 2.845290981301834e-06, "logits/chosen": -1.7695420980453491, "logits/rejected": -1.7348365783691406, "logps/chosen": -495.6388244628906, "logps/rejected": -598.6192016601562, "loss": 0.5113, "rewards/accuracies": 0.71875, "rewards/chosen": -2.325334072113037, "rewards/margins": 1.1762292385101318, "rewards/rejected": -3.501563310623169, "step": 650 }, { "epoch": 0.5181550539744848, "grad_norm": 28.21457862854004, "learning_rate": 2.7772332186871464e-06, "logits/chosen": -1.947697401046753, "logits/rejected": -1.838045358657837, "logps/chosen": -504.62628173828125, "logps/rejected": -588.11669921875, "loss": 0.5176, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0726189613342285, "rewards/margins": 0.9631049036979675, "rewards/rejected": -3.035723924636841, "step": 660 }, { "epoch": 0.5260058881256133, "grad_norm": 29.053319931030273, "learning_rate": 2.708966763595493e-06, "logits/chosen": -1.9613069295883179, "logits/rejected": -1.8020261526107788, "logps/chosen": -448.95977783203125, "logps/rejected": -519.1914672851562, "loss": 0.5175, "rewards/accuracies": 0.75, "rewards/chosen": -1.724962830543518, "rewards/margins": 1.0086156129837036, "rewards/rejected": -2.7335782051086426, "step": 670 }, { "epoch": 0.5338567222767419, "grad_norm": 34.93812561035156, "learning_rate": 2.640543004874409e-06, "logits/chosen": -2.0338661670684814, "logits/rejected": -1.964261770248413, "logps/chosen": -492.92205810546875, "logps/rejected": -533.3572998046875, "loss": 0.5076, "rewards/accuracies": 0.75, "rewards/chosen": -1.7126522064208984, "rewards/margins": 0.9582921862602234, "rewards/rejected": -2.6709446907043457, "step": 680 }, { "epoch": 0.5417075564278705, "grad_norm": 23.211416244506836, "learning_rate": 2.572013449784671e-06, "logits/chosen": -1.9940426349639893, "logits/rejected": -1.881670594215393, "logps/chosen": -523.5638427734375, "logps/rejected": -588.251220703125, "loss": 0.5368, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3426578044891357, "rewards/margins": 1.0034016370773315, "rewards/rejected": -3.3460593223571777, "step": 690 }, { "epoch": 0.549558390578999, "grad_norm": 31.131181716918945, "learning_rate": 2.503429685227245e-06, "logits/chosen": -1.831365942955017, "logits/rejected": -1.7525676488876343, "logps/chosen": -545.9859619140625, "logps/rejected": -647.7813720703125, "loss": 0.5173, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.500175714492798, "rewards/margins": 1.1063227653503418, "rewards/rejected": -3.6064987182617188, "step": 700 }, { "epoch": 0.549558390578999, "eval_logits/chosen": -2.004182815551758, "eval_logits/rejected": -1.9400309324264526, "eval_logps/chosen": -563.7943725585938, "eval_logps/rejected": -608.2110595703125, "eval_loss": 0.5244275331497192, "eval_rewards/accuracies": 0.7038690447807312, "eval_rewards/chosen": -2.754066228866577, "eval_rewards/margins": 0.7918809056282043, "eval_rewards/rejected": -3.5459470748901367, "eval_runtime": 179.3578, "eval_samples_per_second": 11.151, "eval_steps_per_second": 0.468, "step": 700 }, { "epoch": 0.5574092247301276, "grad_norm": 37.639991760253906, "learning_rate": 2.434843338910286e-06, "logits/chosen": -1.9917552471160889, "logits/rejected": -1.9698021411895752, "logps/chosen": -578.1214599609375, "logps/rejected": -614.1402587890625, "loss": 0.5478, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.914405584335327, "rewards/margins": 0.6746307015419006, "rewards/rejected": -3.589036464691162, "step": 710 }, { "epoch": 0.5652600588812562, "grad_norm": 25.04204750061035, "learning_rate": 2.3663060404854155e-06, "logits/chosen": -1.9311301708221436, "logits/rejected": -1.946319818496704, "logps/chosen": -533.4403076171875, "logps/rejected": -599.1284790039062, "loss": 0.5323, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.5715718269348145, "rewards/margins": 0.9469249844551086, "rewards/rejected": -3.5184967517852783, "step": 720 }, { "epoch": 0.5731108930323847, "grad_norm": 30.593637466430664, "learning_rate": 2.2978693826825406e-06, "logits/chosen": -1.8591407537460327, "logits/rejected": -1.9342968463897705, "logps/chosen": -519.0078125, "logps/rejected": -567.246826171875, "loss": 0.5521, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.448857545852661, "rewards/margins": 0.8260825276374817, "rewards/rejected": -3.274940013885498, "step": 730 }, { "epoch": 0.5809617271835132, "grad_norm": 32.455841064453125, "learning_rate": 2.2295848824724612e-06, "logits/chosen": -2.01774263381958, "logits/rejected": -1.9122161865234375, "logps/chosen": -491.2975158691406, "logps/rejected": -555.1488037109375, "loss": 0.5371, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1018691062927246, "rewards/margins": 0.892257571220398, "rewards/rejected": -2.994126796722412, "step": 740 }, { "epoch": 0.5888125613346418, "grad_norm": 19.341310501098633, "learning_rate": 2.1615039422865136e-06, "logits/chosen": -1.8771547079086304, "logits/rejected": -1.815799355506897, "logps/chosen": -499.349609375, "logps/rejected": -598.83935546875, "loss": 0.4899, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9459705352783203, "rewards/margins": 1.2197866439819336, "rewards/rejected": -3.165757179260254, "step": 750 }, { "epoch": 0.5966633954857704, "grad_norm": 55.24733352661133, "learning_rate": 2.0936778113224253e-06, "logits/chosen": -1.9215799570083618, "logits/rejected": -1.8155832290649414, "logps/chosen": -542.361328125, "logps/rejected": -551.7185668945312, "loss": 0.5494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0699851512908936, "rewards/margins": 0.8438289761543274, "rewards/rejected": -2.913814067840576, "step": 760 }, { "epoch": 0.6045142296368989, "grad_norm": 37.531490325927734, "learning_rate": 2.0261575469655304e-06, "logits/chosen": -1.9638067483901978, "logits/rejected": -1.8803679943084717, "logps/chosen": -466.53143310546875, "logps/rejected": -552.6204833984375, "loss": 0.5412, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0402417182922363, "rewards/margins": 1.0711690187454224, "rewards/rejected": -3.1114110946655273, "step": 770 }, { "epoch": 0.6123650637880275, "grad_norm": 22.25844383239746, "learning_rate": 1.9589939763543693e-06, "logits/chosen": -1.8626676797866821, "logits/rejected": -1.8624019622802734, "logps/chosen": -464.10333251953125, "logps/rejected": -532.6005249023438, "loss": 0.5502, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.867743730545044, "rewards/margins": 0.8303905725479126, "rewards/rejected": -2.698134183883667, "step": 780 }, { "epoch": 0.620215897939156, "grad_norm": 28.578536987304688, "learning_rate": 1.8922376581196107e-06, "logits/chosen": -2.015662670135498, "logits/rejected": -1.9723193645477295, "logps/chosen": -475.9444885253906, "logps/rejected": -536.0194091796875, "loss": 0.4799, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8610671758651733, "rewards/margins": 0.9107308387756348, "rewards/rejected": -2.7717981338500977, "step": 790 }, { "epoch": 0.6280667320902846, "grad_norm": 21.485143661499023, "learning_rate": 1.8259388443250993e-06, "logits/chosen": -2.004772663116455, "logits/rejected": -1.8513364791870117, "logps/chosen": -469.4261779785156, "logps/rejected": -537.4896240234375, "loss": 0.5081, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.004453659057617, "rewards/margins": 0.9301109313964844, "rewards/rejected": -2.9345641136169434, "step": 800 }, { "epoch": 0.6280667320902846, "eval_logits/chosen": -1.9647265672683716, "eval_logits/rejected": -1.909649133682251, "eval_logps/chosen": -498.1996765136719, "eval_logps/rejected": -547.5287475585938, "eval_loss": 0.517790675163269, "eval_rewards/accuracies": 0.7008928656578064, "eval_rewards/chosen": -2.0981194972991943, "eval_rewards/margins": 0.8410041332244873, "eval_rewards/rejected": -2.9391238689422607, "eval_runtime": 177.4176, "eval_samples_per_second": 11.273, "eval_steps_per_second": 0.473, "step": 800 }, { "epoch": 0.6359175662414132, "grad_norm": 43.05495071411133, "learning_rate": 1.760147442639679e-06, "logits/chosen": -1.7117631435394287, "logits/rejected": -1.8081023693084717, "logps/chosen": -471.0235290527344, "logps/rejected": -578.5651245117188, "loss": 0.5044, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.020402193069458, "rewards/margins": 1.2799599170684814, "rewards/rejected": -3.3003621101379395, "step": 810 }, { "epoch": 0.6437684003925417, "grad_norm": 25.53011131286621, "learning_rate": 1.6949129787682628e-06, "logits/chosen": -1.8636391162872314, "logits/rejected": -1.7885582447052002, "logps/chosen": -535.2430419921875, "logps/rejected": -592.5420532226562, "loss": 0.5071, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.181544542312622, "rewards/margins": 1.0555063486099243, "rewards/rejected": -3.2370505332946777, "step": 820 }, { "epoch": 0.6516192345436702, "grad_norm": 32.84662628173828, "learning_rate": 1.6302845591704348e-06, "logits/chosen": -1.7528541088104248, "logits/rejected": -1.919858694076538, "logps/chosen": -471.3095703125, "logps/rejected": -554.0218505859375, "loss": 0.5015, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.969342589378357, "rewards/margins": 0.9811599850654602, "rewards/rejected": -2.950502872467041, "step": 830 }, { "epoch": 0.6594700686947988, "grad_norm": 37.13783264160156, "learning_rate": 1.5663108340946465e-06, "logits/chosen": -2.004257917404175, "logits/rejected": -1.7805702686309814, "logps/chosen": -476.3814392089844, "logps/rejected": -552.9000244140625, "loss": 0.496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8513377904891968, "rewards/margins": 0.9407541155815125, "rewards/rejected": -2.7920918464660645, "step": 840 }, { "epoch": 0.6673209028459274, "grad_norm": 66.06834411621094, "learning_rate": 1.5030399609558364e-06, "logits/chosen": -1.9352130889892578, "logits/rejected": -1.8171558380126953, "logps/chosen": -489.35052490234375, "logps/rejected": -586.8991088867188, "loss": 0.485, "rewards/accuracies": 0.75, "rewards/chosen": -2.1446428298950195, "rewards/margins": 1.0632911920547485, "rewards/rejected": -3.2079339027404785, "step": 850 }, { "epoch": 0.6751717369970559, "grad_norm": 32.76154708862305, "learning_rate": 1.4405195680840357e-06, "logits/chosen": -1.8590924739837646, "logits/rejected": -1.8191407918930054, "logps/chosen": -515.1978759765625, "logps/rejected": -582.213623046875, "loss": 0.5305, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.157810926437378, "rewards/margins": 0.9910067319869995, "rewards/rejected": -3.148818016052246, "step": 860 }, { "epoch": 0.6830225711481845, "grad_norm": 32.92315673828125, "learning_rate": 1.378796718871252e-06, "logits/chosen": -1.9760971069335938, "logits/rejected": -1.8940002918243408, "logps/chosen": -500.63360595703125, "logps/rejected": -580.5349731445312, "loss": 0.5018, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.069706916809082, "rewards/margins": 1.104552984237671, "rewards/rejected": -3.174259662628174, "step": 870 }, { "epoch": 0.6908734052993131, "grad_norm": 27.977630615234375, "learning_rate": 1.3179178763436302e-06, "logits/chosen": -1.713399887084961, "logits/rejected": -1.5991706848144531, "logps/chosen": -485.16693115234375, "logps/rejected": -589.6981201171875, "loss": 0.5245, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5337507724761963, "rewards/margins": 0.9967278242111206, "rewards/rejected": -3.5304782390594482, "step": 880 }, { "epoch": 0.6987242394504416, "grad_norm": 46.519187927246094, "learning_rate": 1.2579288681855364e-06, "logits/chosen": -1.8697153329849243, "logits/rejected": -1.7676079273223877, "logps/chosen": -555.0260620117188, "logps/rejected": -671.7311401367188, "loss": 0.4779, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7753043174743652, "rewards/margins": 1.084364414215088, "rewards/rejected": -3.859668731689453, "step": 890 }, { "epoch": 0.7065750736015701, "grad_norm": 31.31684684753418, "learning_rate": 1.1988748522419163e-06, "logits/chosen": -1.9314721822738647, "logits/rejected": -1.8384710550308228, "logps/chosen": -595.1455078125, "logps/rejected": -668.7490234375, "loss": 0.5197, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.979123592376709, "rewards/margins": 0.9926818609237671, "rewards/rejected": -3.9718050956726074, "step": 900 }, { "epoch": 0.7065750736015701, "eval_logits/chosen": -1.8656275272369385, "eval_logits/rejected": -1.7931705713272095, "eval_logps/chosen": -577.5418701171875, "eval_logps/rejected": -637.5369873046875, "eval_loss": 0.5191683173179626, "eval_rewards/accuracies": 0.7008928656578064, "eval_rewards/chosen": -2.891542434692383, "eval_rewards/margins": 0.9476642608642578, "eval_rewards/rejected": -3.8392069339752197, "eval_runtime": 255.7835, "eval_samples_per_second": 7.819, "eval_steps_per_second": 0.328, "step": 900 }, { "epoch": 0.7144259077526988, "grad_norm": 60.76858139038086, "learning_rate": 1.1408002825248842e-06, "logits/chosen": -1.8335750102996826, "logits/rejected": -1.7328205108642578, "logps/chosen": -567.2271728515625, "logps/rejected": -646.0481567382812, "loss": 0.5185, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.787205219268799, "rewards/margins": 1.058345079421997, "rewards/rejected": -3.845550537109375, "step": 910 }, { "epoch": 0.7222767419038273, "grad_norm": 42.74496078491211, "learning_rate": 1.0837488757501369e-06, "logits/chosen": -1.7031282186508179, "logits/rejected": -1.6774184703826904, "logps/chosen": -532.3548583984375, "logps/rejected": -636.7594604492188, "loss": 0.4887, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.610095262527466, "rewards/margins": 1.1858711242675781, "rewards/rejected": -3.795966386795044, "step": 920 }, { "epoch": 0.7301275760549558, "grad_norm": 32.94953155517578, "learning_rate": 1.027763578428379e-06, "logits/chosen": -1.7176014184951782, "logits/rejected": -1.7608709335327148, "logps/chosen": -563.7265625, "logps/rejected": -646.8751220703125, "loss": 0.4836, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.821300983428955, "rewards/margins": 1.0197052955627441, "rewards/rejected": -3.8410065174102783, "step": 930 }, { "epoch": 0.7379784102060843, "grad_norm": 74.49922943115234, "learning_rate": 9.728865345365379e-07, "logits/chosen": -1.7150166034698486, "logits/rejected": -1.5209593772888184, "logps/chosen": -534.5591430664062, "logps/rejected": -621.5565185546875, "loss": 0.5418, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.711378812789917, "rewards/margins": 1.1061863899230957, "rewards/rejected": -3.8175652027130127, "step": 940 }, { "epoch": 0.745829244357213, "grad_norm": 27.46148681640625, "learning_rate": 9.191590537930975e-07, "logits/chosen": -1.7130823135375977, "logits/rejected": -1.638779878616333, "logps/chosen": -529.4462280273438, "logps/rejected": -603.8697509765625, "loss": 0.536, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.627434015274048, "rewards/margins": 1.0126179456710815, "rewards/rejected": -3.6400516033172607, "step": 950 }, { "epoch": 0.7536800785083415, "grad_norm": 21.87665367126465, "learning_rate": 8.666215805614373e-07, "logits/chosen": -1.7968714237213135, "logits/rejected": -1.8486363887786865, "logps/chosen": -504.91571044921875, "logps/rejected": -589.1393432617188, "loss": 0.5057, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2802155017852783, "rewards/margins": 1.0134499073028564, "rewards/rejected": -3.2936654090881348, "step": 960 }, { "epoch": 0.76153091265947, "grad_norm": 29.431264877319336, "learning_rate": 8.153136634045844e-07, "logits/chosen": -1.9010169506072998, "logits/rejected": -1.6634715795516968, "logps/chosen": -493.634765625, "logps/rejected": -557.65380859375, "loss": 0.4996, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.152984619140625, "rewards/margins": 1.0319383144378662, "rewards/rejected": -3.184922933578491, "step": 970 }, { "epoch": 0.7693817468105987, "grad_norm": 41.45183181762695, "learning_rate": 7.652739253142915e-07, "logits/chosen": -1.9328157901763916, "logits/rejected": -1.7516534328460693, "logps/chosen": -538.4470825195312, "logps/rejected": -577.069580078125, "loss": 0.5214, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.125819683074951, "rewards/margins": 1.0004959106445312, "rewards/rejected": -3.1263155937194824, "step": 980 }, { "epoch": 0.7772325809617272, "grad_norm": 21.71674346923828, "learning_rate": 7.165400346368648e-07, "logits/chosen": -1.9481573104858398, "logits/rejected": -1.8963590860366821, "logps/chosen": -547.48486328125, "logps/rejected": -585.6912231445312, "loss": 0.5278, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2814033031463623, "rewards/margins": 0.8729672431945801, "rewards/rejected": -3.1543705463409424, "step": 990 }, { "epoch": 0.7850834151128557, "grad_norm": 60.18208312988281, "learning_rate": 6.691486767176092e-07, "logits/chosen": -1.7295516729354858, "logits/rejected": -1.773970365524292, "logps/chosen": -467.82049560546875, "logps/rejected": -562.5482177734375, "loss": 0.5008, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1015613079071045, "rewards/margins": 0.9576795697212219, "rewards/rejected": -3.0592408180236816, "step": 1000 }, { "epoch": 0.7850834151128557, "eval_logits/chosen": -1.991379737854004, "eval_logits/rejected": -1.937352180480957, "eval_logps/chosen": -498.16119384765625, "eval_logps/rejected": -545.6578979492188, "eval_loss": 0.5102471709251404, "eval_rewards/accuracies": 0.7008928656578064, "eval_rewards/chosen": -2.0977351665496826, "eval_rewards/margins": 0.8226803541183472, "eval_rewards/rejected": -2.9204154014587402, "eval_runtime": 248.8844, "eval_samples_per_second": 8.036, "eval_steps_per_second": 0.338, "step": 1000 }, { "epoch": 0.7929342492639843, "grad_norm": 27.0752010345459, "learning_rate": 6.231355262852529e-07, "logits/chosen": -1.8228180408477783, "logits/rejected": -1.728371024131775, "logps/chosen": -497.906982421875, "logps/rejected": -574.6722412109375, "loss": 0.5178, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1114232540130615, "rewards/margins": 1.0526831150054932, "rewards/rejected": -3.1641063690185547, "step": 1010 }, { "epoch": 0.8007850834151129, "grad_norm": 40.453643798828125, "learning_rate": 5.785352205971275e-07, "logits/chosen": -1.8827228546142578, "logits/rejected": -1.8348219394683838, "logps/chosen": -479.0231018066406, "logps/rejected": -544.406982421875, "loss": 0.4717, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9858747720718384, "rewards/margins": 0.8169358372688293, "rewards/rejected": -2.8028104305267334, "step": 1020 }, { "epoch": 0.8086359175662414, "grad_norm": 26.58576774597168, "learning_rate": 5.353813333653287e-07, "logits/chosen": -1.9306774139404297, "logits/rejected": -1.9138189554214478, "logps/chosen": -529.3744506835938, "logps/rejected": -577.8673095703125, "loss": 0.5073, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1622471809387207, "rewards/margins": 0.9140118360519409, "rewards/rejected": -3.076258659362793, "step": 1030 }, { "epoch": 0.81648675171737, "grad_norm": 23.61007308959961, "learning_rate": 4.937063494834774e-07, "logits/chosen": -1.814344048500061, "logits/rejected": -1.6967451572418213, "logps/chosen": -507.7666015625, "logps/rejected": -598.0667724609375, "loss": 0.5215, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.145608425140381, "rewards/margins": 0.9707077741622925, "rewards/rejected": -3.116316080093384, "step": 1040 }, { "epoch": 0.8243375858684985, "grad_norm": 28.008739471435547, "learning_rate": 4.5354164057310857e-07, "logits/chosen": -1.8821042776107788, "logits/rejected": -1.7559188604354858, "logps/chosen": -465.9667053222656, "logps/rejected": -576.3198852539062, "loss": 0.5257, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1245594024658203, "rewards/margins": 1.131911039352417, "rewards/rejected": -3.256470203399658, "step": 1050 }, { "epoch": 0.8321884200196271, "grad_norm": 23.431196212768555, "learning_rate": 4.1491744136810066e-07, "logits/chosen": -1.8241643905639648, "logits/rejected": -1.5898910760879517, "logps/chosen": -494.38006591796875, "logps/rejected": -608.50048828125, "loss": 0.5239, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2372994422912598, "rewards/margins": 1.0501940250396729, "rewards/rejected": -3.2874934673309326, "step": 1060 }, { "epoch": 0.8400392541707556, "grad_norm": 36.24497604370117, "learning_rate": 3.7786282695491313e-07, "logits/chosen": -1.7533372640609741, "logits/rejected": -1.780310034751892, "logps/chosen": -521.2637939453125, "logps/rejected": -594.5169067382812, "loss": 0.5173, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1260104179382324, "rewards/margins": 1.0049241781234741, "rewards/rejected": -3.130934476852417, "step": 1070 }, { "epoch": 0.8478900883218842, "grad_norm": 28.115896224975586, "learning_rate": 3.4240569088577564e-07, "logits/chosen": -1.9627529382705688, "logits/rejected": -1.9232120513916016, "logps/chosen": -521.6199951171875, "logps/rejected": -590.34619140625, "loss": 0.5159, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1085124015808105, "rewards/margins": 1.0188482999801636, "rewards/rejected": -3.1273605823516846, "step": 1080 }, { "epoch": 0.8557409224730128, "grad_norm": 25.046926498413086, "learning_rate": 3.0857272418129136e-07, "logits/chosen": -1.8483200073242188, "logits/rejected": -1.8257999420166016, "logps/chosen": -538.3873901367188, "logps/rejected": -620.02978515625, "loss": 0.5008, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2884747982025146, "rewards/margins": 1.0311329364776611, "rewards/rejected": -3.3196074962615967, "step": 1090 }, { "epoch": 0.8635917566241413, "grad_norm": 25.578903198242188, "learning_rate": 2.7638939523827956e-07, "logits/chosen": -1.771712064743042, "logits/rejected": -1.6592738628387451, "logps/chosen": -536.8753662109375, "logps/rejected": -635.494384765625, "loss": 0.5223, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2466344833374023, "rewards/margins": 1.0864031314849854, "rewards/rejected": -3.3330376148223877, "step": 1100 }, { "epoch": 0.8635917566241413, "eval_logits/chosen": -1.9598368406295776, "eval_logits/rejected": -1.8985047340393066, "eval_logps/chosen": -510.208984375, "eval_logps/rejected": -564.5363159179688, "eval_loss": 0.5109513401985168, "eval_rewards/accuracies": 0.6934523582458496, "eval_rewards/chosen": -2.218212604522705, "eval_rewards/margins": 0.8909867405891418, "eval_rewards/rejected": -3.1092000007629395, "eval_runtime": 178.9794, "eval_samples_per_second": 11.174, "eval_steps_per_second": 0.469, "step": 1100 }, { "epoch": 0.8714425907752699, "grad_norm": 24.112642288208008, "learning_rate": 2.4587993065795983e-07, "logits/chosen": -1.8837692737579346, "logits/rejected": -1.7314777374267578, "logps/chosen": -480.4740295410156, "logps/rejected": -563.0213623046875, "loss": 0.5227, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0834219455718994, "rewards/margins": 1.149505853652954, "rewards/rejected": -3.2329280376434326, "step": 1110 }, { "epoch": 0.8792934249263984, "grad_norm": 24.728294372558594, "learning_rate": 2.170672970089291e-07, "logits/chosen": -1.8168354034423828, "logits/rejected": -1.7316901683807373, "logps/chosen": -536.4750366210938, "logps/rejected": -631.4368896484375, "loss": 0.4847, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.207212448120117, "rewards/margins": 1.1604888439178467, "rewards/rejected": -3.367701768875122, "step": 1120 }, { "epoch": 0.887144259077527, "grad_norm": 34.55753707885742, "learning_rate": 1.8997318353864673e-07, "logits/chosen": -1.887563943862915, "logits/rejected": -1.5958278179168701, "logps/chosen": -506.45294189453125, "logps/rejected": -567.0094604492188, "loss": 0.5052, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.201402187347412, "rewards/margins": 1.0240195989608765, "rewards/rejected": -3.225421905517578, "step": 1130 }, { "epoch": 0.8949950932286556, "grad_norm": 37.284019470214844, "learning_rate": 1.6461798584644944e-07, "logits/chosen": -1.940473198890686, "logits/rejected": -1.8656337261199951, "logps/chosen": -518.56494140625, "logps/rejected": -582.9520874023438, "loss": 0.4778, "rewards/accuracies": 0.8125, "rewards/chosen": -2.122587203979492, "rewards/margins": 1.1023415327072144, "rewards/rejected": -3.224928617477417, "step": 1140 }, { "epoch": 0.9028459273797841, "grad_norm": 48.473114013671875, "learning_rate": 1.4102079053038454e-07, "logits/chosen": -1.9566850662231445, "logits/rejected": -1.7725406885147095, "logps/chosen": -515.0001220703125, "logps/rejected": -587.2335205078125, "loss": 0.4947, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.1063010692596436, "rewards/margins": 1.121829628944397, "rewards/rejected": -3.22813081741333, "step": 1150 }, { "epoch": 0.9106967615309126, "grad_norm": 23.98328399658203, "learning_rate": 1.1919936081941585e-07, "logits/chosen": -1.9583518505096436, "logits/rejected": -1.8895307779312134, "logps/chosen": -528.2996215820312, "logps/rejected": -599.0931396484375, "loss": 0.5063, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.383150577545166, "rewards/margins": 0.8790243268013, "rewards/rejected": -3.2621750831604004, "step": 1160 }, { "epoch": 0.9185475956820413, "grad_norm": 30.729877471923828, "learning_rate": 9.917012320182245e-08, "logits/chosen": -1.8442468643188477, "logits/rejected": -1.7293345928192139, "logps/chosen": -530.6605224609375, "logps/rejected": -573.9486083984375, "loss": 0.5107, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3617968559265137, "rewards/margins": 0.8581873774528503, "rewards/rejected": -3.2199840545654297, "step": 1170 }, { "epoch": 0.9263984298331698, "grad_norm": 29.362680435180664, "learning_rate": 8.094815505985315e-08, "logits/chosen": -1.898097276687622, "logits/rejected": -1.7420837879180908, "logps/chosen": -498.27874755859375, "logps/rejected": -638.7017211914062, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -2.2651729583740234, "rewards/margins": 1.1845793724060059, "rewards/rejected": -3.44975209236145, "step": 1180 }, { "epoch": 0.9342492639842983, "grad_norm": 36.354610443115234, "learning_rate": 6.454717331994542e-08, "logits/chosen": -1.9377390146255493, "logits/rejected": -1.8412069082260132, "logps/chosen": -528.7586059570312, "logps/rejected": -617.6361083984375, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.173046350479126, "rewards/margins": 1.1210204362869263, "rewards/rejected": -3.2940666675567627, "step": 1190 }, { "epoch": 0.9421000981354269, "grad_norm": 37.27730178833008, "learning_rate": 4.9979524127052595e-08, "logits/chosen": -1.7879035472869873, "logits/rejected": -1.8019065856933594, "logps/chosen": -485.499755859375, "logps/rejected": -587.9569091796875, "loss": 0.4981, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1999027729034424, "rewards/margins": 1.0023242235183716, "rewards/rejected": -3.2022266387939453, "step": 1200 }, { "epoch": 0.9421000981354269, "eval_logits/chosen": -1.9679957628250122, "eval_logits/rejected": -1.9060754776000977, "eval_logps/chosen": -509.53515625, "eval_logps/rejected": -565.4013061523438, "eval_loss": 0.5110836029052734, "eval_rewards/accuracies": 0.699404776096344, "eval_rewards/chosen": -2.211474895477295, "eval_rewards/margins": 0.9063741564750671, "eval_rewards/rejected": -3.117849349975586, "eval_runtime": 303.5083, "eval_samples_per_second": 6.59, "eval_steps_per_second": 0.277, "step": 1200 }, { "epoch": 0.9499509322865555, "grad_norm": 30.429931640625, "learning_rate": 3.725617355085476e-08, "logits/chosen": -1.7728469371795654, "logits/rejected": -1.6203314065933228, "logps/chosen": -476.9127502441406, "logps/rejected": -577.5582275390625, "loss": 0.507, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.16386079788208, "rewards/margins": 1.1731908321380615, "rewards/rejected": -3.3370513916015625, "step": 1210 }, { "epoch": 0.957801766437684, "grad_norm": 42.811119079589844, "learning_rate": 2.63866993308437e-08, "logits/chosen": -1.765027642250061, "logits/rejected": -1.6837198734283447, "logps/chosen": -484.285400390625, "logps/rejected": -537.3614501953125, "loss": 0.5262, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.178112506866455, "rewards/margins": 0.8579233884811401, "rewards/rejected": -3.0360360145568848, "step": 1220 }, { "epoch": 0.9656526005888125, "grad_norm": 28.079404830932617, "learning_rate": 1.737928366650099e-08, "logits/chosen": -1.9261119365692139, "logits/rejected": -1.853053092956543, "logps/chosen": -547.2498779296875, "logps/rejected": -600.8333129882812, "loss": 0.5182, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.244377851486206, "rewards/margins": 1.105455756187439, "rewards/rejected": -3.3498339653015137, "step": 1230 }, { "epoch": 0.9735034347399412, "grad_norm": 29.11058807373047, "learning_rate": 1.0240707057995735e-08, "logits/chosen": -1.7693697214126587, "logits/rejected": -1.5242459774017334, "logps/chosen": -488.11724853515625, "logps/rejected": -578.2257690429688, "loss": 0.4903, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2136459350585938, "rewards/margins": 0.9566876292228699, "rewards/rejected": -3.1703333854675293, "step": 1240 }, { "epoch": 0.9813542688910697, "grad_norm": 24.037424087524414, "learning_rate": 4.976343202034717e-09, "logits/chosen": -1.754732370376587, "logits/rejected": -1.6457884311676025, "logps/chosen": -478.7969665527344, "logps/rejected": -566.3361206054688, "loss": 0.4716, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.199491024017334, "rewards/margins": 0.9993401765823364, "rewards/rejected": -3.198831081390381, "step": 1250 }, { "epoch": 0.9892051030421982, "grad_norm": 33.65019607543945, "learning_rate": 1.5901549467139953e-09, "logits/chosen": -1.9445594549179077, "logits/rejected": -1.8698110580444336, "logps/chosen": -522.216552734375, "logps/rejected": -589.295654296875, "loss": 0.5043, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2183516025543213, "rewards/margins": 0.9545730352401733, "rewards/rejected": -3.172924757003784, "step": 1260 }, { "epoch": 0.9970559371933267, "grad_norm": 39.74230194091797, "learning_rate": 8.469130840960127e-11, "logits/chosen": -1.7422492504119873, "logits/rejected": -1.6215105056762695, "logps/chosen": -489.52642822265625, "logps/rejected": -590.7807006835938, "loss": 0.5169, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.154064655303955, "rewards/margins": 1.0220654010772705, "rewards/rejected": -3.176130533218384, "step": 1270 }, { "epoch": 0.9994111874386653, "step": 1273, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0132, "train_samples_per_second": 4647380.664, "train_steps_per_second": 96772.918 } ], "logging_steps": 10, "max_steps": 1273, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }