zephyr-7b-dpo-qlora / trainer_state.json
awsuineg's picture
Model save
ba206a2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994111874386653,
"eval_steps": 100,
"global_step": 1273,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007850834151128558,
"grad_norm": 8.785704612731934,
"learning_rate": 3.90625e-08,
"logits/chosen": -2.957148313522339,
"logits/rejected": -2.900550365447998,
"logps/chosen": -466.9051818847656,
"logps/rejected": -502.35345458984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.007850834151128557,
"grad_norm": 6.684790134429932,
"learning_rate": 3.90625e-07,
"logits/chosen": -3.0609865188598633,
"logits/rejected": -2.9977359771728516,
"logps/chosen": -308.3629455566406,
"logps/rejected": -261.8404235839844,
"loss": 0.6931,
"rewards/accuracies": 0.3888888955116272,
"rewards/chosen": 0.0009382184944115579,
"rewards/margins": 5.389652869780548e-05,
"rewards/rejected": 0.0008843218092806637,
"step": 10
},
{
"epoch": 0.015701668302257114,
"grad_norm": 6.8678717613220215,
"learning_rate": 7.8125e-07,
"logits/chosen": -3.1096813678741455,
"logits/rejected": -3.0798025131225586,
"logps/chosen": -292.4770202636719,
"logps/rejected": -254.6656494140625,
"loss": 0.6924,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.007172191981226206,
"rewards/margins": 0.001467574737034738,
"rewards/rejected": 0.005704617593437433,
"step": 20
},
{
"epoch": 0.023552502453385672,
"grad_norm": 6.647519588470459,
"learning_rate": 1.1718750000000001e-06,
"logits/chosen": -3.1090664863586426,
"logits/rejected": -3.084791660308838,
"logps/chosen": -265.77301025390625,
"logps/rejected": -266.1317138671875,
"loss": 0.691,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.01251781266182661,
"rewards/margins": 0.0033119157887995243,
"rewards/rejected": 0.009205898270010948,
"step": 30
},
{
"epoch": 0.03140333660451423,
"grad_norm": 6.744068145751953,
"learning_rate": 1.5625e-06,
"logits/chosen": -3.081329107284546,
"logits/rejected": -3.1170654296875,
"logps/chosen": -297.75823974609375,
"logps/rejected": -270.17462158203125,
"loss": 0.6871,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.011317016556859016,
"rewards/margins": 0.013026026077568531,
"rewards/rejected": -0.00170900858938694,
"step": 40
},
{
"epoch": 0.03925417075564279,
"grad_norm": 7.407871246337891,
"learning_rate": 1.953125e-06,
"logits/chosen": -3.1149449348449707,
"logits/rejected": -3.066861629486084,
"logps/chosen": -306.6522216796875,
"logps/rejected": -255.38491821289062,
"loss": 0.6829,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.015373636968433857,
"rewards/margins": 0.022752460092306137,
"rewards/rejected": -0.00737882312387228,
"step": 50
},
{
"epoch": 0.047105004906771344,
"grad_norm": 7.203430652618408,
"learning_rate": 2.3437500000000002e-06,
"logits/chosen": -3.008836269378662,
"logits/rejected": -3.026230573654175,
"logps/chosen": -269.2247009277344,
"logps/rejected": -276.4228820800781,
"loss": 0.6811,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.006092413794249296,
"rewards/margins": 0.021052923053503036,
"rewards/rejected": -0.014960509724915028,
"step": 60
},
{
"epoch": 0.0549558390578999,
"grad_norm": 6.94841194152832,
"learning_rate": 2.7343750000000004e-06,
"logits/chosen": -3.0492148399353027,
"logits/rejected": -3.0904951095581055,
"logps/chosen": -267.11553955078125,
"logps/rejected": -257.773681640625,
"loss": 0.668,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.009770817123353481,
"rewards/margins": 0.04689077287912369,
"rewards/rejected": -0.056661587208509445,
"step": 70
},
{
"epoch": 0.06280667320902845,
"grad_norm": 9.025800704956055,
"learning_rate": 3.125e-06,
"logits/chosen": -3.1011645793914795,
"logits/rejected": -3.1380457878112793,
"logps/chosen": -308.1461486816406,
"logps/rejected": -312.40869140625,
"loss": 0.6652,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.09069164097309113,
"rewards/margins": 0.052069295197725296,
"rewards/rejected": -0.14276091754436493,
"step": 80
},
{
"epoch": 0.07065750736015702,
"grad_norm": 10.733589172363281,
"learning_rate": 3.5156250000000003e-06,
"logits/chosen": -3.0480704307556152,
"logits/rejected": -3.068376064300537,
"logps/chosen": -309.16766357421875,
"logps/rejected": -306.99627685546875,
"loss": 0.6473,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.12756133079528809,
"rewards/margins": 0.07411627471446991,
"rewards/rejected": -0.20167763531208038,
"step": 90
},
{
"epoch": 0.07850834151128558,
"grad_norm": 13.016451835632324,
"learning_rate": 3.90625e-06,
"logits/chosen": -2.9545440673828125,
"logits/rejected": -2.923600435256958,
"logps/chosen": -302.78509521484375,
"logps/rejected": -304.83795166015625,
"loss": 0.6438,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.2593812942504883,
"rewards/margins": 0.14604052901268005,
"rewards/rejected": -0.40542179346084595,
"step": 100
},
{
"epoch": 0.07850834151128558,
"eval_logits/chosen": -3.014045238494873,
"eval_logits/rejected": -3.0357654094696045,
"eval_logps/chosen": -329.1207580566406,
"eval_logps/rejected": -306.6942443847656,
"eval_loss": 0.6424023509025574,
"eval_rewards/accuracies": 0.6547619104385376,
"eval_rewards/chosen": -0.40733060240745544,
"eval_rewards/margins": 0.12344833463430405,
"eval_rewards/rejected": -0.5307790040969849,
"eval_runtime": 174.6916,
"eval_samples_per_second": 11.449,
"eval_steps_per_second": 0.481,
"step": 100
},
{
"epoch": 0.08635917566241413,
"grad_norm": 14.229337692260742,
"learning_rate": 4.296875e-06,
"logits/chosen": -2.9414217472076416,
"logits/rejected": -2.9711925983428955,
"logps/chosen": -339.30914306640625,
"logps/rejected": -324.44390869140625,
"loss": 0.6339,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3848220109939575,
"rewards/margins": 0.16449818015098572,
"rewards/rejected": -0.5493202209472656,
"step": 110
},
{
"epoch": 0.09421000981354269,
"grad_norm": 20.014785766601562,
"learning_rate": 4.6875000000000004e-06,
"logits/chosen": -2.9944517612457275,
"logits/rejected": -3.045173168182373,
"logps/chosen": -391.199462890625,
"logps/rejected": -376.3497009277344,
"loss": 0.6211,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5974650979042053,
"rewards/margins": 0.2744296193122864,
"rewards/rejected": -0.8718946576118469,
"step": 120
},
{
"epoch": 0.10206084396467124,
"grad_norm": 12.38216781616211,
"learning_rate": 4.999962359300416e-06,
"logits/chosen": -2.9552016258239746,
"logits/rejected": -2.9254870414733887,
"logps/chosen": -402.8971252441406,
"logps/rejected": -404.6396789550781,
"loss": 0.6189,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8715218305587769,
"rewards/margins": 0.27080851793289185,
"rewards/rejected": -1.1423304080963135,
"step": 130
},
{
"epoch": 0.1099116781157998,
"grad_norm": 16.306636810302734,
"learning_rate": 4.998645053824218e-06,
"logits/chosen": -2.803802967071533,
"logits/rejected": -2.8079888820648193,
"logps/chosen": -379.4205017089844,
"logps/rejected": -362.0006103515625,
"loss": 0.6495,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.0367389917373657,
"rewards/margins": 0.17243310809135437,
"rewards/rejected": -1.2091721296310425,
"step": 140
},
{
"epoch": 0.11776251226692837,
"grad_norm": 16.321983337402344,
"learning_rate": 4.9954468466732145e-06,
"logits/chosen": -2.8862144947052,
"logits/rejected": -2.9243063926696777,
"logps/chosen": -417.48272705078125,
"logps/rejected": -420.72381591796875,
"loss": 0.6151,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8957064747810364,
"rewards/margins": 0.33516108989715576,
"rewards/rejected": -1.2308675050735474,
"step": 150
},
{
"epoch": 0.1256133464180569,
"grad_norm": 21.833602905273438,
"learning_rate": 4.990370145357496e-06,
"logits/chosen": -2.880340099334717,
"logits/rejected": -2.8787879943847656,
"logps/chosen": -374.5000305175781,
"logps/rejected": -366.66619873046875,
"loss": 0.6028,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9459589719772339,
"rewards/margins": 0.245649054646492,
"rewards/rejected": -1.1916080713272095,
"step": 160
},
{
"epoch": 0.13346418056918546,
"grad_norm": 22.123382568359375,
"learning_rate": 4.983418771458684e-06,
"logits/chosen": -2.9111855030059814,
"logits/rejected": -2.8439784049987793,
"logps/chosen": -384.2833251953125,
"logps/rejected": -396.34326171875,
"loss": 0.598,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.9746086001396179,
"rewards/margins": 0.32514628767967224,
"rewards/rejected": -1.2997548580169678,
"step": 170
},
{
"epoch": 0.14131501472031405,
"grad_norm": 18.860288619995117,
"learning_rate": 4.97459795775315e-06,
"logits/chosen": -2.846890449523926,
"logits/rejected": -2.8465495109558105,
"logps/chosen": -371.8786315917969,
"logps/rejected": -400.20501708984375,
"loss": 0.5886,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.8063033819198608,
"rewards/margins": 0.39446142315864563,
"rewards/rejected": -1.200764775276184,
"step": 180
},
{
"epoch": 0.1491658488714426,
"grad_norm": 17.285179138183594,
"learning_rate": 4.963914344272961e-06,
"logits/chosen": -2.9533636569976807,
"logits/rejected": -2.9740447998046875,
"logps/chosen": -379.0119934082031,
"logps/rejected": -406.78936767578125,
"loss": 0.6036,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7768992781639099,
"rewards/margins": 0.33595213294029236,
"rewards/rejected": -1.1128513813018799,
"step": 190
},
{
"epoch": 0.15701668302257116,
"grad_norm": 23.66827964782715,
"learning_rate": 4.951375973307458e-06,
"logits/chosen": -2.9775123596191406,
"logits/rejected": -2.977674961090088,
"logps/chosen": -389.47088623046875,
"logps/rejected": -386.0644226074219,
"loss": 0.5977,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8474963903427124,
"rewards/margins": 0.37333354353904724,
"rewards/rejected": -1.220829963684082,
"step": 200
},
{
"epoch": 0.15701668302257116,
"eval_logits/chosen": -3.014695405960083,
"eval_logits/rejected": -3.025944232940674,
"eval_logps/chosen": -394.195068359375,
"eval_logps/rejected": -395.111328125,
"eval_loss": 0.5976593494415283,
"eval_rewards/accuracies": 0.6666666865348816,
"eval_rewards/chosen": -1.0580739974975586,
"eval_rewards/margins": 0.35687559843063354,
"eval_rewards/rejected": -1.414949655532837,
"eval_runtime": 171.2039,
"eval_samples_per_second": 11.682,
"eval_steps_per_second": 0.491,
"step": 200
},
{
"epoch": 0.1648675171736997,
"grad_norm": 21.703943252563477,
"learning_rate": 4.93699228334928e-06,
"logits/chosen": -3.014017343521118,
"logits/rejected": -2.9310977458953857,
"logps/chosen": -398.8971862792969,
"logps/rejected": -423.201416015625,
"loss": 0.5652,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1008360385894775,
"rewards/margins": 0.518481433391571,
"rewards/rejected": -1.6193174123764038,
"step": 210
},
{
"epoch": 0.17271835132482827,
"grad_norm": 30.472820281982422,
"learning_rate": 4.920774101989362e-06,
"logits/chosen": -2.922285318374634,
"logits/rejected": -2.8559417724609375,
"logps/chosen": -394.4663391113281,
"logps/rejected": -436.795166015625,
"loss": 0.5943,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.094743013381958,
"rewards/margins": 0.4840098023414612,
"rewards/rejected": -1.578752875328064,
"step": 220
},
{
"epoch": 0.18056918547595682,
"grad_norm": 19.1884708404541,
"learning_rate": 4.902733637766261e-06,
"logits/chosen": -2.8735547065734863,
"logits/rejected": -2.8807244300842285,
"logps/chosen": -358.8312072753906,
"logps/rejected": -393.9620056152344,
"loss": 0.5503,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8047897219657898,
"rewards/margins": 0.49235886335372925,
"rewards/rejected": -1.297148585319519,
"step": 230
},
{
"epoch": 0.18842001962708538,
"grad_norm": 25.822147369384766,
"learning_rate": 4.882884470975954e-06,
"logits/chosen": -2.733098030090332,
"logits/rejected": -2.768909454345703,
"logps/chosen": -396.49188232421875,
"logps/rejected": -438.16455078125,
"loss": 0.5768,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.028875708580017,
"rewards/margins": 0.5807405710220337,
"rewards/rejected": -1.6096162796020508,
"step": 240
},
{
"epoch": 0.19627085377821393,
"grad_norm": 18.91808319091797,
"learning_rate": 4.861241543449015e-06,
"logits/chosen": -2.723087787628174,
"logits/rejected": -2.6532533168792725,
"logps/chosen": -397.3339538574219,
"logps/rejected": -422.185302734375,
"loss": 0.5773,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.094036340713501,
"rewards/margins": 0.49098238348960876,
"rewards/rejected": -1.5850186347961426,
"step": 250
},
{
"epoch": 0.2041216879293425,
"grad_norm": 32.23611068725586,
"learning_rate": 4.8378211473028755e-06,
"logits/chosen": -2.828057289123535,
"logits/rejected": -2.838313579559326,
"logps/chosen": -390.9112243652344,
"logps/rejected": -397.8682556152344,
"loss": 0.573,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7329429984092712,
"rewards/margins": 0.33645009994506836,
"rewards/rejected": -1.0693930387496948,
"step": 260
},
{
"epoch": 0.21197252208047104,
"grad_norm": 25.560338973999023,
"learning_rate": 4.812640912677624e-06,
"logits/chosen": -2.9140567779541016,
"logits/rejected": -2.930488109588623,
"logps/chosen": -346.61273193359375,
"logps/rejected": -370.59771728515625,
"loss": 0.5867,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6457637548446655,
"rewards/margins": 0.3336387276649475,
"rewards/rejected": -0.9794024229049683,
"step": 270
},
{
"epoch": 0.2198233562315996,
"grad_norm": 21.532350540161133,
"learning_rate": 4.785719794464596e-06,
"logits/chosen": -2.7777903079986572,
"logits/rejected": -2.7826027870178223,
"logps/chosen": -360.8690490722656,
"logps/rejected": -386.78753662109375,
"loss": 0.5804,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.9706557393074036,
"rewards/margins": 0.3798271715641022,
"rewards/rejected": -1.3504829406738281,
"step": 280
},
{
"epoch": 0.22767419038272815,
"grad_norm": 25.418230056762695,
"learning_rate": 4.757078058037722e-06,
"logits/chosen": -2.886289119720459,
"logits/rejected": -2.813042402267456,
"logps/chosen": -386.9649353027344,
"logps/rejected": -436.1798400878906,
"loss": 0.5789,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1510156393051147,
"rewards/margins": 0.4267166256904602,
"rewards/rejected": -1.5777322053909302,
"step": 290
},
{
"epoch": 0.23552502453385674,
"grad_norm": 26.089282989501953,
"learning_rate": 4.72673726399839e-06,
"logits/chosen": -2.6797690391540527,
"logits/rejected": -2.7410836219787598,
"logps/chosen": -347.84405517578125,
"logps/rejected": -429.4021911621094,
"loss": 0.5583,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0482864379882812,
"rewards/margins": 0.5523291826248169,
"rewards/rejected": -1.6006155014038086,
"step": 300
},
{
"epoch": 0.23552502453385674,
"eval_logits/chosen": -2.723576307296753,
"eval_logits/rejected": -2.7299251556396484,
"eval_logps/chosen": -384.55767822265625,
"eval_logps/rejected": -390.4145812988281,
"eval_loss": 0.5714064836502075,
"eval_rewards/accuracies": 0.6711309552192688,
"eval_rewards/chosen": -0.9617000818252563,
"eval_rewards/margins": 0.40628206729888916,
"eval_rewards/rejected": -1.3679821491241455,
"eval_runtime": 171.2042,
"eval_samples_per_second": 11.682,
"eval_steps_per_second": 0.491,
"step": 300
},
{
"epoch": 0.2433758586849853,
"grad_norm": 18.173839569091797,
"learning_rate": 4.694720251945298e-06,
"logits/chosen": -2.7685041427612305,
"logits/rejected": -2.686394691467285,
"logps/chosen": -381.9715881347656,
"logps/rejected": -402.4884338378906,
"loss": 0.5718,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9211880564689636,
"rewards/margins": 0.3484678566455841,
"rewards/rejected": -1.2696558237075806,
"step": 310
},
{
"epoch": 0.2512266928361138,
"grad_norm": 20.20842933654785,
"learning_rate": 4.661051123281528e-06,
"logits/chosen": -2.535449504852295,
"logits/rejected": -2.4344544410705566,
"logps/chosen": -394.1264953613281,
"logps/rejected": -438.5677185058594,
"loss": 0.542,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9613161087036133,
"rewards/margins": 0.5449072122573853,
"rewards/rejected": -1.5062233209609985,
"step": 320
},
{
"epoch": 0.2590775269872424,
"grad_norm": 19.297094345092773,
"learning_rate": 4.6257552230717536e-06,
"logits/chosen": -2.40204119682312,
"logits/rejected": -2.392609119415283,
"logps/chosen": -459.760498046875,
"logps/rejected": -468.84698486328125,
"loss": 0.5285,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.178815245628357,
"rewards/margins": 0.6503817439079285,
"rewards/rejected": -1.8291969299316406,
"step": 330
},
{
"epoch": 0.26692836113837093,
"grad_norm": 36.15755081176758,
"learning_rate": 4.588859120963282e-06,
"logits/chosen": -2.267246723175049,
"logits/rejected": -2.1048290729522705,
"logps/chosen": -382.4331359863281,
"logps/rejected": -419.305908203125,
"loss": 0.5572,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2648835182189941,
"rewards/margins": 0.6192021369934082,
"rewards/rejected": -1.8840856552124023,
"step": 340
},
{
"epoch": 0.2747791952894995,
"grad_norm": 15.106271743774414,
"learning_rate": 4.5503905911852435e-06,
"logits/chosen": -2.3543121814727783,
"logits/rejected": -2.33532452583313,
"logps/chosen": -404.1642761230469,
"logps/rejected": -431.0093688964844,
"loss": 0.5287,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0297491550445557,
"rewards/margins": 0.6717931032180786,
"rewards/rejected": -1.7015421390533447,
"step": 350
},
{
"epoch": 0.2826300294406281,
"grad_norm": 20.0123348236084,
"learning_rate": 4.510378591641036e-06,
"logits/chosen": -2.2860474586486816,
"logits/rejected": -2.3591558933258057,
"logps/chosen": -394.79827880859375,
"logps/rejected": -417.1219177246094,
"loss": 0.5561,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9084060788154602,
"rewards/margins": 0.6691475510597229,
"rewards/rejected": -1.5775535106658936,
"step": 360
},
{
"epoch": 0.2904808635917566,
"grad_norm": 19.609752655029297,
"learning_rate": 4.468853242109712e-06,
"logits/chosen": -2.3907597064971924,
"logits/rejected": -2.378951072692871,
"logps/chosen": -362.96331787109375,
"logps/rejected": -399.92401123046875,
"loss": 0.5624,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9224265217781067,
"rewards/margins": 0.5320521593093872,
"rewards/rejected": -1.4544788599014282,
"step": 370
},
{
"epoch": 0.2983316977428852,
"grad_norm": 29.80910873413086,
"learning_rate": 4.42584580157276e-06,
"logits/chosen": -2.2916672229766846,
"logits/rejected": -2.059715986251831,
"logps/chosen": -365.691162109375,
"logps/rejected": -418.39642333984375,
"loss": 0.5196,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0353277921676636,
"rewards/margins": 0.6206272840499878,
"rewards/rejected": -1.6559550762176514,
"step": 380
},
{
"epoch": 0.30618253189401373,
"grad_norm": 28.393800735473633,
"learning_rate": 4.381388644683317e-06,
"logits/chosen": -2.1753897666931152,
"logits/rejected": -2.1332502365112305,
"logps/chosen": -400.159423828125,
"logps/rejected": -432.15777587890625,
"loss": 0.5341,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2140204906463623,
"rewards/margins": 0.7265356779098511,
"rewards/rejected": -1.9405561685562134,
"step": 390
},
{
"epoch": 0.3140333660451423,
"grad_norm": 32.2076301574707,
"learning_rate": 4.33551523739555e-06,
"logits/chosen": -2.03031849861145,
"logits/rejected": -2.0334537029266357,
"logps/chosen": -369.33056640625,
"logps/rejected": -458.2646484375,
"loss": 0.5727,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2683178186416626,
"rewards/margins": 0.6755903363227844,
"rewards/rejected": -1.9439083337783813,
"step": 400
},
{
"epoch": 0.3140333660451423,
"eval_logits/chosen": -2.2418928146362305,
"eval_logits/rejected": -2.2240025997161865,
"eval_logps/chosen": -389.8349609375,
"eval_logps/rejected": -412.5816955566406,
"eval_loss": 0.5430436730384827,
"eval_rewards/accuracies": 0.6875,
"eval_rewards/chosen": -1.0144727230072021,
"eval_rewards/margins": 0.5751808285713196,
"eval_rewards/rejected": -1.5896533727645874,
"eval_runtime": 172.1182,
"eval_samples_per_second": 11.62,
"eval_steps_per_second": 0.488,
"step": 400
},
{
"epoch": 0.32188420019627084,
"grad_norm": 22.76254653930664,
"learning_rate": 4.288260111772535e-06,
"logits/chosen": -2.275933027267456,
"logits/rejected": -2.148829221725464,
"logps/chosen": -390.9195861816406,
"logps/rejected": -428.26226806640625,
"loss": 0.5302,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9567239880561829,
"rewards/margins": 0.5892980694770813,
"rewards/rejected": -1.5460221767425537,
"step": 410
},
{
"epoch": 0.3297350343473994,
"grad_norm": 23.928640365600586,
"learning_rate": 4.239658839991594e-06,
"logits/chosen": -2.107412099838257,
"logits/rejected": -2.1913232803344727,
"logps/chosen": -405.66265869140625,
"logps/rejected": -424.18377685546875,
"loss": 0.5677,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.139583706855774,
"rewards/margins": 0.5120341777801514,
"rewards/rejected": -1.6516176462173462,
"step": 420
},
{
"epoch": 0.33758586849852795,
"grad_norm": 21.068220138549805,
"learning_rate": 4.189748007566686e-06,
"logits/chosen": -2.05175518989563,
"logits/rejected": -1.9536798000335693,
"logps/chosen": -372.6251525878906,
"logps/rejected": -439.80810546875,
"loss": 0.5315,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.315185785293579,
"rewards/margins": 0.6874850988388062,
"rewards/rejected": -2.0026707649230957,
"step": 430
},
{
"epoch": 0.34543670264965654,
"grad_norm": 30.113636016845703,
"learning_rate": 4.138565185807972e-06,
"logits/chosen": -2.102708339691162,
"logits/rejected": -2.0921308994293213,
"logps/chosen": -431.109375,
"logps/rejected": -471.14532470703125,
"loss": 0.5454,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.4245600700378418,
"rewards/margins": 0.6852970719337463,
"rewards/rejected": -2.1098570823669434,
"step": 440
},
{
"epoch": 0.35328753680078506,
"grad_norm": 43.14057540893555,
"learning_rate": 4.086148903539311e-06,
"logits/chosen": -1.9374672174453735,
"logits/rejected": -1.9135332107543945,
"logps/chosen": -499.2344665527344,
"logps/rejected": -536.7950439453125,
"loss": 0.5821,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.975155234336853,
"rewards/margins": 0.659866988658905,
"rewards/rejected": -2.6350224018096924,
"step": 450
},
{
"epoch": 0.36113837095191365,
"grad_norm": 38.40256881713867,
"learning_rate": 4.032538618094972e-06,
"logits/chosen": -2.0139780044555664,
"logits/rejected": -1.9372785091400146,
"logps/chosen": -458.24664306640625,
"logps/rejected": -535.4005126953125,
"loss": 0.4926,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7541725635528564,
"rewards/margins": 0.8262192010879517,
"rewards/rejected": -2.5803914070129395,
"step": 460
},
{
"epoch": 0.3689892051030422,
"grad_norm": 19.52273941040039,
"learning_rate": 3.977774685617386e-06,
"logits/chosen": -2.1808319091796875,
"logits/rejected": -2.155151844024658,
"logps/chosen": -449.31927490234375,
"logps/rejected": -500.30242919921875,
"loss": 0.4962,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5000309944152832,
"rewards/margins": 0.7721298336982727,
"rewards/rejected": -2.272160768508911,
"step": 470
},
{
"epoch": 0.37684003925417076,
"grad_norm": 40.90033721923828,
"learning_rate": 3.92189833067831e-06,
"logits/chosen": -1.9630296230316162,
"logits/rejected": -1.909574270248413,
"logps/chosen": -471.18243408203125,
"logps/rejected": -558.3318481445312,
"loss": 0.5098,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8825572729110718,
"rewards/margins": 0.9354137182235718,
"rewards/rejected": -2.8179707527160645,
"step": 480
},
{
"epoch": 0.38469087340529934,
"grad_norm": 20.045015335083008,
"learning_rate": 3.864951615246261e-06,
"logits/chosen": -1.8974872827529907,
"logits/rejected": -1.8500369787216187,
"logps/chosen": -516.6534423828125,
"logps/rejected": -588.5135498046875,
"loss": 0.5681,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.3989312648773193,
"rewards/margins": 0.9191252589225769,
"rewards/rejected": -3.3180572986602783,
"step": 490
},
{
"epoch": 0.39254170755642787,
"grad_norm": 24.87650489807129,
"learning_rate": 3.806977407023581e-06,
"logits/chosen": -2.218294143676758,
"logits/rejected": -2.087562084197998,
"logps/chosen": -463.06121826171875,
"logps/rejected": -502.892333984375,
"loss": 0.5178,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.609442949295044,
"rewards/margins": 0.832965075969696,
"rewards/rejected": -2.4424080848693848,
"step": 500
},
{
"epoch": 0.39254170755642787,
"eval_logits/chosen": -2.3772380352020264,
"eval_logits/rejected": -2.3567545413970947,
"eval_logps/chosen": -410.2373046875,
"eval_logps/rejected": -440.0260925292969,
"eval_loss": 0.5367991328239441,
"eval_rewards/accuracies": 0.6815476417541504,
"eval_rewards/chosen": -1.2184962034225464,
"eval_rewards/margins": 0.6456010937690735,
"eval_rewards/rejected": -1.864097237586975,
"eval_runtime": 164.1,
"eval_samples_per_second": 12.188,
"eval_steps_per_second": 0.512,
"step": 500
},
{
"epoch": 0.40039254170755645,
"grad_norm": 17.09919548034668,
"learning_rate": 3.7480193471769815e-06,
"logits/chosen": -2.3634283542633057,
"logits/rejected": -2.379462242126465,
"logps/chosen": -422.3035583496094,
"logps/rejected": -452.18951416015625,
"loss": 0.559,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2336976528167725,
"rewards/margins": 0.5405682325363159,
"rewards/rejected": -1.7742656469345093,
"step": 510
},
{
"epoch": 0.408243375858685,
"grad_norm": 19.045442581176758,
"learning_rate": 3.6881218174858354e-06,
"logits/chosen": -2.298239231109619,
"logits/rejected": -2.1397132873535156,
"logps/chosen": -401.8265075683594,
"logps/rejected": -456.30535888671875,
"loss": 0.5242,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.240301251411438,
"rewards/margins": 0.7864383459091187,
"rewards/rejected": -2.0267395973205566,
"step": 520
},
{
"epoch": 0.41609421000981356,
"grad_norm": 26.22776985168457,
"learning_rate": 3.627329906932964e-06,
"logits/chosen": -2.407930374145508,
"logits/rejected": -2.3968963623046875,
"logps/chosen": -411.4175720214844,
"logps/rejected": -485.57379150390625,
"loss": 0.5329,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2162271738052368,
"rewards/margins": 0.7498941421508789,
"rewards/rejected": -1.9661214351654053,
"step": 530
},
{
"epoch": 0.4239450441609421,
"grad_norm": 33.6424674987793,
"learning_rate": 3.5656893777630686e-06,
"logits/chosen": -2.208657741546631,
"logits/rejected": -2.1544740200042725,
"logps/chosen": -431.5694274902344,
"logps/rejected": -502.0116271972656,
"loss": 0.5605,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4970638751983643,
"rewards/margins": 0.8648282885551453,
"rewards/rejected": -2.361891984939575,
"step": 540
},
{
"epoch": 0.43179587831207067,
"grad_norm": 32.2934684753418,
"learning_rate": 3.503246631034345e-06,
"logits/chosen": -2.119847297668457,
"logits/rejected": -2.133668899536133,
"logps/chosen": -413.0341796875,
"logps/rejected": -459.9708557128906,
"loss": 0.5818,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.7010523080825806,
"rewards/margins": 0.699297308921814,
"rewards/rejected": -2.4003493785858154,
"step": 550
},
{
"epoch": 0.4396467124631992,
"grad_norm": 20.624055862426758,
"learning_rate": 3.440048671689219e-06,
"logits/chosen": -2.2201478481292725,
"logits/rejected": -2.28852915763855,
"logps/chosen": -394.3067321777344,
"logps/rejected": -428.1966857910156,
"loss": 0.5362,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.156294822692871,
"rewards/margins": 0.6579602360725403,
"rewards/rejected": -1.8142551183700562,
"step": 560
},
{
"epoch": 0.4474975466143278,
"grad_norm": 20.51217269897461,
"learning_rate": 3.3761430731705056e-06,
"logits/chosen": -2.342036485671997,
"logits/rejected": -2.3035025596618652,
"logps/chosen": -397.56768798828125,
"logps/rejected": -449.5596618652344,
"loss": 0.526,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1822260618209839,
"rewards/margins": 0.667534589767456,
"rewards/rejected": -1.84976065158844,
"step": 570
},
{
"epoch": 0.4553483807654563,
"grad_norm": 23.517745971679688,
"learning_rate": 3.311577941609604e-06,
"logits/chosen": -2.2895524501800537,
"logits/rejected": -2.30122447013855,
"logps/chosen": -426.5897521972656,
"logps/rejected": -487.896484375,
"loss": 0.5231,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1464052200317383,
"rewards/margins": 0.7059827446937561,
"rewards/rejected": -1.8523880243301392,
"step": 580
},
{
"epoch": 0.4631992149165849,
"grad_norm": 28.418771743774414,
"learning_rate": 3.2464018796137157e-06,
"logits/chosen": -2.184406042098999,
"logits/rejected": -2.1148581504821777,
"logps/chosen": -412.546875,
"logps/rejected": -503.6065368652344,
"loss": 0.4968,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3471360206604004,
"rewards/margins": 0.9848724603652954,
"rewards/rejected": -2.3320083618164062,
"step": 590
},
{
"epoch": 0.47105004906771347,
"grad_norm": 30.563884735107422,
"learning_rate": 3.1806639496793245e-06,
"logits/chosen": -2.0617759227752686,
"logits/rejected": -1.9668960571289062,
"logps/chosen": -447.58984375,
"logps/rejected": -517.9015502929688,
"loss": 0.5238,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5449590682983398,
"rewards/margins": 0.9296010732650757,
"rewards/rejected": -2.474560260772705,
"step": 600
},
{
"epoch": 0.47105004906771347,
"eval_logits/chosen": -2.120598793029785,
"eval_logits/rejected": -2.074557065963745,
"eval_logps/chosen": -434.86480712890625,
"eval_logps/rejected": -479.731201171875,
"eval_loss": 0.5332732200622559,
"eval_rewards/accuracies": 0.6875,
"eval_rewards/chosen": -1.4647715091705322,
"eval_rewards/margins": 0.7963771820068359,
"eval_rewards/rejected": -2.261148691177368,
"eval_runtime": 168.5149,
"eval_samples_per_second": 11.868,
"eval_steps_per_second": 0.498,
"step": 600
},
{
"epoch": 0.478900883218842,
"grad_norm": 26.9317626953125,
"learning_rate": 3.114413637259484e-06,
"logits/chosen": -2.065842628479004,
"logits/rejected": -1.9007959365844727,
"logps/chosen": -437.0047912597656,
"logps/rejected": -493.7703552246094,
"loss": 0.5562,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6189963817596436,
"rewards/margins": 0.897415816783905,
"rewards/rejected": -2.5164122581481934,
"step": 610
},
{
"epoch": 0.4867517173699706,
"grad_norm": 30.88678741455078,
"learning_rate": 3.0477008135127247e-06,
"logits/chosen": -2.133183002471924,
"logits/rejected": -2.0338778495788574,
"logps/chosen": -457.9064025878906,
"logps/rejected": -531.5266723632812,
"loss": 0.5087,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.429966688156128,
"rewards/margins": 0.9474767446517944,
"rewards/rejected": -2.377443552017212,
"step": 620
},
{
"epoch": 0.4946025515210991,
"grad_norm": 34.801639556884766,
"learning_rate": 2.980575697761603e-06,
"logits/chosen": -2.0099399089813232,
"logits/rejected": -1.8623266220092773,
"logps/chosen": -441.07757568359375,
"logps/rejected": -508.0874938964844,
"loss": 0.5061,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.6641613245010376,
"rewards/margins": 1.0001566410064697,
"rewards/rejected": -2.664318084716797,
"step": 630
},
{
"epoch": 0.5024533856722276,
"grad_norm": 30.205976486206055,
"learning_rate": 2.9130888196891755e-06,
"logits/chosen": -2.0108351707458496,
"logits/rejected": -1.890523910522461,
"logps/chosen": -568.1267700195312,
"logps/rejected": -614.1755981445312,
"loss": 0.5158,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.448976993560791,
"rewards/margins": 0.9926842451095581,
"rewards/rejected": -3.4416611194610596,
"step": 640
},
{
"epoch": 0.5103042198233563,
"grad_norm": 27.39600372314453,
"learning_rate": 2.845290981301834e-06,
"logits/chosen": -1.7695420980453491,
"logits/rejected": -1.7348365783691406,
"logps/chosen": -495.6388244628906,
"logps/rejected": -598.6192016601562,
"loss": 0.5113,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.325334072113037,
"rewards/margins": 1.1762292385101318,
"rewards/rejected": -3.501563310623169,
"step": 650
},
{
"epoch": 0.5181550539744848,
"grad_norm": 28.21457862854004,
"learning_rate": 2.7772332186871464e-06,
"logits/chosen": -1.947697401046753,
"logits/rejected": -1.838045358657837,
"logps/chosen": -504.62628173828125,
"logps/rejected": -588.11669921875,
"loss": 0.5176,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0726189613342285,
"rewards/margins": 0.9631049036979675,
"rewards/rejected": -3.035723924636841,
"step": 660
},
{
"epoch": 0.5260058881256133,
"grad_norm": 29.053319931030273,
"learning_rate": 2.708966763595493e-06,
"logits/chosen": -1.9613069295883179,
"logits/rejected": -1.8020261526107788,
"logps/chosen": -448.95977783203125,
"logps/rejected": -519.1914672851562,
"loss": 0.5175,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.724962830543518,
"rewards/margins": 1.0086156129837036,
"rewards/rejected": -2.7335782051086426,
"step": 670
},
{
"epoch": 0.5338567222767419,
"grad_norm": 34.93812561035156,
"learning_rate": 2.640543004874409e-06,
"logits/chosen": -2.0338661670684814,
"logits/rejected": -1.964261770248413,
"logps/chosen": -492.92205810546875,
"logps/rejected": -533.3572998046875,
"loss": 0.5076,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7126522064208984,
"rewards/margins": 0.9582921862602234,
"rewards/rejected": -2.6709446907043457,
"step": 680
},
{
"epoch": 0.5417075564278705,
"grad_norm": 23.211416244506836,
"learning_rate": 2.572013449784671e-06,
"logits/chosen": -1.9940426349639893,
"logits/rejected": -1.881670594215393,
"logps/chosen": -523.5638427734375,
"logps/rejected": -588.251220703125,
"loss": 0.5368,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.3426578044891357,
"rewards/margins": 1.0034016370773315,
"rewards/rejected": -3.3460593223571777,
"step": 690
},
{
"epoch": 0.549558390578999,
"grad_norm": 31.131181716918945,
"learning_rate": 2.503429685227245e-06,
"logits/chosen": -1.831365942955017,
"logits/rejected": -1.7525676488876343,
"logps/chosen": -545.9859619140625,
"logps/rejected": -647.7813720703125,
"loss": 0.5173,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.500175714492798,
"rewards/margins": 1.1063227653503418,
"rewards/rejected": -3.6064987182617188,
"step": 700
},
{
"epoch": 0.549558390578999,
"eval_logits/chosen": -2.004182815551758,
"eval_logits/rejected": -1.9400309324264526,
"eval_logps/chosen": -563.7943725585938,
"eval_logps/rejected": -608.2110595703125,
"eval_loss": 0.5244275331497192,
"eval_rewards/accuracies": 0.7038690447807312,
"eval_rewards/chosen": -2.754066228866577,
"eval_rewards/margins": 0.7918809056282043,
"eval_rewards/rejected": -3.5459470748901367,
"eval_runtime": 179.3578,
"eval_samples_per_second": 11.151,
"eval_steps_per_second": 0.468,
"step": 700
},
{
"epoch": 0.5574092247301276,
"grad_norm": 37.639991760253906,
"learning_rate": 2.434843338910286e-06,
"logits/chosen": -1.9917552471160889,
"logits/rejected": -1.9698021411895752,
"logps/chosen": -578.1214599609375,
"logps/rejected": -614.1402587890625,
"loss": 0.5478,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.914405584335327,
"rewards/margins": 0.6746307015419006,
"rewards/rejected": -3.589036464691162,
"step": 710
},
{
"epoch": 0.5652600588812562,
"grad_norm": 25.04204750061035,
"learning_rate": 2.3663060404854155e-06,
"logits/chosen": -1.9311301708221436,
"logits/rejected": -1.946319818496704,
"logps/chosen": -533.4403076171875,
"logps/rejected": -599.1284790039062,
"loss": 0.5323,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5715718269348145,
"rewards/margins": 0.9469249844551086,
"rewards/rejected": -3.5184967517852783,
"step": 720
},
{
"epoch": 0.5731108930323847,
"grad_norm": 30.593637466430664,
"learning_rate": 2.2978693826825406e-06,
"logits/chosen": -1.8591407537460327,
"logits/rejected": -1.9342968463897705,
"logps/chosen": -519.0078125,
"logps/rejected": -567.246826171875,
"loss": 0.5521,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.448857545852661,
"rewards/margins": 0.8260825276374817,
"rewards/rejected": -3.274940013885498,
"step": 730
},
{
"epoch": 0.5809617271835132,
"grad_norm": 32.455841064453125,
"learning_rate": 2.2295848824724612e-06,
"logits/chosen": -2.01774263381958,
"logits/rejected": -1.9122161865234375,
"logps/chosen": -491.2975158691406,
"logps/rejected": -555.1488037109375,
"loss": 0.5371,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1018691062927246,
"rewards/margins": 0.892257571220398,
"rewards/rejected": -2.994126796722412,
"step": 740
},
{
"epoch": 0.5888125613346418,
"grad_norm": 19.341310501098633,
"learning_rate": 2.1615039422865136e-06,
"logits/chosen": -1.8771547079086304,
"logits/rejected": -1.815799355506897,
"logps/chosen": -499.349609375,
"logps/rejected": -598.83935546875,
"loss": 0.4899,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.9459705352783203,
"rewards/margins": 1.2197866439819336,
"rewards/rejected": -3.165757179260254,
"step": 750
},
{
"epoch": 0.5966633954857704,
"grad_norm": 55.24733352661133,
"learning_rate": 2.0936778113224253e-06,
"logits/chosen": -1.9215799570083618,
"logits/rejected": -1.8155832290649414,
"logps/chosen": -542.361328125,
"logps/rejected": -551.7185668945312,
"loss": 0.5494,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0699851512908936,
"rewards/margins": 0.8438289761543274,
"rewards/rejected": -2.913814067840576,
"step": 760
},
{
"epoch": 0.6045142296368989,
"grad_norm": 37.531490325927734,
"learning_rate": 2.0261575469655304e-06,
"logits/chosen": -1.9638067483901978,
"logits/rejected": -1.8803679943084717,
"logps/chosen": -466.53143310546875,
"logps/rejected": -552.6204833984375,
"loss": 0.5412,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0402417182922363,
"rewards/margins": 1.0711690187454224,
"rewards/rejected": -3.1114110946655273,
"step": 770
},
{
"epoch": 0.6123650637880275,
"grad_norm": 22.25844383239746,
"learning_rate": 1.9589939763543693e-06,
"logits/chosen": -1.8626676797866821,
"logits/rejected": -1.8624019622802734,
"logps/chosen": -464.10333251953125,
"logps/rejected": -532.6005249023438,
"loss": 0.5502,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.867743730545044,
"rewards/margins": 0.8303905725479126,
"rewards/rejected": -2.698134183883667,
"step": 780
},
{
"epoch": 0.620215897939156,
"grad_norm": 28.578536987304688,
"learning_rate": 1.8922376581196107e-06,
"logits/chosen": -2.015662670135498,
"logits/rejected": -1.9723193645477295,
"logps/chosen": -475.9444885253906,
"logps/rejected": -536.0194091796875,
"loss": 0.4799,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8610671758651733,
"rewards/margins": 0.9107308387756348,
"rewards/rejected": -2.7717981338500977,
"step": 790
},
{
"epoch": 0.6280667320902846,
"grad_norm": 21.485143661499023,
"learning_rate": 1.8259388443250993e-06,
"logits/chosen": -2.004772663116455,
"logits/rejected": -1.8513364791870117,
"logps/chosen": -469.4261779785156,
"logps/rejected": -537.4896240234375,
"loss": 0.5081,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.004453659057617,
"rewards/margins": 0.9301109313964844,
"rewards/rejected": -2.9345641136169434,
"step": 800
},
{
"epoch": 0.6280667320902846,
"eval_logits/chosen": -1.9647265672683716,
"eval_logits/rejected": -1.909649133682251,
"eval_logps/chosen": -498.1996765136719,
"eval_logps/rejected": -547.5287475585938,
"eval_loss": 0.517790675163269,
"eval_rewards/accuracies": 0.7008928656578064,
"eval_rewards/chosen": -2.0981194972991943,
"eval_rewards/margins": 0.8410041332244873,
"eval_rewards/rejected": -2.9391238689422607,
"eval_runtime": 177.4176,
"eval_samples_per_second": 11.273,
"eval_steps_per_second": 0.473,
"step": 800
},
{
"epoch": 0.6359175662414132,
"grad_norm": 43.05495071411133,
"learning_rate": 1.760147442639679e-06,
"logits/chosen": -1.7117631435394287,
"logits/rejected": -1.8081023693084717,
"logps/chosen": -471.0235290527344,
"logps/rejected": -578.5651245117188,
"loss": 0.5044,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.020402193069458,
"rewards/margins": 1.2799599170684814,
"rewards/rejected": -3.3003621101379395,
"step": 810
},
{
"epoch": 0.6437684003925417,
"grad_norm": 25.53011131286621,
"learning_rate": 1.6949129787682628e-06,
"logits/chosen": -1.8636391162872314,
"logits/rejected": -1.7885582447052002,
"logps/chosen": -535.2430419921875,
"logps/rejected": -592.5420532226562,
"loss": 0.5071,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.181544542312622,
"rewards/margins": 1.0555063486099243,
"rewards/rejected": -3.2370505332946777,
"step": 820
},
{
"epoch": 0.6516192345436702,
"grad_norm": 32.84662628173828,
"learning_rate": 1.6302845591704348e-06,
"logits/chosen": -1.7528541088104248,
"logits/rejected": -1.919858694076538,
"logps/chosen": -471.3095703125,
"logps/rejected": -554.0218505859375,
"loss": 0.5015,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.969342589378357,
"rewards/margins": 0.9811599850654602,
"rewards/rejected": -2.950502872467041,
"step": 830
},
{
"epoch": 0.6594700686947988,
"grad_norm": 37.13783264160156,
"learning_rate": 1.5663108340946465e-06,
"logits/chosen": -2.004257917404175,
"logits/rejected": -1.7805702686309814,
"logps/chosen": -476.3814392089844,
"logps/rejected": -552.9000244140625,
"loss": 0.496,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8513377904891968,
"rewards/margins": 0.9407541155815125,
"rewards/rejected": -2.7920918464660645,
"step": 840
},
{
"epoch": 0.6673209028459274,
"grad_norm": 66.06834411621094,
"learning_rate": 1.5030399609558364e-06,
"logits/chosen": -1.9352130889892578,
"logits/rejected": -1.8171558380126953,
"logps/chosen": -489.35052490234375,
"logps/rejected": -586.8991088867188,
"loss": 0.485,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.1446428298950195,
"rewards/margins": 1.0632911920547485,
"rewards/rejected": -3.2079339027404785,
"step": 850
},
{
"epoch": 0.6751717369970559,
"grad_norm": 32.76154708862305,
"learning_rate": 1.4405195680840357e-06,
"logits/chosen": -1.8590924739837646,
"logits/rejected": -1.8191407918930054,
"logps/chosen": -515.1978759765625,
"logps/rejected": -582.213623046875,
"loss": 0.5305,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.157810926437378,
"rewards/margins": 0.9910067319869995,
"rewards/rejected": -3.148818016052246,
"step": 860
},
{
"epoch": 0.6830225711481845,
"grad_norm": 32.92315673828125,
"learning_rate": 1.378796718871252e-06,
"logits/chosen": -1.9760971069335938,
"logits/rejected": -1.8940002918243408,
"logps/chosen": -500.63360595703125,
"logps/rejected": -580.5349731445312,
"loss": 0.5018,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.069706916809082,
"rewards/margins": 1.104552984237671,
"rewards/rejected": -3.174259662628174,
"step": 870
},
{
"epoch": 0.6908734052993131,
"grad_norm": 27.977630615234375,
"learning_rate": 1.3179178763436302e-06,
"logits/chosen": -1.713399887084961,
"logits/rejected": -1.5991706848144531,
"logps/chosen": -485.16693115234375,
"logps/rejected": -589.6981201171875,
"loss": 0.5245,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.5337507724761963,
"rewards/margins": 0.9967278242111206,
"rewards/rejected": -3.5304782390594482,
"step": 880
},
{
"epoch": 0.6987242394504416,
"grad_norm": 46.519187927246094,
"learning_rate": 1.2579288681855364e-06,
"logits/chosen": -1.8697153329849243,
"logits/rejected": -1.7676079273223877,
"logps/chosen": -555.0260620117188,
"logps/rejected": -671.7311401367188,
"loss": 0.4779,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.7753043174743652,
"rewards/margins": 1.084364414215088,
"rewards/rejected": -3.859668731689453,
"step": 890
},
{
"epoch": 0.7065750736015701,
"grad_norm": 31.31684684753418,
"learning_rate": 1.1988748522419163e-06,
"logits/chosen": -1.9314721822738647,
"logits/rejected": -1.8384710550308228,
"logps/chosen": -595.1455078125,
"logps/rejected": -668.7490234375,
"loss": 0.5197,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.979123592376709,
"rewards/margins": 0.9926818609237671,
"rewards/rejected": -3.9718050956726074,
"step": 900
},
{
"epoch": 0.7065750736015701,
"eval_logits/chosen": -1.8656275272369385,
"eval_logits/rejected": -1.7931705713272095,
"eval_logps/chosen": -577.5418701171875,
"eval_logps/rejected": -637.5369873046875,
"eval_loss": 0.5191683173179626,
"eval_rewards/accuracies": 0.7008928656578064,
"eval_rewards/chosen": -2.891542434692383,
"eval_rewards/margins": 0.9476642608642578,
"eval_rewards/rejected": -3.8392069339752197,
"eval_runtime": 255.7835,
"eval_samples_per_second": 7.819,
"eval_steps_per_second": 0.328,
"step": 900
},
{
"epoch": 0.7144259077526988,
"grad_norm": 60.76858139038086,
"learning_rate": 1.1408002825248842e-06,
"logits/chosen": -1.8335750102996826,
"logits/rejected": -1.7328205108642578,
"logps/chosen": -567.2271728515625,
"logps/rejected": -646.0481567382812,
"loss": 0.5185,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.787205219268799,
"rewards/margins": 1.058345079421997,
"rewards/rejected": -3.845550537109375,
"step": 910
},
{
"epoch": 0.7222767419038273,
"grad_norm": 42.74496078491211,
"learning_rate": 1.0837488757501369e-06,
"logits/chosen": -1.7031282186508179,
"logits/rejected": -1.6774184703826904,
"logps/chosen": -532.3548583984375,
"logps/rejected": -636.7594604492188,
"loss": 0.4887,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.610095262527466,
"rewards/margins": 1.1858711242675781,
"rewards/rejected": -3.795966386795044,
"step": 920
},
{
"epoch": 0.7301275760549558,
"grad_norm": 32.94953155517578,
"learning_rate": 1.027763578428379e-06,
"logits/chosen": -1.7176014184951782,
"logits/rejected": -1.7608709335327148,
"logps/chosen": -563.7265625,
"logps/rejected": -646.8751220703125,
"loss": 0.4836,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.821300983428955,
"rewards/margins": 1.0197052955627441,
"rewards/rejected": -3.8410065174102783,
"step": 930
},
{
"epoch": 0.7379784102060843,
"grad_norm": 74.49922943115234,
"learning_rate": 9.728865345365379e-07,
"logits/chosen": -1.7150166034698486,
"logits/rejected": -1.5209593772888184,
"logps/chosen": -534.5591430664062,
"logps/rejected": -621.5565185546875,
"loss": 0.5418,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.711378812789917,
"rewards/margins": 1.1061863899230957,
"rewards/rejected": -3.8175652027130127,
"step": 940
},
{
"epoch": 0.745829244357213,
"grad_norm": 27.46148681640625,
"learning_rate": 9.191590537930975e-07,
"logits/chosen": -1.7130823135375977,
"logits/rejected": -1.638779878616333,
"logps/chosen": -529.4462280273438,
"logps/rejected": -603.8697509765625,
"loss": 0.536,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.627434015274048,
"rewards/margins": 1.0126179456710815,
"rewards/rejected": -3.6400516033172607,
"step": 950
},
{
"epoch": 0.7536800785083415,
"grad_norm": 21.87665367126465,
"learning_rate": 8.666215805614373e-07,
"logits/chosen": -1.7968714237213135,
"logits/rejected": -1.8486363887786865,
"logps/chosen": -504.91571044921875,
"logps/rejected": -589.1393432617188,
"loss": 0.5057,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.2802155017852783,
"rewards/margins": 1.0134499073028564,
"rewards/rejected": -3.2936654090881348,
"step": 960
},
{
"epoch": 0.76153091265947,
"grad_norm": 29.431264877319336,
"learning_rate": 8.153136634045844e-07,
"logits/chosen": -1.9010169506072998,
"logits/rejected": -1.6634715795516968,
"logps/chosen": -493.634765625,
"logps/rejected": -557.65380859375,
"loss": 0.4996,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.152984619140625,
"rewards/margins": 1.0319383144378662,
"rewards/rejected": -3.184922933578491,
"step": 970
},
{
"epoch": 0.7693817468105987,
"grad_norm": 41.45183181762695,
"learning_rate": 7.652739253142915e-07,
"logits/chosen": -1.9328157901763916,
"logits/rejected": -1.7516534328460693,
"logps/chosen": -538.4470825195312,
"logps/rejected": -577.069580078125,
"loss": 0.5214,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.125819683074951,
"rewards/margins": 1.0004959106445312,
"rewards/rejected": -3.1263155937194824,
"step": 980
},
{
"epoch": 0.7772325809617272,
"grad_norm": 21.71674346923828,
"learning_rate": 7.165400346368648e-07,
"logits/chosen": -1.9481573104858398,
"logits/rejected": -1.8963590860366821,
"logps/chosen": -547.48486328125,
"logps/rejected": -585.6912231445312,
"loss": 0.5278,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.2814033031463623,
"rewards/margins": 0.8729672431945801,
"rewards/rejected": -3.1543705463409424,
"step": 990
},
{
"epoch": 0.7850834151128557,
"grad_norm": 60.18208312988281,
"learning_rate": 6.691486767176092e-07,
"logits/chosen": -1.7295516729354858,
"logits/rejected": -1.773970365524292,
"logps/chosen": -467.82049560546875,
"logps/rejected": -562.5482177734375,
"loss": 0.5008,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.1015613079071045,
"rewards/margins": 0.9576795697212219,
"rewards/rejected": -3.0592408180236816,
"step": 1000
},
{
"epoch": 0.7850834151128557,
"eval_logits/chosen": -1.991379737854004,
"eval_logits/rejected": -1.937352180480957,
"eval_logps/chosen": -498.16119384765625,
"eval_logps/rejected": -545.6578979492188,
"eval_loss": 0.5102471709251404,
"eval_rewards/accuracies": 0.7008928656578064,
"eval_rewards/chosen": -2.0977351665496826,
"eval_rewards/margins": 0.8226803541183472,
"eval_rewards/rejected": -2.9204154014587402,
"eval_runtime": 248.8844,
"eval_samples_per_second": 8.036,
"eval_steps_per_second": 0.338,
"step": 1000
},
{
"epoch": 0.7929342492639843,
"grad_norm": 27.0752010345459,
"learning_rate": 6.231355262852529e-07,
"logits/chosen": -1.8228180408477783,
"logits/rejected": -1.728371024131775,
"logps/chosen": -497.906982421875,
"logps/rejected": -574.6722412109375,
"loss": 0.5178,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.1114232540130615,
"rewards/margins": 1.0526831150054932,
"rewards/rejected": -3.1641063690185547,
"step": 1010
},
{
"epoch": 0.8007850834151129,
"grad_norm": 40.453643798828125,
"learning_rate": 5.785352205971275e-07,
"logits/chosen": -1.8827228546142578,
"logits/rejected": -1.8348219394683838,
"logps/chosen": -479.0231018066406,
"logps/rejected": -544.406982421875,
"loss": 0.4717,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9858747720718384,
"rewards/margins": 0.8169358372688293,
"rewards/rejected": -2.8028104305267334,
"step": 1020
},
{
"epoch": 0.8086359175662414,
"grad_norm": 26.58576774597168,
"learning_rate": 5.353813333653287e-07,
"logits/chosen": -1.9306774139404297,
"logits/rejected": -1.9138189554214478,
"logps/chosen": -529.3744506835938,
"logps/rejected": -577.8673095703125,
"loss": 0.5073,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.1622471809387207,
"rewards/margins": 0.9140118360519409,
"rewards/rejected": -3.076258659362793,
"step": 1030
},
{
"epoch": 0.81648675171737,
"grad_norm": 23.61007308959961,
"learning_rate": 4.937063494834774e-07,
"logits/chosen": -1.814344048500061,
"logits/rejected": -1.6967451572418213,
"logps/chosen": -507.7666015625,
"logps/rejected": -598.0667724609375,
"loss": 0.5215,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.145608425140381,
"rewards/margins": 0.9707077741622925,
"rewards/rejected": -3.116316080093384,
"step": 1040
},
{
"epoch": 0.8243375858684985,
"grad_norm": 28.008739471435547,
"learning_rate": 4.5354164057310857e-07,
"logits/chosen": -1.8821042776107788,
"logits/rejected": -1.7559188604354858,
"logps/chosen": -465.9667053222656,
"logps/rejected": -576.3198852539062,
"loss": 0.5257,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.1245594024658203,
"rewards/margins": 1.131911039352417,
"rewards/rejected": -3.256470203399658,
"step": 1050
},
{
"epoch": 0.8321884200196271,
"grad_norm": 23.431196212768555,
"learning_rate": 4.1491744136810066e-07,
"logits/chosen": -1.8241643905639648,
"logits/rejected": -1.5898910760879517,
"logps/chosen": -494.38006591796875,
"logps/rejected": -608.50048828125,
"loss": 0.5239,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2372994422912598,
"rewards/margins": 1.0501940250396729,
"rewards/rejected": -3.2874934673309326,
"step": 1060
},
{
"epoch": 0.8400392541707556,
"grad_norm": 36.24497604370117,
"learning_rate": 3.7786282695491313e-07,
"logits/chosen": -1.7533372640609741,
"logits/rejected": -1.780310034751892,
"logps/chosen": -521.2637939453125,
"logps/rejected": -594.5169067382812,
"loss": 0.5173,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1260104179382324,
"rewards/margins": 1.0049241781234741,
"rewards/rejected": -3.130934476852417,
"step": 1070
},
{
"epoch": 0.8478900883218842,
"grad_norm": 28.115896224975586,
"learning_rate": 3.4240569088577564e-07,
"logits/chosen": -1.9627529382705688,
"logits/rejected": -1.9232120513916016,
"logps/chosen": -521.6199951171875,
"logps/rejected": -590.34619140625,
"loss": 0.5159,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.1085124015808105,
"rewards/margins": 1.0188482999801636,
"rewards/rejected": -3.1273605823516846,
"step": 1080
},
{
"epoch": 0.8557409224730128,
"grad_norm": 25.046926498413086,
"learning_rate": 3.0857272418129136e-07,
"logits/chosen": -1.8483200073242188,
"logits/rejected": -1.8257999420166016,
"logps/chosen": -538.3873901367188,
"logps/rejected": -620.02978515625,
"loss": 0.5008,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2884747982025146,
"rewards/margins": 1.0311329364776611,
"rewards/rejected": -3.3196074962615967,
"step": 1090
},
{
"epoch": 0.8635917566241413,
"grad_norm": 25.578903198242188,
"learning_rate": 2.7638939523827956e-07,
"logits/chosen": -1.771712064743042,
"logits/rejected": -1.6592738628387451,
"logps/chosen": -536.8753662109375,
"logps/rejected": -635.494384765625,
"loss": 0.5223,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2466344833374023,
"rewards/margins": 1.0864031314849854,
"rewards/rejected": -3.3330376148223877,
"step": 1100
},
{
"epoch": 0.8635917566241413,
"eval_logits/chosen": -1.9598368406295776,
"eval_logits/rejected": -1.8985047340393066,
"eval_logps/chosen": -510.208984375,
"eval_logps/rejected": -564.5363159179688,
"eval_loss": 0.5109513401985168,
"eval_rewards/accuracies": 0.6934523582458496,
"eval_rewards/chosen": -2.218212604522705,
"eval_rewards/margins": 0.8909867405891418,
"eval_rewards/rejected": -3.1092000007629395,
"eval_runtime": 178.9794,
"eval_samples_per_second": 11.174,
"eval_steps_per_second": 0.469,
"step": 1100
},
{
"epoch": 0.8714425907752699,
"grad_norm": 24.112642288208008,
"learning_rate": 2.4587993065795983e-07,
"logits/chosen": -1.8837692737579346,
"logits/rejected": -1.7314777374267578,
"logps/chosen": -480.4740295410156,
"logps/rejected": -563.0213623046875,
"loss": 0.5227,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0834219455718994,
"rewards/margins": 1.149505853652954,
"rewards/rejected": -3.2329280376434326,
"step": 1110
},
{
"epoch": 0.8792934249263984,
"grad_norm": 24.728294372558594,
"learning_rate": 2.170672970089291e-07,
"logits/chosen": -1.8168354034423828,
"logits/rejected": -1.7316901683807373,
"logps/chosen": -536.4750366210938,
"logps/rejected": -631.4368896484375,
"loss": 0.4847,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.207212448120117,
"rewards/margins": 1.1604888439178467,
"rewards/rejected": -3.367701768875122,
"step": 1120
},
{
"epoch": 0.887144259077527,
"grad_norm": 34.55753707885742,
"learning_rate": 1.8997318353864673e-07,
"logits/chosen": -1.887563943862915,
"logits/rejected": -1.5958278179168701,
"logps/chosen": -506.45294189453125,
"logps/rejected": -567.0094604492188,
"loss": 0.5052,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.201402187347412,
"rewards/margins": 1.0240195989608765,
"rewards/rejected": -3.225421905517578,
"step": 1130
},
{
"epoch": 0.8949950932286556,
"grad_norm": 37.284019470214844,
"learning_rate": 1.6461798584644944e-07,
"logits/chosen": -1.940473198890686,
"logits/rejected": -1.8656337261199951,
"logps/chosen": -518.56494140625,
"logps/rejected": -582.9520874023438,
"loss": 0.4778,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.122587203979492,
"rewards/margins": 1.1023415327072144,
"rewards/rejected": -3.224928617477417,
"step": 1140
},
{
"epoch": 0.9028459273797841,
"grad_norm": 48.473114013671875,
"learning_rate": 1.4102079053038454e-07,
"logits/chosen": -1.9566850662231445,
"logits/rejected": -1.7725406885147095,
"logps/chosen": -515.0001220703125,
"logps/rejected": -587.2335205078125,
"loss": 0.4947,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.1063010692596436,
"rewards/margins": 1.121829628944397,
"rewards/rejected": -3.22813081741333,
"step": 1150
},
{
"epoch": 0.9106967615309126,
"grad_norm": 23.98328399658203,
"learning_rate": 1.1919936081941585e-07,
"logits/chosen": -1.9583518505096436,
"logits/rejected": -1.8895307779312134,
"logps/chosen": -528.2996215820312,
"logps/rejected": -599.0931396484375,
"loss": 0.5063,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.383150577545166,
"rewards/margins": 0.8790243268013,
"rewards/rejected": -3.2621750831604004,
"step": 1160
},
{
"epoch": 0.9185475956820413,
"grad_norm": 30.729877471923828,
"learning_rate": 9.917012320182245e-08,
"logits/chosen": -1.8442468643188477,
"logits/rejected": -1.7293345928192139,
"logps/chosen": -530.6605224609375,
"logps/rejected": -573.9486083984375,
"loss": 0.5107,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.3617968559265137,
"rewards/margins": 0.8581873774528503,
"rewards/rejected": -3.2199840545654297,
"step": 1170
},
{
"epoch": 0.9263984298331698,
"grad_norm": 29.362680435180664,
"learning_rate": 8.094815505985315e-08,
"logits/chosen": -1.898097276687622,
"logits/rejected": -1.7420837879180908,
"logps/chosen": -498.27874755859375,
"logps/rejected": -638.7017211914062,
"loss": 0.4825,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2651729583740234,
"rewards/margins": 1.1845793724060059,
"rewards/rejected": -3.44975209236145,
"step": 1180
},
{
"epoch": 0.9342492639842983,
"grad_norm": 36.354610443115234,
"learning_rate": 6.454717331994542e-08,
"logits/chosen": -1.9377390146255493,
"logits/rejected": -1.8412069082260132,
"logps/chosen": -528.7586059570312,
"logps/rejected": -617.6361083984375,
"loss": 0.5211,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.173046350479126,
"rewards/margins": 1.1210204362869263,
"rewards/rejected": -3.2940666675567627,
"step": 1190
},
{
"epoch": 0.9421000981354269,
"grad_norm": 37.27730178833008,
"learning_rate": 4.9979524127052595e-08,
"logits/chosen": -1.7879035472869873,
"logits/rejected": -1.8019065856933594,
"logps/chosen": -485.499755859375,
"logps/rejected": -587.9569091796875,
"loss": 0.4981,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.1999027729034424,
"rewards/margins": 1.0023242235183716,
"rewards/rejected": -3.2022266387939453,
"step": 1200
},
{
"epoch": 0.9421000981354269,
"eval_logits/chosen": -1.9679957628250122,
"eval_logits/rejected": -1.9060754776000977,
"eval_logps/chosen": -509.53515625,
"eval_logps/rejected": -565.4013061523438,
"eval_loss": 0.5110836029052734,
"eval_rewards/accuracies": 0.699404776096344,
"eval_rewards/chosen": -2.211474895477295,
"eval_rewards/margins": 0.9063741564750671,
"eval_rewards/rejected": -3.117849349975586,
"eval_runtime": 303.5083,
"eval_samples_per_second": 6.59,
"eval_steps_per_second": 0.277,
"step": 1200
},
{
"epoch": 0.9499509322865555,
"grad_norm": 30.429931640625,
"learning_rate": 3.725617355085476e-08,
"logits/chosen": -1.7728469371795654,
"logits/rejected": -1.6203314065933228,
"logps/chosen": -476.9127502441406,
"logps/rejected": -577.5582275390625,
"loss": 0.507,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.16386079788208,
"rewards/margins": 1.1731908321380615,
"rewards/rejected": -3.3370513916015625,
"step": 1210
},
{
"epoch": 0.957801766437684,
"grad_norm": 42.811119079589844,
"learning_rate": 2.63866993308437e-08,
"logits/chosen": -1.765027642250061,
"logits/rejected": -1.6837198734283447,
"logps/chosen": -484.285400390625,
"logps/rejected": -537.3614501953125,
"loss": 0.5262,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.178112506866455,
"rewards/margins": 0.8579233884811401,
"rewards/rejected": -3.0360360145568848,
"step": 1220
},
{
"epoch": 0.9656526005888125,
"grad_norm": 28.079404830932617,
"learning_rate": 1.737928366650099e-08,
"logits/chosen": -1.9261119365692139,
"logits/rejected": -1.853053092956543,
"logps/chosen": -547.2498779296875,
"logps/rejected": -600.8333129882812,
"loss": 0.5182,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.244377851486206,
"rewards/margins": 1.105455756187439,
"rewards/rejected": -3.3498339653015137,
"step": 1230
},
{
"epoch": 0.9735034347399412,
"grad_norm": 29.11058807373047,
"learning_rate": 1.0240707057995735e-08,
"logits/chosen": -1.7693697214126587,
"logits/rejected": -1.5242459774017334,
"logps/chosen": -488.11724853515625,
"logps/rejected": -578.2257690429688,
"loss": 0.4903,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2136459350585938,
"rewards/margins": 0.9566876292228699,
"rewards/rejected": -3.1703333854675293,
"step": 1240
},
{
"epoch": 0.9813542688910697,
"grad_norm": 24.037424087524414,
"learning_rate": 4.976343202034717e-09,
"logits/chosen": -1.754732370376587,
"logits/rejected": -1.6457884311676025,
"logps/chosen": -478.7969665527344,
"logps/rejected": -566.3361206054688,
"loss": 0.4716,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.199491024017334,
"rewards/margins": 0.9993401765823364,
"rewards/rejected": -3.198831081390381,
"step": 1250
},
{
"epoch": 0.9892051030421982,
"grad_norm": 33.65019607543945,
"learning_rate": 1.5901549467139953e-09,
"logits/chosen": -1.9445594549179077,
"logits/rejected": -1.8698110580444336,
"logps/chosen": -522.216552734375,
"logps/rejected": -589.295654296875,
"loss": 0.5043,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.2183516025543213,
"rewards/margins": 0.9545730352401733,
"rewards/rejected": -3.172924757003784,
"step": 1260
},
{
"epoch": 0.9970559371933267,
"grad_norm": 39.74230194091797,
"learning_rate": 8.469130840960127e-11,
"logits/chosen": -1.7422492504119873,
"logits/rejected": -1.6215105056762695,
"logps/chosen": -489.52642822265625,
"logps/rejected": -590.7807006835938,
"loss": 0.5169,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.154064655303955,
"rewards/margins": 1.0220654010772705,
"rewards/rejected": -3.176130533218384,
"step": 1270
},
{
"epoch": 0.9994111874386653,
"step": 1273,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 0.0132,
"train_samples_per_second": 4647380.664,
"train_steps_per_second": 96772.918
}
],
"logging_steps": 10,
"max_steps": 1273,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}