Safetensors
qwen2
trustalign_qwen2.5_7b / trainer_state.json
shanghong's picture
Upload folder using huggingface_hub
791a99b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3129888447236826,
"eval_steps": 10,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008206180279523016,
"grad_norm": 72.29982535271158,
"learning_rate": 2.040816326530612e-08,
"logits/chosen": -2.1581597328186035,
"logits/rejected": -2.159653902053833,
"logps/chosen": -25.12261962890625,
"logps/rejected": -43.09302520751953,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.016412360559046033,
"grad_norm": 96.10955860968517,
"learning_rate": 4.081632653061224e-08,
"logits/chosen": -2.0733723640441895,
"logits/rejected": -2.0702476501464844,
"logps/chosen": -24.637685775756836,
"logps/rejected": -41.43503189086914,
"loss": 0.6963,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.008512299507856369,
"rewards/margins": -0.013849787414073944,
"rewards/rejected": 0.022362088784575462,
"step": 4
},
{
"epoch": 0.02461854083856905,
"grad_norm": 68.94505414065074,
"learning_rate": 6.122448979591837e-08,
"logits/chosen": -2.118948459625244,
"logits/rejected": -2.109750270843506,
"logps/chosen": -20.342554092407227,
"logps/rejected": -29.361812591552734,
"loss": 0.6878,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.028929362073540688,
"rewards/margins": 0.029945004731416702,
"rewards/rejected": -0.0010156440548598766,
"step": 6
},
{
"epoch": 0.032824721118092065,
"grad_norm": 77.97306429217313,
"learning_rate": 8.163265306122448e-08,
"logits/chosen": -2.0574400424957275,
"logits/rejected": -2.0550637245178223,
"logps/chosen": -27.057085037231445,
"logps/rejected": -39.5283088684082,
"loss": 0.7013,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.014948742464184761,
"rewards/margins": -0.01355709508061409,
"rewards/rejected": -0.0013916483148932457,
"step": 8
},
{
"epoch": 0.04103090139761508,
"grad_norm": 67.40204277563684,
"learning_rate": 1.0204081632653061e-07,
"logits/chosen": -2.0784454345703125,
"logits/rejected": -2.072957992553711,
"logps/chosen": -25.560945510864258,
"logps/rejected": -31.469083786010742,
"loss": 0.6968,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.004186006262898445,
"rewards/margins": 0.010740835219621658,
"rewards/rejected": -0.014926840551197529,
"step": 10
},
{
"epoch": 0.04103090139761508,
"eval_logits/chosen": -2.0721328258514404,
"eval_logits/rejected": -2.069610834121704,
"eval_logps/chosen": -24.65077018737793,
"eval_logps/rejected": -31.145498275756836,
"eval_loss": 0.6956828236579895,
"eval_rewards/accuracies": 0.40668201446533203,
"eval_rewards/chosen": -0.005178123712539673,
"eval_rewards/margins": -0.0014642790192738175,
"eval_rewards/rejected": -0.003713843412697315,
"eval_runtime": 391.6625,
"eval_samples_per_second": 4.427,
"eval_steps_per_second": 1.108,
"step": 10
},
{
"epoch": 0.0492370816771381,
"grad_norm": 75.23897935562478,
"learning_rate": 1.2244897959183673e-07,
"logits/chosen": -2.105541229248047,
"logits/rejected": -2.110215187072754,
"logps/chosen": -25.037534713745117,
"logps/rejected": -51.45681381225586,
"loss": 0.691,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.02754499390721321,
"rewards/margins": -0.003392305225133896,
"rewards/rejected": -0.024152684956789017,
"step": 12
},
{
"epoch": 0.057443261956661114,
"grad_norm": 62.15854308442268,
"learning_rate": 1.4285714285714285e-07,
"logits/chosen": -2.0920844078063965,
"logits/rejected": -2.0932013988494873,
"logps/chosen": -30.580596923828125,
"logps/rejected": -43.68434143066406,
"loss": 0.693,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.03211332485079765,
"rewards/margins": 0.012334156781435013,
"rewards/rejected": -0.044447485357522964,
"step": 14
},
{
"epoch": 0.06564944223618413,
"grad_norm": 70.35846634784929,
"learning_rate": 1.6326530612244896e-07,
"logits/chosen": -2.058406114578247,
"logits/rejected": -2.0612599849700928,
"logps/chosen": -24.555591583251953,
"logps/rejected": -47.54515838623047,
"loss": 0.6833,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.06411759555339813,
"rewards/margins": 0.02048414759337902,
"rewards/rejected": -0.0846017450094223,
"step": 16
},
{
"epoch": 0.07385562251570714,
"grad_norm": 53.416196200679686,
"learning_rate": 1.836734693877551e-07,
"logits/chosen": -2.0910069942474365,
"logits/rejected": -2.0841658115386963,
"logps/chosen": -24.07425308227539,
"logps/rejected": -23.863197326660156,
"loss": 0.6845,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.09408427029848099,
"rewards/margins": 0.020395735278725624,
"rewards/rejected": -0.11447998881340027,
"step": 18
},
{
"epoch": 0.08206180279523016,
"grad_norm": 81.67543770250495,
"learning_rate": 2.0408163265306121e-07,
"logits/chosen": -2.0422399044036865,
"logits/rejected": -2.0412869453430176,
"logps/chosen": -23.748153686523438,
"logps/rejected": -41.940765380859375,
"loss": 0.6754,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.08865831792354584,
"rewards/margins": 0.042564887553453445,
"rewards/rejected": -0.13122320175170898,
"step": 20
},
{
"epoch": 0.08206180279523016,
"eval_logits/chosen": -2.073920726776123,
"eval_logits/rejected": -2.071418285369873,
"eval_logps/chosen": -24.912931442260742,
"eval_logps/rejected": -31.505783081054688,
"eval_loss": 0.6749772429466248,
"eval_rewards/accuracies": 0.559907853603363,
"eval_rewards/chosen": -0.13625794649124146,
"eval_rewards/margins": 0.04759809002280235,
"eval_rewards/rejected": -0.1838560253381729,
"eval_runtime": 388.9647,
"eval_samples_per_second": 4.458,
"eval_steps_per_second": 1.116,
"step": 20
},
{
"epoch": 0.09026798307475317,
"grad_norm": 58.1111337202034,
"learning_rate": 2.2448979591836733e-07,
"logits/chosen": -2.141969680786133,
"logits/rejected": -2.139296054840088,
"logps/chosen": -28.556930541992188,
"logps/rejected": -33.20477294921875,
"loss": 0.6809,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1592944860458374,
"rewards/margins": 0.038385238498449326,
"rewards/rejected": -0.19767972826957703,
"step": 22
},
{
"epoch": 0.0984741633542762,
"grad_norm": 59.55094100468827,
"learning_rate": 2.4489795918367347e-07,
"logits/chosen": -2.0954370498657227,
"logits/rejected": -2.0884788036346436,
"logps/chosen": -24.139230728149414,
"logps/rejected": -28.622846603393555,
"loss": 0.6507,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.26595112681388855,
"rewards/margins": 0.08658240735530853,
"rewards/rejected": -0.3525335192680359,
"step": 24
},
{
"epoch": 0.1066803436337992,
"grad_norm": 57.19794639459796,
"learning_rate": 2.653061224489796e-07,
"logits/chosen": -2.137075662612915,
"logits/rejected": -2.142050266265869,
"logps/chosen": -29.15459442138672,
"logps/rejected": -42.837894439697266,
"loss": 0.6393,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.3813813328742981,
"rewards/margins": 0.10138601809740067,
"rewards/rejected": -0.48276740312576294,
"step": 26
},
{
"epoch": 0.11488652391332223,
"grad_norm": 49.34854220939512,
"learning_rate": 2.857142857142857e-07,
"logits/chosen": -2.078376531600952,
"logits/rejected": -2.0755605697631836,
"logps/chosen": -25.93030548095703,
"logps/rejected": -38.2554817199707,
"loss": 0.6617,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.49894633889198303,
"rewards/margins": 0.13289065659046173,
"rewards/rejected": -0.631837010383606,
"step": 28
},
{
"epoch": 0.12309270419284524,
"grad_norm": 57.358296790419175,
"learning_rate": 3.0612244897959183e-07,
"logits/chosen": -2.1170706748962402,
"logits/rejected": -2.123375415802002,
"logps/chosen": -21.049701690673828,
"logps/rejected": -53.651187896728516,
"loss": 0.6385,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.6054391264915466,
"rewards/margins": 0.20123842358589172,
"rewards/rejected": -0.806677520275116,
"step": 30
},
{
"epoch": 0.12309270419284524,
"eval_logits/chosen": -2.072676420211792,
"eval_logits/rejected": -2.0702009201049805,
"eval_logps/chosen": -25.986181259155273,
"eval_logps/rejected": -32.866546630859375,
"eval_loss": 0.6259192824363708,
"eval_rewards/accuracies": 0.5967742204666138,
"eval_rewards/chosen": -0.672883927822113,
"eval_rewards/margins": 0.1913554072380066,
"eval_rewards/rejected": -0.8642393946647644,
"eval_runtime": 383.4842,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 1.132,
"step": 30
},
{
"epoch": 0.13129888447236826,
"grad_norm": 51.59090790007893,
"learning_rate": 3.265306122448979e-07,
"logits/chosen": -2.065584182739258,
"logits/rejected": -2.059347629547119,
"logps/chosen": -21.224966049194336,
"logps/rejected": -31.50775718688965,
"loss": 0.6031,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.6435321569442749,
"rewards/margins": 0.2527439296245575,
"rewards/rejected": -0.8962759971618652,
"step": 32
},
{
"epoch": 0.13950506475189126,
"grad_norm": 48.81049837247277,
"learning_rate": 3.4693877551020406e-07,
"logits/chosen": -2.044010639190674,
"logits/rejected": -2.04213285446167,
"logps/chosen": -21.99913787841797,
"logps/rejected": -41.94310760498047,
"loss": 0.5998,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8395999670028687,
"rewards/margins": 0.3455100953578949,
"rewards/rejected": -1.1851099729537964,
"step": 34
},
{
"epoch": 0.14771124503141428,
"grad_norm": 51.08931493056027,
"learning_rate": 3.673469387755102e-07,
"logits/chosen": -2.0850181579589844,
"logits/rejected": -2.0847809314727783,
"logps/chosen": -31.404415130615234,
"logps/rejected": -38.55030059814453,
"loss": 0.5904,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2278621196746826,
"rewards/margins": 0.26516327261924744,
"rewards/rejected": -1.493025541305542,
"step": 36
},
{
"epoch": 0.1559174253109373,
"grad_norm": 59.35651114893968,
"learning_rate": 3.877551020408163e-07,
"logits/chosen": -2.114025831222534,
"logits/rejected": -2.1199615001678467,
"logps/chosen": -24.901527404785156,
"logps/rejected": -50.99317932128906,
"loss": 0.5923,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.1870838403701782,
"rewards/margins": 0.41857296228408813,
"rewards/rejected": -1.6056568622589111,
"step": 38
},
{
"epoch": 0.16412360559046033,
"grad_norm": 46.92283583915801,
"learning_rate": 4.0816326530612243e-07,
"logits/chosen": -2.109259843826294,
"logits/rejected": -2.1076252460479736,
"logps/chosen": -27.03592300415039,
"logps/rejected": -42.91762161254883,
"loss": 0.5115,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.1980061531066895,
"rewards/margins": 0.5265197157859802,
"rewards/rejected": -1.7245259284973145,
"step": 40
},
{
"epoch": 0.16412360559046033,
"eval_logits/chosen": -2.0738561153411865,
"eval_logits/rejected": -2.0714197158813477,
"eval_logps/chosen": -27.073177337646484,
"eval_logps/rejected": -34.44426345825195,
"eval_loss": 0.5755711197853088,
"eval_rewards/accuracies": 0.6682027578353882,
"eval_rewards/chosen": -1.2163803577423096,
"eval_rewards/margins": 0.43671703338623047,
"eval_rewards/rejected": -1.6530975103378296,
"eval_runtime": 383.5104,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 1.132,
"step": 40
},
{
"epoch": 0.17232978586998332,
"grad_norm": 53.83140343188913,
"learning_rate": 4.285714285714285e-07,
"logits/chosen": -2.0352187156677246,
"logits/rejected": -2.037541627883911,
"logps/chosen": -32.00979232788086,
"logps/rejected": -34.965049743652344,
"loss": 0.5958,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.6347800493240356,
"rewards/margins": 0.25150731205940247,
"rewards/rejected": -1.8862874507904053,
"step": 42
},
{
"epoch": 0.18053596614950634,
"grad_norm": 47.972074857166355,
"learning_rate": 4.4897959183673465e-07,
"logits/chosen": -2.075744390487671,
"logits/rejected": -2.080559253692627,
"logps/chosen": -27.046289443969727,
"logps/rejected": -43.155860900878906,
"loss": 0.5475,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4715561866760254,
"rewards/margins": 0.3980383574962616,
"rewards/rejected": -1.8695944547653198,
"step": 44
},
{
"epoch": 0.18874214642902937,
"grad_norm": 49.55763090785352,
"learning_rate": 4.693877551020408e-07,
"logits/chosen": -2.0551533699035645,
"logits/rejected": -2.0602352619171143,
"logps/chosen": -33.108123779296875,
"logps/rejected": -42.747528076171875,
"loss": 0.5183,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.2049014568328857,
"rewards/margins": 0.6831133961677551,
"rewards/rejected": -1.888014793395996,
"step": 46
},
{
"epoch": 0.1969483267085524,
"grad_norm": 36.14217009270528,
"learning_rate": 4.897959183673469e-07,
"logits/chosen": -2.178698778152466,
"logits/rejected": -2.178926467895508,
"logps/chosen": -23.19949722290039,
"logps/rejected": -47.667236328125,
"loss": 0.4744,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.3382686376571655,
"rewards/margins": 0.744179368019104,
"rewards/rejected": -2.0824477672576904,
"step": 48
},
{
"epoch": 0.20515450698807539,
"grad_norm": 44.92936419641385,
"learning_rate": 4.999935398141225e-07,
"logits/chosen": -2.0898728370666504,
"logits/rejected": -2.086491107940674,
"logps/chosen": -28.257888793945312,
"logps/rejected": -45.15388870239258,
"loss": 0.493,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.5506025552749634,
"rewards/margins": 0.6771446466445923,
"rewards/rejected": -2.2277474403381348,
"step": 50
},
{
"epoch": 0.20515450698807539,
"eval_logits/chosen": -2.0701448917388916,
"eval_logits/rejected": -2.067728042602539,
"eval_logps/chosen": -26.948427200317383,
"eval_logps/rejected": -35.04188919067383,
"eval_loss": 0.49889206886291504,
"eval_rewards/accuracies": 0.7396313548088074,
"eval_rewards/chosen": -1.154005527496338,
"eval_rewards/margins": 0.797903835773468,
"eval_rewards/rejected": -1.9519096612930298,
"eval_runtime": 383.3286,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 1.132,
"step": 50
},
{
"epoch": 0.2133606872675984,
"grad_norm": 39.90879769142747,
"learning_rate": 4.999418603303176e-07,
"logits/chosen": -2.044602394104004,
"logits/rejected": -2.043879985809326,
"logps/chosen": -20.71453094482422,
"logps/rejected": -39.783756256103516,
"loss": 0.4715,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9839560985565186,
"rewards/margins": 0.9620079398155212,
"rewards/rejected": -1.9459640979766846,
"step": 52
},
{
"epoch": 0.22156686754712143,
"grad_norm": 37.33476295758473,
"learning_rate": 4.998385120460602e-07,
"logits/chosen": -2.080956220626831,
"logits/rejected": -2.0807886123657227,
"logps/chosen": -26.196861267089844,
"logps/rejected": -39.63607406616211,
"loss": 0.51,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.140933871269226,
"rewards/margins": 0.8466982841491699,
"rewards/rejected": -1.9876320362091064,
"step": 54
},
{
"epoch": 0.22977304782664446,
"grad_norm": 45.971175914221135,
"learning_rate": 4.996835163258461e-07,
"logits/chosen": -2.098681688308716,
"logits/rejected": -2.0977959632873535,
"logps/chosen": -27.12312126159668,
"logps/rejected": -44.47397994995117,
"loss": 0.5346,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.1570792198181152,
"rewards/margins": 0.7375643253326416,
"rewards/rejected": -1.8946435451507568,
"step": 56
},
{
"epoch": 0.23797922810616745,
"grad_norm": 37.460312981010176,
"learning_rate": 4.994769052108987e-07,
"logits/chosen": -2.0555946826934814,
"logits/rejected": -2.0529699325561523,
"logps/chosen": -23.68335723876953,
"logps/rejected": -39.27627944946289,
"loss": 0.3812,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.066185712814331,
"rewards/margins": 1.065974235534668,
"rewards/rejected": -2.13215970993042,
"step": 58
},
{
"epoch": 0.24618540838569047,
"grad_norm": 34.303283256725926,
"learning_rate": 4.992187214125447e-07,
"logits/chosen": -2.10715651512146,
"logits/rejected": -2.1082205772399902,
"logps/chosen": -23.896142959594727,
"logps/rejected": -50.47355270385742,
"loss": 0.3707,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.9780565500259399,
"rewards/margins": 1.4628773927688599,
"rewards/rejected": -2.4409339427948,
"step": 60
},
{
"epoch": 0.24618540838569047,
"eval_logits/chosen": -2.0698177814483643,
"eval_logits/rejected": -2.0673775672912598,
"eval_logps/chosen": -26.22644805908203,
"eval_logps/rejected": -34.93669128417969,
"eval_loss": 0.44575488567352295,
"eval_rewards/accuracies": 0.7695852518081665,
"eval_rewards/chosen": -0.7930165529251099,
"eval_rewards/margins": 1.106292963027954,
"eval_rewards/rejected": -1.8993093967437744,
"eval_runtime": 383.2498,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 1.132,
"step": 60
},
{
"epoch": 0.2543915886652135,
"grad_norm": 34.73750630342744,
"learning_rate": 4.98909018303385e-07,
"logits/chosen": -2.096700429916382,
"logits/rejected": -2.0969526767730713,
"logps/chosen": -28.021564483642578,
"logps/rejected": -44.515228271484375,
"loss": 0.4361,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.700480580329895,
"rewards/margins": 1.3456040620803833,
"rewards/rejected": -2.046084403991699,
"step": 62
},
{
"epoch": 0.2625977689447365,
"grad_norm": 32.81062577153495,
"learning_rate": 4.985478599062611e-07,
"logits/chosen": -2.087684154510498,
"logits/rejected": -2.088871717453003,
"logps/chosen": -26.94491958618164,
"logps/rejected": -34.76374435424805,
"loss": 0.4479,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.528798520565033,
"rewards/margins": 0.9001777172088623,
"rewards/rejected": -1.42897629737854,
"step": 64
},
{
"epoch": 0.27080394922425954,
"grad_norm": 29.052321436937024,
"learning_rate": 4.981353208810206e-07,
"logits/chosen": -2.0988636016845703,
"logits/rejected": -2.104152202606201,
"logps/chosen": -20.718381881713867,
"logps/rejected": -51.553409576416016,
"loss": 0.4029,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.7514103055000305,
"rewards/margins": 1.5007989406585693,
"rewards/rejected": -2.252209186553955,
"step": 66
},
{
"epoch": 0.2790101295037825,
"grad_norm": 37.90898283460309,
"learning_rate": 4.976714865090826e-07,
"logits/chosen": -2.0126729011535645,
"logits/rejected": -2.007772445678711,
"logps/chosen": -27.219005584716797,
"logps/rejected": -26.84417152404785,
"loss": 0.45,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7038444876670837,
"rewards/margins": 0.9012500047683716,
"rewards/rejected": -1.6050945520401,
"step": 68
},
{
"epoch": 0.28721630978330553,
"grad_norm": 31.01097368955644,
"learning_rate": 4.971564526758087e-07,
"logits/chosen": -2.0806331634521484,
"logits/rejected": -2.090015411376953,
"logps/chosen": -26.418888092041016,
"logps/rejected": -61.58074951171875,
"loss": 0.3921,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6639503240585327,
"rewards/margins": 2.018562078475952,
"rewards/rejected": -2.6825122833251953,
"step": 70
},
{
"epoch": 0.28721630978330553,
"eval_logits/chosen": -2.0629022121429443,
"eval_logits/rejected": -2.0604288578033447,
"eval_logps/chosen": -25.214706420898438,
"eval_logps/rejected": -34.3575553894043,
"eval_loss": 0.4037153124809265,
"eval_rewards/accuracies": 0.7776497602462769,
"eval_rewards/chosen": -0.2871449589729309,
"eval_rewards/margins": 1.322598934173584,
"eval_rewards/rejected": -1.6097438335418701,
"eval_runtime": 383.2946,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 1.132,
"step": 70
},
{
"epoch": 0.29542249006282856,
"grad_norm": 35.944172042258,
"learning_rate": 4.965903258506806e-07,
"logits/chosen": -1.982763409614563,
"logits/rejected": -1.9825458526611328,
"logps/chosen": -26.120790481567383,
"logps/rejected": -47.724666595458984,
"loss": 0.3965,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5441683530807495,
"rewards/margins": 1.4394950866699219,
"rewards/rejected": -1.9836633205413818,
"step": 72
},
{
"epoch": 0.3036286703423516,
"grad_norm": 30.076414397329636,
"learning_rate": 4.959732230652907e-07,
"logits/chosen": -2.0936832427978516,
"logits/rejected": -2.0875070095062256,
"logps/chosen": -21.951000213623047,
"logps/rejected": -35.42924499511719,
"loss": 0.3786,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.6129394769668579,
"rewards/margins": 1.359676718711853,
"rewards/rejected": -1.9726160764694214,
"step": 74
},
{
"epoch": 0.3118348506218746,
"grad_norm": 31.92687509033969,
"learning_rate": 4.953052718891494e-07,
"logits/chosen": -2.0927772521972656,
"logits/rejected": -2.0937466621398926,
"logps/chosen": -24.76601791381836,
"logps/rejected": -40.51527786254883,
"loss": 0.3999,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.39051324129104614,
"rewards/margins": 1.7075785398483276,
"rewards/rejected": -2.0980916023254395,
"step": 76
},
{
"epoch": 0.3200410309013976,
"grad_norm": 39.97329898507408,
"learning_rate": 4.945866104033126e-07,
"logits/chosen": -2.07487416267395,
"logits/rejected": -2.0698864459991455,
"logps/chosen": -28.91614532470703,
"logps/rejected": -32.64835739135742,
"loss": 0.3902,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.8651710152626038,
"rewards/margins": 1.3195717334747314,
"rewards/rejected": -2.1847424507141113,
"step": 78
},
{
"epoch": 0.32824721118092065,
"grad_norm": 35.23935142184965,
"learning_rate": 4.938173871718379e-07,
"logits/chosen": -2.1075708866119385,
"logits/rejected": -2.108260154724121,
"logps/chosen": -30.674692153930664,
"logps/rejected": -45.75242233276367,
"loss": 0.4197,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.8922288417816162,
"rewards/margins": 1.6468966007232666,
"rewards/rejected": -2.5391252040863037,
"step": 80
},
{
"epoch": 0.32824721118092065,
"eval_logits/chosen": -2.0568766593933105,
"eval_logits/rejected": -2.0545151233673096,
"eval_logps/chosen": -26.29727554321289,
"eval_logps/rejected": -36.041969299316406,
"eval_loss": 0.3652815520763397,
"eval_rewards/accuracies": 0.7937787771224976,
"eval_rewards/chosen": -0.8284297585487366,
"eval_rewards/margins": 1.6235177516937256,
"eval_rewards/rejected": -2.4519472122192383,
"eval_runtime": 383.0876,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 1.133,
"step": 80
},
{
"epoch": 0.3364533914604437,
"grad_norm": 27.98699458231919,
"learning_rate": 4.929977612110723e-07,
"logits/chosen": -2.0166802406311035,
"logits/rejected": -2.009007215499878,
"logps/chosen": -27.796003341674805,
"logps/rejected": -37.83144760131836,
"loss": 0.3548,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.278414249420166,
"rewards/margins": 1.387697696685791,
"rewards/rejected": -2.666111946105957,
"step": 82
},
{
"epoch": 0.34465957173996664,
"grad_norm": 29.909892482214758,
"learning_rate": 4.921279019567806e-07,
"logits/chosen": -2.060328483581543,
"logits/rejected": -2.0567514896392822,
"logps/chosen": -22.3341064453125,
"logps/rejected": -27.066940307617188,
"loss": 0.3357,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.3317604064941406,
"rewards/margins": 1.3481371402740479,
"rewards/rejected": -2.6798975467681885,
"step": 84
},
{
"epoch": 0.35286575201948966,
"grad_norm": 28.894675574314675,
"learning_rate": 4.912079892291184e-07,
"logits/chosen": -2.0049288272857666,
"logits/rejected": -2.0062108039855957,
"logps/chosen": -30.222366333007812,
"logps/rejected": -37.637451171875,
"loss": 0.3523,
"rewards/accuracies": 0.890625,
"rewards/chosen": -1.1566193103790283,
"rewards/margins": 1.645595669746399,
"rewards/rejected": -2.8022148609161377,
"step": 86
},
{
"epoch": 0.3610719322990127,
"grad_norm": 33.499254927144385,
"learning_rate": 4.902382131954594e-07,
"logits/chosen": -2.0440762042999268,
"logits/rejected": -2.041065216064453,
"logps/chosen": -28.95600700378418,
"logps/rejected": -32.45362091064453,
"loss": 0.3969,
"rewards/accuracies": 0.859375,
"rewards/chosen": -1.4144505262374878,
"rewards/margins": 1.526777982711792,
"rewards/rejected": -2.9412283897399902,
"step": 88
},
{
"epoch": 0.3692781125785357,
"grad_norm": 36.672895433055324,
"learning_rate": 4.892187743310834e-07,
"logits/chosen": -2.1121721267700195,
"logits/rejected": -2.1142220497131348,
"logps/chosen": -33.17682647705078,
"logps/rejected": -55.22991180419922,
"loss": 0.3216,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.8327025175094604,
"rewards/margins": 2.4360129833221436,
"rewards/rejected": -3.2687156200408936,
"step": 90
},
{
"epoch": 0.3692781125785357,
"eval_logits/chosen": -2.052072048187256,
"eval_logits/rejected": -2.0496795177459717,
"eval_logps/chosen": -25.994354248046875,
"eval_logps/rejected": -35.990543365478516,
"eval_loss": 0.33759820461273193,
"eval_rewards/accuracies": 0.8029953837394714,
"eval_rewards/chosen": -0.6769699454307556,
"eval_rewards/margins": 1.7492659091949463,
"eval_rewards/rejected": -2.426236152648926,
"eval_runtime": 382.9061,
"eval_samples_per_second": 4.529,
"eval_steps_per_second": 1.133,
"step": 90
},
{
"epoch": 0.37748429285805873,
"grad_norm": 27.350771381278054,
"learning_rate": 4.881498833777333e-07,
"logits/chosen": -2.1142969131469727,
"logits/rejected": -2.1127796173095703,
"logps/chosen": -27.94344711303711,
"logps/rejected": -36.454994201660156,
"loss": 0.3337,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.4834941327571869,
"rewards/margins": 1.6755785942077637,
"rewards/rejected": -2.1590728759765625,
"step": 92
},
{
"epoch": 0.38569047313758176,
"grad_norm": 21.635416471698683,
"learning_rate": 4.870317613000496e-07,
"logits/chosen": -2.016702890396118,
"logits/rejected": -2.017988681793213,
"logps/chosen": -29.998384475708008,
"logps/rejected": -37.61553192138672,
"loss": 0.3289,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.3838534653186798,
"rewards/margins": 1.9279775619506836,
"rewards/rejected": -2.311830997467041,
"step": 94
},
{
"epoch": 0.3938966534171048,
"grad_norm": 32.26464274668422,
"learning_rate": 4.858646392398927e-07,
"logits/chosen": -2.1012330055236816,
"logits/rejected": -2.1029739379882812,
"logps/chosen": -24.2839298248291,
"logps/rejected": -52.811363220214844,
"loss": 0.3288,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.1970413625240326,
"rewards/margins": 2.4278342723846436,
"rewards/rejected": -2.624875545501709,
"step": 96
},
{
"epoch": 0.40210283369662775,
"grad_norm": 24.11098061020347,
"learning_rate": 4.846487584685594e-07,
"logits/chosen": -2.0802366733551025,
"logits/rejected": -2.073582649230957,
"logps/chosen": -24.776100158691406,
"logps/rejected": -30.90226936340332,
"loss": 0.3069,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.5730211734771729,
"rewards/margins": 1.4600439071655273,
"rewards/rejected": -2.0330650806427,
"step": 98
},
{
"epoch": 0.41030901397615077,
"grad_norm": 24.386233800584957,
"learning_rate": 4.833843703369075e-07,
"logits/chosen": -2.056903839111328,
"logits/rejected": -2.0560696125030518,
"logps/chosen": -25.943012237548828,
"logps/rejected": -47.84954071044922,
"loss": 0.2756,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.26147836446762085,
"rewards/margins": 2.6134910583496094,
"rewards/rejected": -2.874969482421875,
"step": 100
},
{
"epoch": 0.41030901397615077,
"eval_logits/chosen": -2.048205852508545,
"eval_logits/rejected": -2.0459136962890625,
"eval_logps/chosen": -25.047216415405273,
"eval_logps/rejected": -35.27001953125,
"eval_loss": 0.3195771872997284,
"eval_rewards/accuracies": 0.804147481918335,
"eval_rewards/chosen": -0.20340144634246826,
"eval_rewards/margins": 1.8625727891921997,
"eval_rewards/rejected": -2.065974235534668,
"eval_runtime": 383.1037,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 1.133,
"step": 100
},
{
"epoch": 0.4185151942556738,
"grad_norm": 20.26348597387277,
"learning_rate": 4.82071736223395e-07,
"logits/chosen": -2.0507657527923584,
"logits/rejected": -2.044565200805664,
"logps/chosen": -23.989782333374023,
"logps/rejected": -31.029855728149414,
"loss": 0.2811,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2594943940639496,
"rewards/margins": 1.7268887758255005,
"rewards/rejected": -1.9863829612731934,
"step": 102
},
{
"epoch": 0.4267213745351968,
"grad_norm": 19.535827187227056,
"learning_rate": 4.807111274800475e-07,
"logits/chosen": -2.0995054244995117,
"logits/rejected": -2.091494083404541,
"logps/chosen": -20.125661849975586,
"logps/rejected": -26.041656494140625,
"loss": 0.2717,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.21148215234279633,
"rewards/margins": 1.814781665802002,
"rewards/rejected": -2.02626371383667,
"step": 104
},
{
"epoch": 0.43492755481471984,
"grad_norm": 33.73160324782582,
"learning_rate": 4.793028253763632e-07,
"logits/chosen": -2.0714871883392334,
"logits/rejected": -2.073652744293213,
"logps/chosen": -29.02633285522461,
"logps/rejected": -45.91259765625,
"loss": 0.3285,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.4463399350643158,
"rewards/margins": 2.3804402351379395,
"rewards/rejected": -2.826780080795288,
"step": 106
},
{
"epoch": 0.44313373509424286,
"grad_norm": 27.500884484426393,
"learning_rate": 4.778471210411683e-07,
"logits/chosen": -2.1328721046447754,
"logits/rejected": -2.128079414367676,
"logps/chosen": -26.338151931762695,
"logps/rejected": -39.00925064086914,
"loss": 0.2889,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.20531219244003296,
"rewards/margins": 2.1164348125457764,
"rewards/rejected": -2.321747064590454,
"step": 108
},
{
"epoch": 0.4513399153737659,
"grad_norm": 19.395398725174257,
"learning_rate": 4.763443154024334e-07,
"logits/chosen": -2.010795831680298,
"logits/rejected": -2.004256010055542,
"logps/chosen": -21.2674560546875,
"logps/rejected": -29.27115249633789,
"loss": 0.2402,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.35923194885253906,
"rewards/margins": 2.286337375640869,
"rewards/rejected": -2.645569324493408,
"step": 110
},
{
"epoch": 0.4513399153737659,
"eval_logits/chosen": -2.047652244567871,
"eval_logits/rejected": -2.0454225540161133,
"eval_logps/chosen": -25.388172149658203,
"eval_logps/rejected": -36.19011306762695,
"eval_loss": 0.29904791712760925,
"eval_rewards/accuracies": 0.8064516186714172,
"eval_rewards/chosen": -0.37387850880622864,
"eval_rewards/margins": 2.152141571044922,
"eval_rewards/rejected": -2.526020050048828,
"eval_runtime": 382.9313,
"eval_samples_per_second": 4.528,
"eval_steps_per_second": 1.133,
"step": 110
},
{
"epoch": 0.4595460956532889,
"grad_norm": 19.214229030497684,
"learning_rate": 4.74794719125065e-07,
"logits/chosen": -2.0643627643585205,
"logits/rejected": -2.0607378482818604,
"logps/chosen": -28.60050392150879,
"logps/rejected": -30.703428268432617,
"loss": 0.2887,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.28185003995895386,
"rewards/margins": 1.8889483213424683,
"rewards/rejected": -2.1707985401153564,
"step": 112
},
{
"epoch": 0.4677522759328119,
"grad_norm": 26.967072996600148,
"learning_rate": 4.731986525466836e-07,
"logits/chosen": -2.0277886390686035,
"logits/rejected": -2.0253236293792725,
"logps/chosen": -28.50118064880371,
"logps/rejected": -39.31520462036133,
"loss": 0.2738,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.7399206757545471,
"rewards/margins": 2.0028390884399414,
"rewards/rejected": -2.7427597045898438,
"step": 114
},
{
"epoch": 0.4759584562123349,
"grad_norm": 23.78521498468923,
"learning_rate": 4.7155644561140293e-07,
"logits/chosen": -2.0169668197631836,
"logits/rejected": -2.0144312381744385,
"logps/chosen": -17.960556030273438,
"logps/rejected": -44.15462875366211,
"loss": 0.308,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.5678286552429199,
"rewards/margins": 2.510011672973633,
"rewards/rejected": -3.077840805053711,
"step": 116
},
{
"epoch": 0.4841646364918579,
"grad_norm": 24.124459622722974,
"learning_rate": 4.698684378016222e-07,
"logits/chosen": -1.9945940971374512,
"logits/rejected": -1.9939186573028564,
"logps/chosen": -27.118621826171875,
"logps/rejected": -47.901485443115234,
"loss": 0.3078,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.848014235496521,
"rewards/margins": 2.41487455368042,
"rewards/rejected": -3.2628891468048096,
"step": 118
},
{
"epoch": 0.49237081677138095,
"grad_norm": 21.029226154824478,
"learning_rate": 4.681349780678478e-07,
"logits/chosen": -2.069658041000366,
"logits/rejected": -2.066652774810791,
"logps/chosen": -19.98298454284668,
"logps/rejected": -35.198486328125,
"loss": 0.2684,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.4263419508934021,
"rewards/margins": 2.379479169845581,
"rewards/rejected": -2.805821418762207,
"step": 120
},
{
"epoch": 0.49237081677138095,
"eval_logits/chosen": -2.0455994606018066,
"eval_logits/rejected": -2.0433595180511475,
"eval_logps/chosen": -25.59235954284668,
"eval_logps/rejected": -36.88100051879883,
"eval_loss": 0.2864527106285095,
"eval_rewards/accuracies": 0.8122119903564453,
"eval_rewards/chosen": -0.47597208619117737,
"eval_rewards/margins": 2.3954925537109375,
"eval_rewards/rejected": -2.871464729309082,
"eval_runtime": 382.8408,
"eval_samples_per_second": 4.529,
"eval_steps_per_second": 1.134,
"step": 120
},
{
"epoch": 0.500576997050904,
"grad_norm": 22.882841908986034,
"learning_rate": 4.6635642475655643e-07,
"logits/chosen": -2.0903265476226807,
"logits/rejected": -2.091627836227417,
"logps/chosen": -27.477466583251953,
"logps/rejected": -39.44534683227539,
"loss": 0.3338,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.455081045627594,
"rewards/margins": 2.4550111293792725,
"rewards/rejected": -2.9100921154022217,
"step": 122
},
{
"epoch": 0.508783177330427,
"grad_norm": 25.08346996002156,
"learning_rate": 4.6453314553611724e-07,
"logits/chosen": -2.058607816696167,
"logits/rejected": -2.0590226650238037,
"logps/chosen": -26.310922622680664,
"logps/rejected": -41.810604095458984,
"loss": 0.3438,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.7647377848625183,
"rewards/margins": 2.2874107360839844,
"rewards/rejected": -3.0521485805511475,
"step": 124
},
{
"epoch": 0.51698935760995,
"grad_norm": 19.245560055238066,
"learning_rate": 4.626655173207856e-07,
"logits/chosen": -2.0874435901641846,
"logits/rejected": -2.0808544158935547,
"logps/chosen": -34.59534454345703,
"logps/rejected": -37.33736801147461,
"loss": 0.2242,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.46927499771118164,
"rewards/margins": 2.6450273990631104,
"rewards/rejected": -3.114302396774292,
"step": 126
},
{
"epoch": 0.525195537889473,
"grad_norm": 26.78927560536762,
"learning_rate": 4.607539261927868e-07,
"logits/chosen": -2.038646936416626,
"logits/rejected": -2.036555290222168,
"logps/chosen": -25.151229858398438,
"logps/rejected": -41.385738372802734,
"loss": 0.258,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.7142379879951477,
"rewards/margins": 2.4793665409088135,
"rewards/rejected": -3.1936047077178955,
"step": 128
},
{
"epoch": 0.5334017181689961,
"grad_norm": 28.211129877400257,
"learning_rate": 4.587987673225031e-07,
"logits/chosen": -2.0317800045013428,
"logits/rejected": -2.029090642929077,
"logps/chosen": -23.62847137451172,
"logps/rejected": -41.40718460083008,
"loss": 0.314,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7062588930130005,
"rewards/margins": 2.260392904281616,
"rewards/rejected": -2.9666519165039062,
"step": 130
},
{
"epoch": 0.5334017181689961,
"eval_logits/chosen": -2.046064615249634,
"eval_logits/rejected": -2.0438313484191895,
"eval_logps/chosen": -24.87871742248535,
"eval_logps/rejected": -36.286964416503906,
"eval_loss": 0.2757984399795532,
"eval_rewards/accuracies": 0.8225806355476379,
"eval_rewards/chosen": -0.11915161460638046,
"eval_rewards/margins": 2.4552958011627197,
"eval_rewards/rejected": -2.5744473934173584,
"eval_runtime": 382.9112,
"eval_samples_per_second": 4.528,
"eval_steps_per_second": 1.133,
"step": 130
},
{
"epoch": 0.5416078984485191,
"grad_norm": 19.634258213686227,
"learning_rate": 4.568004448867836e-07,
"logits/chosen": -1.9866001605987549,
"logits/rejected": -1.983536720275879,
"logps/chosen": -31.62383460998535,
"logps/rejected": -39.09525680541992,
"loss": 0.2353,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.31665635108947754,
"rewards/margins": 2.6990866661071777,
"rewards/rejected": -3.0157430171966553,
"step": 132
},
{
"epoch": 0.5498140787280421,
"grad_norm": 18.53508760026935,
"learning_rate": 4.547593719853908e-07,
"logits/chosen": -2.117701530456543,
"logits/rejected": -2.1072885990142822,
"logps/chosen": -18.488718032836914,
"logps/rejected": -28.42927360534668,
"loss": 0.2533,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.2543734908103943,
"rewards/margins": 1.9832816123962402,
"rewards/rejected": -2.2376551628112793,
"step": 134
},
{
"epoch": 0.558020259007565,
"grad_norm": 16.806992595132815,
"learning_rate": 4.526759705556037e-07,
"logits/chosen": -2.1042609214782715,
"logits/rejected": -2.106780767440796,
"logps/chosen": -25.879810333251953,
"logps/rejected": -48.42536163330078,
"loss": 0.1949,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.10162217915058136,
"rewards/margins": 3.190650224685669,
"rewards/rejected": -3.0890283584594727,
"step": 136
},
{
"epoch": 0.566226439287088,
"grad_norm": 29.13623442121838,
"learning_rate": 4.5055067128499336e-07,
"logits/chosen": -2.1067519187927246,
"logits/rejected": -2.0996322631835938,
"logps/chosen": -24.035615921020508,
"logps/rejected": -33.068504333496094,
"loss": 0.3053,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.14914892613887787,
"rewards/margins": 2.2678146362304688,
"rewards/rejected": -2.4169633388519287,
"step": 138
},
{
"epoch": 0.5744326195666111,
"grad_norm": 14.892524709426025,
"learning_rate": 4.483839135223899e-07,
"logits/chosen": -2.0495522022247314,
"logits/rejected": -2.049654960632324,
"logps/chosen": -22.117572784423828,
"logps/rejected": -45.10372543334961,
"loss": 0.213,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1762857437133789,
"rewards/margins": 3.072883129119873,
"rewards/rejected": -3.249169111251831,
"step": 140
},
{
"epoch": 0.5744326195666111,
"eval_logits/chosen": -2.043985605239868,
"eval_logits/rejected": -2.041775703430176,
"eval_logps/chosen": -24.56585693359375,
"eval_logps/rejected": -36.1696662902832,
"eval_loss": 0.26956623792648315,
"eval_rewards/accuracies": 0.8214285969734192,
"eval_rewards/chosen": 0.03727945312857628,
"eval_rewards/margins": 2.55307674407959,
"eval_rewards/rejected": -2.5157971382141113,
"eval_runtime": 382.8923,
"eval_samples_per_second": 4.529,
"eval_steps_per_second": 1.133,
"step": 140
},
{
"epoch": 0.5826387998461341,
"grad_norm": 26.214083971395798,
"learning_rate": 4.461761451870586e-07,
"logits/chosen": -2.0746335983276367,
"logits/rejected": -2.0725696086883545,
"logps/chosen": -28.98033332824707,
"logps/rejected": -33.45741653442383,
"loss": 0.2844,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3142589330673218,
"rewards/margins": 2.325613498687744,
"rewards/rejected": -2.6398725509643555,
"step": 142
},
{
"epoch": 0.5908449801256571,
"grad_norm": 19.801373156192714,
"learning_rate": 4.4392782267610495e-07,
"logits/chosen": -2.026793956756592,
"logits/rejected": -2.02146315574646,
"logps/chosen": -27.090116500854492,
"logps/rejected": -36.820072174072266,
"loss": 0.2405,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.05273272842168808,
"rewards/margins": 2.6351218223571777,
"rewards/rejected": -2.5823891162872314,
"step": 144
},
{
"epoch": 0.5990511604051801,
"grad_norm": 16.912833507875153,
"learning_rate": 4.416394107701263e-07,
"logits/chosen": -2.0721688270568848,
"logits/rejected": -2.0710277557373047,
"logps/chosen": -29.055322647094727,
"logps/rejected": -35.73569107055664,
"loss": 0.22,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.04920143634080887,
"rewards/margins": 2.790663957595825,
"rewards/rejected": -2.7414627075195312,
"step": 146
},
{
"epoch": 0.6072573406847032,
"grad_norm": 18.48986000965074,
"learning_rate": 4.393113825371312e-07,
"logits/chosen": -2.088228225708008,
"logits/rejected": -2.0804646015167236,
"logps/chosen": -22.193986892700195,
"logps/rejected": -28.600019454956055,
"loss": 0.2584,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.23887695372104645,
"rewards/margins": 2.352029323577881,
"rewards/rejected": -2.5909059047698975,
"step": 148
},
{
"epoch": 0.6154635209642262,
"grad_norm": 20.195764906566346,
"learning_rate": 4.3694421923474523e-07,
"logits/chosen": -2.0307724475860596,
"logits/rejected": -2.0267581939697266,
"logps/chosen": -25.377315521240234,
"logps/rejected": -26.69452667236328,
"loss": 0.2466,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.02581462636590004,
"rewards/margins": 2.615370512008667,
"rewards/rejected": -2.5895559787750244,
"step": 150
},
{
"epoch": 0.6154635209642262,
"eval_logits/chosen": -2.0484068393707275,
"eval_logits/rejected": -2.046243906021118,
"eval_logps/chosen": -24.641366958618164,
"eval_logps/rejected": -36.52920150756836,
"eval_loss": 0.2627074420452118,
"eval_rewards/accuracies": 0.8179723620414734,
"eval_rewards/chosen": -0.00047618892858736217,
"eval_rewards/margins": 2.6950886249542236,
"eval_rewards/rejected": -2.6955649852752686,
"eval_runtime": 383.0874,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 1.133,
"step": 150
},
{
"epoch": 0.6236697012437492,
"grad_norm": 19.84271445630352,
"learning_rate": 4.3453841021072367e-07,
"logits/chosen": -2.0682671070098877,
"logits/rejected": -2.0678606033325195,
"logps/chosen": -29.11172866821289,
"logps/rejected": -43.93864059448242,
"loss": 0.2393,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.21894294023513794,
"rewards/margins": 2.8336007595062256,
"rewards/rejected": -3.0525436401367188,
"step": 152
},
{
"epoch": 0.6318758815232722,
"grad_norm": 23.803208123467336,
"learning_rate": 4.32094452801792e-07,
"logits/chosen": -2.120655059814453,
"logits/rejected": -2.113632917404175,
"logps/chosen": -23.940746307373047,
"logps/rejected": -40.54450988769531,
"loss": 0.2101,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.22472774982452393,
"rewards/margins": 3.2544140815734863,
"rewards/rejected": -3.029686450958252,
"step": 154
},
{
"epoch": 0.6400820618027953,
"grad_norm": 27.831688353015224,
"learning_rate": 4.29612852230835e-07,
"logits/chosen": -2.1510727405548096,
"logits/rejected": -2.1424570083618164,
"logps/chosen": -29.35157012939453,
"logps/rejected": -39.825775146484375,
"loss": 0.3136,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.25963878631591797,
"rewards/margins": 2.4727556705474854,
"rewards/rejected": -2.7323946952819824,
"step": 156
},
{
"epoch": 0.6482882420823183,
"grad_norm": 16.129032160167743,
"learning_rate": 4.270941215024551e-07,
"logits/chosen": -2.0342090129852295,
"logits/rejected": -2.030592918395996,
"logps/chosen": -25.4481201171875,
"logps/rejected": -30.034128189086914,
"loss": 0.2383,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.3485499918460846,
"rewards/margins": 2.4843502044677734,
"rewards/rejected": -2.832900047302246,
"step": 158
},
{
"epoch": 0.6564944223618413,
"grad_norm": 22.363661877277583,
"learning_rate": 4.2453878129692257e-07,
"logits/chosen": -2.0489790439605713,
"logits/rejected": -2.0505762100219727,
"logps/chosen": -20.61872673034668,
"logps/rejected": -39.31790542602539,
"loss": 0.236,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.36551517248153687,
"rewards/margins": 2.786931037902832,
"rewards/rejected": -3.1524462699890137,
"step": 160
},
{
"epoch": 0.6564944223618413,
"eval_logits/chosen": -2.048050880432129,
"eval_logits/rejected": -2.0458953380584717,
"eval_logps/chosen": -25.27681541442871,
"eval_logps/rejected": -37.62437438964844,
"eval_loss": 0.2534183859825134,
"eval_rewards/accuracies": 0.8260368704795837,
"eval_rewards/chosen": -0.31819990277290344,
"eval_rewards/margins": 2.9249520301818848,
"eval_rewards/rejected": -3.243151903152466,
"eval_runtime": 383.1245,
"eval_samples_per_second": 4.526,
"eval_steps_per_second": 1.133,
"step": 160
},
{
"epoch": 0.6647006026413643,
"grad_norm": 16.96720912691383,
"learning_rate": 4.2194735986253894e-07,
"logits/chosen": -2.0488674640655518,
"logits/rejected": -2.04327130317688,
"logps/chosen": -29.53526496887207,
"logps/rejected": -29.405216217041016,
"loss": 0.2198,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.7364621162414551,
"rewards/margins": 2.2279775142669678,
"rewards/rejected": -2.9644393920898438,
"step": 162
},
{
"epoch": 0.6729067829208873,
"grad_norm": 21.306381619736708,
"learning_rate": 4.193203929064353e-07,
"logits/chosen": -2.098163366317749,
"logits/rejected": -2.09775710105896,
"logps/chosen": -27.31165313720703,
"logps/rejected": -49.05406951904297,
"loss": 0.2626,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6689335107803345,
"rewards/margins": 3.3616225719451904,
"rewards/rejected": -4.030555725097656,
"step": 164
},
{
"epoch": 0.6811129632004103,
"grad_norm": 16.744872149563246,
"learning_rate": 4.1665842348382974e-07,
"logits/chosen": -2.076477289199829,
"logits/rejected": -2.078810453414917,
"logps/chosen": -21.170326232910156,
"logps/rejected": -47.55445098876953,
"loss": 0.2274,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.5238662958145142,
"rewards/margins": 3.466320514678955,
"rewards/rejected": -3.9901866912841797,
"step": 166
},
{
"epoch": 0.6893191434799333,
"grad_norm": 14.185305520407544,
"learning_rate": 4.139620018857648e-07,
"logits/chosen": -2.003122091293335,
"logits/rejected": -2.000248908996582,
"logps/chosen": -22.870935440063477,
"logps/rejected": -39.74565505981445,
"loss": 0.2638,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6974708437919617,
"rewards/margins": 2.8220083713531494,
"rewards/rejected": -3.519479513168335,
"step": 168
},
{
"epoch": 0.6975253237594563,
"grad_norm": 18.022997094570332,
"learning_rate": 4.1123168552534983e-07,
"logits/chosen": -2.025099277496338,
"logits/rejected": -2.0219833850860596,
"logps/chosen": -28.584171295166016,
"logps/rejected": -38.055633544921875,
"loss": 0.2503,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.9480264186859131,
"rewards/margins": 2.6562623977661133,
"rewards/rejected": -3.6042890548706055,
"step": 170
},
{
"epoch": 0.6975253237594563,
"eval_logits/chosen": -2.047826051712036,
"eval_logits/rejected": -2.0456759929656982,
"eval_logps/chosen": -25.28836441040039,
"eval_logps/rejected": -37.87636184692383,
"eval_loss": 0.2495637983083725,
"eval_rewards/accuracies": 0.8271889686584473,
"eval_rewards/chosen": -0.32397639751434326,
"eval_rewards/margins": 3.0451700687408447,
"eval_rewards/rejected": -3.3691465854644775,
"eval_runtime": 383.2321,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 170
},
{
"epoch": 0.7057315040389793,
"grad_norm": 30.201529803311132,
"learning_rate": 4.084680388225302e-07,
"logits/chosen": -2.0860209465026855,
"logits/rejected": -2.0855419635772705,
"logps/chosen": -25.88998031616211,
"logps/rejected": -35.96123504638672,
"loss": 0.2004,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.12960225343704224,
"rewards/margins": 3.401216745376587,
"rewards/rejected": -3.5308187007904053,
"step": 172
},
{
"epoch": 0.7139376843185024,
"grad_norm": 13.521536443635728,
"learning_rate": 4.0567163308740925e-07,
"logits/chosen": -2.061061143875122,
"logits/rejected": -2.0628788471221924,
"logps/chosen": -28.78243637084961,
"logps/rejected": -46.390567779541016,
"loss": 0.1995,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.46569639444351196,
"rewards/margins": 3.489924430847168,
"rewards/rejected": -3.9556210041046143,
"step": 174
},
{
"epoch": 0.7221438645980254,
"grad_norm": 12.241119865111436,
"learning_rate": 4.028430464021445e-07,
"logits/chosen": -2.0593080520629883,
"logits/rejected": -2.0476720333099365,
"logps/chosen": -25.62759780883789,
"logps/rejected": -28.63144302368164,
"loss": 0.2311,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.2619730532169342,
"rewards/margins": 2.6522488594055176,
"rewards/rejected": -2.91422176361084,
"step": 176
},
{
"epoch": 0.7303500448775484,
"grad_norm": 19.69681362187802,
"learning_rate": 3.9998286350144517e-07,
"logits/chosen": -2.127859592437744,
"logits/rejected": -2.124366283416748,
"logps/chosen": -27.991050720214844,
"logps/rejected": -32.49789810180664,
"loss": 0.1931,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.4937363862991333,
"rewards/margins": 2.826824188232422,
"rewards/rejected": -3.3205604553222656,
"step": 178
},
{
"epoch": 0.7385562251570714,
"grad_norm": 18.465447655230893,
"learning_rate": 3.970916756516936e-07,
"logits/chosen": -2.0612165927886963,
"logits/rejected": -2.0634703636169434,
"logps/chosen": -31.92318344116211,
"logps/rejected": -47.82712173461914,
"loss": 0.1869,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8367382287979126,
"rewards/margins": 3.4318110942840576,
"rewards/rejected": -4.26854944229126,
"step": 180
},
{
"epoch": 0.7385562251570714,
"eval_logits/chosen": -2.047463893890381,
"eval_logits/rejected": -2.0453338623046875,
"eval_logps/chosen": -25.764009475708008,
"eval_logps/rejected": -38.632667541503906,
"eval_loss": 0.24548786878585815,
"eval_rewards/accuracies": 0.8248847723007202,
"eval_rewards/chosen": -0.5617985725402832,
"eval_rewards/margins": 3.185500383377075,
"eval_rewards/rejected": -3.7472991943359375,
"eval_runtime": 383.1735,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.133,
"step": 180
},
{
"epoch": 0.7467624054365944,
"grad_norm": 28.11697685801471,
"learning_rate": 3.941700805287168e-07,
"logits/chosen": -2.100497245788574,
"logits/rejected": -2.0966544151306152,
"logps/chosen": -22.02035140991211,
"logps/rejected": -34.0717658996582,
"loss": 0.2677,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.21246086061000824,
"rewards/margins": 2.8322882652282715,
"rewards/rejected": -3.0447492599487305,
"step": 182
},
{
"epoch": 0.7549685857161175,
"grad_norm": 21.185462762042597,
"learning_rate": 3.912186820942329e-07,
"logits/chosen": -2.07576847076416,
"logits/rejected": -2.076803684234619,
"logps/chosen": -28.578994750976562,
"logps/rejected": -53.69257354736328,
"loss": 0.2491,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.507973849773407,
"rewards/margins": 4.095354080200195,
"rewards/rejected": -4.603327751159668,
"step": 184
},
{
"epoch": 0.7631747659956405,
"grad_norm": 15.467370455908538,
"learning_rate": 3.8823809047099844e-07,
"logits/chosen": -2.091827630996704,
"logits/rejected": -2.0913453102111816,
"logps/chosen": -22.403928756713867,
"logps/rejected": -48.54361343383789,
"loss": 0.214,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5807015895843506,
"rewards/margins": 3.9955556392669678,
"rewards/rejected": -4.57625675201416,
"step": 186
},
{
"epoch": 0.7713809462751635,
"grad_norm": 21.449176063404654,
"learning_rate": 3.8522892181668145e-07,
"logits/chosen": -2.073751449584961,
"logits/rejected": -2.0747601985931396,
"logps/chosen": -25.900894165039062,
"logps/rejected": -46.20444107055664,
"loss": 0.2199,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7566344141960144,
"rewards/margins": 3.398766040802002,
"rewards/rejected": -4.15540075302124,
"step": 188
},
{
"epoch": 0.7795871265546865,
"grad_norm": 17.55165479512368,
"learning_rate": 3.821917981964873e-07,
"logits/chosen": -2.0060808658599854,
"logits/rejected": -2.0017528533935547,
"logps/chosen": -22.145849227905273,
"logps/rejected": -36.78887939453125,
"loss": 0.2346,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.17031463980674744,
"rewards/margins": 2.9387521743774414,
"rewards/rejected": -3.1090667247772217,
"step": 190
},
{
"epoch": 0.7795871265546865,
"eval_logits/chosen": -2.0468969345092773,
"eval_logits/rejected": -2.044738292694092,
"eval_logps/chosen": -24.78700065612793,
"eval_logps/rejected": -37.58818054199219,
"eval_loss": 0.2414015829563141,
"eval_rewards/accuracies": 0.8329492807388306,
"eval_rewards/chosen": -0.07329300791025162,
"eval_rewards/margins": 3.1517624855041504,
"eval_rewards/rejected": -3.225055456161499,
"eval_runtime": 383.2411,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 190
},
{
"epoch": 0.7877933068342096,
"grad_norm": 17.368985354953686,
"learning_rate": 3.79127347454564e-07,
"logits/chosen": -2.0460448265075684,
"logits/rejected": -2.0370802879333496,
"logps/chosen": -26.563547134399414,
"logps/rejected": -38.11757278442383,
"loss": 0.2211,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.37561261653900146,
"rewards/margins": 2.6861519813537598,
"rewards/rejected": -3.0617644786834717,
"step": 192
},
{
"epoch": 0.7959994871137326,
"grad_norm": 24.935734742960335,
"learning_rate": 3.760362030842113e-07,
"logits/chosen": -2.0645647048950195,
"logits/rejected": -2.0566651821136475,
"logps/chosen": -25.446151733398438,
"logps/rejected": -29.0418758392334,
"loss": 0.2469,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.27131855487823486,
"rewards/margins": 2.571014165878296,
"rewards/rejected": -2.842332124710083,
"step": 194
},
{
"epoch": 0.8042056673932555,
"grad_norm": 30.86901898716295,
"learning_rate": 3.7291900409692346e-07,
"logits/chosen": -2.1032328605651855,
"logits/rejected": -2.1033365726470947,
"logps/chosen": -25.201629638671875,
"logps/rejected": -39.323097229003906,
"loss": 0.2264,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.08912299573421478,
"rewards/margins": 3.298488140106201,
"rewards/rejected": -3.387611150741577,
"step": 196
},
{
"epoch": 0.8124118476727785,
"grad_norm": 21.758022770529394,
"learning_rate": 3.6977639489029056e-07,
"logits/chosen": -2.0734946727752686,
"logits/rejected": -2.0731263160705566,
"logps/chosen": -23.40597915649414,
"logps/rejected": -54.18486022949219,
"loss": 0.2027,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.06441287696361542,
"rewards/margins": 3.9613521099090576,
"rewards/rejected": -3.896939277648926,
"step": 198
},
{
"epoch": 0.8206180279523015,
"grad_norm": 24.294166941995968,
"learning_rate": 3.666090251147864e-07,
"logits/chosen": -2.0556201934814453,
"logits/rejected": -2.056826114654541,
"logps/chosen": -22.517078399658203,
"logps/rejected": -50.51237487792969,
"loss": 0.2229,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.15975169837474823,
"rewards/margins": 4.061436176300049,
"rewards/rejected": -4.221188068389893,
"step": 200
},
{
"epoch": 0.8206180279523015,
"eval_logits/chosen": -2.0442752838134766,
"eval_logits/rejected": -2.0421106815338135,
"eval_logps/chosen": -24.409955978393555,
"eval_logps/rejected": -37.24555206298828,
"eval_loss": 0.23997244238853455,
"eval_rewards/accuracies": 0.8341013789176941,
"eval_rewards/chosen": 0.1152293011546135,
"eval_rewards/margins": 3.168970823287964,
"eval_rewards/rejected": -3.053741216659546,
"eval_runtime": 383.2231,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 200
},
{
"epoch": 0.8288242082318246,
"grad_norm": 16.34097836225302,
"learning_rate": 3.6341754953947074e-07,
"logits/chosen": -1.9979509115219116,
"logits/rejected": -1.9910781383514404,
"logps/chosen": -19.41321563720703,
"logps/rejected": -31.22482681274414,
"loss": 0.1755,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.12338539958000183,
"rewards/margins": 3.176830291748047,
"rewards/rejected": -3.0534448623657227,
"step": 202
},
{
"epoch": 0.8370303885113476,
"grad_norm": 12.694366662014156,
"learning_rate": 3.6020262791663334e-07,
"logits/chosen": -2.0971240997314453,
"logits/rejected": -2.098155975341797,
"logps/chosen": -28.991458892822266,
"logps/rejected": -45.771728515625,
"loss": 0.1939,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.10578002035617828,
"rewards/margins": 3.6688270568847656,
"rewards/rejected": -3.563046932220459,
"step": 204
},
{
"epoch": 0.8452365687908706,
"grad_norm": 14.104950698059591,
"learning_rate": 3.569649248454077e-07,
"logits/chosen": -2.0265486240386963,
"logits/rejected": -2.023569107055664,
"logps/chosen": -19.130268096923828,
"logps/rejected": -45.600608825683594,
"loss": 0.1853,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.10110447555780411,
"rewards/margins": 3.614145040512085,
"rewards/rejected": -3.513040542602539,
"step": 206
},
{
"epoch": 0.8534427490703936,
"grad_norm": 17.487336280708007,
"learning_rate": 3.53705109634383e-07,
"logits/chosen": -2.0617876052856445,
"logits/rejected": -2.059702157974243,
"logps/chosen": -22.653236389160156,
"logps/rejected": -33.451725006103516,
"loss": 0.2036,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.21188408136367798,
"rewards/margins": 2.7427585124969482,
"rewards/rejected": -2.9546422958374023,
"step": 208
},
{
"epoch": 0.8616489293499167,
"grad_norm": 9.723870774955257,
"learning_rate": 3.5042385616324236e-07,
"logits/chosen": -2.030812978744507,
"logits/rejected": -2.0279781818389893,
"logps/chosen": -24.24312973022461,
"logps/rejected": -41.24238204956055,
"loss": 0.1987,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.33003199100494385,
"rewards/margins": 3.4097635746002197,
"rewards/rejected": -3.739795684814453,
"step": 210
},
{
"epoch": 0.8616489293499167,
"eval_logits/chosen": -2.044862985610962,
"eval_logits/rejected": -2.042698621749878,
"eval_logps/chosen": -24.65873146057129,
"eval_logps/rejected": -37.76519012451172,
"eval_loss": 0.23567576706409454,
"eval_rewards/accuracies": 0.8375576138496399,
"eval_rewards/chosen": -0.009159128181636333,
"eval_rewards/margins": 3.304400682449341,
"eval_rewards/rejected": -3.3135595321655273,
"eval_runtime": 383.2234,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 210
},
{
"epoch": 0.8698551096294397,
"grad_norm": 21.492517186963823,
"learning_rate": 3.471218427434564e-07,
"logits/chosen": -2.080448865890503,
"logits/rejected": -2.077627182006836,
"logps/chosen": -28.046445846557617,
"logps/rejected": -39.041099548339844,
"loss": 0.2058,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.2396007478237152,
"rewards/margins": 3.291498899459839,
"rewards/rejected": -3.531099557876587,
"step": 212
},
{
"epoch": 0.8780612899089627,
"grad_norm": 13.459743982109678,
"learning_rate": 3.4379975197806025e-07,
"logits/chosen": -2.0183067321777344,
"logits/rejected": -2.0159990787506104,
"logps/chosen": -30.474184036254883,
"logps/rejected": -43.47400665283203,
"loss": 0.1623,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.49888333678245544,
"rewards/margins": 3.724670886993408,
"rewards/rejected": -4.2235541343688965,
"step": 214
},
{
"epoch": 0.8862674701884857,
"grad_norm": 21.39701909954645,
"learning_rate": 3.404582706205438e-07,
"logits/chosen": -2.0548624992370605,
"logits/rejected": -2.0510313510894775,
"logps/chosen": -25.710582733154297,
"logps/rejected": -47.18820571899414,
"loss": 0.2419,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.6691687107086182,
"rewards/margins": 3.2068655490875244,
"rewards/rejected": -3.8760342597961426,
"step": 216
},
{
"epoch": 0.8944736504680088,
"grad_norm": 15.742259895322631,
"learning_rate": 3.370980894328836e-07,
"logits/chosen": -2.0632827281951904,
"logits/rejected": -2.0607898235321045,
"logps/chosen": -29.016246795654297,
"logps/rejected": -29.224483489990234,
"loss": 0.231,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.02165369875729084,
"rewards/margins": 2.8292150497436523,
"rewards/rejected": -2.8508687019348145,
"step": 218
},
{
"epoch": 0.9026798307475318,
"grad_norm": 15.763481233552632,
"learning_rate": 3.337199030427465e-07,
"logits/chosen": -2.0306642055511475,
"logits/rejected": -2.0277583599090576,
"logps/chosen": -24.812223434448242,
"logps/rejected": -42.606937408447266,
"loss": 0.2242,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.5038759112358093,
"rewards/margins": 3.393310070037842,
"rewards/rejected": -3.897186040878296,
"step": 220
},
{
"epoch": 0.9026798307475318,
"eval_logits/chosen": -2.0421266555786133,
"eval_logits/rejected": -2.0399889945983887,
"eval_logps/chosen": -24.929790496826172,
"eval_logps/rejected": -38.282798767089844,
"eval_loss": 0.23096635937690735,
"eval_rewards/accuracies": 0.8398617506027222,
"eval_rewards/chosen": -0.144687682390213,
"eval_rewards/margins": 3.4276747703552246,
"eval_rewards/rejected": -3.5723624229431152,
"eval_runtime": 383.2137,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.133,
"step": 220
},
{
"epoch": 0.9108860110270548,
"grad_norm": 21.58130767879211,
"learning_rate": 3.303244097998937e-07,
"logits/chosen": -2.059748888015747,
"logits/rejected": -2.055283546447754,
"logps/chosen": -28.7065372467041,
"logps/rejected": -32.10883331298828,
"loss": 0.2483,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.3254169821739197,
"rewards/margins": 2.9877233505249023,
"rewards/rejected": -3.313140392303467,
"step": 222
},
{
"epoch": 0.9190921913065778,
"grad_norm": 22.43367663307141,
"learning_rate": 3.2691231163181577e-07,
"logits/chosen": -2.03068470954895,
"logits/rejected": -2.029240608215332,
"logps/chosen": -30.380455017089844,
"logps/rejected": -51.02494430541992,
"loss": 0.2287,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.5234890580177307,
"rewards/margins": 3.933694362640381,
"rewards/rejected": -4.457183361053467,
"step": 224
},
{
"epoch": 0.9272983715861007,
"grad_norm": 18.763839267544366,
"learning_rate": 3.2348431389862775e-07,
"logits/chosen": -2.0352532863616943,
"logits/rejected": -2.0309362411499023,
"logps/chosen": -26.602890014648438,
"logps/rejected": -50.593746185302734,
"loss": 0.2013,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.3150665760040283,
"rewards/margins": 4.31168270111084,
"rewards/rejected": -4.626749515533447,
"step": 226
},
{
"epoch": 0.9355045518656238,
"grad_norm": 12.546473609091086,
"learning_rate": 3.2004112524725485e-07,
"logits/chosen": -2.102660894393921,
"logits/rejected": -2.098243474960327,
"logps/chosen": -18.035892486572266,
"logps/rejected": -29.32090187072754,
"loss": 0.22,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.6185750365257263,
"rewards/margins": 2.496063232421875,
"rewards/rejected": -3.114638328552246,
"step": 228
},
{
"epoch": 0.9437107321451468,
"grad_norm": 15.34640763010342,
"learning_rate": 3.16583457464939e-07,
"logits/chosen": -2.1066555976867676,
"logits/rejected": -2.102388381958008,
"logps/chosen": -22.569570541381836,
"logps/rejected": -35.02971267700195,
"loss": 0.2468,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.08270835876464844,
"rewards/margins": 3.306253433227539,
"rewards/rejected": -3.2235450744628906,
"step": 230
},
{
"epoch": 0.9437107321451468,
"eval_logits/chosen": -2.0403828620910645,
"eval_logits/rejected": -2.0383219718933105,
"eval_logps/chosen": -25.11418342590332,
"eval_logps/rejected": -38.591522216796875,
"eval_loss": 0.22682493925094604,
"eval_rewards/accuracies": 0.8364055156707764,
"eval_rewards/chosen": -0.23688402771949768,
"eval_rewards/margins": 3.4898440837860107,
"eval_rewards/rejected": -3.7267279624938965,
"eval_runtime": 383.2213,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.133,
"step": 230
},
{
"epoch": 0.9519169124246698,
"grad_norm": 20.317292755951325,
"learning_rate": 3.1311202533209516e-07,
"logits/chosen": -2.066338300704956,
"logits/rejected": -2.0640790462493896,
"logps/chosen": -18.971635818481445,
"logps/rejected": -37.304718017578125,
"loss": 0.2079,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2257971614599228,
"rewards/margins": 3.128026008605957,
"rewards/rejected": -3.353823184967041,
"step": 232
},
{
"epoch": 0.9601230927041928,
"grad_norm": 21.602603549490688,
"learning_rate": 3.096275464745501e-07,
"logits/chosen": -2.0534098148345947,
"logits/rejected": -2.059464931488037,
"logps/chosen": -27.803630828857422,
"logps/rejected": -51.089698791503906,
"loss": 0.1848,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1761884093284607,
"rewards/margins": 3.8570480346679688,
"rewards/rejected": -4.033236503601074,
"step": 234
},
{
"epoch": 0.9683292729837158,
"grad_norm": 25.9781205793325,
"learning_rate": 3.061307412151922e-07,
"logits/chosen": -2.063019037246704,
"logits/rejected": -2.0604867935180664,
"logps/chosen": -30.086076736450195,
"logps/rejected": -38.32453536987305,
"loss": 0.2619,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.35281646251678467,
"rewards/margins": 3.2657690048217773,
"rewards/rejected": -3.6185855865478516,
"step": 236
},
{
"epoch": 0.9765354532632389,
"grad_norm": 21.253354056577106,
"learning_rate": 3.0262233242506414e-07,
"logits/chosen": -2.0784614086151123,
"logits/rejected": -2.0770914554595947,
"logps/chosen": -21.169662475585938,
"logps/rejected": -38.09800720214844,
"loss": 0.2181,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.11809039115905762,
"rewards/margins": 3.477996587753296,
"rewards/rejected": -3.359905958175659,
"step": 238
},
{
"epoch": 0.9847416335427619,
"grad_norm": 19.6268618703958,
"learning_rate": 2.9910304537392837e-07,
"logits/chosen": -1.9946519136428833,
"logits/rejected": -1.9943212270736694,
"logps/chosen": -25.262042999267578,
"logps/rejected": -37.486366271972656,
"loss": 0.218,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.47168225049972534,
"rewards/margins": 3.64669132232666,
"rewards/rejected": -4.118373394012451,
"step": 240
},
{
"epoch": 0.9847416335427619,
"eval_logits/chosen": -2.043826103210449,
"eval_logits/rejected": -2.0417795181274414,
"eval_logps/chosen": -24.943058013916016,
"eval_logps/rejected": -38.50017166137695,
"eval_loss": 0.22486171126365662,
"eval_rewards/accuracies": 0.8387096524238586,
"eval_rewards/chosen": -0.1513207107782364,
"eval_rewards/margins": 3.5297298431396484,
"eval_rewards/rejected": -3.6810505390167236,
"eval_runtime": 383.2373,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 240
},
{
"epoch": 0.9929478138222849,
"grad_norm": 14.682865028202697,
"learning_rate": 2.955736075803371e-07,
"logits/chosen": -2.0690362453460693,
"logits/rejected": -2.070676326751709,
"logps/chosen": -24.48103904724121,
"logps/rejected": -47.019752502441406,
"loss": 0.1536,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.06840521097183228,
"rewards/margins": 4.232391357421875,
"rewards/rejected": -4.163986682891846,
"step": 242
},
{
"epoch": 1.001153994101808,
"grad_norm": 23.69289209855517,
"learning_rate": 2.9203474866123756e-07,
"logits/chosen": -2.050558090209961,
"logits/rejected": -2.045598030090332,
"logps/chosen": -26.37921142578125,
"logps/rejected": -40.620906829833984,
"loss": 0.2308,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.41915470361709595,
"rewards/margins": 3.705007314682007,
"rewards/rejected": -4.124162197113037,
"step": 244
},
{
"epoch": 1.009360174381331,
"grad_norm": 17.446487056986353,
"learning_rate": 2.884872001811425e-07,
"logits/chosen": -1.9810688495635986,
"logits/rejected": -1.9750444889068604,
"logps/chosen": -21.791378021240234,
"logps/rejected": -34.808170318603516,
"loss": 0.1961,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5856488943099976,
"rewards/margins": 3.6117939949035645,
"rewards/rejected": -4.197443008422852,
"step": 246
},
{
"epoch": 1.017566354660854,
"grad_norm": 10.922108855871292,
"learning_rate": 2.849316955008996e-07,
"logits/chosen": -2.0840566158294678,
"logits/rejected": -2.0829105377197266,
"logps/chosen": -20.662445068359375,
"logps/rejected": -45.1578254699707,
"loss": 0.1894,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.044170308858156204,
"rewards/margins": 4.373213768005371,
"rewards/rejected": -4.417383670806885,
"step": 248
},
{
"epoch": 1.025772534940377,
"grad_norm": 7.713913555670714,
"learning_rate": 2.8136896962608785e-07,
"logits/chosen": -2.0515904426574707,
"logits/rejected": -2.051260232925415,
"logps/chosen": -25.802305221557617,
"logps/rejected": -45.2052116394043,
"loss": 0.1496,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.37995997071266174,
"rewards/margins": 4.365400314331055,
"rewards/rejected": -4.745360374450684,
"step": 250
},
{
"epoch": 1.025772534940377,
"eval_logits/chosen": -2.042832612991333,
"eval_logits/rejected": -2.040806293487549,
"eval_logps/chosen": -25.576231002807617,
"eval_logps/rejected": -39.28297805786133,
"eval_loss": 0.2241121530532837,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": -0.4679082930088043,
"eval_rewards/margins": 3.604546546936035,
"eval_rewards/rejected": -4.072454452514648,
"eval_runtime": 383.2982,
"eval_samples_per_second": 4.524,
"eval_steps_per_second": 1.132,
"step": 250
},
{
"epoch": 1.0339787152199,
"grad_norm": 13.7751082342933,
"learning_rate": 2.777997590550758e-07,
"logits/chosen": -2.155553102493286,
"logits/rejected": -2.1575121879577637,
"logps/chosen": -27.36958122253418,
"logps/rejected": -42.855403900146484,
"loss": 0.1621,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.4879440367221832,
"rewards/margins": 3.895385980606079,
"rewards/rejected": -4.38332986831665,
"step": 252
},
{
"epoch": 1.042184895499423,
"grad_norm": 16.01605616077575,
"learning_rate": 2.742248016267692e-07,
"logits/chosen": -2.085308790206909,
"logits/rejected": -2.082420825958252,
"logps/chosen": -27.64177703857422,
"logps/rejected": -43.7150993347168,
"loss": 0.1802,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.7673605680465698,
"rewards/margins": 4.039832592010498,
"rewards/rejected": -4.807192325592041,
"step": 254
},
{
"epoch": 1.050391075778946,
"grad_norm": 10.928437797325595,
"learning_rate": 2.706448363680831e-07,
"logits/chosen": -2.0981714725494385,
"logits/rejected": -2.0977556705474854,
"logps/chosen": -25.607622146606445,
"logps/rejected": -46.82646179199219,
"loss": 0.1185,
"rewards/accuracies": 0.96875,
"rewards/chosen": -0.45319244265556335,
"rewards/margins": 4.636317253112793,
"rewards/rejected": -5.089509010314941,
"step": 256
},
{
"epoch": 1.058597256058469,
"grad_norm": 8.863927238249516,
"learning_rate": 2.6706060334116775e-07,
"logits/chosen": -2.0557029247283936,
"logits/rejected": -2.058215379714966,
"logps/chosen": -24.904468536376953,
"logps/rejected": -52.995845794677734,
"loss": 0.1153,
"rewards/accuracies": 0.953125,
"rewards/chosen": -0.11815226078033447,
"rewards/margins": 5.022463321685791,
"rewards/rejected": -5.140615940093994,
"step": 258
},
{
"epoch": 1.0668034363379921,
"grad_norm": 6.912079383205965,
"learning_rate": 2.634728434904204e-07,
"logits/chosen": -2.0943830013275146,
"logits/rejected": -2.0922083854675293,
"logps/chosen": -27.497676849365234,
"logps/rejected": -48.15060806274414,
"loss": 0.1458,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.2815995514392853,
"rewards/margins": 4.458585739135742,
"rewards/rejected": -4.740184783935547,
"step": 260
},
{
"epoch": 1.0668034363379921,
"eval_logits/chosen": -2.0431203842163086,
"eval_logits/rejected": -2.0411202907562256,
"eval_logps/chosen": -25.558095932006836,
"eval_logps/rejected": -39.460453033447266,
"eval_loss": 0.2211485058069229,
"eval_rewards/accuracies": 0.8364055156707764,
"eval_rewards/chosen": -0.4588410556316376,
"eval_rewards/margins": 3.702350378036499,
"eval_rewards/rejected": -4.161191463470459,
"eval_runtime": 383.5467,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 1.132,
"step": 260
},
{
"epoch": 1.0750096166175152,
"grad_norm": 15.832890184225922,
"learning_rate": 2.5988229848931483e-07,
"logits/chosen": -2.0925068855285645,
"logits/rejected": -2.0973916053771973,
"logps/chosen": -35.409889221191406,
"logps/rejected": -56.27671813964844,
"loss": 0.1349,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.4806308150291443,
"rewards/margins": 4.840456008911133,
"rewards/rejected": -5.321086883544922,
"step": 262
},
{
"epoch": 1.0832157968970382,
"grad_norm": 10.693776832442103,
"learning_rate": 2.562897105870801e-07,
"logits/chosen": -2.0722289085388184,
"logits/rejected": -2.0738039016723633,
"logps/chosen": -22.13025665283203,
"logps/rejected": -36.53873825073242,
"loss": 0.1738,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.3039347231388092,
"rewards/margins": 3.8571736812591553,
"rewards/rejected": -4.161108016967773,
"step": 264
},
{
"epoch": 1.0914219771765612,
"grad_norm": 19.182488661584227,
"learning_rate": 2.5269582245526096e-07,
"logits/chosen": -2.0609676837921143,
"logits/rejected": -2.0527615547180176,
"logps/chosen": -25.158451080322266,
"logps/rejected": -37.255897521972656,
"loss": 0.1578,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.35766327381134033,
"rewards/margins": 3.5247013568878174,
"rewards/rejected": -3.882364273071289,
"step": 266
},
{
"epoch": 1.0996281574560842,
"grad_norm": 9.44923086833568,
"learning_rate": 2.4910137703418926e-07,
"logits/chosen": -2.0382041931152344,
"logits/rejected": -2.0330657958984375,
"logps/chosen": -27.744306564331055,
"logps/rejected": -37.20608901977539,
"loss": 0.1527,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.0014753900468349457,
"rewards/margins": 4.225948333740234,
"rewards/rejected": -4.224472522735596,
"step": 268
},
{
"epoch": 1.107834337735607,
"grad_norm": 10.39281903644009,
"learning_rate": 2.4550711737940205e-07,
"logits/chosen": -2.0400283336639404,
"logits/rejected": -2.0371220111846924,
"logps/chosen": -26.864028930664062,
"logps/rejected": -52.77448272705078,
"loss": 0.1428,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.7589187026023865,
"rewards/margins": 4.930204391479492,
"rewards/rejected": -5.689122200012207,
"step": 270
},
{
"epoch": 1.107834337735607,
"eval_logits/chosen": -2.043691635131836,
"eval_logits/rejected": -2.0416359901428223,
"eval_logps/chosen": -25.027318954467773,
"eval_logps/rejected": -38.982704162597656,
"eval_loss": 0.22408966720104218,
"eval_rewards/accuracies": 0.8364055156707764,
"eval_rewards/chosen": -0.19345209002494812,
"eval_rewards/margins": 3.7288661003112793,
"eval_rewards/rejected": -3.9223177433013916,
"eval_runtime": 383.6577,
"eval_samples_per_second": 4.52,
"eval_steps_per_second": 1.131,
"step": 270
},
{
"epoch": 1.11604051801513,
"grad_norm": 6.39182395545924,
"learning_rate": 2.419137865080337e-07,
"logits/chosen": -2.069648027420044,
"logits/rejected": -2.0665249824523926,
"logps/chosen": -22.426170349121094,
"logps/rejected": -28.420143127441406,
"loss": 0.1774,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.1105562299489975,
"rewards/margins": 3.127821683883667,
"rewards/rejected": -3.238377571105957,
"step": 272
},
{
"epoch": 1.124246698294653,
"grad_norm": 11.027723946191974,
"learning_rate": 2.383221272452178e-07,
"logits/chosen": -2.0779690742492676,
"logits/rejected": -2.087660312652588,
"logps/chosen": -24.797609329223633,
"logps/rejected": -65.99850463867188,
"loss": 0.1436,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.11033198237419128,
"rewards/margins": 5.425482749938965,
"rewards/rejected": -5.53581428527832,
"step": 274
},
{
"epoch": 1.132452878574176,
"grad_norm": 17.805145954377792,
"learning_rate": 2.3473288207052741e-07,
"logits/chosen": -2.08833384513855,
"logits/rejected": -2.0850491523742676,
"logps/chosen": -25.857656478881836,
"logps/rejected": -32.381996154785156,
"loss": 0.1588,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1286620795726776,
"rewards/margins": 3.531426191329956,
"rewards/rejected": -3.660088062286377,
"step": 276
},
{
"epoch": 1.1406590588536991,
"grad_norm": 17.682382824702852,
"learning_rate": 2.3114679296448726e-07,
"logits/chosen": -2.131256341934204,
"logits/rejected": -2.125654697418213,
"logps/chosen": -24.452953338623047,
"logps/rejected": -31.313426971435547,
"loss": 0.2115,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.033705681562423706,
"rewards/margins": 3.4805080890655518,
"rewards/rejected": -3.514213800430298,
"step": 278
},
{
"epoch": 1.1488652391332221,
"grad_norm": 16.145295646746973,
"learning_rate": 2.2756460125518942e-07,
"logits/chosen": -2.0623483657836914,
"logits/rejected": -2.056093692779541,
"logps/chosen": -21.44537353515625,
"logps/rejected": -35.502716064453125,
"loss": 0.1726,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.28275609016418457,
"rewards/margins": 3.6864452362060547,
"rewards/rejected": -3.96920108795166,
"step": 280
},
{
"epoch": 1.1488652391332221,
"eval_logits/chosen": -2.045440196990967,
"eval_logits/rejected": -2.0434725284576416,
"eval_logps/chosen": -25.390390396118164,
"eval_logps/rejected": -39.54319763183594,
"eval_loss": 0.2192622274160385,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": -0.37498798966407776,
"eval_rewards/margins": 3.8275747299194336,
"eval_rewards/rejected": -4.2025628089904785,
"eval_runtime": 383.5119,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 1.132,
"step": 280
},
{
"epoch": 1.1570714194127452,
"grad_norm": 18.875320195670874,
"learning_rate": 2.2398704746504318e-07,
"logits/chosen": -1.9832574129104614,
"logits/rejected": -1.9829756021499634,
"logps/chosen": -31.6018123626709,
"logps/rejected": -41.83821105957031,
"loss": 0.1804,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.4063544273376465,
"rewards/margins": 4.4296417236328125,
"rewards/rejected": -4.835995674133301,
"step": 282
},
{
"epoch": 1.1652775996922682,
"grad_norm": 14.790336639999394,
"learning_rate": 2.20414871157692e-07,
"logits/chosen": -2.0224382877349854,
"logits/rejected": -2.027156352996826,
"logps/chosen": -25.788434982299805,
"logps/rejected": -50.23531723022461,
"loss": 0.1406,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.5073741674423218,
"rewards/margins": 4.580756664276123,
"rewards/rejected": -5.088130474090576,
"step": 284
},
{
"epoch": 1.1734837799717912,
"grad_norm": 14.288848495922638,
"learning_rate": 2.1684881078512867e-07,
"logits/chosen": -2.0773634910583496,
"logits/rejected": -2.0686299800872803,
"logps/chosen": -25.211549758911133,
"logps/rejected": -30.360422134399414,
"loss": 0.1824,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.5571584105491638,
"rewards/margins": 3.3241891860961914,
"rewards/rejected": -3.881347417831421,
"step": 286
},
{
"epoch": 1.1816899602513142,
"grad_norm": 17.171906139091153,
"learning_rate": 2.1328960353503978e-07,
"logits/chosen": -2.085387945175171,
"logits/rejected": -2.0828468799591064,
"logps/chosen": -23.766998291015625,
"logps/rejected": -33.4593620300293,
"loss": 0.1488,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.5340551137924194,
"rewards/margins": 3.7335057258605957,
"rewards/rejected": -4.267560958862305,
"step": 288
},
{
"epoch": 1.1898961405308373,
"grad_norm": 13.043172605616002,
"learning_rate": 2.0973798517841173e-07,
"logits/chosen": -2.018440008163452,
"logits/rejected": -2.0149004459381104,
"logps/chosen": -31.094905853271484,
"logps/rejected": -34.58868408203125,
"loss": 0.1415,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.8485018610954285,
"rewards/margins": 3.8137242794036865,
"rewards/rejected": -4.66222620010376,
"step": 290
},
{
"epoch": 1.1898961405308373,
"eval_logits/chosen": -2.0464136600494385,
"eval_logits/rejected": -2.044499158859253,
"eval_logps/chosen": -25.900218963623047,
"eval_logps/rejected": -40.12266159057617,
"eval_loss": 0.21777255833148956,
"eval_rewards/accuracies": 0.8364055156707764,
"eval_rewards/chosen": -0.6299027800559998,
"eval_rewards/margins": 3.8623909950256348,
"eval_rewards/rejected": -4.492293834686279,
"eval_runtime": 383.3374,
"eval_samples_per_second": 4.523,
"eval_steps_per_second": 1.132,
"step": 290
},
{
"epoch": 1.1981023208103603,
"grad_norm": 20.638807271142973,
"learning_rate": 2.0619468991743042e-07,
"logits/chosen": -2.061304807662964,
"logits/rejected": -2.0597338676452637,
"logps/chosen": -24.799753189086914,
"logps/rejected": -57.012210845947266,
"loss": 0.1738,
"rewards/accuracies": 0.921875,
"rewards/chosen": -1.1362073421478271,
"rewards/margins": 5.481666564941406,
"rewards/rejected": -6.617873191833496,
"step": 292
},
{
"epoch": 1.2063085010898833,
"grad_norm": 15.936474764605512,
"learning_rate": 2.026604502337039e-07,
"logits/chosen": -2.047367572784424,
"logits/rejected": -2.044090747833252,
"logps/chosen": -27.630414962768555,
"logps/rejected": -50.361202239990234,
"loss": 0.1453,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.04436264932155609,
"rewards/margins": 4.908329486846924,
"rewards/rejected": -4.863966941833496,
"step": 294
},
{
"epoch": 1.2145146813694063,
"grad_norm": 15.408621452000459,
"learning_rate": 1.9913599673684159e-07,
"logits/chosen": -2.0377581119537354,
"logits/rejected": -2.0361804962158203,
"logps/chosen": -22.501596450805664,
"logps/rejected": -40.86521530151367,
"loss": 0.1384,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.5628668069839478,
"rewards/margins": 3.820146083831787,
"rewards/rejected": -4.383012771606445,
"step": 296
},
{
"epoch": 1.2227208616489293,
"grad_norm": 12.285332508425714,
"learning_rate": 1.9562205801342034e-07,
"logits/chosen": -2.061429738998413,
"logits/rejected": -2.0591816902160645,
"logps/chosen": -29.394742965698242,
"logps/rejected": -43.32746505737305,
"loss": 0.1526,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.384346067905426,
"rewards/margins": 4.4763689041137695,
"rewards/rejected": -4.860714912414551,
"step": 298
},
{
"epoch": 1.2309270419284524,
"grad_norm": 5.583982281384929,
"learning_rate": 1.9211936047636867e-07,
"logits/chosen": -2.0710768699645996,
"logits/rejected": -2.0669403076171875,
"logps/chosen": -20.023929595947266,
"logps/rejected": -34.831573486328125,
"loss": 0.1595,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.22750067710876465,
"rewards/margins": 4.123189926147461,
"rewards/rejected": -4.350690841674805,
"step": 300
},
{
"epoch": 1.2309270419284524,
"eval_logits/chosen": -2.0440096855163574,
"eval_logits/rejected": -2.042032480239868,
"eval_logps/chosen": -24.7576961517334,
"eval_logps/rejected": -38.71461486816406,
"eval_loss": 0.2164347618818283,
"eval_rewards/accuracies": 0.8387096524238586,
"eval_rewards/chosen": -0.058640193194150925,
"eval_rewards/margins": 3.729631185531616,
"eval_rewards/rejected": -3.788270950317383,
"eval_runtime": 383.42,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 1.132,
"step": 300
},
{
"epoch": 1.2391332222079754,
"grad_norm": 11.637212026135893,
"learning_rate": 1.886286282148002e-07,
"logits/chosen": -2.0720925331115723,
"logits/rejected": -2.06878399848938,
"logps/chosen": -21.090686798095703,
"logps/rejected": -28.20726776123047,
"loss": 0.156,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.15111806988716125,
"rewards/margins": 3.5836377143859863,
"rewards/rejected": -3.4325199127197266,
"step": 302
},
{
"epoch": 1.2473394024874984,
"grad_norm": 6.71334576605166,
"learning_rate": 1.8515058284432743e-07,
"logits/chosen": -2.1127841472625732,
"logits/rejected": -2.111618757247925,
"logps/chosen": -23.5858097076416,
"logps/rejected": -42.50680923461914,
"loss": 0.1521,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.2578216791152954,
"rewards/margins": 4.158515453338623,
"rewards/rejected": -4.416337013244629,
"step": 304
},
{
"epoch": 1.2555455827670214,
"grad_norm": 16.588064814380605,
"learning_rate": 1.8168594335788728e-07,
"logits/chosen": -2.042862892150879,
"logits/rejected": -2.0439772605895996,
"logps/chosen": -24.17473030090332,
"logps/rejected": -53.81690216064453,
"loss": 0.177,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.44858402013778687,
"rewards/margins": 4.805813789367676,
"rewards/rejected": -5.254397392272949,
"step": 306
},
{
"epoch": 1.2637517630465445,
"grad_norm": 6.647490191841678,
"learning_rate": 1.7823542597710832e-07,
"logits/chosen": -2.0013082027435303,
"logits/rejected": -2.004103183746338,
"logps/chosen": -23.131669998168945,
"logps/rejected": -43.91994857788086,
"loss": 0.1689,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.4323474168777466,
"rewards/margins": 4.400308609008789,
"rewards/rejected": -3.967961311340332,
"step": 308
},
{
"epoch": 1.2719579433260675,
"grad_norm": 13.202795454945317,
"learning_rate": 1.7479974400425123e-07,
"logits/chosen": -2.046172857284546,
"logits/rejected": -2.0446221828460693,
"logps/chosen": -23.60657501220703,
"logps/rejected": -39.958396911621094,
"loss": 0.1706,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.3769833445549011,
"rewards/margins": 4.0399041175842285,
"rewards/rejected": -3.66292142868042,
"step": 310
},
{
"epoch": 1.2719579433260675,
"eval_logits/chosen": -2.043919086456299,
"eval_logits/rejected": -2.04194974899292,
"eval_logps/chosen": -24.46892547607422,
"eval_logps/rejected": -38.424713134765625,
"eval_loss": 0.21722769737243652,
"eval_rewards/accuracies": 0.8375576138496399,
"eval_rewards/chosen": 0.08574579656124115,
"eval_rewards/margins": 3.7290680408477783,
"eval_rewards/rejected": -3.643322467803955,
"eval_runtime": 383.1854,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.133,
"step": 310
},
{
"epoch": 1.2801641236055905,
"grad_norm": 10.439998982466557,
"learning_rate": 1.7137960767475263e-07,
"logits/chosen": -2.062668561935425,
"logits/rejected": -2.058307647705078,
"logps/chosen": -20.401418685913086,
"logps/rejected": -39.55351257324219,
"loss": 0.179,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.12827420234680176,
"rewards/margins": 4.090062141418457,
"rewards/rejected": -3.9617881774902344,
"step": 312
},
{
"epoch": 1.2883703038851135,
"grad_norm": 7.172405069379902,
"learning_rate": 1.6797572401040244e-07,
"logits/chosen": -2.0692923069000244,
"logits/rejected": -2.067417860031128,
"logps/chosen": -30.879615783691406,
"logps/rejected": -40.37434387207031,
"loss": 0.1199,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.12180179357528687,
"rewards/margins": 4.198694229125977,
"rewards/rejected": -4.076892375946045,
"step": 314
},
{
"epoch": 1.2965764841646366,
"grad_norm": 24.11773202321173,
"learning_rate": 1.6458879667318687e-07,
"logits/chosen": -2.064960479736328,
"logits/rejected": -2.0638976097106934,
"logps/chosen": -29.61989974975586,
"logps/rejected": -38.073875427246094,
"loss": 0.1542,
"rewards/accuracies": 0.90625,
"rewards/chosen": -0.3140927255153656,
"rewards/margins": 3.569211721420288,
"rewards/rejected": -3.8833043575286865,
"step": 316
},
{
"epoch": 1.3047826644441596,
"grad_norm": 15.353619034619824,
"learning_rate": 1.612195258198243e-07,
"logits/chosen": -2.1528408527374268,
"logits/rejected": -2.1589419841766357,
"logps/chosen": -25.104019165039062,
"logps/rejected": -56.826202392578125,
"loss": 0.1502,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.11969700455665588,
"rewards/margins": 5.174037933349609,
"rewards/rejected": -5.054340362548828,
"step": 318
},
{
"epoch": 1.3129888447236826,
"grad_norm": 6.821222616952283,
"learning_rate": 1.57868607957027e-07,
"logits/chosen": -2.0985031127929688,
"logits/rejected": -2.0935959815979004,
"logps/chosen": -30.2029972076416,
"logps/rejected": -43.66718292236328,
"loss": 0.1404,
"rewards/accuracies": 0.921875,
"rewards/chosen": -0.6867274641990662,
"rewards/margins": 4.196517467498779,
"rewards/rejected": -4.883245468139648,
"step": 320
},
{
"epoch": 1.3129888447236826,
"eval_logits/chosen": -2.0446114540100098,
"eval_logits/rejected": -2.042722702026367,
"eval_logps/chosen": -25.383127212524414,
"eval_logps/rejected": -39.56816101074219,
"eval_loss": 0.21271604299545288,
"eval_rewards/accuracies": 0.8375576138496399,
"eval_rewards/chosen": -0.37135595083236694,
"eval_rewards/margins": 3.84369158744812,
"eval_rewards/rejected": -4.215047359466553,
"eval_runtime": 383.232,
"eval_samples_per_second": 4.525,
"eval_steps_per_second": 1.132,
"step": 320
}
],
"logging_steps": 2,
"max_steps": 486,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}