DongfuJiang's picture
End of training
57177f9 verified
raw
history blame contribute delete
No virus
68.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992793658419409,
"eval_steps": 500,
"global_step": 780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025622547842101048,
"grad_norm": 4.871487140655518,
"learning_rate": 5.128205128205128e-07,
"loss": 0.7593,
"step": 2
},
{
"epoch": 0.0051245095684202095,
"grad_norm": 4.878956317901611,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.8076,
"step": 4
},
{
"epoch": 0.007686764352630314,
"grad_norm": 4.183067321777344,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.7059,
"step": 6
},
{
"epoch": 0.010249019136840419,
"grad_norm": 4.679640769958496,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.715,
"step": 8
},
{
"epoch": 0.012811273921050524,
"grad_norm": 4.590462684631348,
"learning_rate": 2.564102564102564e-06,
"loss": 0.7175,
"step": 10
},
{
"epoch": 0.015373528705260629,
"grad_norm": 4.5435638427734375,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.6976,
"step": 12
},
{
"epoch": 0.017935783489470735,
"grad_norm": 4.649476051330566,
"learning_rate": 3.58974358974359e-06,
"loss": 0.7615,
"step": 14
},
{
"epoch": 0.020498038273680838,
"grad_norm": 3.847956418991089,
"learning_rate": 4.102564102564103e-06,
"loss": 0.6735,
"step": 16
},
{
"epoch": 0.023060293057890945,
"grad_norm": 4.92044734954834,
"learning_rate": 4.615384615384616e-06,
"loss": 0.6885,
"step": 18
},
{
"epoch": 0.025622547842101048,
"grad_norm": 4.817592620849609,
"learning_rate": 5.128205128205128e-06,
"loss": 0.768,
"step": 20
},
{
"epoch": 0.028184802626311154,
"grad_norm": 3.8268470764160156,
"learning_rate": 5.641025641025641e-06,
"loss": 0.7092,
"step": 22
},
{
"epoch": 0.030747057410521257,
"grad_norm": 4.103015899658203,
"learning_rate": 6.153846153846155e-06,
"loss": 0.6764,
"step": 24
},
{
"epoch": 0.033309312194731364,
"grad_norm": 4.125541687011719,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6386,
"step": 26
},
{
"epoch": 0.03587156697894147,
"grad_norm": 4.19417142868042,
"learning_rate": 7.17948717948718e-06,
"loss": 0.6218,
"step": 28
},
{
"epoch": 0.03843382176315158,
"grad_norm": 3.5323216915130615,
"learning_rate": 7.692307692307694e-06,
"loss": 0.5853,
"step": 30
},
{
"epoch": 0.040996076547361676,
"grad_norm": 3.1467161178588867,
"learning_rate": 8.205128205128205e-06,
"loss": 0.5441,
"step": 32
},
{
"epoch": 0.04355833133157178,
"grad_norm": 3.0700371265411377,
"learning_rate": 8.717948717948719e-06,
"loss": 0.5458,
"step": 34
},
{
"epoch": 0.04612058611578189,
"grad_norm": 2.553177833557129,
"learning_rate": 9.230769230769232e-06,
"loss": 0.4589,
"step": 36
},
{
"epoch": 0.048682840899991996,
"grad_norm": 2.5199780464172363,
"learning_rate": 9.743589743589744e-06,
"loss": 0.4386,
"step": 38
},
{
"epoch": 0.051245095684202095,
"grad_norm": 1.7382951974868774,
"learning_rate": 9.99995506314361e-06,
"loss": 0.4215,
"step": 40
},
{
"epoch": 0.0538073504684122,
"grad_norm": 0.9749733209609985,
"learning_rate": 9.999595573138845e-06,
"loss": 0.3888,
"step": 42
},
{
"epoch": 0.05636960525262231,
"grad_norm": 0.9746177196502686,
"learning_rate": 9.99887661897616e-06,
"loss": 0.3749,
"step": 44
},
{
"epoch": 0.058931860036832415,
"grad_norm": 0.7504925727844238,
"learning_rate": 9.997798252347382e-06,
"loss": 0.3543,
"step": 46
},
{
"epoch": 0.061494114821042514,
"grad_norm": 0.7129773497581482,
"learning_rate": 9.996360550785619e-06,
"loss": 0.3565,
"step": 48
},
{
"epoch": 0.06405636960525263,
"grad_norm": 0.6482123732566833,
"learning_rate": 9.994563617659665e-06,
"loss": 0.3242,
"step": 50
},
{
"epoch": 0.06661862438946273,
"grad_norm": 0.5225902199745178,
"learning_rate": 9.992407582166582e-06,
"loss": 0.3334,
"step": 52
},
{
"epoch": 0.06918087917367283,
"grad_norm": 0.5128389596939087,
"learning_rate": 9.989892599322404e-06,
"loss": 0.3741,
"step": 54
},
{
"epoch": 0.07174313395788294,
"grad_norm": 0.3568147122859955,
"learning_rate": 9.987018849950996e-06,
"loss": 0.3331,
"step": 56
},
{
"epoch": 0.07430538874209304,
"grad_norm": 0.5258967280387878,
"learning_rate": 9.983786540671052e-06,
"loss": 0.3335,
"step": 58
},
{
"epoch": 0.07686764352630315,
"grad_norm": 0.5380641222000122,
"learning_rate": 9.980195903881231e-06,
"loss": 0.3344,
"step": 60
},
{
"epoch": 0.07942989831051325,
"grad_norm": 0.3320980668067932,
"learning_rate": 9.976247197743465e-06,
"loss": 0.3055,
"step": 62
},
{
"epoch": 0.08199215309472335,
"grad_norm": 0.3006751835346222,
"learning_rate": 9.97194070616438e-06,
"loss": 0.3187,
"step": 64
},
{
"epoch": 0.08455440787893347,
"grad_norm": 0.26902303099632263,
"learning_rate": 9.967276738774897e-06,
"loss": 0.2998,
"step": 66
},
{
"epoch": 0.08711666266314357,
"grad_norm": 0.2527882158756256,
"learning_rate": 9.962255630907964e-06,
"loss": 0.3251,
"step": 68
},
{
"epoch": 0.08967891744735366,
"grad_norm": 0.24817310273647308,
"learning_rate": 9.956877743574437e-06,
"loss": 0.317,
"step": 70
},
{
"epoch": 0.09224117223156378,
"grad_norm": 0.25589698553085327,
"learning_rate": 9.951143463437145e-06,
"loss": 0.31,
"step": 72
},
{
"epoch": 0.09480342701577388,
"grad_norm": 0.23160037398338318,
"learning_rate": 9.94505320278307e-06,
"loss": 0.3158,
"step": 74
},
{
"epoch": 0.09736568179998399,
"grad_norm": 0.26378345489501953,
"learning_rate": 9.938607399493714e-06,
"loss": 0.3163,
"step": 76
},
{
"epoch": 0.09992793658419409,
"grad_norm": 0.21700599789619446,
"learning_rate": 9.931806517013612e-06,
"loss": 0.3132,
"step": 78
},
{
"epoch": 0.10249019136840419,
"grad_norm": 0.29619359970092773,
"learning_rate": 9.924651044317017e-06,
"loss": 0.315,
"step": 80
},
{
"epoch": 0.1050524461526143,
"grad_norm": 0.33124956488609314,
"learning_rate": 9.917141495872733e-06,
"loss": 0.2851,
"step": 82
},
{
"epoch": 0.1076147009368244,
"grad_norm": 0.202985018491745,
"learning_rate": 9.909278411607134e-06,
"loss": 0.3036,
"step": 84
},
{
"epoch": 0.11017695572103452,
"grad_norm": 0.24660180509090424,
"learning_rate": 9.90106235686534e-06,
"loss": 0.3374,
"step": 86
},
{
"epoch": 0.11273921050524462,
"grad_norm": 0.2552855312824249,
"learning_rate": 9.892493922370575e-06,
"loss": 0.316,
"step": 88
},
{
"epoch": 0.11530146528945472,
"grad_norm": 0.29509437084198,
"learning_rate": 9.883573724181683e-06,
"loss": 0.3284,
"step": 90
},
{
"epoch": 0.11786372007366483,
"grad_norm": 0.4660441279411316,
"learning_rate": 9.87430240364885e-06,
"loss": 0.3266,
"step": 92
},
{
"epoch": 0.12042597485787493,
"grad_norm": 0.31215617060661316,
"learning_rate": 9.864680627367476e-06,
"loss": 0.3104,
"step": 94
},
{
"epoch": 0.12298822964208503,
"grad_norm": 0.6009628176689148,
"learning_rate": 9.854709087130261e-06,
"loss": 0.3221,
"step": 96
},
{
"epoch": 0.12555048442629513,
"grad_norm": 0.30782508850097656,
"learning_rate": 9.844388499877457e-06,
"loss": 0.2913,
"step": 98
},
{
"epoch": 0.12811273921050526,
"grad_norm": 0.3109281659126282,
"learning_rate": 9.833719607645325e-06,
"loss": 0.2974,
"step": 100
},
{
"epoch": 0.13067499399471535,
"grad_norm": 0.2832891345024109,
"learning_rate": 9.822703177512783e-06,
"loss": 0.3698,
"step": 102
},
{
"epoch": 0.13323724877892545,
"grad_norm": 0.2483058124780655,
"learning_rate": 9.811340001546252e-06,
"loss": 0.2995,
"step": 104
},
{
"epoch": 0.13579950356313555,
"grad_norm": 0.2657499611377716,
"learning_rate": 9.799630896742716e-06,
"loss": 0.304,
"step": 106
},
{
"epoch": 0.13836175834734565,
"grad_norm": 0.4393406808376312,
"learning_rate": 9.787576704970965e-06,
"loss": 0.3032,
"step": 108
},
{
"epoch": 0.14092401313155578,
"grad_norm": 0.3503418266773224,
"learning_rate": 9.77517829291108e-06,
"loss": 0.2915,
"step": 110
},
{
"epoch": 0.14348626791576588,
"grad_norm": 0.28331002593040466,
"learning_rate": 9.762436551992117e-06,
"loss": 0.3192,
"step": 112
},
{
"epoch": 0.14604852269997598,
"grad_norm": 0.28841540217399597,
"learning_rate": 9.74935239832801e-06,
"loss": 0.2999,
"step": 114
},
{
"epoch": 0.14861077748418608,
"grad_norm": 0.3377619683742523,
"learning_rate": 9.735926772651703e-06,
"loss": 0.31,
"step": 116
},
{
"epoch": 0.15117303226839618,
"grad_norm": 0.358359158039093,
"learning_rate": 9.722160640247523e-06,
"loss": 0.292,
"step": 118
},
{
"epoch": 0.1537352870526063,
"grad_norm": 0.24913446605205536,
"learning_rate": 9.708054990881763e-06,
"loss": 0.3077,
"step": 120
},
{
"epoch": 0.1562975418368164,
"grad_norm": 0.34343913197517395,
"learning_rate": 9.693610838731532e-06,
"loss": 0.3132,
"step": 122
},
{
"epoch": 0.1588597966210265,
"grad_norm": 0.2604675590991974,
"learning_rate": 9.678829222311827e-06,
"loss": 0.3139,
"step": 124
},
{
"epoch": 0.1614220514052366,
"grad_norm": 0.30086928606033325,
"learning_rate": 9.663711204400872e-06,
"loss": 0.2992,
"step": 126
},
{
"epoch": 0.1639843061894467,
"grad_norm": 0.3241061270236969,
"learning_rate": 9.6482578719637e-06,
"loss": 0.3066,
"step": 128
},
{
"epoch": 0.1665465609736568,
"grad_norm": 0.26830941438674927,
"learning_rate": 9.632470336074009e-06,
"loss": 0.3121,
"step": 130
},
{
"epoch": 0.16910881575786693,
"grad_norm": 0.203893780708313,
"learning_rate": 9.616349731834271e-06,
"loss": 0.2991,
"step": 132
},
{
"epoch": 0.17167107054207703,
"grad_norm": 0.3675401210784912,
"learning_rate": 9.599897218294122e-06,
"loss": 0.3227,
"step": 134
},
{
"epoch": 0.17423332532628713,
"grad_norm": 0.21370336413383484,
"learning_rate": 9.583113978367026e-06,
"loss": 0.3025,
"step": 136
},
{
"epoch": 0.17679558011049723,
"grad_norm": 0.25403013825416565,
"learning_rate": 9.56600121874523e-06,
"loss": 0.3179,
"step": 138
},
{
"epoch": 0.17935783489470733,
"grad_norm": 0.3012063503265381,
"learning_rate": 9.548560169812997e-06,
"loss": 0.2906,
"step": 140
},
{
"epoch": 0.18192008967891746,
"grad_norm": 0.24593935906887054,
"learning_rate": 9.530792085558151e-06,
"loss": 0.2968,
"step": 142
},
{
"epoch": 0.18448234446312756,
"grad_norm": 0.267528235912323,
"learning_rate": 9.512698243481914e-06,
"loss": 0.3076,
"step": 144
},
{
"epoch": 0.18704459924733766,
"grad_norm": 0.4075755774974823,
"learning_rate": 9.49427994450705e-06,
"loss": 0.292,
"step": 146
},
{
"epoch": 0.18960685403154776,
"grad_norm": 0.37276849150657654,
"learning_rate": 9.47553851288434e-06,
"loss": 0.3337,
"step": 148
},
{
"epoch": 0.19216910881575786,
"grad_norm": 0.34166908264160156,
"learning_rate": 9.45647529609736e-06,
"loss": 0.2854,
"step": 150
},
{
"epoch": 0.19473136359996798,
"grad_norm": 0.3679031431674957,
"learning_rate": 9.437091664765611e-06,
"loss": 0.328,
"step": 152
},
{
"epoch": 0.19729361838417808,
"grad_norm": 0.2564798891544342,
"learning_rate": 9.41738901254596e-06,
"loss": 0.2831,
"step": 154
},
{
"epoch": 0.19985587316838818,
"grad_norm": 0.39898496866226196,
"learning_rate": 9.397368756032445e-06,
"loss": 0.2899,
"step": 156
},
{
"epoch": 0.20241812795259828,
"grad_norm": 0.2926347255706787,
"learning_rate": 9.37703233465443e-06,
"loss": 0.2796,
"step": 158
},
{
"epoch": 0.20498038273680838,
"grad_norm": 0.3333691656589508,
"learning_rate": 9.356381210573092e-06,
"loss": 0.2965,
"step": 160
},
{
"epoch": 0.2075426375210185,
"grad_norm": 0.2890892028808594,
"learning_rate": 9.33541686857632e-06,
"loss": 0.2884,
"step": 162
},
{
"epoch": 0.2101048923052286,
"grad_norm": 0.27766191959381104,
"learning_rate": 9.31414081597194e-06,
"loss": 0.297,
"step": 164
},
{
"epoch": 0.2126671470894387,
"grad_norm": 0.3218678832054138,
"learning_rate": 9.292554582479349e-06,
"loss": 0.2862,
"step": 166
},
{
"epoch": 0.2152294018736488,
"grad_norm": 0.3139230012893677,
"learning_rate": 9.270659720119533e-06,
"loss": 0.2958,
"step": 168
},
{
"epoch": 0.2177916566578589,
"grad_norm": 0.2383907586336136,
"learning_rate": 9.248457803103476e-06,
"loss": 0.2988,
"step": 170
},
{
"epoch": 0.22035391144206903,
"grad_norm": 0.32504117488861084,
"learning_rate": 9.225950427718974e-06,
"loss": 0.2803,
"step": 172
},
{
"epoch": 0.22291616622627913,
"grad_norm": 0.2564990818500519,
"learning_rate": 9.203139212215868e-06,
"loss": 0.2957,
"step": 174
},
{
"epoch": 0.22547842101048923,
"grad_norm": 0.282103568315506,
"learning_rate": 9.180025796689692e-06,
"loss": 0.2933,
"step": 176
},
{
"epoch": 0.22804067579469933,
"grad_norm": 0.3701488971710205,
"learning_rate": 9.156611842963753e-06,
"loss": 0.2926,
"step": 178
},
{
"epoch": 0.23060293057890943,
"grad_norm": 0.3153334856033325,
"learning_rate": 9.132899034469648e-06,
"loss": 0.3111,
"step": 180
},
{
"epoch": 0.23316518536311953,
"grad_norm": 0.31320127844810486,
"learning_rate": 9.108889076126226e-06,
"loss": 0.2948,
"step": 182
},
{
"epoch": 0.23572744014732966,
"grad_norm": 0.3407798409461975,
"learning_rate": 9.084583694217012e-06,
"loss": 0.2872,
"step": 184
},
{
"epoch": 0.23828969493153976,
"grad_norm": 0.5061964988708496,
"learning_rate": 9.059984636266082e-06,
"loss": 0.3073,
"step": 186
},
{
"epoch": 0.24085194971574986,
"grad_norm": 0.342929482460022,
"learning_rate": 9.035093670912424e-06,
"loss": 0.2866,
"step": 188
},
{
"epoch": 0.24341420449995996,
"grad_norm": 0.38549765944480896,
"learning_rate": 9.009912587782772e-06,
"loss": 0.298,
"step": 190
},
{
"epoch": 0.24597645928417006,
"grad_norm": 0.339372843503952,
"learning_rate": 8.984443197362938e-06,
"loss": 0.2644,
"step": 192
},
{
"epoch": 0.24853871406838018,
"grad_norm": 0.26157572865486145,
"learning_rate": 8.958687330867634e-06,
"loss": 0.2986,
"step": 194
},
{
"epoch": 0.25110096885259026,
"grad_norm": 0.307921826839447,
"learning_rate": 8.932646840108818e-06,
"loss": 0.2826,
"step": 196
},
{
"epoch": 0.2536632236368004,
"grad_norm": 0.48844948410987854,
"learning_rate": 8.906323597362547e-06,
"loss": 0.2824,
"step": 198
},
{
"epoch": 0.2562254784210105,
"grad_norm": 0.3046979308128357,
"learning_rate": 8.879719495234363e-06,
"loss": 0.2836,
"step": 200
},
{
"epoch": 0.2587877332052206,
"grad_norm": 0.37873372435569763,
"learning_rate": 8.852836446523213e-06,
"loss": 0.2799,
"step": 202
},
{
"epoch": 0.2613499879894307,
"grad_norm": 0.5752015709877014,
"learning_rate": 8.825676384083936e-06,
"loss": 0.3027,
"step": 204
},
{
"epoch": 0.2639122427736408,
"grad_norm": 0.403952956199646,
"learning_rate": 8.798241260688273e-06,
"loss": 0.3032,
"step": 206
},
{
"epoch": 0.2664744975578509,
"grad_norm": 0.36202457547187805,
"learning_rate": 8.770533048884483e-06,
"loss": 0.3044,
"step": 208
},
{
"epoch": 0.26903675234206104,
"grad_norm": 0.34956708550453186,
"learning_rate": 8.742553740855507e-06,
"loss": 0.2784,
"step": 210
},
{
"epoch": 0.2715990071262711,
"grad_norm": 0.44058695435523987,
"learning_rate": 8.71430534827574e-06,
"loss": 0.3142,
"step": 212
},
{
"epoch": 0.27416126191048124,
"grad_norm": 0.3903171420097351,
"learning_rate": 8.685789902166395e-06,
"loss": 0.2592,
"step": 214
},
{
"epoch": 0.2767235166946913,
"grad_norm": 0.34790173172950745,
"learning_rate": 8.657009452749466e-06,
"loss": 0.2881,
"step": 216
},
{
"epoch": 0.27928577147890143,
"grad_norm": 0.3779347240924835,
"learning_rate": 8.627966069300332e-06,
"loss": 0.3017,
"step": 218
},
{
"epoch": 0.28184802626311156,
"grad_norm": 0.40141528844833374,
"learning_rate": 8.598661839998972e-06,
"loss": 0.2781,
"step": 220
},
{
"epoch": 0.28441028104732163,
"grad_norm": 0.30786147713661194,
"learning_rate": 8.569098871779828e-06,
"loss": 0.296,
"step": 222
},
{
"epoch": 0.28697253583153176,
"grad_norm": 0.2742227017879486,
"learning_rate": 8.539279290180315e-06,
"loss": 0.3161,
"step": 224
},
{
"epoch": 0.28953479061574183,
"grad_norm": 0.5068826675415039,
"learning_rate": 8.509205239188017e-06,
"loss": 0.2948,
"step": 226
},
{
"epoch": 0.29209704539995196,
"grad_norm": 0.3508552610874176,
"learning_rate": 8.478878881086505e-06,
"loss": 0.2736,
"step": 228
},
{
"epoch": 0.2946593001841621,
"grad_norm": 0.47813767194747925,
"learning_rate": 8.448302396299906e-06,
"loss": 0.2954,
"step": 230
},
{
"epoch": 0.29722155496837216,
"grad_norm": 0.29084405303001404,
"learning_rate": 8.417477983236107e-06,
"loss": 0.3134,
"step": 232
},
{
"epoch": 0.2997838097525823,
"grad_norm": 0.41805362701416016,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2767,
"step": 234
},
{
"epoch": 0.30234606453679236,
"grad_norm": 0.32367441058158875,
"learning_rate": 8.355094254877665e-06,
"loss": 0.2783,
"step": 236
},
{
"epoch": 0.3049083193210025,
"grad_norm": 0.31607088446617126,
"learning_rate": 8.323539424888695e-06,
"loss": 0.2871,
"step": 238
},
{
"epoch": 0.3074705741052126,
"grad_norm": 0.3964040279388428,
"learning_rate": 8.291745636911382e-06,
"loss": 0.2747,
"step": 240
},
{
"epoch": 0.3100328288894227,
"grad_norm": 0.3582654595375061,
"learning_rate": 8.259715176876069e-06,
"loss": 0.2737,
"step": 242
},
{
"epoch": 0.3125950836736328,
"grad_norm": 0.38515010476112366,
"learning_rate": 8.2274503477295e-06,
"loss": 0.2889,
"step": 244
},
{
"epoch": 0.3151573384578429,
"grad_norm": 0.3744358718395233,
"learning_rate": 8.19495346926924e-06,
"loss": 0.2822,
"step": 246
},
{
"epoch": 0.317719593242053,
"grad_norm": 0.3402256369590759,
"learning_rate": 8.162226877976886e-06,
"loss": 0.284,
"step": 248
},
{
"epoch": 0.32028184802626314,
"grad_norm": 0.4301615059375763,
"learning_rate": 8.129272926850079e-06,
"loss": 0.2915,
"step": 250
},
{
"epoch": 0.3228441028104732,
"grad_norm": 0.3376031816005707,
"learning_rate": 8.096093985233323e-06,
"loss": 0.2842,
"step": 252
},
{
"epoch": 0.32540635759468334,
"grad_norm": 0.546100378036499,
"learning_rate": 8.062692438647628e-06,
"loss": 0.3203,
"step": 254
},
{
"epoch": 0.3279686123788934,
"grad_norm": 0.37469664216041565,
"learning_rate": 8.029070688619013e-06,
"loss": 0.2828,
"step": 256
},
{
"epoch": 0.33053086716310354,
"grad_norm": 0.31530773639678955,
"learning_rate": 7.995231152505815e-06,
"loss": 0.2672,
"step": 258
},
{
"epoch": 0.3330931219473136,
"grad_norm": 0.47679194808006287,
"learning_rate": 7.961176263324902e-06,
"loss": 0.292,
"step": 260
},
{
"epoch": 0.33565537673152374,
"grad_norm": 0.7583074569702148,
"learning_rate": 7.92690846957673e-06,
"loss": 0.2987,
"step": 262
},
{
"epoch": 0.33821763151573386,
"grad_norm": 0.4478585124015808,
"learning_rate": 7.892430235069317e-06,
"loss": 0.2881,
"step": 264
},
{
"epoch": 0.34077988629994393,
"grad_norm": 0.49820685386657715,
"learning_rate": 7.857744038741076e-06,
"loss": 0.2912,
"step": 266
},
{
"epoch": 0.34334214108415406,
"grad_norm": 0.42809927463531494,
"learning_rate": 7.822852374482597e-06,
"loss": 0.2672,
"step": 268
},
{
"epoch": 0.34590439586836413,
"grad_norm": 0.3707646131515503,
"learning_rate": 7.787757750957335e-06,
"loss": 0.2921,
"step": 270
},
{
"epoch": 0.34846665065257426,
"grad_norm": 0.3849372863769531,
"learning_rate": 7.752462691421245e-06,
"loss": 0.2676,
"step": 272
},
{
"epoch": 0.3510289054367844,
"grad_norm": 0.34830930829048157,
"learning_rate": 7.716969733541357e-06,
"loss": 0.2576,
"step": 274
},
{
"epoch": 0.35359116022099446,
"grad_norm": 0.4144101142883301,
"learning_rate": 7.681281429213328e-06,
"loss": 0.2686,
"step": 276
},
{
"epoch": 0.3561534150052046,
"grad_norm": 0.30803945660591125,
"learning_rate": 7.645400344377953e-06,
"loss": 0.2678,
"step": 278
},
{
"epoch": 0.35871566978941466,
"grad_norm": 0.40825673937797546,
"learning_rate": 7.609329058836694e-06,
"loss": 0.2907,
"step": 280
},
{
"epoch": 0.3612779245736248,
"grad_norm": 0.38340067863464355,
"learning_rate": 7.5730701660661795e-06,
"loss": 0.298,
"step": 282
},
{
"epoch": 0.3638401793578349,
"grad_norm": 0.3731997013092041,
"learning_rate": 7.536626273031747e-06,
"loss": 0.263,
"step": 284
},
{
"epoch": 0.366402434142045,
"grad_norm": 0.3588733375072479,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2733,
"step": 286
},
{
"epoch": 0.3689646889262551,
"grad_norm": 0.4146881699562073,
"learning_rate": 7.4631939803504215e-06,
"loss": 0.3159,
"step": 288
},
{
"epoch": 0.3715269437104652,
"grad_norm": 0.4735972285270691,
"learning_rate": 7.426210860386032e-06,
"loss": 0.2878,
"step": 290
},
{
"epoch": 0.3740891984946753,
"grad_norm": 0.5484066009521484,
"learning_rate": 7.3890532991431174e-06,
"loss": 0.2829,
"step": 292
},
{
"epoch": 0.37665145327888544,
"grad_norm": 0.3961395025253296,
"learning_rate": 7.3517239682000675e-06,
"loss": 0.2646,
"step": 294
},
{
"epoch": 0.3792137080630955,
"grad_norm": 0.43453872203826904,
"learning_rate": 7.314225551485273e-06,
"loss": 0.301,
"step": 296
},
{
"epoch": 0.38177596284730564,
"grad_norm": 0.45246270298957825,
"learning_rate": 7.276560745084167e-06,
"loss": 0.2622,
"step": 298
},
{
"epoch": 0.3843382176315157,
"grad_norm": 0.4539019763469696,
"learning_rate": 7.2387322570453724e-06,
"loss": 0.2901,
"step": 300
},
{
"epoch": 0.38690047241572584,
"grad_norm": 0.4333208203315735,
"learning_rate": 7.2007428071860045e-06,
"loss": 0.2576,
"step": 302
},
{
"epoch": 0.38946272719993597,
"grad_norm": 0.3936616778373718,
"learning_rate": 7.162595126896111e-06,
"loss": 0.2716,
"step": 304
},
{
"epoch": 0.39202498198414604,
"grad_norm": 0.40865668654441833,
"learning_rate": 7.1242919589422974e-06,
"loss": 0.2716,
"step": 306
},
{
"epoch": 0.39458723676835616,
"grad_norm": 0.5468711256980896,
"learning_rate": 7.085836057270521e-06,
"loss": 0.2978,
"step": 308
},
{
"epoch": 0.39714949155256624,
"grad_norm": 0.469566285610199,
"learning_rate": 7.047230186808085e-06,
"loss": 0.2499,
"step": 310
},
{
"epoch": 0.39971174633677636,
"grad_norm": 0.5449560880661011,
"learning_rate": 7.008477123264849e-06,
"loss": 0.3018,
"step": 312
},
{
"epoch": 0.4022740011209865,
"grad_norm": 0.48154890537261963,
"learning_rate": 6.96957965293365e-06,
"loss": 0.2834,
"step": 314
},
{
"epoch": 0.40483625590519656,
"grad_norm": 0.3875851035118103,
"learning_rate": 6.9305405724899876e-06,
"loss": 0.3008,
"step": 316
},
{
"epoch": 0.4073985106894067,
"grad_norm": 0.5583494305610657,
"learning_rate": 6.891362688790925e-06,
"loss": 0.2753,
"step": 318
},
{
"epoch": 0.40996076547361676,
"grad_norm": 0.47610044479370117,
"learning_rate": 6.8520488186733e-06,
"loss": 0.2943,
"step": 320
},
{
"epoch": 0.4125230202578269,
"grad_norm": 0.33989906311035156,
"learning_rate": 6.812601788751192e-06,
"loss": 0.2692,
"step": 322
},
{
"epoch": 0.415085275042037,
"grad_norm": 0.4737338125705719,
"learning_rate": 6.773024435212678e-06,
"loss": 0.2961,
"step": 324
},
{
"epoch": 0.4176475298262471,
"grad_norm": 0.538935124874115,
"learning_rate": 6.733319603615941e-06,
"loss": 0.2898,
"step": 326
},
{
"epoch": 0.4202097846104572,
"grad_norm": 0.4021223187446594,
"learning_rate": 6.693490148684654e-06,
"loss": 0.2555,
"step": 328
},
{
"epoch": 0.4227720393946673,
"grad_norm": 0.330159991979599,
"learning_rate": 6.653538934102743e-06,
"loss": 0.3043,
"step": 330
},
{
"epoch": 0.4253342941788774,
"grad_norm": 0.39451590180397034,
"learning_rate": 6.6134688323084884e-06,
"loss": 0.3098,
"step": 332
},
{
"epoch": 0.42789654896308754,
"grad_norm": 0.3512692451477051,
"learning_rate": 6.573282724288001e-06,
"loss": 0.276,
"step": 334
},
{
"epoch": 0.4304588037472976,
"grad_norm": 0.3749544322490692,
"learning_rate": 6.532983499368078e-06,
"loss": 0.2893,
"step": 336
},
{
"epoch": 0.43302105853150774,
"grad_norm": 0.35993286967277527,
"learning_rate": 6.492574055008474e-06,
"loss": 0.2522,
"step": 338
},
{
"epoch": 0.4355833133157178,
"grad_norm": 0.3857017457485199,
"learning_rate": 6.452057296593568e-06,
"loss": 0.2556,
"step": 340
},
{
"epoch": 0.43814556809992794,
"grad_norm": 0.36345577239990234,
"learning_rate": 6.411436137223479e-06,
"loss": 0.2795,
"step": 342
},
{
"epoch": 0.44070782288413807,
"grad_norm": 0.40086713433265686,
"learning_rate": 6.370713497504607e-06,
"loss": 0.2619,
"step": 344
},
{
"epoch": 0.44327007766834814,
"grad_norm": 0.4900248944759369,
"learning_rate": 6.329892305339659e-06,
"loss": 0.2748,
"step": 346
},
{
"epoch": 0.44583233245255827,
"grad_norm": 0.6341924071311951,
"learning_rate": 6.288975495717124e-06,
"loss": 0.2731,
"step": 348
},
{
"epoch": 0.44839458723676834,
"grad_norm": 0.5340880751609802,
"learning_rate": 6.247966010500258e-06,
"loss": 0.2797,
"step": 350
},
{
"epoch": 0.45095684202097847,
"grad_norm": 0.37570691108703613,
"learning_rate": 6.206866798215571e-06,
"loss": 0.2724,
"step": 352
},
{
"epoch": 0.4535190968051886,
"grad_norm": 0.4172237515449524,
"learning_rate": 6.165680813840822e-06,
"loss": 0.2728,
"step": 354
},
{
"epoch": 0.45608135158939866,
"grad_norm": 0.36990782618522644,
"learning_rate": 6.124411018592568e-06,
"loss": 0.2733,
"step": 356
},
{
"epoch": 0.4586436063736088,
"grad_norm": 0.35491085052490234,
"learning_rate": 6.0830603797132574e-06,
"loss": 0.2688,
"step": 358
},
{
"epoch": 0.46120586115781886,
"grad_norm": 0.36608174443244934,
"learning_rate": 6.041631870257882e-06,
"loss": 0.2505,
"step": 360
},
{
"epoch": 0.463768115942029,
"grad_norm": 0.3670680820941925,
"learning_rate": 6.000128468880223e-06,
"loss": 0.2749,
"step": 362
},
{
"epoch": 0.46633037072623906,
"grad_norm": 0.40972089767456055,
"learning_rate": 5.958553159618693e-06,
"loss": 0.2541,
"step": 364
},
{
"epoch": 0.4688926255104492,
"grad_norm": 0.40942203998565674,
"learning_rate": 5.916908931681781e-06,
"loss": 0.2721,
"step": 366
},
{
"epoch": 0.4714548802946593,
"grad_norm": 0.508773922920227,
"learning_rate": 5.8751987792331365e-06,
"loss": 0.2774,
"step": 368
},
{
"epoch": 0.4740171350788694,
"grad_norm": 0.38248467445373535,
"learning_rate": 5.833425701176294e-06,
"loss": 0.2497,
"step": 370
},
{
"epoch": 0.4765793898630795,
"grad_norm": 0.42881184816360474,
"learning_rate": 5.79159270093905e-06,
"loss": 0.2686,
"step": 372
},
{
"epoch": 0.4791416446472896,
"grad_norm": 0.4207112491130829,
"learning_rate": 5.749702786257529e-06,
"loss": 0.2797,
"step": 374
},
{
"epoch": 0.4817038994314997,
"grad_norm": 0.4612100124359131,
"learning_rate": 5.707758968959923e-06,
"loss": 0.2665,
"step": 376
},
{
"epoch": 0.48426615421570984,
"grad_norm": 0.471349835395813,
"learning_rate": 5.6657642647499545e-06,
"loss": 0.2753,
"step": 378
},
{
"epoch": 0.4868284089999199,
"grad_norm": 0.4658471643924713,
"learning_rate": 5.62372169299004e-06,
"loss": 0.2445,
"step": 380
},
{
"epoch": 0.48939066378413004,
"grad_norm": 0.48692232370376587,
"learning_rate": 5.581634276484211e-06,
"loss": 0.2933,
"step": 382
},
{
"epoch": 0.4919529185683401,
"grad_norm": 0.44437411427497864,
"learning_rate": 5.539505041260779e-06,
"loss": 0.2502,
"step": 384
},
{
"epoch": 0.49451517335255024,
"grad_norm": 0.4907655119895935,
"learning_rate": 5.497337016354757e-06,
"loss": 0.263,
"step": 386
},
{
"epoch": 0.49707742813676037,
"grad_norm": 0.4633347690105438,
"learning_rate": 5.45513323359009e-06,
"loss": 0.2494,
"step": 388
},
{
"epoch": 0.49963968292097044,
"grad_norm": 0.5105425715446472,
"learning_rate": 5.412896727361663e-06,
"loss": 0.2431,
"step": 390
},
{
"epoch": 0.5022019377051805,
"grad_norm": 0.43711456656455994,
"learning_rate": 5.370630534417133e-06,
"loss": 0.248,
"step": 392
},
{
"epoch": 0.5047641924893906,
"grad_norm": 0.44248372316360474,
"learning_rate": 5.328337693638591e-06,
"loss": 0.2522,
"step": 394
},
{
"epoch": 0.5073264472736008,
"grad_norm": 0.41455918550491333,
"learning_rate": 5.286021245824075e-06,
"loss": 0.2856,
"step": 396
},
{
"epoch": 0.5098887020578109,
"grad_norm": 0.36339160799980164,
"learning_rate": 5.243684233468933e-06,
"loss": 0.2626,
"step": 398
},
{
"epoch": 0.512450956842021,
"grad_norm": 0.4179689288139343,
"learning_rate": 5.201329700547077e-06,
"loss": 0.2738,
"step": 400
},
{
"epoch": 0.515013211626231,
"grad_norm": 0.3756559193134308,
"learning_rate": 5.158960692292122e-06,
"loss": 0.2511,
"step": 402
},
{
"epoch": 0.5175754664104412,
"grad_norm": 0.5741788148880005,
"learning_rate": 5.116580254978447e-06,
"loss": 0.2957,
"step": 404
},
{
"epoch": 0.5201377211946513,
"grad_norm": 0.4136016070842743,
"learning_rate": 5.074191435702155e-06,
"loss": 0.2704,
"step": 406
},
{
"epoch": 0.5226999759788614,
"grad_norm": 0.5152673125267029,
"learning_rate": 5.031797282162007e-06,
"loss": 0.3206,
"step": 408
},
{
"epoch": 0.5252622307630715,
"grad_norm": 0.4879305958747864,
"learning_rate": 4.98940084244029e-06,
"loss": 0.2536,
"step": 410
},
{
"epoch": 0.5278244855472816,
"grad_norm": 0.36677488684654236,
"learning_rate": 4.947005164783661e-06,
"loss": 0.2517,
"step": 412
},
{
"epoch": 0.5303867403314917,
"grad_norm": 0.4830959141254425,
"learning_rate": 4.9046132973839895e-06,
"loss": 0.2751,
"step": 414
},
{
"epoch": 0.5329489951157018,
"grad_norm": 0.39130493998527527,
"learning_rate": 4.862228288159191e-06,
"loss": 0.2583,
"step": 416
},
{
"epoch": 0.535511249899912,
"grad_norm": 0.45581528544425964,
"learning_rate": 4.819853184534085e-06,
"loss": 0.3033,
"step": 418
},
{
"epoch": 0.5380735046841221,
"grad_norm": 0.552720308303833,
"learning_rate": 4.7774910332213005e-06,
"loss": 0.2679,
"step": 420
},
{
"epoch": 0.5406357594683321,
"grad_norm": 0.5465298295021057,
"learning_rate": 4.735144880002199e-06,
"loss": 0.2765,
"step": 422
},
{
"epoch": 0.5431980142525422,
"grad_norm": 0.452952116727829,
"learning_rate": 4.692817769507912e-06,
"loss": 0.2629,
"step": 424
},
{
"epoch": 0.5457602690367523,
"grad_norm": 0.5454785227775574,
"learning_rate": 4.6505127450004216e-06,
"loss": 0.292,
"step": 426
},
{
"epoch": 0.5483225238209625,
"grad_norm": 0.36023062467575073,
"learning_rate": 4.608232848153757e-06,
"loss": 0.2388,
"step": 428
},
{
"epoch": 0.5508847786051726,
"grad_norm": 0.3965865969657898,
"learning_rate": 4.565981118835299e-06,
"loss": 0.2683,
"step": 430
},
{
"epoch": 0.5534470333893826,
"grad_norm": 0.47152435779571533,
"learning_rate": 4.523760594887228e-06,
"loss": 0.265,
"step": 432
},
{
"epoch": 0.5560092881735927,
"grad_norm": 0.5159929394721985,
"learning_rate": 4.481574311908096e-06,
"loss": 0.2823,
"step": 434
},
{
"epoch": 0.5585715429578029,
"grad_norm": 0.371762752532959,
"learning_rate": 4.439425303034576e-06,
"loss": 0.2942,
"step": 436
},
{
"epoch": 0.561133797742013,
"grad_norm": 0.4925728440284729,
"learning_rate": 4.397316598723385e-06,
"loss": 0.2983,
"step": 438
},
{
"epoch": 0.5636960525262231,
"grad_norm": 0.3970510959625244,
"learning_rate": 4.355251226533396e-06,
"loss": 0.2435,
"step": 440
},
{
"epoch": 0.5662583073104331,
"grad_norm": 0.4432925283908844,
"learning_rate": 4.313232210907959e-06,
"loss": 0.2615,
"step": 442
},
{
"epoch": 0.5688205620946433,
"grad_norm": 0.39295539259910583,
"learning_rate": 4.271262572957453e-06,
"loss": 0.2603,
"step": 444
},
{
"epoch": 0.5713828168788534,
"grad_norm": 0.3533722460269928,
"learning_rate": 4.229345330242067e-06,
"loss": 0.246,
"step": 446
},
{
"epoch": 0.5739450716630635,
"grad_norm": 0.4501621127128601,
"learning_rate": 4.187483496554844e-06,
"loss": 0.2679,
"step": 448
},
{
"epoch": 0.5765073264472736,
"grad_norm": 0.4579297602176666,
"learning_rate": 4.145680081704989e-06,
"loss": 0.2616,
"step": 450
},
{
"epoch": 0.5790695812314837,
"grad_norm": 0.43312978744506836,
"learning_rate": 4.103938091301479e-06,
"loss": 0.2534,
"step": 452
},
{
"epoch": 0.5816318360156938,
"grad_norm": 0.45154210925102234,
"learning_rate": 4.062260526536955e-06,
"loss": 0.2909,
"step": 454
},
{
"epoch": 0.5841940907999039,
"grad_norm": 0.34377482533454895,
"learning_rate": 4.0206503839719335e-06,
"loss": 0.261,
"step": 456
},
{
"epoch": 0.586756345584114,
"grad_norm": 0.4153713881969452,
"learning_rate": 3.9791106553193746e-06,
"loss": 0.2669,
"step": 458
},
{
"epoch": 0.5893186003683242,
"grad_norm": 0.5368139743804932,
"learning_rate": 3.937644327229572e-06,
"loss": 0.251,
"step": 460
},
{
"epoch": 0.5918808551525342,
"grad_norm": 0.4761441946029663,
"learning_rate": 3.896254381075416e-06,
"loss": 0.2595,
"step": 462
},
{
"epoch": 0.5944431099367443,
"grad_norm": 0.597135603427887,
"learning_rate": 3.854943792738037e-06,
"loss": 0.2866,
"step": 464
},
{
"epoch": 0.5970053647209544,
"grad_norm": 0.6271767616271973,
"learning_rate": 3.8137155323928526e-06,
"loss": 0.2832,
"step": 466
},
{
"epoch": 0.5995676195051646,
"grad_norm": 0.3820246458053589,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.2548,
"step": 468
},
{
"epoch": 0.6021298742893747,
"grad_norm": 0.5720183849334717,
"learning_rate": 3.7315178465712364e-06,
"loss": 0.2603,
"step": 470
},
{
"epoch": 0.6046921290735847,
"grad_norm": 0.4225583076477051,
"learning_rate": 3.690554330997215e-06,
"loss": 0.2685,
"step": 472
},
{
"epoch": 0.6072543838577948,
"grad_norm": 0.3530130386352539,
"learning_rate": 3.6496849627952875e-06,
"loss": 0.2723,
"step": 474
},
{
"epoch": 0.609816638642005,
"grad_norm": 0.3795667290687561,
"learning_rate": 3.6089126804177373e-06,
"loss": 0.2691,
"step": 476
},
{
"epoch": 0.6123788934262151,
"grad_norm": 0.43652230501174927,
"learning_rate": 3.568240415336509e-06,
"loss": 0.2838,
"step": 478
},
{
"epoch": 0.6149411482104252,
"grad_norm": 0.4311392903327942,
"learning_rate": 3.52767109183244e-06,
"loss": 0.2847,
"step": 480
},
{
"epoch": 0.6175034029946352,
"grad_norm": 0.42163416743278503,
"learning_rate": 3.4872076267850015e-06,
"loss": 0.2488,
"step": 482
},
{
"epoch": 0.6200656577788454,
"grad_norm": 0.4223015308380127,
"learning_rate": 3.4468529294625895e-06,
"loss": 0.2621,
"step": 484
},
{
"epoch": 0.6226279125630555,
"grad_norm": 0.4520999491214752,
"learning_rate": 3.406609901313349e-06,
"loss": 0.2543,
"step": 486
},
{
"epoch": 0.6251901673472656,
"grad_norm": 0.5905027985572815,
"learning_rate": 3.36648143575656e-06,
"loss": 0.271,
"step": 488
},
{
"epoch": 0.6277524221314758,
"grad_norm": 0.5310239195823669,
"learning_rate": 3.326470417974604e-06,
"loss": 0.2794,
"step": 490
},
{
"epoch": 0.6303146769156858,
"grad_norm": 0.43746617436408997,
"learning_rate": 3.2865797247055354e-06,
"loss": 0.2716,
"step": 492
},
{
"epoch": 0.6328769316998959,
"grad_norm": 0.4661629796028137,
"learning_rate": 3.2468122240362287e-06,
"loss": 0.243,
"step": 494
},
{
"epoch": 0.635439186484106,
"grad_norm": 0.44793224334716797,
"learning_rate": 3.2071707751961838e-06,
"loss": 0.2808,
"step": 496
},
{
"epoch": 0.6380014412683161,
"grad_norm": 0.5625908970832825,
"learning_rate": 3.1676582283519454e-06,
"loss": 0.265,
"step": 498
},
{
"epoch": 0.6405636960525263,
"grad_norm": 0.44215095043182373,
"learning_rate": 3.1282774244021717e-06,
"loss": 0.2858,
"step": 500
},
{
"epoch": 0.6405636960525263,
"eval_loss": 0.2639869451522827,
"eval_runtime": 270.7894,
"eval_samples_per_second": 19.421,
"eval_steps_per_second": 2.43,
"step": 500
},
{
"epoch": 0.6431259508367363,
"grad_norm": 0.47866004705429077,
"learning_rate": 3.089031194773392e-06,
"loss": 0.2879,
"step": 502
},
{
"epoch": 0.6456882056209464,
"grad_norm": 0.5291287302970886,
"learning_rate": 3.049922361216422e-06,
"loss": 0.2501,
"step": 504
},
{
"epoch": 0.6482504604051565,
"grad_norm": 0.4798702895641327,
"learning_rate": 3.0109537356034856e-06,
"loss": 0.2691,
"step": 506
},
{
"epoch": 0.6508127151893667,
"grad_norm": 0.7165606617927551,
"learning_rate": 2.9721281197260427e-06,
"loss": 0.3519,
"step": 508
},
{
"epoch": 0.6533749699735768,
"grad_norm": 0.6769598126411438,
"learning_rate": 2.9334483050933506e-06,
"loss": 0.281,
"step": 510
},
{
"epoch": 0.6559372247577868,
"grad_norm": 0.47096380591392517,
"learning_rate": 2.894917072731753e-06,
"loss": 0.2677,
"step": 512
},
{
"epoch": 0.658499479541997,
"grad_norm": 0.6711763739585876,
"learning_rate": 2.8565371929847286e-06,
"loss": 0.2707,
"step": 514
},
{
"epoch": 0.6610617343262071,
"grad_norm": 0.44064444303512573,
"learning_rate": 2.81831142531371e-06,
"loss": 0.2654,
"step": 516
},
{
"epoch": 0.6636239891104172,
"grad_norm": 0.42236313223838806,
"learning_rate": 2.780242518099675e-06,
"loss": 0.2601,
"step": 518
},
{
"epoch": 0.6661862438946272,
"grad_norm": 0.4029591381549835,
"learning_rate": 2.7423332084455543e-06,
"loss": 0.2648,
"step": 520
},
{
"epoch": 0.6687484986788373,
"grad_norm": 0.47852271795272827,
"learning_rate": 2.704586221979422e-06,
"loss": 0.2744,
"step": 522
},
{
"epoch": 0.6713107534630475,
"grad_norm": 0.44856366515159607,
"learning_rate": 2.667004272658541e-06,
"loss": 0.2499,
"step": 524
},
{
"epoch": 0.6738730082472576,
"grad_norm": 0.4645158648490906,
"learning_rate": 2.629590062574221e-06,
"loss": 0.2716,
"step": 526
},
{
"epoch": 0.6764352630314677,
"grad_norm": 0.5160189867019653,
"learning_rate": 2.592346281757552e-06,
"loss": 0.2361,
"step": 528
},
{
"epoch": 0.6789975178156777,
"grad_norm": 0.3944529891014099,
"learning_rate": 2.5552756079859904e-06,
"loss": 0.2476,
"step": 530
},
{
"epoch": 0.6815597725998879,
"grad_norm": 0.5633410811424255,
"learning_rate": 2.5183807065908296e-06,
"loss": 0.2287,
"step": 532
},
{
"epoch": 0.684122027384098,
"grad_norm": 0.3865067958831787,
"learning_rate": 2.4816642302655634e-06,
"loss": 0.2644,
"step": 534
},
{
"epoch": 0.6866842821683081,
"grad_norm": 0.5245662331581116,
"learning_rate": 2.445128818875166e-06,
"loss": 0.2354,
"step": 536
},
{
"epoch": 0.6892465369525183,
"grad_norm": 0.4881504774093628,
"learning_rate": 2.408777099266291e-06,
"loss": 0.2779,
"step": 538
},
{
"epoch": 0.6918087917367283,
"grad_norm": 0.5840505957603455,
"learning_rate": 2.3726116850783987e-06,
"loss": 0.2742,
"step": 540
},
{
"epoch": 0.6943710465209384,
"grad_norm": 0.4902634918689728,
"learning_rate": 2.3366351765558437e-06,
"loss": 0.2818,
"step": 542
},
{
"epoch": 0.6969333013051485,
"grad_norm": 0.4141348600387573,
"learning_rate": 2.3008501603609147e-06,
"loss": 0.2542,
"step": 544
},
{
"epoch": 0.6994955560893586,
"grad_norm": 0.3754000663757324,
"learning_rate": 2.265259209387867e-06,
"loss": 0.2664,
"step": 546
},
{
"epoch": 0.7020578108735688,
"grad_norm": 0.6529264450073242,
"learning_rate": 2.229864882577921e-06,
"loss": 0.2678,
"step": 548
},
{
"epoch": 0.7046200656577788,
"grad_norm": 0.3764033615589142,
"learning_rate": 2.194669724735296e-06,
"loss": 0.2668,
"step": 550
},
{
"epoch": 0.7071823204419889,
"grad_norm": 0.3769323229789734,
"learning_rate": 2.159676266344222e-06,
"loss": 0.2663,
"step": 552
},
{
"epoch": 0.709744575226199,
"grad_norm": 0.3979746103286743,
"learning_rate": 2.124887023387017e-06,
"loss": 0.2666,
"step": 554
},
{
"epoch": 0.7123068300104092,
"grad_norm": 0.4987868070602417,
"learning_rate": 2.0903044971631854e-06,
"loss": 0.2292,
"step": 556
},
{
"epoch": 0.7148690847946193,
"grad_norm": 0.6058522462844849,
"learning_rate": 2.055931174109579e-06,
"loss": 0.2354,
"step": 558
},
{
"epoch": 0.7174313395788293,
"grad_norm": 0.5615466237068176,
"learning_rate": 2.02176952562162e-06,
"loss": 0.2557,
"step": 560
},
{
"epoch": 0.7199935943630394,
"grad_norm": 0.5051982998847961,
"learning_rate": 1.987822007875617e-06,
"loss": 0.2706,
"step": 562
},
{
"epoch": 0.7225558491472496,
"grad_norm": 0.571441650390625,
"learning_rate": 1.954091061652172e-06,
"loss": 0.2815,
"step": 564
},
{
"epoch": 0.7251181039314597,
"grad_norm": 0.5101485252380371,
"learning_rate": 1.920579112160685e-06,
"loss": 0.2314,
"step": 566
},
{
"epoch": 0.7276803587156698,
"grad_norm": 0.4810335040092468,
"learning_rate": 1.8872885688649879e-06,
"loss": 0.2812,
"step": 568
},
{
"epoch": 0.7302426134998798,
"grad_norm": 0.49377724528312683,
"learning_rate": 1.854221825310103e-06,
"loss": 0.2656,
"step": 570
},
{
"epoch": 0.73280486828409,
"grad_norm": 0.5363904237747192,
"learning_rate": 1.8213812589501611e-06,
"loss": 0.265,
"step": 572
},
{
"epoch": 0.7353671230683001,
"grad_norm": 0.5577176213264465,
"learning_rate": 1.78876923097745e-06,
"loss": 0.2652,
"step": 574
},
{
"epoch": 0.7379293778525102,
"grad_norm": 0.44135797023773193,
"learning_rate": 1.7563880861526656e-06,
"loss": 0.2748,
"step": 576
},
{
"epoch": 0.7404916326367204,
"grad_norm": 0.41491812467575073,
"learning_rate": 1.7242401526363095e-06,
"loss": 0.2847,
"step": 578
},
{
"epoch": 0.7430538874209304,
"grad_norm": 0.4843028783798218,
"learning_rate": 1.692327741821312e-06,
"loss": 0.2792,
"step": 580
},
{
"epoch": 0.7456161422051405,
"grad_norm": 0.5842957496643066,
"learning_rate": 1.6606531481668364e-06,
"loss": 0.2784,
"step": 582
},
{
"epoch": 0.7481783969893506,
"grad_norm": 0.572831928730011,
"learning_rate": 1.6292186490333172e-06,
"loss": 0.2862,
"step": 584
},
{
"epoch": 0.7507406517735608,
"grad_norm": 0.5212300419807434,
"learning_rate": 1.5980265045187139e-06,
"loss": 0.2637,
"step": 586
},
{
"epoch": 0.7533029065577709,
"grad_norm": 0.5278065800666809,
"learning_rate": 1.567078957296016e-06,
"loss": 0.2617,
"step": 588
},
{
"epoch": 0.7558651613419809,
"grad_norm": 0.5063283443450928,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.2569,
"step": 590
},
{
"epoch": 0.758427416126191,
"grad_norm": 0.40898391604423523,
"learning_rate": 1.5059265373272574e-06,
"loss": 0.2558,
"step": 592
},
{
"epoch": 0.7609896709104012,
"grad_norm": 0.5030636191368103,
"learning_rate": 1.475726061357463e-06,
"loss": 0.2547,
"step": 594
},
{
"epoch": 0.7635519256946113,
"grad_norm": 0.5822692513465881,
"learning_rate": 1.4457789759159813e-06,
"loss": 0.2266,
"step": 596
},
{
"epoch": 0.7661141804788214,
"grad_norm": 0.5503767132759094,
"learning_rate": 1.4160874341577447e-06,
"loss": 0.269,
"step": 598
},
{
"epoch": 0.7686764352630314,
"grad_norm": 0.4649931788444519,
"learning_rate": 1.3866535708644335e-06,
"loss": 0.2536,
"step": 600
},
{
"epoch": 0.7712386900472415,
"grad_norm": 0.6687978506088257,
"learning_rate": 1.3574795022910014e-06,
"loss": 0.3012,
"step": 602
},
{
"epoch": 0.7738009448314517,
"grad_norm": 0.5331063866615295,
"learning_rate": 1.3285673260135073e-06,
"loss": 0.2453,
"step": 604
},
{
"epoch": 0.7763631996156618,
"grad_norm": 0.46101680397987366,
"learning_rate": 1.2999191207783129e-06,
"loss": 0.2308,
"step": 606
},
{
"epoch": 0.7789254543998719,
"grad_norm": 0.4032719135284424,
"learning_rate": 1.2715369463526173e-06,
"loss": 0.2534,
"step": 608
},
{
"epoch": 0.781487709184082,
"grad_norm": 0.7435618042945862,
"learning_rate": 1.2434228433763657e-06,
"loss": 0.2331,
"step": 610
},
{
"epoch": 0.7840499639682921,
"grad_norm": 0.6071492433547974,
"learning_rate": 1.215578833215526e-06,
"loss": 0.2695,
"step": 612
},
{
"epoch": 0.7866122187525022,
"grad_norm": 0.4534173011779785,
"learning_rate": 1.1880069178167586e-06,
"loss": 0.2654,
"step": 614
},
{
"epoch": 0.7891744735367123,
"grad_norm": 0.48930707573890686,
"learning_rate": 1.1607090795634802e-06,
"loss": 0.2597,
"step": 616
},
{
"epoch": 0.7917367283209225,
"grad_norm": 0.43963509798049927,
"learning_rate": 1.133687281133331e-06,
"loss": 0.2454,
"step": 618
},
{
"epoch": 0.7942989831051325,
"grad_norm": 0.45418596267700195,
"learning_rate": 1.1069434653570633e-06,
"loss": 0.2541,
"step": 620
},
{
"epoch": 0.7968612378893426,
"grad_norm": 0.41048523783683777,
"learning_rate": 1.0804795550788473e-06,
"loss": 0.2743,
"step": 622
},
{
"epoch": 0.7994234926735527,
"grad_norm": 0.516132116317749,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.2736,
"step": 624
},
{
"epoch": 0.8019857474577629,
"grad_norm": 0.412601113319397,
"learning_rate": 1.0283990416323336e-06,
"loss": 0.2503,
"step": 626
},
{
"epoch": 0.804548002241973,
"grad_norm": 0.5029380917549133,
"learning_rate": 1.0027861829824953e-06,
"loss": 0.232,
"step": 628
},
{
"epoch": 0.807110257026183,
"grad_norm": 0.4999438226222992,
"learning_rate": 9.774607185984004e-07,
"loss": 0.2549,
"step": 630
},
{
"epoch": 0.8096725118103931,
"grad_norm": 0.44878801703453064,
"learning_rate": 9.524244693466773e-07,
"loss": 0.2355,
"step": 632
},
{
"epoch": 0.8122347665946033,
"grad_norm": 0.4290701150894165,
"learning_rate": 9.276792352997782e-07,
"loss": 0.2579,
"step": 634
},
{
"epoch": 0.8147970213788134,
"grad_norm": 0.5716743469238281,
"learning_rate": 9.032267956065516e-07,
"loss": 0.2833,
"step": 636
},
{
"epoch": 0.8173592761630235,
"grad_norm": 0.4765143394470215,
"learning_rate": 8.790689083643328e-07,
"loss": 0.2473,
"step": 638
},
{
"epoch": 0.8199215309472335,
"grad_norm": 0.4390144646167755,
"learning_rate": 8.552073104925296e-07,
"loss": 0.2711,
"step": 640
},
{
"epoch": 0.8224837857314437,
"grad_norm": 0.5272576808929443,
"learning_rate": 8.316437176077491e-07,
"loss": 0.2749,
"step": 642
},
{
"epoch": 0.8250460405156538,
"grad_norm": 0.44547039270401,
"learning_rate": 8.083798239004408e-07,
"loss": 0.259,
"step": 644
},
{
"epoch": 0.8276082952998639,
"grad_norm": 0.578179121017456,
"learning_rate": 7.854173020130906e-07,
"loss": 0.2946,
"step": 646
},
{
"epoch": 0.830170550084074,
"grad_norm": 0.4996013641357422,
"learning_rate": 7.627578029199562e-07,
"loss": 0.2573,
"step": 648
},
{
"epoch": 0.832732804868284,
"grad_norm": 0.5044499039649963,
"learning_rate": 7.404029558083653e-07,
"loss": 0.2461,
"step": 650
},
{
"epoch": 0.8352950596524942,
"grad_norm": 0.42843055725097656,
"learning_rate": 7.183543679615834e-07,
"loss": 0.2578,
"step": 652
},
{
"epoch": 0.8378573144367043,
"grad_norm": 0.5041942596435547,
"learning_rate": 6.966136246432492e-07,
"loss": 0.2412,
"step": 654
},
{
"epoch": 0.8404195692209144,
"grad_norm": 0.454973042011261,
"learning_rate": 6.751822889833926e-07,
"loss": 0.265,
"step": 656
},
{
"epoch": 0.8429818240051246,
"grad_norm": 0.4820737838745117,
"learning_rate": 6.540619018660555e-07,
"loss": 0.226,
"step": 658
},
{
"epoch": 0.8455440787893346,
"grad_norm": 0.5445938110351562,
"learning_rate": 6.332539818184985e-07,
"loss": 0.2501,
"step": 660
},
{
"epoch": 0.8481063335735447,
"grad_norm": 0.5699609518051147,
"learning_rate": 6.127600249020216e-07,
"loss": 0.2747,
"step": 662
},
{
"epoch": 0.8506685883577548,
"grad_norm": 0.46571552753448486,
"learning_rate": 5.925815046044026e-07,
"loss": 0.2612,
"step": 664
},
{
"epoch": 0.853230843141965,
"grad_norm": 0.47914472222328186,
"learning_rate": 5.727198717339511e-07,
"loss": 0.2574,
"step": 666
},
{
"epoch": 0.8557930979261751,
"grad_norm": 0.40852856636047363,
"learning_rate": 5.531765543152002e-07,
"loss": 0.2734,
"step": 668
},
{
"epoch": 0.8583553527103851,
"grad_norm": 0.3702560067176819,
"learning_rate": 5.33952957486234e-07,
"loss": 0.2539,
"step": 670
},
{
"epoch": 0.8609176074945952,
"grad_norm": 0.5180298686027527,
"learning_rate": 5.150504633976572e-07,
"loss": 0.3682,
"step": 672
},
{
"epoch": 0.8634798622788054,
"grad_norm": 0.7016831040382385,
"learning_rate": 4.964704311132224e-07,
"loss": 0.2265,
"step": 674
},
{
"epoch": 0.8660421170630155,
"grad_norm": 0.5376434922218323,
"learning_rate": 4.782141965121129e-07,
"loss": 0.2676,
"step": 676
},
{
"epoch": 0.8686043718472256,
"grad_norm": 0.47063949704170227,
"learning_rate": 4.602830721928997e-07,
"loss": 0.2606,
"step": 678
},
{
"epoch": 0.8711666266314356,
"grad_norm": 0.4991367757320404,
"learning_rate": 4.4267834737916295e-07,
"loss": 0.2414,
"step": 680
},
{
"epoch": 0.8737288814156458,
"grad_norm": 0.4373914301395416,
"learning_rate": 4.2540128782679934e-07,
"loss": 0.2287,
"step": 682
},
{
"epoch": 0.8762911361998559,
"grad_norm": 0.39528214931488037,
"learning_rate": 4.0845313573301736e-07,
"loss": 0.2404,
"step": 684
},
{
"epoch": 0.878853390984066,
"grad_norm": 0.5945621132850647,
"learning_rate": 3.9183510964702463e-07,
"loss": 0.2719,
"step": 686
},
{
"epoch": 0.8814156457682761,
"grad_norm": 0.6032932996749878,
"learning_rate": 3.755484043824131e-07,
"loss": 0.2608,
"step": 688
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.49754688143730164,
"learning_rate": 3.595941909312595e-07,
"loss": 0.2852,
"step": 690
},
{
"epoch": 0.8865401553366963,
"grad_norm": 0.49544405937194824,
"learning_rate": 3.439736163799251e-07,
"loss": 0.2693,
"step": 692
},
{
"epoch": 0.8891024101209064,
"grad_norm": 0.4462824761867523,
"learning_rate": 3.2868780382658895e-07,
"loss": 0.2443,
"step": 694
},
{
"epoch": 0.8916646649051165,
"grad_norm": 0.4302297532558441,
"learning_rate": 3.1373785230049356e-07,
"loss": 0.2515,
"step": 696
},
{
"epoch": 0.8942269196893267,
"grad_norm": 0.4883180856704712,
"learning_rate": 2.991248366829291e-07,
"loss": 0.2757,
"step": 698
},
{
"epoch": 0.8967891744735367,
"grad_norm": 0.7474163174629211,
"learning_rate": 2.848498076299483e-07,
"loss": 0.2963,
"step": 700
},
{
"epoch": 0.8993514292577468,
"grad_norm": 0.4648323059082031,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.2361,
"step": 702
},
{
"epoch": 0.9019136840419569,
"grad_norm": 0.4341067373752594,
"learning_rate": 2.573177902642726e-07,
"loss": 0.2555,
"step": 704
},
{
"epoch": 0.9044759388261671,
"grad_norm": 0.47577670216560364,
"learning_rate": 2.440627814663804e-07,
"loss": 0.2772,
"step": 706
},
{
"epoch": 0.9070381936103772,
"grad_norm": 0.47802722454071045,
"learning_rate": 2.3114971812034981e-07,
"loss": 0.2544,
"step": 708
},
{
"epoch": 0.9096004483945872,
"grad_norm": 0.4585348665714264,
"learning_rate": 2.1857952865796616e-07,
"loss": 0.2424,
"step": 710
},
{
"epoch": 0.9121627031787973,
"grad_norm": 0.4453139305114746,
"learning_rate": 2.0635311685884675e-07,
"loss": 0.2424,
"step": 712
},
{
"epoch": 0.9147249579630075,
"grad_norm": 0.4780106544494629,
"learning_rate": 1.9447136178545766e-07,
"loss": 0.2475,
"step": 714
},
{
"epoch": 0.9172872127472176,
"grad_norm": 0.47332102060317993,
"learning_rate": 1.8293511771991624e-07,
"loss": 0.2414,
"step": 716
},
{
"epoch": 0.9198494675314277,
"grad_norm": 0.5608975887298584,
"learning_rate": 1.7174521410256162e-07,
"loss": 0.2733,
"step": 718
},
{
"epoch": 0.9224117223156377,
"grad_norm": 0.611322283744812,
"learning_rate": 1.6090245547232707e-07,
"loss": 0.2195,
"step": 720
},
{
"epoch": 0.9249739770998479,
"grad_norm": 0.37321174144744873,
"learning_rate": 1.5040762140888843e-07,
"loss": 0.2496,
"step": 722
},
{
"epoch": 0.927536231884058,
"grad_norm": 0.394593745470047,
"learning_rate": 1.402614664766172e-07,
"loss": 0.2521,
"step": 724
},
{
"epoch": 0.9300984866682681,
"grad_norm": 0.5954830646514893,
"learning_rate": 1.3046472017032685e-07,
"loss": 0.2742,
"step": 726
},
{
"epoch": 0.9326607414524781,
"grad_norm": 0.3724110722541809,
"learning_rate": 1.210180868628219e-07,
"loss": 0.2359,
"step": 728
},
{
"epoch": 0.9352229962366883,
"grad_norm": 0.42592036724090576,
"learning_rate": 1.1192224575425848e-07,
"loss": 0.2428,
"step": 730
},
{
"epoch": 0.9377852510208984,
"grad_norm": 0.481985479593277,
"learning_rate": 1.0317785082330555e-07,
"loss": 0.2982,
"step": 732
},
{
"epoch": 0.9403475058051085,
"grad_norm": 0.5069997906684875,
"learning_rate": 9.478553078013042e-08,
"loss": 0.2659,
"step": 734
},
{
"epoch": 0.9429097605893186,
"grad_norm": 0.41395503282546997,
"learning_rate": 8.674588902118919e-08,
"loss": 0.2429,
"step": 736
},
{
"epoch": 0.9454720153735287,
"grad_norm": 0.5499728322029114,
"learning_rate": 7.905950358584768e-08,
"loss": 0.2665,
"step": 738
},
{
"epoch": 0.9480342701577388,
"grad_norm": 0.5084072351455688,
"learning_rate": 7.172692711482022e-08,
"loss": 0.2768,
"step": 740
},
{
"epoch": 0.9505965249419489,
"grad_norm": 0.6337217688560486,
"learning_rate": 6.474868681043578e-08,
"loss": 0.264,
"step": 742
},
{
"epoch": 0.953158779726159,
"grad_norm": 0.47787654399871826,
"learning_rate": 5.8125284398730666e-08,
"loss": 0.2686,
"step": 744
},
{
"epoch": 0.9557210345103692,
"grad_norm": 0.46294164657592773,
"learning_rate": 5.185719609337836e-08,
"loss": 0.2709,
"step": 746
},
{
"epoch": 0.9582832892945792,
"grad_norm": 0.3983994126319885,
"learning_rate": 4.5944872561448084e-08,
"loss": 0.2542,
"step": 748
},
{
"epoch": 0.9608455440787893,
"grad_norm": 0.5090007781982422,
"learning_rate": 4.038873889100237e-08,
"loss": 0.2613,
"step": 750
},
{
"epoch": 0.9634077988629994,
"grad_norm": 0.6989894509315491,
"learning_rate": 3.518919456053649e-08,
"loss": 0.2394,
"step": 752
},
{
"epoch": 0.9659700536472096,
"grad_norm": 0.5098798871040344,
"learning_rate": 3.034661341025258e-08,
"loss": 0.2581,
"step": 754
},
{
"epoch": 0.9685323084314197,
"grad_norm": 0.49127092957496643,
"learning_rate": 2.5861343615184997e-08,
"loss": 0.2271,
"step": 756
},
{
"epoch": 0.9710945632156297,
"grad_norm": 0.42872855067253113,
"learning_rate": 2.173370766016314e-08,
"loss": 0.2569,
"step": 758
},
{
"epoch": 0.9736568179998398,
"grad_norm": 0.5179227590560913,
"learning_rate": 1.7964002316628316e-08,
"loss": 0.2569,
"step": 760
},
{
"epoch": 0.97621907278405,
"grad_norm": 0.6316475868225098,
"learning_rate": 1.4552498621295264e-08,
"loss": 0.2667,
"step": 762
},
{
"epoch": 0.9787813275682601,
"grad_norm": 0.4547966718673706,
"learning_rate": 1.1499441856663296e-08,
"loss": 0.2743,
"step": 764
},
{
"epoch": 0.9813435823524702,
"grad_norm": 0.3960263431072235,
"learning_rate": 8.805051533384846e-09,
"loss": 0.2321,
"step": 766
},
{
"epoch": 0.9839058371366802,
"grad_norm": 0.4860779345035553,
"learning_rate": 6.469521374477539e-09,
"loss": 0.2375,
"step": 768
},
{
"epoch": 0.9864680919208904,
"grad_norm": 0.4424307346343994,
"learning_rate": 4.493019301401447e-09,
"loss": 0.2373,
"step": 770
},
{
"epoch": 0.9890303467051005,
"grad_norm": 0.6010534167289734,
"learning_rate": 2.875687421980966e-09,
"loss": 0.2703,
"step": 772
},
{
"epoch": 0.9915926014893106,
"grad_norm": 0.4469178318977356,
"learning_rate": 1.6176420201902132e-09,
"loss": 0.2426,
"step": 774
},
{
"epoch": 0.9941548562735207,
"grad_norm": 0.3937220573425293,
"learning_rate": 7.189735477913795e-10,
"loss": 0.2662,
"step": 776
},
{
"epoch": 0.9967171110577308,
"grad_norm": 0.44991588592529297,
"learning_rate": 1.797466178327101e-10,
"loss": 0.242,
"step": 778
},
{
"epoch": 0.9992793658419409,
"grad_norm": 0.7540487051010132,
"learning_rate": 0.0,
"loss": 0.308,
"step": 780
},
{
"epoch": 0.9992793658419409,
"step": 780,
"total_flos": 2.0573294793064448e+18,
"train_loss": 0.29625688539101525,
"train_runtime": 10334.3449,
"train_samples_per_second": 9.668,
"train_steps_per_second": 0.075
}
],
"logging_steps": 2,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0573294793064448e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}