|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9992793658419409, |
|
"eval_steps": 500, |
|
"global_step": 780, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025622547842101048, |
|
"grad_norm": 4.871487140655518, |
|
"learning_rate": 5.128205128205128e-07, |
|
"loss": 0.7593, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0051245095684202095, |
|
"grad_norm": 4.878956317901611, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 0.8076, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.007686764352630314, |
|
"grad_norm": 4.183067321777344, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.7059, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.010249019136840419, |
|
"grad_norm": 4.679640769958496, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 0.715, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.012811273921050524, |
|
"grad_norm": 4.590462684631348, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.7175, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015373528705260629, |
|
"grad_norm": 4.5435638427734375, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.6976, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.017935783489470735, |
|
"grad_norm": 4.649476051330566, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 0.7615, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.020498038273680838, |
|
"grad_norm": 3.847956418991089, |
|
"learning_rate": 4.102564102564103e-06, |
|
"loss": 0.6735, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.023060293057890945, |
|
"grad_norm": 4.92044734954834, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.6885, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.025622547842101048, |
|
"grad_norm": 4.817592620849609, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.768, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028184802626311154, |
|
"grad_norm": 3.8268470764160156, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 0.7092, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.030747057410521257, |
|
"grad_norm": 4.103015899658203, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.6764, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.033309312194731364, |
|
"grad_norm": 4.125541687011719, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.6386, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03587156697894147, |
|
"grad_norm": 4.19417142868042, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 0.6218, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03843382176315158, |
|
"grad_norm": 3.5323216915130615, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.5853, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.040996076547361676, |
|
"grad_norm": 3.1467161178588867, |
|
"learning_rate": 8.205128205128205e-06, |
|
"loss": 0.5441, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04355833133157178, |
|
"grad_norm": 3.0700371265411377, |
|
"learning_rate": 8.717948717948719e-06, |
|
"loss": 0.5458, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04612058611578189, |
|
"grad_norm": 2.553177833557129, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.4589, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.048682840899991996, |
|
"grad_norm": 2.5199780464172363, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.4386, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.051245095684202095, |
|
"grad_norm": 1.7382951974868774, |
|
"learning_rate": 9.99995506314361e-06, |
|
"loss": 0.4215, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0538073504684122, |
|
"grad_norm": 0.9749733209609985, |
|
"learning_rate": 9.999595573138845e-06, |
|
"loss": 0.3888, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05636960525262231, |
|
"grad_norm": 0.9746177196502686, |
|
"learning_rate": 9.99887661897616e-06, |
|
"loss": 0.3749, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.058931860036832415, |
|
"grad_norm": 0.7504925727844238, |
|
"learning_rate": 9.997798252347382e-06, |
|
"loss": 0.3543, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.061494114821042514, |
|
"grad_norm": 0.7129773497581482, |
|
"learning_rate": 9.996360550785619e-06, |
|
"loss": 0.3565, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06405636960525263, |
|
"grad_norm": 0.6482123732566833, |
|
"learning_rate": 9.994563617659665e-06, |
|
"loss": 0.3242, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06661862438946273, |
|
"grad_norm": 0.5225902199745178, |
|
"learning_rate": 9.992407582166582e-06, |
|
"loss": 0.3334, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06918087917367283, |
|
"grad_norm": 0.5128389596939087, |
|
"learning_rate": 9.989892599322404e-06, |
|
"loss": 0.3741, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07174313395788294, |
|
"grad_norm": 0.3568147122859955, |
|
"learning_rate": 9.987018849950996e-06, |
|
"loss": 0.3331, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07430538874209304, |
|
"grad_norm": 0.5258967280387878, |
|
"learning_rate": 9.983786540671052e-06, |
|
"loss": 0.3335, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07686764352630315, |
|
"grad_norm": 0.5380641222000122, |
|
"learning_rate": 9.980195903881231e-06, |
|
"loss": 0.3344, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07942989831051325, |
|
"grad_norm": 0.3320980668067932, |
|
"learning_rate": 9.976247197743465e-06, |
|
"loss": 0.3055, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08199215309472335, |
|
"grad_norm": 0.3006751835346222, |
|
"learning_rate": 9.97194070616438e-06, |
|
"loss": 0.3187, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08455440787893347, |
|
"grad_norm": 0.26902303099632263, |
|
"learning_rate": 9.967276738774897e-06, |
|
"loss": 0.2998, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08711666266314357, |
|
"grad_norm": 0.2527882158756256, |
|
"learning_rate": 9.962255630907964e-06, |
|
"loss": 0.3251, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08967891744735366, |
|
"grad_norm": 0.24817310273647308, |
|
"learning_rate": 9.956877743574437e-06, |
|
"loss": 0.317, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09224117223156378, |
|
"grad_norm": 0.25589698553085327, |
|
"learning_rate": 9.951143463437145e-06, |
|
"loss": 0.31, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09480342701577388, |
|
"grad_norm": 0.23160037398338318, |
|
"learning_rate": 9.94505320278307e-06, |
|
"loss": 0.3158, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09736568179998399, |
|
"grad_norm": 0.26378345489501953, |
|
"learning_rate": 9.938607399493714e-06, |
|
"loss": 0.3163, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09992793658419409, |
|
"grad_norm": 0.21700599789619446, |
|
"learning_rate": 9.931806517013612e-06, |
|
"loss": 0.3132, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10249019136840419, |
|
"grad_norm": 0.29619359970092773, |
|
"learning_rate": 9.924651044317017e-06, |
|
"loss": 0.315, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1050524461526143, |
|
"grad_norm": 0.33124956488609314, |
|
"learning_rate": 9.917141495872733e-06, |
|
"loss": 0.2851, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1076147009368244, |
|
"grad_norm": 0.202985018491745, |
|
"learning_rate": 9.909278411607134e-06, |
|
"loss": 0.3036, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.11017695572103452, |
|
"grad_norm": 0.24660180509090424, |
|
"learning_rate": 9.90106235686534e-06, |
|
"loss": 0.3374, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11273921050524462, |
|
"grad_norm": 0.2552855312824249, |
|
"learning_rate": 9.892493922370575e-06, |
|
"loss": 0.316, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11530146528945472, |
|
"grad_norm": 0.29509437084198, |
|
"learning_rate": 9.883573724181683e-06, |
|
"loss": 0.3284, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11786372007366483, |
|
"grad_norm": 0.4660441279411316, |
|
"learning_rate": 9.87430240364885e-06, |
|
"loss": 0.3266, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.12042597485787493, |
|
"grad_norm": 0.31215617060661316, |
|
"learning_rate": 9.864680627367476e-06, |
|
"loss": 0.3104, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.12298822964208503, |
|
"grad_norm": 0.6009628176689148, |
|
"learning_rate": 9.854709087130261e-06, |
|
"loss": 0.3221, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12555048442629513, |
|
"grad_norm": 0.30782508850097656, |
|
"learning_rate": 9.844388499877457e-06, |
|
"loss": 0.2913, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12811273921050526, |
|
"grad_norm": 0.3109281659126282, |
|
"learning_rate": 9.833719607645325e-06, |
|
"loss": 0.2974, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13067499399471535, |
|
"grad_norm": 0.2832891345024109, |
|
"learning_rate": 9.822703177512783e-06, |
|
"loss": 0.3698, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.13323724877892545, |
|
"grad_norm": 0.2483058124780655, |
|
"learning_rate": 9.811340001546252e-06, |
|
"loss": 0.2995, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.13579950356313555, |
|
"grad_norm": 0.2657499611377716, |
|
"learning_rate": 9.799630896742716e-06, |
|
"loss": 0.304, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13836175834734565, |
|
"grad_norm": 0.4393406808376312, |
|
"learning_rate": 9.787576704970965e-06, |
|
"loss": 0.3032, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.14092401313155578, |
|
"grad_norm": 0.3503418266773224, |
|
"learning_rate": 9.77517829291108e-06, |
|
"loss": 0.2915, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14348626791576588, |
|
"grad_norm": 0.28331002593040466, |
|
"learning_rate": 9.762436551992117e-06, |
|
"loss": 0.3192, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14604852269997598, |
|
"grad_norm": 0.28841540217399597, |
|
"learning_rate": 9.74935239832801e-06, |
|
"loss": 0.2999, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14861077748418608, |
|
"grad_norm": 0.3377619683742523, |
|
"learning_rate": 9.735926772651703e-06, |
|
"loss": 0.31, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.15117303226839618, |
|
"grad_norm": 0.358359158039093, |
|
"learning_rate": 9.722160640247523e-06, |
|
"loss": 0.292, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1537352870526063, |
|
"grad_norm": 0.24913446605205536, |
|
"learning_rate": 9.708054990881763e-06, |
|
"loss": 0.3077, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1562975418368164, |
|
"grad_norm": 0.34343913197517395, |
|
"learning_rate": 9.693610838731532e-06, |
|
"loss": 0.3132, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.1588597966210265, |
|
"grad_norm": 0.2604675590991974, |
|
"learning_rate": 9.678829222311827e-06, |
|
"loss": 0.3139, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1614220514052366, |
|
"grad_norm": 0.30086928606033325, |
|
"learning_rate": 9.663711204400872e-06, |
|
"loss": 0.2992, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.1639843061894467, |
|
"grad_norm": 0.3241061270236969, |
|
"learning_rate": 9.6482578719637e-06, |
|
"loss": 0.3066, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1665465609736568, |
|
"grad_norm": 0.26830941438674927, |
|
"learning_rate": 9.632470336074009e-06, |
|
"loss": 0.3121, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16910881575786693, |
|
"grad_norm": 0.203893780708313, |
|
"learning_rate": 9.616349731834271e-06, |
|
"loss": 0.2991, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.17167107054207703, |
|
"grad_norm": 0.3675401210784912, |
|
"learning_rate": 9.599897218294122e-06, |
|
"loss": 0.3227, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.17423332532628713, |
|
"grad_norm": 0.21370336413383484, |
|
"learning_rate": 9.583113978367026e-06, |
|
"loss": 0.3025, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.17679558011049723, |
|
"grad_norm": 0.25403013825416565, |
|
"learning_rate": 9.56600121874523e-06, |
|
"loss": 0.3179, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17935783489470733, |
|
"grad_norm": 0.3012063503265381, |
|
"learning_rate": 9.548560169812997e-06, |
|
"loss": 0.2906, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18192008967891746, |
|
"grad_norm": 0.24593935906887054, |
|
"learning_rate": 9.530792085558151e-06, |
|
"loss": 0.2968, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.18448234446312756, |
|
"grad_norm": 0.267528235912323, |
|
"learning_rate": 9.512698243481914e-06, |
|
"loss": 0.3076, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.18704459924733766, |
|
"grad_norm": 0.4075755774974823, |
|
"learning_rate": 9.49427994450705e-06, |
|
"loss": 0.292, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18960685403154776, |
|
"grad_norm": 0.37276849150657654, |
|
"learning_rate": 9.47553851288434e-06, |
|
"loss": 0.3337, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.19216910881575786, |
|
"grad_norm": 0.34166908264160156, |
|
"learning_rate": 9.45647529609736e-06, |
|
"loss": 0.2854, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.19473136359996798, |
|
"grad_norm": 0.3679031431674957, |
|
"learning_rate": 9.437091664765611e-06, |
|
"loss": 0.328, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.19729361838417808, |
|
"grad_norm": 0.2564798891544342, |
|
"learning_rate": 9.41738901254596e-06, |
|
"loss": 0.2831, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.19985587316838818, |
|
"grad_norm": 0.39898496866226196, |
|
"learning_rate": 9.397368756032445e-06, |
|
"loss": 0.2899, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.20241812795259828, |
|
"grad_norm": 0.2926347255706787, |
|
"learning_rate": 9.37703233465443e-06, |
|
"loss": 0.2796, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.20498038273680838, |
|
"grad_norm": 0.3333691656589508, |
|
"learning_rate": 9.356381210573092e-06, |
|
"loss": 0.2965, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2075426375210185, |
|
"grad_norm": 0.2890892028808594, |
|
"learning_rate": 9.33541686857632e-06, |
|
"loss": 0.2884, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2101048923052286, |
|
"grad_norm": 0.27766191959381104, |
|
"learning_rate": 9.31414081597194e-06, |
|
"loss": 0.297, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2126671470894387, |
|
"grad_norm": 0.3218678832054138, |
|
"learning_rate": 9.292554582479349e-06, |
|
"loss": 0.2862, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2152294018736488, |
|
"grad_norm": 0.3139230012893677, |
|
"learning_rate": 9.270659720119533e-06, |
|
"loss": 0.2958, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2177916566578589, |
|
"grad_norm": 0.2383907586336136, |
|
"learning_rate": 9.248457803103476e-06, |
|
"loss": 0.2988, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.22035391144206903, |
|
"grad_norm": 0.32504117488861084, |
|
"learning_rate": 9.225950427718974e-06, |
|
"loss": 0.2803, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.22291616622627913, |
|
"grad_norm": 0.2564990818500519, |
|
"learning_rate": 9.203139212215868e-06, |
|
"loss": 0.2957, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.22547842101048923, |
|
"grad_norm": 0.282103568315506, |
|
"learning_rate": 9.180025796689692e-06, |
|
"loss": 0.2933, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.22804067579469933, |
|
"grad_norm": 0.3701488971710205, |
|
"learning_rate": 9.156611842963753e-06, |
|
"loss": 0.2926, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.23060293057890943, |
|
"grad_norm": 0.3153334856033325, |
|
"learning_rate": 9.132899034469648e-06, |
|
"loss": 0.3111, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23316518536311953, |
|
"grad_norm": 0.31320127844810486, |
|
"learning_rate": 9.108889076126226e-06, |
|
"loss": 0.2948, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.23572744014732966, |
|
"grad_norm": 0.3407798409461975, |
|
"learning_rate": 9.084583694217012e-06, |
|
"loss": 0.2872, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.23828969493153976, |
|
"grad_norm": 0.5061964988708496, |
|
"learning_rate": 9.059984636266082e-06, |
|
"loss": 0.3073, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.24085194971574986, |
|
"grad_norm": 0.342929482460022, |
|
"learning_rate": 9.035093670912424e-06, |
|
"loss": 0.2866, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.24341420449995996, |
|
"grad_norm": 0.38549765944480896, |
|
"learning_rate": 9.009912587782772e-06, |
|
"loss": 0.298, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24597645928417006, |
|
"grad_norm": 0.339372843503952, |
|
"learning_rate": 8.984443197362938e-06, |
|
"loss": 0.2644, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.24853871406838018, |
|
"grad_norm": 0.26157572865486145, |
|
"learning_rate": 8.958687330867634e-06, |
|
"loss": 0.2986, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.25110096885259026, |
|
"grad_norm": 0.307921826839447, |
|
"learning_rate": 8.932646840108818e-06, |
|
"loss": 0.2826, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.2536632236368004, |
|
"grad_norm": 0.48844948410987854, |
|
"learning_rate": 8.906323597362547e-06, |
|
"loss": 0.2824, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2562254784210105, |
|
"grad_norm": 0.3046979308128357, |
|
"learning_rate": 8.879719495234363e-06, |
|
"loss": 0.2836, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2587877332052206, |
|
"grad_norm": 0.37873372435569763, |
|
"learning_rate": 8.852836446523213e-06, |
|
"loss": 0.2799, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2613499879894307, |
|
"grad_norm": 0.5752015709877014, |
|
"learning_rate": 8.825676384083936e-06, |
|
"loss": 0.3027, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2639122427736408, |
|
"grad_norm": 0.403952956199646, |
|
"learning_rate": 8.798241260688273e-06, |
|
"loss": 0.3032, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2664744975578509, |
|
"grad_norm": 0.36202457547187805, |
|
"learning_rate": 8.770533048884483e-06, |
|
"loss": 0.3044, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.26903675234206104, |
|
"grad_norm": 0.34956708550453186, |
|
"learning_rate": 8.742553740855507e-06, |
|
"loss": 0.2784, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2715990071262711, |
|
"grad_norm": 0.44058695435523987, |
|
"learning_rate": 8.71430534827574e-06, |
|
"loss": 0.3142, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.27416126191048124, |
|
"grad_norm": 0.3903171420097351, |
|
"learning_rate": 8.685789902166395e-06, |
|
"loss": 0.2592, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2767235166946913, |
|
"grad_norm": 0.34790173172950745, |
|
"learning_rate": 8.657009452749466e-06, |
|
"loss": 0.2881, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.27928577147890143, |
|
"grad_norm": 0.3779347240924835, |
|
"learning_rate": 8.627966069300332e-06, |
|
"loss": 0.3017, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.28184802626311156, |
|
"grad_norm": 0.40141528844833374, |
|
"learning_rate": 8.598661839998972e-06, |
|
"loss": 0.2781, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.28441028104732163, |
|
"grad_norm": 0.30786147713661194, |
|
"learning_rate": 8.569098871779828e-06, |
|
"loss": 0.296, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.28697253583153176, |
|
"grad_norm": 0.2742227017879486, |
|
"learning_rate": 8.539279290180315e-06, |
|
"loss": 0.3161, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.28953479061574183, |
|
"grad_norm": 0.5068826675415039, |
|
"learning_rate": 8.509205239188017e-06, |
|
"loss": 0.2948, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.29209704539995196, |
|
"grad_norm": 0.3508552610874176, |
|
"learning_rate": 8.478878881086505e-06, |
|
"loss": 0.2736, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.2946593001841621, |
|
"grad_norm": 0.47813767194747925, |
|
"learning_rate": 8.448302396299906e-06, |
|
"loss": 0.2954, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.29722155496837216, |
|
"grad_norm": 0.29084405303001404, |
|
"learning_rate": 8.417477983236107e-06, |
|
"loss": 0.3134, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2997838097525823, |
|
"grad_norm": 0.41805362701416016, |
|
"learning_rate": 8.386407858128707e-06, |
|
"loss": 0.2767, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.30234606453679236, |
|
"grad_norm": 0.32367441058158875, |
|
"learning_rate": 8.355094254877665e-06, |
|
"loss": 0.2783, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.3049083193210025, |
|
"grad_norm": 0.31607088446617126, |
|
"learning_rate": 8.323539424888695e-06, |
|
"loss": 0.2871, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.3074705741052126, |
|
"grad_norm": 0.3964040279388428, |
|
"learning_rate": 8.291745636911382e-06, |
|
"loss": 0.2747, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3100328288894227, |
|
"grad_norm": 0.3582654595375061, |
|
"learning_rate": 8.259715176876069e-06, |
|
"loss": 0.2737, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.3125950836736328, |
|
"grad_norm": 0.38515010476112366, |
|
"learning_rate": 8.2274503477295e-06, |
|
"loss": 0.2889, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3151573384578429, |
|
"grad_norm": 0.3744358718395233, |
|
"learning_rate": 8.19495346926924e-06, |
|
"loss": 0.2822, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.317719593242053, |
|
"grad_norm": 0.3402256369590759, |
|
"learning_rate": 8.162226877976886e-06, |
|
"loss": 0.284, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.32028184802626314, |
|
"grad_norm": 0.4301615059375763, |
|
"learning_rate": 8.129272926850079e-06, |
|
"loss": 0.2915, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3228441028104732, |
|
"grad_norm": 0.3376031816005707, |
|
"learning_rate": 8.096093985233323e-06, |
|
"loss": 0.2842, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.32540635759468334, |
|
"grad_norm": 0.546100378036499, |
|
"learning_rate": 8.062692438647628e-06, |
|
"loss": 0.3203, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.3279686123788934, |
|
"grad_norm": 0.37469664216041565, |
|
"learning_rate": 8.029070688619013e-06, |
|
"loss": 0.2828, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.33053086716310354, |
|
"grad_norm": 0.31530773639678955, |
|
"learning_rate": 7.995231152505815e-06, |
|
"loss": 0.2672, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.3330931219473136, |
|
"grad_norm": 0.47679194808006287, |
|
"learning_rate": 7.961176263324902e-06, |
|
"loss": 0.292, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.33565537673152374, |
|
"grad_norm": 0.7583074569702148, |
|
"learning_rate": 7.92690846957673e-06, |
|
"loss": 0.2987, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.33821763151573386, |
|
"grad_norm": 0.4478585124015808, |
|
"learning_rate": 7.892430235069317e-06, |
|
"loss": 0.2881, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.34077988629994393, |
|
"grad_norm": 0.49820685386657715, |
|
"learning_rate": 7.857744038741076e-06, |
|
"loss": 0.2912, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.34334214108415406, |
|
"grad_norm": 0.42809927463531494, |
|
"learning_rate": 7.822852374482597e-06, |
|
"loss": 0.2672, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.34590439586836413, |
|
"grad_norm": 0.3707646131515503, |
|
"learning_rate": 7.787757750957335e-06, |
|
"loss": 0.2921, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.34846665065257426, |
|
"grad_norm": 0.3849372863769531, |
|
"learning_rate": 7.752462691421245e-06, |
|
"loss": 0.2676, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.3510289054367844, |
|
"grad_norm": 0.34830930829048157, |
|
"learning_rate": 7.716969733541357e-06, |
|
"loss": 0.2576, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.35359116022099446, |
|
"grad_norm": 0.4144101142883301, |
|
"learning_rate": 7.681281429213328e-06, |
|
"loss": 0.2686, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3561534150052046, |
|
"grad_norm": 0.30803945660591125, |
|
"learning_rate": 7.645400344377953e-06, |
|
"loss": 0.2678, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.35871566978941466, |
|
"grad_norm": 0.40825673937797546, |
|
"learning_rate": 7.609329058836694e-06, |
|
"loss": 0.2907, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3612779245736248, |
|
"grad_norm": 0.38340067863464355, |
|
"learning_rate": 7.5730701660661795e-06, |
|
"loss": 0.298, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.3638401793578349, |
|
"grad_norm": 0.3731997013092041, |
|
"learning_rate": 7.536626273031747e-06, |
|
"loss": 0.263, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.366402434142045, |
|
"grad_norm": 0.3588733375072479, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.2733, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3689646889262551, |
|
"grad_norm": 0.4146881699562073, |
|
"learning_rate": 7.4631939803504215e-06, |
|
"loss": 0.3159, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3715269437104652, |
|
"grad_norm": 0.4735972285270691, |
|
"learning_rate": 7.426210860386032e-06, |
|
"loss": 0.2878, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3740891984946753, |
|
"grad_norm": 0.5484066009521484, |
|
"learning_rate": 7.3890532991431174e-06, |
|
"loss": 0.2829, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.37665145327888544, |
|
"grad_norm": 0.3961395025253296, |
|
"learning_rate": 7.3517239682000675e-06, |
|
"loss": 0.2646, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3792137080630955, |
|
"grad_norm": 0.43453872203826904, |
|
"learning_rate": 7.314225551485273e-06, |
|
"loss": 0.301, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.38177596284730564, |
|
"grad_norm": 0.45246270298957825, |
|
"learning_rate": 7.276560745084167e-06, |
|
"loss": 0.2622, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3843382176315157, |
|
"grad_norm": 0.4539019763469696, |
|
"learning_rate": 7.2387322570453724e-06, |
|
"loss": 0.2901, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.38690047241572584, |
|
"grad_norm": 0.4333208203315735, |
|
"learning_rate": 7.2007428071860045e-06, |
|
"loss": 0.2576, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.38946272719993597, |
|
"grad_norm": 0.3936616778373718, |
|
"learning_rate": 7.162595126896111e-06, |
|
"loss": 0.2716, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.39202498198414604, |
|
"grad_norm": 0.40865668654441833, |
|
"learning_rate": 7.1242919589422974e-06, |
|
"loss": 0.2716, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.39458723676835616, |
|
"grad_norm": 0.5468711256980896, |
|
"learning_rate": 7.085836057270521e-06, |
|
"loss": 0.2978, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.39714949155256624, |
|
"grad_norm": 0.469566285610199, |
|
"learning_rate": 7.047230186808085e-06, |
|
"loss": 0.2499, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.39971174633677636, |
|
"grad_norm": 0.5449560880661011, |
|
"learning_rate": 7.008477123264849e-06, |
|
"loss": 0.3018, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.4022740011209865, |
|
"grad_norm": 0.48154890537261963, |
|
"learning_rate": 6.96957965293365e-06, |
|
"loss": 0.2834, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.40483625590519656, |
|
"grad_norm": 0.3875851035118103, |
|
"learning_rate": 6.9305405724899876e-06, |
|
"loss": 0.3008, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.4073985106894067, |
|
"grad_norm": 0.5583494305610657, |
|
"learning_rate": 6.891362688790925e-06, |
|
"loss": 0.2753, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.40996076547361676, |
|
"grad_norm": 0.47610044479370117, |
|
"learning_rate": 6.8520488186733e-06, |
|
"loss": 0.2943, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4125230202578269, |
|
"grad_norm": 0.33989906311035156, |
|
"learning_rate": 6.812601788751192e-06, |
|
"loss": 0.2692, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.415085275042037, |
|
"grad_norm": 0.4737338125705719, |
|
"learning_rate": 6.773024435212678e-06, |
|
"loss": 0.2961, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.4176475298262471, |
|
"grad_norm": 0.538935124874115, |
|
"learning_rate": 6.733319603615941e-06, |
|
"loss": 0.2898, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.4202097846104572, |
|
"grad_norm": 0.4021223187446594, |
|
"learning_rate": 6.693490148684654e-06, |
|
"loss": 0.2555, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4227720393946673, |
|
"grad_norm": 0.330159991979599, |
|
"learning_rate": 6.653538934102743e-06, |
|
"loss": 0.3043, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4253342941788774, |
|
"grad_norm": 0.39451590180397034, |
|
"learning_rate": 6.6134688323084884e-06, |
|
"loss": 0.3098, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.42789654896308754, |
|
"grad_norm": 0.3512692451477051, |
|
"learning_rate": 6.573282724288001e-06, |
|
"loss": 0.276, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.4304588037472976, |
|
"grad_norm": 0.3749544322490692, |
|
"learning_rate": 6.532983499368078e-06, |
|
"loss": 0.2893, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.43302105853150774, |
|
"grad_norm": 0.35993286967277527, |
|
"learning_rate": 6.492574055008474e-06, |
|
"loss": 0.2522, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.4355833133157178, |
|
"grad_norm": 0.3857017457485199, |
|
"learning_rate": 6.452057296593568e-06, |
|
"loss": 0.2556, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.43814556809992794, |
|
"grad_norm": 0.36345577239990234, |
|
"learning_rate": 6.411436137223479e-06, |
|
"loss": 0.2795, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.44070782288413807, |
|
"grad_norm": 0.40086713433265686, |
|
"learning_rate": 6.370713497504607e-06, |
|
"loss": 0.2619, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.44327007766834814, |
|
"grad_norm": 0.4900248944759369, |
|
"learning_rate": 6.329892305339659e-06, |
|
"loss": 0.2748, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.44583233245255827, |
|
"grad_norm": 0.6341924071311951, |
|
"learning_rate": 6.288975495717124e-06, |
|
"loss": 0.2731, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.44839458723676834, |
|
"grad_norm": 0.5340880751609802, |
|
"learning_rate": 6.247966010500258e-06, |
|
"loss": 0.2797, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.45095684202097847, |
|
"grad_norm": 0.37570691108703613, |
|
"learning_rate": 6.206866798215571e-06, |
|
"loss": 0.2724, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4535190968051886, |
|
"grad_norm": 0.4172237515449524, |
|
"learning_rate": 6.165680813840822e-06, |
|
"loss": 0.2728, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.45608135158939866, |
|
"grad_norm": 0.36990782618522644, |
|
"learning_rate": 6.124411018592568e-06, |
|
"loss": 0.2733, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.4586436063736088, |
|
"grad_norm": 0.35491085052490234, |
|
"learning_rate": 6.0830603797132574e-06, |
|
"loss": 0.2688, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.46120586115781886, |
|
"grad_norm": 0.36608174443244934, |
|
"learning_rate": 6.041631870257882e-06, |
|
"loss": 0.2505, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.463768115942029, |
|
"grad_norm": 0.3670680820941925, |
|
"learning_rate": 6.000128468880223e-06, |
|
"loss": 0.2749, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.46633037072623906, |
|
"grad_norm": 0.40972089767456055, |
|
"learning_rate": 5.958553159618693e-06, |
|
"loss": 0.2541, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.4688926255104492, |
|
"grad_norm": 0.40942203998565674, |
|
"learning_rate": 5.916908931681781e-06, |
|
"loss": 0.2721, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4714548802946593, |
|
"grad_norm": 0.508773922920227, |
|
"learning_rate": 5.8751987792331365e-06, |
|
"loss": 0.2774, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.4740171350788694, |
|
"grad_norm": 0.38248467445373535, |
|
"learning_rate": 5.833425701176294e-06, |
|
"loss": 0.2497, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4765793898630795, |
|
"grad_norm": 0.42881184816360474, |
|
"learning_rate": 5.79159270093905e-06, |
|
"loss": 0.2686, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4791416446472896, |
|
"grad_norm": 0.4207112491130829, |
|
"learning_rate": 5.749702786257529e-06, |
|
"loss": 0.2797, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.4817038994314997, |
|
"grad_norm": 0.4612100124359131, |
|
"learning_rate": 5.707758968959923e-06, |
|
"loss": 0.2665, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.48426615421570984, |
|
"grad_norm": 0.471349835395813, |
|
"learning_rate": 5.6657642647499545e-06, |
|
"loss": 0.2753, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4868284089999199, |
|
"grad_norm": 0.4658471643924713, |
|
"learning_rate": 5.62372169299004e-06, |
|
"loss": 0.2445, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.48939066378413004, |
|
"grad_norm": 0.48692232370376587, |
|
"learning_rate": 5.581634276484211e-06, |
|
"loss": 0.2933, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4919529185683401, |
|
"grad_norm": 0.44437411427497864, |
|
"learning_rate": 5.539505041260779e-06, |
|
"loss": 0.2502, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.49451517335255024, |
|
"grad_norm": 0.4907655119895935, |
|
"learning_rate": 5.497337016354757e-06, |
|
"loss": 0.263, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.49707742813676037, |
|
"grad_norm": 0.4633347690105438, |
|
"learning_rate": 5.45513323359009e-06, |
|
"loss": 0.2494, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.49963968292097044, |
|
"grad_norm": 0.5105425715446472, |
|
"learning_rate": 5.412896727361663e-06, |
|
"loss": 0.2431, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5022019377051805, |
|
"grad_norm": 0.43711456656455994, |
|
"learning_rate": 5.370630534417133e-06, |
|
"loss": 0.248, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.5047641924893906, |
|
"grad_norm": 0.44248372316360474, |
|
"learning_rate": 5.328337693638591e-06, |
|
"loss": 0.2522, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.5073264472736008, |
|
"grad_norm": 0.41455918550491333, |
|
"learning_rate": 5.286021245824075e-06, |
|
"loss": 0.2856, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.5098887020578109, |
|
"grad_norm": 0.36339160799980164, |
|
"learning_rate": 5.243684233468933e-06, |
|
"loss": 0.2626, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.512450956842021, |
|
"grad_norm": 0.4179689288139343, |
|
"learning_rate": 5.201329700547077e-06, |
|
"loss": 0.2738, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.515013211626231, |
|
"grad_norm": 0.3756559193134308, |
|
"learning_rate": 5.158960692292122e-06, |
|
"loss": 0.2511, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.5175754664104412, |
|
"grad_norm": 0.5741788148880005, |
|
"learning_rate": 5.116580254978447e-06, |
|
"loss": 0.2957, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.5201377211946513, |
|
"grad_norm": 0.4136016070842743, |
|
"learning_rate": 5.074191435702155e-06, |
|
"loss": 0.2704, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5226999759788614, |
|
"grad_norm": 0.5152673125267029, |
|
"learning_rate": 5.031797282162007e-06, |
|
"loss": 0.3206, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5252622307630715, |
|
"grad_norm": 0.4879305958747864, |
|
"learning_rate": 4.98940084244029e-06, |
|
"loss": 0.2536, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5278244855472816, |
|
"grad_norm": 0.36677488684654236, |
|
"learning_rate": 4.947005164783661e-06, |
|
"loss": 0.2517, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5303867403314917, |
|
"grad_norm": 0.4830959141254425, |
|
"learning_rate": 4.9046132973839895e-06, |
|
"loss": 0.2751, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.5329489951157018, |
|
"grad_norm": 0.39130493998527527, |
|
"learning_rate": 4.862228288159191e-06, |
|
"loss": 0.2583, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.535511249899912, |
|
"grad_norm": 0.45581528544425964, |
|
"learning_rate": 4.819853184534085e-06, |
|
"loss": 0.3033, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.5380735046841221, |
|
"grad_norm": 0.552720308303833, |
|
"learning_rate": 4.7774910332213005e-06, |
|
"loss": 0.2679, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5406357594683321, |
|
"grad_norm": 0.5465298295021057, |
|
"learning_rate": 4.735144880002199e-06, |
|
"loss": 0.2765, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.5431980142525422, |
|
"grad_norm": 0.452952116727829, |
|
"learning_rate": 4.692817769507912e-06, |
|
"loss": 0.2629, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.5457602690367523, |
|
"grad_norm": 0.5454785227775574, |
|
"learning_rate": 4.6505127450004216e-06, |
|
"loss": 0.292, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.5483225238209625, |
|
"grad_norm": 0.36023062467575073, |
|
"learning_rate": 4.608232848153757e-06, |
|
"loss": 0.2388, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.5508847786051726, |
|
"grad_norm": 0.3965865969657898, |
|
"learning_rate": 4.565981118835299e-06, |
|
"loss": 0.2683, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5534470333893826, |
|
"grad_norm": 0.47152435779571533, |
|
"learning_rate": 4.523760594887228e-06, |
|
"loss": 0.265, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5560092881735927, |
|
"grad_norm": 0.5159929394721985, |
|
"learning_rate": 4.481574311908096e-06, |
|
"loss": 0.2823, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.5585715429578029, |
|
"grad_norm": 0.371762752532959, |
|
"learning_rate": 4.439425303034576e-06, |
|
"loss": 0.2942, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.561133797742013, |
|
"grad_norm": 0.4925728440284729, |
|
"learning_rate": 4.397316598723385e-06, |
|
"loss": 0.2983, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5636960525262231, |
|
"grad_norm": 0.3970510959625244, |
|
"learning_rate": 4.355251226533396e-06, |
|
"loss": 0.2435, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5662583073104331, |
|
"grad_norm": 0.4432925283908844, |
|
"learning_rate": 4.313232210907959e-06, |
|
"loss": 0.2615, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5688205620946433, |
|
"grad_norm": 0.39295539259910583, |
|
"learning_rate": 4.271262572957453e-06, |
|
"loss": 0.2603, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.5713828168788534, |
|
"grad_norm": 0.3533722460269928, |
|
"learning_rate": 4.229345330242067e-06, |
|
"loss": 0.246, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.5739450716630635, |
|
"grad_norm": 0.4501621127128601, |
|
"learning_rate": 4.187483496554844e-06, |
|
"loss": 0.2679, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5765073264472736, |
|
"grad_norm": 0.4579297602176666, |
|
"learning_rate": 4.145680081704989e-06, |
|
"loss": 0.2616, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5790695812314837, |
|
"grad_norm": 0.43312978744506836, |
|
"learning_rate": 4.103938091301479e-06, |
|
"loss": 0.2534, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5816318360156938, |
|
"grad_norm": 0.45154210925102234, |
|
"learning_rate": 4.062260526536955e-06, |
|
"loss": 0.2909, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5841940907999039, |
|
"grad_norm": 0.34377482533454895, |
|
"learning_rate": 4.0206503839719335e-06, |
|
"loss": 0.261, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.586756345584114, |
|
"grad_norm": 0.4153713881969452, |
|
"learning_rate": 3.9791106553193746e-06, |
|
"loss": 0.2669, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5893186003683242, |
|
"grad_norm": 0.5368139743804932, |
|
"learning_rate": 3.937644327229572e-06, |
|
"loss": 0.251, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5918808551525342, |
|
"grad_norm": 0.4761441946029663, |
|
"learning_rate": 3.896254381075416e-06, |
|
"loss": 0.2595, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5944431099367443, |
|
"grad_norm": 0.597135603427887, |
|
"learning_rate": 3.854943792738037e-06, |
|
"loss": 0.2866, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5970053647209544, |
|
"grad_norm": 0.6271767616271973, |
|
"learning_rate": 3.8137155323928526e-06, |
|
"loss": 0.2832, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5995676195051646, |
|
"grad_norm": 0.3820246458053589, |
|
"learning_rate": 3.7725725642960047e-06, |
|
"loss": 0.2548, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.6021298742893747, |
|
"grad_norm": 0.5720183849334717, |
|
"learning_rate": 3.7315178465712364e-06, |
|
"loss": 0.2603, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6046921290735847, |
|
"grad_norm": 0.4225583076477051, |
|
"learning_rate": 3.690554330997215e-06, |
|
"loss": 0.2685, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.6072543838577948, |
|
"grad_norm": 0.3530130386352539, |
|
"learning_rate": 3.6496849627952875e-06, |
|
"loss": 0.2723, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.609816638642005, |
|
"grad_norm": 0.3795667290687561, |
|
"learning_rate": 3.6089126804177373e-06, |
|
"loss": 0.2691, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.6123788934262151, |
|
"grad_norm": 0.43652230501174927, |
|
"learning_rate": 3.568240415336509e-06, |
|
"loss": 0.2838, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.6149411482104252, |
|
"grad_norm": 0.4311392903327942, |
|
"learning_rate": 3.52767109183244e-06, |
|
"loss": 0.2847, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6175034029946352, |
|
"grad_norm": 0.42163416743278503, |
|
"learning_rate": 3.4872076267850015e-06, |
|
"loss": 0.2488, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.6200656577788454, |
|
"grad_norm": 0.4223015308380127, |
|
"learning_rate": 3.4468529294625895e-06, |
|
"loss": 0.2621, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.6226279125630555, |
|
"grad_norm": 0.4520999491214752, |
|
"learning_rate": 3.406609901313349e-06, |
|
"loss": 0.2543, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.6251901673472656, |
|
"grad_norm": 0.5905027985572815, |
|
"learning_rate": 3.36648143575656e-06, |
|
"loss": 0.271, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.6277524221314758, |
|
"grad_norm": 0.5310239195823669, |
|
"learning_rate": 3.326470417974604e-06, |
|
"loss": 0.2794, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6303146769156858, |
|
"grad_norm": 0.43746617436408997, |
|
"learning_rate": 3.2865797247055354e-06, |
|
"loss": 0.2716, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6328769316998959, |
|
"grad_norm": 0.4661629796028137, |
|
"learning_rate": 3.2468122240362287e-06, |
|
"loss": 0.243, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.635439186484106, |
|
"grad_norm": 0.44793224334716797, |
|
"learning_rate": 3.2071707751961838e-06, |
|
"loss": 0.2808, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.6380014412683161, |
|
"grad_norm": 0.5625908970832825, |
|
"learning_rate": 3.1676582283519454e-06, |
|
"loss": 0.265, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6405636960525263, |
|
"grad_norm": 0.44215095043182373, |
|
"learning_rate": 3.1282774244021717e-06, |
|
"loss": 0.2858, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6405636960525263, |
|
"eval_loss": 0.2639869451522827, |
|
"eval_runtime": 270.7894, |
|
"eval_samples_per_second": 19.421, |
|
"eval_steps_per_second": 2.43, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6431259508367363, |
|
"grad_norm": 0.47866004705429077, |
|
"learning_rate": 3.089031194773392e-06, |
|
"loss": 0.2879, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.6456882056209464, |
|
"grad_norm": 0.5291287302970886, |
|
"learning_rate": 3.049922361216422e-06, |
|
"loss": 0.2501, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.6482504604051565, |
|
"grad_norm": 0.4798702895641327, |
|
"learning_rate": 3.0109537356034856e-06, |
|
"loss": 0.2691, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.6508127151893667, |
|
"grad_norm": 0.7165606617927551, |
|
"learning_rate": 2.9721281197260427e-06, |
|
"loss": 0.3519, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.6533749699735768, |
|
"grad_norm": 0.6769598126411438, |
|
"learning_rate": 2.9334483050933506e-06, |
|
"loss": 0.281, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6559372247577868, |
|
"grad_norm": 0.47096380591392517, |
|
"learning_rate": 2.894917072731753e-06, |
|
"loss": 0.2677, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.658499479541997, |
|
"grad_norm": 0.6711763739585876, |
|
"learning_rate": 2.8565371929847286e-06, |
|
"loss": 0.2707, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.6610617343262071, |
|
"grad_norm": 0.44064444303512573, |
|
"learning_rate": 2.81831142531371e-06, |
|
"loss": 0.2654, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.6636239891104172, |
|
"grad_norm": 0.42236313223838806, |
|
"learning_rate": 2.780242518099675e-06, |
|
"loss": 0.2601, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.6661862438946272, |
|
"grad_norm": 0.4029591381549835, |
|
"learning_rate": 2.7423332084455543e-06, |
|
"loss": 0.2648, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6687484986788373, |
|
"grad_norm": 0.47852271795272827, |
|
"learning_rate": 2.704586221979422e-06, |
|
"loss": 0.2744, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.6713107534630475, |
|
"grad_norm": 0.44856366515159607, |
|
"learning_rate": 2.667004272658541e-06, |
|
"loss": 0.2499, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.6738730082472576, |
|
"grad_norm": 0.4645158648490906, |
|
"learning_rate": 2.629590062574221e-06, |
|
"loss": 0.2716, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.6764352630314677, |
|
"grad_norm": 0.5160189867019653, |
|
"learning_rate": 2.592346281757552e-06, |
|
"loss": 0.2361, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.6789975178156777, |
|
"grad_norm": 0.3944529891014099, |
|
"learning_rate": 2.5552756079859904e-06, |
|
"loss": 0.2476, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6815597725998879, |
|
"grad_norm": 0.5633410811424255, |
|
"learning_rate": 2.5183807065908296e-06, |
|
"loss": 0.2287, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.684122027384098, |
|
"grad_norm": 0.3865067958831787, |
|
"learning_rate": 2.4816642302655634e-06, |
|
"loss": 0.2644, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.6866842821683081, |
|
"grad_norm": 0.5245662331581116, |
|
"learning_rate": 2.445128818875166e-06, |
|
"loss": 0.2354, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.6892465369525183, |
|
"grad_norm": 0.4881504774093628, |
|
"learning_rate": 2.408777099266291e-06, |
|
"loss": 0.2779, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.6918087917367283, |
|
"grad_norm": 0.5840505957603455, |
|
"learning_rate": 2.3726116850783987e-06, |
|
"loss": 0.2742, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6943710465209384, |
|
"grad_norm": 0.4902634918689728, |
|
"learning_rate": 2.3366351765558437e-06, |
|
"loss": 0.2818, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6969333013051485, |
|
"grad_norm": 0.4141348600387573, |
|
"learning_rate": 2.3008501603609147e-06, |
|
"loss": 0.2542, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6994955560893586, |
|
"grad_norm": 0.3754000663757324, |
|
"learning_rate": 2.265259209387867e-06, |
|
"loss": 0.2664, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.7020578108735688, |
|
"grad_norm": 0.6529264450073242, |
|
"learning_rate": 2.229864882577921e-06, |
|
"loss": 0.2678, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.7046200656577788, |
|
"grad_norm": 0.3764033615589142, |
|
"learning_rate": 2.194669724735296e-06, |
|
"loss": 0.2668, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7071823204419889, |
|
"grad_norm": 0.3769323229789734, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.2663, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.709744575226199, |
|
"grad_norm": 0.3979746103286743, |
|
"learning_rate": 2.124887023387017e-06, |
|
"loss": 0.2666, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.7123068300104092, |
|
"grad_norm": 0.4987868070602417, |
|
"learning_rate": 2.0903044971631854e-06, |
|
"loss": 0.2292, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.7148690847946193, |
|
"grad_norm": 0.6058522462844849, |
|
"learning_rate": 2.055931174109579e-06, |
|
"loss": 0.2354, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.7174313395788293, |
|
"grad_norm": 0.5615466237068176, |
|
"learning_rate": 2.02176952562162e-06, |
|
"loss": 0.2557, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7199935943630394, |
|
"grad_norm": 0.5051982998847961, |
|
"learning_rate": 1.987822007875617e-06, |
|
"loss": 0.2706, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.7225558491472496, |
|
"grad_norm": 0.571441650390625, |
|
"learning_rate": 1.954091061652172e-06, |
|
"loss": 0.2815, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.7251181039314597, |
|
"grad_norm": 0.5101485252380371, |
|
"learning_rate": 1.920579112160685e-06, |
|
"loss": 0.2314, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.7276803587156698, |
|
"grad_norm": 0.4810335040092468, |
|
"learning_rate": 1.8872885688649879e-06, |
|
"loss": 0.2812, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.7302426134998798, |
|
"grad_norm": 0.49377724528312683, |
|
"learning_rate": 1.854221825310103e-06, |
|
"loss": 0.2656, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.73280486828409, |
|
"grad_norm": 0.5363904237747192, |
|
"learning_rate": 1.8213812589501611e-06, |
|
"loss": 0.265, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.7353671230683001, |
|
"grad_norm": 0.5577176213264465, |
|
"learning_rate": 1.78876923097745e-06, |
|
"loss": 0.2652, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.7379293778525102, |
|
"grad_norm": 0.44135797023773193, |
|
"learning_rate": 1.7563880861526656e-06, |
|
"loss": 0.2748, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.7404916326367204, |
|
"grad_norm": 0.41491812467575073, |
|
"learning_rate": 1.7242401526363095e-06, |
|
"loss": 0.2847, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.7430538874209304, |
|
"grad_norm": 0.4843028783798218, |
|
"learning_rate": 1.692327741821312e-06, |
|
"loss": 0.2792, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7456161422051405, |
|
"grad_norm": 0.5842957496643066, |
|
"learning_rate": 1.6606531481668364e-06, |
|
"loss": 0.2784, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.7481783969893506, |
|
"grad_norm": 0.572831928730011, |
|
"learning_rate": 1.6292186490333172e-06, |
|
"loss": 0.2862, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.7507406517735608, |
|
"grad_norm": 0.5212300419807434, |
|
"learning_rate": 1.5980265045187139e-06, |
|
"loss": 0.2637, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.7533029065577709, |
|
"grad_norm": 0.5278065800666809, |
|
"learning_rate": 1.567078957296016e-06, |
|
"loss": 0.2617, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.7558651613419809, |
|
"grad_norm": 0.5063283443450928, |
|
"learning_rate": 1.5363782324520033e-06, |
|
"loss": 0.2569, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.758427416126191, |
|
"grad_norm": 0.40898391604423523, |
|
"learning_rate": 1.5059265373272574e-06, |
|
"loss": 0.2558, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.7609896709104012, |
|
"grad_norm": 0.5030636191368103, |
|
"learning_rate": 1.475726061357463e-06, |
|
"loss": 0.2547, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.7635519256946113, |
|
"grad_norm": 0.5822692513465881, |
|
"learning_rate": 1.4457789759159813e-06, |
|
"loss": 0.2266, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.7661141804788214, |
|
"grad_norm": 0.5503767132759094, |
|
"learning_rate": 1.4160874341577447e-06, |
|
"loss": 0.269, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.7686764352630314, |
|
"grad_norm": 0.4649931788444519, |
|
"learning_rate": 1.3866535708644335e-06, |
|
"loss": 0.2536, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7712386900472415, |
|
"grad_norm": 0.6687978506088257, |
|
"learning_rate": 1.3574795022910014e-06, |
|
"loss": 0.3012, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.7738009448314517, |
|
"grad_norm": 0.5331063866615295, |
|
"learning_rate": 1.3285673260135073e-06, |
|
"loss": 0.2453, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.7763631996156618, |
|
"grad_norm": 0.46101680397987366, |
|
"learning_rate": 1.2999191207783129e-06, |
|
"loss": 0.2308, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.7789254543998719, |
|
"grad_norm": 0.4032719135284424, |
|
"learning_rate": 1.2715369463526173e-06, |
|
"loss": 0.2534, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.781487709184082, |
|
"grad_norm": 0.7435618042945862, |
|
"learning_rate": 1.2434228433763657e-06, |
|
"loss": 0.2331, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7840499639682921, |
|
"grad_norm": 0.6071492433547974, |
|
"learning_rate": 1.215578833215526e-06, |
|
"loss": 0.2695, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.7866122187525022, |
|
"grad_norm": 0.4534173011779785, |
|
"learning_rate": 1.1880069178167586e-06, |
|
"loss": 0.2654, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.7891744735367123, |
|
"grad_norm": 0.48930707573890686, |
|
"learning_rate": 1.1607090795634802e-06, |
|
"loss": 0.2597, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.7917367283209225, |
|
"grad_norm": 0.43963509798049927, |
|
"learning_rate": 1.133687281133331e-06, |
|
"loss": 0.2454, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.7942989831051325, |
|
"grad_norm": 0.45418596267700195, |
|
"learning_rate": 1.1069434653570633e-06, |
|
"loss": 0.2541, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7968612378893426, |
|
"grad_norm": 0.41048523783683777, |
|
"learning_rate": 1.0804795550788473e-06, |
|
"loss": 0.2743, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.7994234926735527, |
|
"grad_norm": 0.516132116317749, |
|
"learning_rate": 1.0542974530180327e-06, |
|
"loss": 0.2736, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.8019857474577629, |
|
"grad_norm": 0.412601113319397, |
|
"learning_rate": 1.0283990416323336e-06, |
|
"loss": 0.2503, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.804548002241973, |
|
"grad_norm": 0.5029380917549133, |
|
"learning_rate": 1.0027861829824953e-06, |
|
"loss": 0.232, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.807110257026183, |
|
"grad_norm": 0.4999438226222992, |
|
"learning_rate": 9.774607185984004e-07, |
|
"loss": 0.2549, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8096725118103931, |
|
"grad_norm": 0.44878801703453064, |
|
"learning_rate": 9.524244693466773e-07, |
|
"loss": 0.2355, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.8122347665946033, |
|
"grad_norm": 0.4290701150894165, |
|
"learning_rate": 9.276792352997782e-07, |
|
"loss": 0.2579, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.8147970213788134, |
|
"grad_norm": 0.5716743469238281, |
|
"learning_rate": 9.032267956065516e-07, |
|
"loss": 0.2833, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.8173592761630235, |
|
"grad_norm": 0.4765143394470215, |
|
"learning_rate": 8.790689083643328e-07, |
|
"loss": 0.2473, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.8199215309472335, |
|
"grad_norm": 0.4390144646167755, |
|
"learning_rate": 8.552073104925296e-07, |
|
"loss": 0.2711, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8224837857314437, |
|
"grad_norm": 0.5272576808929443, |
|
"learning_rate": 8.316437176077491e-07, |
|
"loss": 0.2749, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.8250460405156538, |
|
"grad_norm": 0.44547039270401, |
|
"learning_rate": 8.083798239004408e-07, |
|
"loss": 0.259, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.8276082952998639, |
|
"grad_norm": 0.578179121017456, |
|
"learning_rate": 7.854173020130906e-07, |
|
"loss": 0.2946, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.830170550084074, |
|
"grad_norm": 0.4996013641357422, |
|
"learning_rate": 7.627578029199562e-07, |
|
"loss": 0.2573, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.832732804868284, |
|
"grad_norm": 0.5044499039649963, |
|
"learning_rate": 7.404029558083653e-07, |
|
"loss": 0.2461, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8352950596524942, |
|
"grad_norm": 0.42843055725097656, |
|
"learning_rate": 7.183543679615834e-07, |
|
"loss": 0.2578, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.8378573144367043, |
|
"grad_norm": 0.5041942596435547, |
|
"learning_rate": 6.966136246432492e-07, |
|
"loss": 0.2412, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.8404195692209144, |
|
"grad_norm": 0.454973042011261, |
|
"learning_rate": 6.751822889833926e-07, |
|
"loss": 0.265, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.8429818240051246, |
|
"grad_norm": 0.4820737838745117, |
|
"learning_rate": 6.540619018660555e-07, |
|
"loss": 0.226, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.8455440787893346, |
|
"grad_norm": 0.5445938110351562, |
|
"learning_rate": 6.332539818184985e-07, |
|
"loss": 0.2501, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8481063335735447, |
|
"grad_norm": 0.5699609518051147, |
|
"learning_rate": 6.127600249020216e-07, |
|
"loss": 0.2747, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.8506685883577548, |
|
"grad_norm": 0.46571552753448486, |
|
"learning_rate": 5.925815046044026e-07, |
|
"loss": 0.2612, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.853230843141965, |
|
"grad_norm": 0.47914472222328186, |
|
"learning_rate": 5.727198717339511e-07, |
|
"loss": 0.2574, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.8557930979261751, |
|
"grad_norm": 0.40852856636047363, |
|
"learning_rate": 5.531765543152002e-07, |
|
"loss": 0.2734, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.8583553527103851, |
|
"grad_norm": 0.3702560067176819, |
|
"learning_rate": 5.33952957486234e-07, |
|
"loss": 0.2539, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8609176074945952, |
|
"grad_norm": 0.5180298686027527, |
|
"learning_rate": 5.150504633976572e-07, |
|
"loss": 0.3682, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.8634798622788054, |
|
"grad_norm": 0.7016831040382385, |
|
"learning_rate": 4.964704311132224e-07, |
|
"loss": 0.2265, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.8660421170630155, |
|
"grad_norm": 0.5376434922218323, |
|
"learning_rate": 4.782141965121129e-07, |
|
"loss": 0.2676, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.8686043718472256, |
|
"grad_norm": 0.47063949704170227, |
|
"learning_rate": 4.602830721928997e-07, |
|
"loss": 0.2606, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.8711666266314356, |
|
"grad_norm": 0.4991367757320404, |
|
"learning_rate": 4.4267834737916295e-07, |
|
"loss": 0.2414, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8737288814156458, |
|
"grad_norm": 0.4373914301395416, |
|
"learning_rate": 4.2540128782679934e-07, |
|
"loss": 0.2287, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.8762911361998559, |
|
"grad_norm": 0.39528214931488037, |
|
"learning_rate": 4.0845313573301736e-07, |
|
"loss": 0.2404, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.878853390984066, |
|
"grad_norm": 0.5945621132850647, |
|
"learning_rate": 3.9183510964702463e-07, |
|
"loss": 0.2719, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.8814156457682761, |
|
"grad_norm": 0.6032932996749878, |
|
"learning_rate": 3.755484043824131e-07, |
|
"loss": 0.2608, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.8839779005524862, |
|
"grad_norm": 0.49754688143730164, |
|
"learning_rate": 3.595941909312595e-07, |
|
"loss": 0.2852, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8865401553366963, |
|
"grad_norm": 0.49544405937194824, |
|
"learning_rate": 3.439736163799251e-07, |
|
"loss": 0.2693, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.8891024101209064, |
|
"grad_norm": 0.4462824761867523, |
|
"learning_rate": 3.2868780382658895e-07, |
|
"loss": 0.2443, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.8916646649051165, |
|
"grad_norm": 0.4302297532558441, |
|
"learning_rate": 3.1373785230049356e-07, |
|
"loss": 0.2515, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.8942269196893267, |
|
"grad_norm": 0.4883180856704712, |
|
"learning_rate": 2.991248366829291e-07, |
|
"loss": 0.2757, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.8967891744735367, |
|
"grad_norm": 0.7474163174629211, |
|
"learning_rate": 2.848498076299483e-07, |
|
"loss": 0.2963, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8993514292577468, |
|
"grad_norm": 0.4648323059082031, |
|
"learning_rate": 2.7091379149682683e-07, |
|
"loss": 0.2361, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.9019136840419569, |
|
"grad_norm": 0.4341067373752594, |
|
"learning_rate": 2.573177902642726e-07, |
|
"loss": 0.2555, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.9044759388261671, |
|
"grad_norm": 0.47577670216560364, |
|
"learning_rate": 2.440627814663804e-07, |
|
"loss": 0.2772, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.9070381936103772, |
|
"grad_norm": 0.47802722454071045, |
|
"learning_rate": 2.3114971812034981e-07, |
|
"loss": 0.2544, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.9096004483945872, |
|
"grad_norm": 0.4585348665714264, |
|
"learning_rate": 2.1857952865796616e-07, |
|
"loss": 0.2424, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9121627031787973, |
|
"grad_norm": 0.4453139305114746, |
|
"learning_rate": 2.0635311685884675e-07, |
|
"loss": 0.2424, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.9147249579630075, |
|
"grad_norm": 0.4780106544494629, |
|
"learning_rate": 1.9447136178545766e-07, |
|
"loss": 0.2475, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.9172872127472176, |
|
"grad_norm": 0.47332102060317993, |
|
"learning_rate": 1.8293511771991624e-07, |
|
"loss": 0.2414, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.9198494675314277, |
|
"grad_norm": 0.5608975887298584, |
|
"learning_rate": 1.7174521410256162e-07, |
|
"loss": 0.2733, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.9224117223156377, |
|
"grad_norm": 0.611322283744812, |
|
"learning_rate": 1.6090245547232707e-07, |
|
"loss": 0.2195, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9249739770998479, |
|
"grad_norm": 0.37321174144744873, |
|
"learning_rate": 1.5040762140888843e-07, |
|
"loss": 0.2496, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.927536231884058, |
|
"grad_norm": 0.394593745470047, |
|
"learning_rate": 1.402614664766172e-07, |
|
"loss": 0.2521, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.9300984866682681, |
|
"grad_norm": 0.5954830646514893, |
|
"learning_rate": 1.3046472017032685e-07, |
|
"loss": 0.2742, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.9326607414524781, |
|
"grad_norm": 0.3724110722541809, |
|
"learning_rate": 1.210180868628219e-07, |
|
"loss": 0.2359, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.9352229962366883, |
|
"grad_norm": 0.42592036724090576, |
|
"learning_rate": 1.1192224575425848e-07, |
|
"loss": 0.2428, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9377852510208984, |
|
"grad_norm": 0.481985479593277, |
|
"learning_rate": 1.0317785082330555e-07, |
|
"loss": 0.2982, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.9403475058051085, |
|
"grad_norm": 0.5069997906684875, |
|
"learning_rate": 9.478553078013042e-08, |
|
"loss": 0.2659, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.9429097605893186, |
|
"grad_norm": 0.41395503282546997, |
|
"learning_rate": 8.674588902118919e-08, |
|
"loss": 0.2429, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.9454720153735287, |
|
"grad_norm": 0.5499728322029114, |
|
"learning_rate": 7.905950358584768e-08, |
|
"loss": 0.2665, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.9480342701577388, |
|
"grad_norm": 0.5084072351455688, |
|
"learning_rate": 7.172692711482022e-08, |
|
"loss": 0.2768, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9505965249419489, |
|
"grad_norm": 0.6337217688560486, |
|
"learning_rate": 6.474868681043578e-08, |
|
"loss": 0.264, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.953158779726159, |
|
"grad_norm": 0.47787654399871826, |
|
"learning_rate": 5.8125284398730666e-08, |
|
"loss": 0.2686, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.9557210345103692, |
|
"grad_norm": 0.46294164657592773, |
|
"learning_rate": 5.185719609337836e-08, |
|
"loss": 0.2709, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.9582832892945792, |
|
"grad_norm": 0.3983994126319885, |
|
"learning_rate": 4.5944872561448084e-08, |
|
"loss": 0.2542, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.9608455440787893, |
|
"grad_norm": 0.5090007781982422, |
|
"learning_rate": 4.038873889100237e-08, |
|
"loss": 0.2613, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9634077988629994, |
|
"grad_norm": 0.6989894509315491, |
|
"learning_rate": 3.518919456053649e-08, |
|
"loss": 0.2394, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.9659700536472096, |
|
"grad_norm": 0.5098798871040344, |
|
"learning_rate": 3.034661341025258e-08, |
|
"loss": 0.2581, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.9685323084314197, |
|
"grad_norm": 0.49127092957496643, |
|
"learning_rate": 2.5861343615184997e-08, |
|
"loss": 0.2271, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.9710945632156297, |
|
"grad_norm": 0.42872855067253113, |
|
"learning_rate": 2.173370766016314e-08, |
|
"loss": 0.2569, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.9736568179998398, |
|
"grad_norm": 0.5179227590560913, |
|
"learning_rate": 1.7964002316628316e-08, |
|
"loss": 0.2569, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.97621907278405, |
|
"grad_norm": 0.6316475868225098, |
|
"learning_rate": 1.4552498621295264e-08, |
|
"loss": 0.2667, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.9787813275682601, |
|
"grad_norm": 0.4547966718673706, |
|
"learning_rate": 1.1499441856663296e-08, |
|
"loss": 0.2743, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.9813435823524702, |
|
"grad_norm": 0.3960263431072235, |
|
"learning_rate": 8.805051533384846e-09, |
|
"loss": 0.2321, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.9839058371366802, |
|
"grad_norm": 0.4860779345035553, |
|
"learning_rate": 6.469521374477539e-09, |
|
"loss": 0.2375, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.9864680919208904, |
|
"grad_norm": 0.4424307346343994, |
|
"learning_rate": 4.493019301401447e-09, |
|
"loss": 0.2373, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9890303467051005, |
|
"grad_norm": 0.6010534167289734, |
|
"learning_rate": 2.875687421980966e-09, |
|
"loss": 0.2703, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.9915926014893106, |
|
"grad_norm": 0.4469178318977356, |
|
"learning_rate": 1.6176420201902132e-09, |
|
"loss": 0.2426, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.9941548562735207, |
|
"grad_norm": 0.3937220573425293, |
|
"learning_rate": 7.189735477913795e-10, |
|
"loss": 0.2662, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.9967171110577308, |
|
"grad_norm": 0.44991588592529297, |
|
"learning_rate": 1.797466178327101e-10, |
|
"loss": 0.242, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.9992793658419409, |
|
"grad_norm": 0.7540487051010132, |
|
"learning_rate": 0.0, |
|
"loss": 0.308, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9992793658419409, |
|
"step": 780, |
|
"total_flos": 2.0573294793064448e+18, |
|
"train_loss": 0.29625688539101525, |
|
"train_runtime": 10334.3449, |
|
"train_samples_per_second": 9.668, |
|
"train_steps_per_second": 0.075 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0573294793064448e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|