{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992793658419409, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025622547842101048, "grad_norm": 4.871487140655518, "learning_rate": 5.128205128205128e-07, "loss": 0.7593, "step": 2 }, { "epoch": 0.0051245095684202095, "grad_norm": 4.878956317901611, "learning_rate": 1.0256410256410257e-06, "loss": 0.8076, "step": 4 }, { "epoch": 0.007686764352630314, "grad_norm": 4.183067321777344, "learning_rate": 1.5384615384615387e-06, "loss": 0.7059, "step": 6 }, { "epoch": 0.010249019136840419, "grad_norm": 4.679640769958496, "learning_rate": 2.0512820512820513e-06, "loss": 0.715, "step": 8 }, { "epoch": 0.012811273921050524, "grad_norm": 4.590462684631348, "learning_rate": 2.564102564102564e-06, "loss": 0.7175, "step": 10 }, { "epoch": 0.015373528705260629, "grad_norm": 4.5435638427734375, "learning_rate": 3.0769230769230774e-06, "loss": 0.6976, "step": 12 }, { "epoch": 0.017935783489470735, "grad_norm": 4.649476051330566, "learning_rate": 3.58974358974359e-06, "loss": 0.7615, "step": 14 }, { "epoch": 0.020498038273680838, "grad_norm": 3.847956418991089, "learning_rate": 4.102564102564103e-06, "loss": 0.6735, "step": 16 }, { "epoch": 0.023060293057890945, "grad_norm": 4.92044734954834, "learning_rate": 4.615384615384616e-06, "loss": 0.6885, "step": 18 }, { "epoch": 0.025622547842101048, "grad_norm": 4.817592620849609, "learning_rate": 5.128205128205128e-06, "loss": 0.768, "step": 20 }, { "epoch": 0.028184802626311154, "grad_norm": 3.8268470764160156, "learning_rate": 5.641025641025641e-06, "loss": 0.7092, "step": 22 }, { "epoch": 0.030747057410521257, "grad_norm": 4.103015899658203, "learning_rate": 6.153846153846155e-06, "loss": 0.6764, "step": 24 }, { "epoch": 0.033309312194731364, "grad_norm": 4.125541687011719, "learning_rate": 6.666666666666667e-06, "loss": 0.6386, "step": 26 }, { "epoch": 0.03587156697894147, "grad_norm": 4.19417142868042, "learning_rate": 7.17948717948718e-06, "loss": 0.6218, "step": 28 }, { "epoch": 0.03843382176315158, "grad_norm": 3.5323216915130615, "learning_rate": 7.692307692307694e-06, "loss": 0.5853, "step": 30 }, { "epoch": 0.040996076547361676, "grad_norm": 3.1467161178588867, "learning_rate": 8.205128205128205e-06, "loss": 0.5441, "step": 32 }, { "epoch": 0.04355833133157178, "grad_norm": 3.0700371265411377, "learning_rate": 8.717948717948719e-06, "loss": 0.5458, "step": 34 }, { "epoch": 0.04612058611578189, "grad_norm": 2.553177833557129, "learning_rate": 9.230769230769232e-06, "loss": 0.4589, "step": 36 }, { "epoch": 0.048682840899991996, "grad_norm": 2.5199780464172363, "learning_rate": 9.743589743589744e-06, "loss": 0.4386, "step": 38 }, { "epoch": 0.051245095684202095, "grad_norm": 1.7382951974868774, "learning_rate": 9.99995506314361e-06, "loss": 0.4215, "step": 40 }, { "epoch": 0.0538073504684122, "grad_norm": 0.9749733209609985, "learning_rate": 9.999595573138845e-06, "loss": 0.3888, "step": 42 }, { "epoch": 0.05636960525262231, "grad_norm": 0.9746177196502686, "learning_rate": 9.99887661897616e-06, "loss": 0.3749, "step": 44 }, { "epoch": 0.058931860036832415, "grad_norm": 0.7504925727844238, "learning_rate": 9.997798252347382e-06, "loss": 0.3543, "step": 46 }, { "epoch": 0.061494114821042514, "grad_norm": 0.7129773497581482, "learning_rate": 9.996360550785619e-06, "loss": 0.3565, "step": 48 }, { "epoch": 0.06405636960525263, "grad_norm": 0.6482123732566833, "learning_rate": 9.994563617659665e-06, "loss": 0.3242, "step": 50 }, { "epoch": 0.06661862438946273, "grad_norm": 0.5225902199745178, "learning_rate": 9.992407582166582e-06, "loss": 0.3334, "step": 52 }, { "epoch": 0.06918087917367283, "grad_norm": 0.5128389596939087, "learning_rate": 9.989892599322404e-06, "loss": 0.3741, "step": 54 }, { "epoch": 0.07174313395788294, "grad_norm": 0.3568147122859955, "learning_rate": 9.987018849950996e-06, "loss": 0.3331, "step": 56 }, { "epoch": 0.07430538874209304, "grad_norm": 0.5258967280387878, "learning_rate": 9.983786540671052e-06, "loss": 0.3335, "step": 58 }, { "epoch": 0.07686764352630315, "grad_norm": 0.5380641222000122, "learning_rate": 9.980195903881231e-06, "loss": 0.3344, "step": 60 }, { "epoch": 0.07942989831051325, "grad_norm": 0.3320980668067932, "learning_rate": 9.976247197743465e-06, "loss": 0.3055, "step": 62 }, { "epoch": 0.08199215309472335, "grad_norm": 0.3006751835346222, "learning_rate": 9.97194070616438e-06, "loss": 0.3187, "step": 64 }, { "epoch": 0.08455440787893347, "grad_norm": 0.26902303099632263, "learning_rate": 9.967276738774897e-06, "loss": 0.2998, "step": 66 }, { "epoch": 0.08711666266314357, "grad_norm": 0.2527882158756256, "learning_rate": 9.962255630907964e-06, "loss": 0.3251, "step": 68 }, { "epoch": 0.08967891744735366, "grad_norm": 0.24817310273647308, "learning_rate": 9.956877743574437e-06, "loss": 0.317, "step": 70 }, { "epoch": 0.09224117223156378, "grad_norm": 0.25589698553085327, "learning_rate": 9.951143463437145e-06, "loss": 0.31, "step": 72 }, { "epoch": 0.09480342701577388, "grad_norm": 0.23160037398338318, "learning_rate": 9.94505320278307e-06, "loss": 0.3158, "step": 74 }, { "epoch": 0.09736568179998399, "grad_norm": 0.26378345489501953, "learning_rate": 9.938607399493714e-06, "loss": 0.3163, "step": 76 }, { "epoch": 0.09992793658419409, "grad_norm": 0.21700599789619446, "learning_rate": 9.931806517013612e-06, "loss": 0.3132, "step": 78 }, { "epoch": 0.10249019136840419, "grad_norm": 0.29619359970092773, "learning_rate": 9.924651044317017e-06, "loss": 0.315, "step": 80 }, { "epoch": 0.1050524461526143, "grad_norm": 0.33124956488609314, "learning_rate": 9.917141495872733e-06, "loss": 0.2851, "step": 82 }, { "epoch": 0.1076147009368244, "grad_norm": 0.202985018491745, "learning_rate": 9.909278411607134e-06, "loss": 0.3036, "step": 84 }, { "epoch": 0.11017695572103452, "grad_norm": 0.24660180509090424, "learning_rate": 9.90106235686534e-06, "loss": 0.3374, "step": 86 }, { "epoch": 0.11273921050524462, "grad_norm": 0.2552855312824249, "learning_rate": 9.892493922370575e-06, "loss": 0.316, "step": 88 }, { "epoch": 0.11530146528945472, "grad_norm": 0.29509437084198, "learning_rate": 9.883573724181683e-06, "loss": 0.3284, "step": 90 }, { "epoch": 0.11786372007366483, "grad_norm": 0.4660441279411316, "learning_rate": 9.87430240364885e-06, "loss": 0.3266, "step": 92 }, { "epoch": 0.12042597485787493, "grad_norm": 0.31215617060661316, "learning_rate": 9.864680627367476e-06, "loss": 0.3104, "step": 94 }, { "epoch": 0.12298822964208503, "grad_norm": 0.6009628176689148, "learning_rate": 9.854709087130261e-06, "loss": 0.3221, "step": 96 }, { "epoch": 0.12555048442629513, "grad_norm": 0.30782508850097656, "learning_rate": 9.844388499877457e-06, "loss": 0.2913, "step": 98 }, { "epoch": 0.12811273921050526, "grad_norm": 0.3109281659126282, "learning_rate": 9.833719607645325e-06, "loss": 0.2974, "step": 100 }, { "epoch": 0.13067499399471535, "grad_norm": 0.2832891345024109, "learning_rate": 9.822703177512783e-06, "loss": 0.3698, "step": 102 }, { "epoch": 0.13323724877892545, "grad_norm": 0.2483058124780655, "learning_rate": 9.811340001546252e-06, "loss": 0.2995, "step": 104 }, { "epoch": 0.13579950356313555, "grad_norm": 0.2657499611377716, "learning_rate": 9.799630896742716e-06, "loss": 0.304, "step": 106 }, { "epoch": 0.13836175834734565, "grad_norm": 0.4393406808376312, "learning_rate": 9.787576704970965e-06, "loss": 0.3032, "step": 108 }, { "epoch": 0.14092401313155578, "grad_norm": 0.3503418266773224, "learning_rate": 9.77517829291108e-06, "loss": 0.2915, "step": 110 }, { "epoch": 0.14348626791576588, "grad_norm": 0.28331002593040466, "learning_rate": 9.762436551992117e-06, "loss": 0.3192, "step": 112 }, { "epoch": 0.14604852269997598, "grad_norm": 0.28841540217399597, "learning_rate": 9.74935239832801e-06, "loss": 0.2999, "step": 114 }, { "epoch": 0.14861077748418608, "grad_norm": 0.3377619683742523, "learning_rate": 9.735926772651703e-06, "loss": 0.31, "step": 116 }, { "epoch": 0.15117303226839618, "grad_norm": 0.358359158039093, "learning_rate": 9.722160640247523e-06, "loss": 0.292, "step": 118 }, { "epoch": 0.1537352870526063, "grad_norm": 0.24913446605205536, "learning_rate": 9.708054990881763e-06, "loss": 0.3077, "step": 120 }, { "epoch": 0.1562975418368164, "grad_norm": 0.34343913197517395, "learning_rate": 9.693610838731532e-06, "loss": 0.3132, "step": 122 }, { "epoch": 0.1588597966210265, "grad_norm": 0.2604675590991974, "learning_rate": 9.678829222311827e-06, "loss": 0.3139, "step": 124 }, { "epoch": 0.1614220514052366, "grad_norm": 0.30086928606033325, "learning_rate": 9.663711204400872e-06, "loss": 0.2992, "step": 126 }, { "epoch": 0.1639843061894467, "grad_norm": 0.3241061270236969, "learning_rate": 9.6482578719637e-06, "loss": 0.3066, "step": 128 }, { "epoch": 0.1665465609736568, "grad_norm": 0.26830941438674927, "learning_rate": 9.632470336074009e-06, "loss": 0.3121, "step": 130 }, { "epoch": 0.16910881575786693, "grad_norm": 0.203893780708313, "learning_rate": 9.616349731834271e-06, "loss": 0.2991, "step": 132 }, { "epoch": 0.17167107054207703, "grad_norm": 0.3675401210784912, "learning_rate": 9.599897218294122e-06, "loss": 0.3227, "step": 134 }, { "epoch": 0.17423332532628713, "grad_norm": 0.21370336413383484, "learning_rate": 9.583113978367026e-06, "loss": 0.3025, "step": 136 }, { "epoch": 0.17679558011049723, "grad_norm": 0.25403013825416565, "learning_rate": 9.56600121874523e-06, "loss": 0.3179, "step": 138 }, { "epoch": 0.17935783489470733, "grad_norm": 0.3012063503265381, "learning_rate": 9.548560169812997e-06, "loss": 0.2906, "step": 140 }, { "epoch": 0.18192008967891746, "grad_norm": 0.24593935906887054, "learning_rate": 9.530792085558151e-06, "loss": 0.2968, "step": 142 }, { "epoch": 0.18448234446312756, "grad_norm": 0.267528235912323, "learning_rate": 9.512698243481914e-06, "loss": 0.3076, "step": 144 }, { "epoch": 0.18704459924733766, "grad_norm": 0.4075755774974823, "learning_rate": 9.49427994450705e-06, "loss": 0.292, "step": 146 }, { "epoch": 0.18960685403154776, "grad_norm": 0.37276849150657654, "learning_rate": 9.47553851288434e-06, "loss": 0.3337, "step": 148 }, { "epoch": 0.19216910881575786, "grad_norm": 0.34166908264160156, "learning_rate": 9.45647529609736e-06, "loss": 0.2854, "step": 150 }, { "epoch": 0.19473136359996798, "grad_norm": 0.3679031431674957, "learning_rate": 9.437091664765611e-06, "loss": 0.328, "step": 152 }, { "epoch": 0.19729361838417808, "grad_norm": 0.2564798891544342, "learning_rate": 9.41738901254596e-06, "loss": 0.2831, "step": 154 }, { "epoch": 0.19985587316838818, "grad_norm": 0.39898496866226196, "learning_rate": 9.397368756032445e-06, "loss": 0.2899, "step": 156 }, { "epoch": 0.20241812795259828, "grad_norm": 0.2926347255706787, "learning_rate": 9.37703233465443e-06, "loss": 0.2796, "step": 158 }, { "epoch": 0.20498038273680838, "grad_norm": 0.3333691656589508, "learning_rate": 9.356381210573092e-06, "loss": 0.2965, "step": 160 }, { "epoch": 0.2075426375210185, "grad_norm": 0.2890892028808594, "learning_rate": 9.33541686857632e-06, "loss": 0.2884, "step": 162 }, { "epoch": 0.2101048923052286, "grad_norm": 0.27766191959381104, "learning_rate": 9.31414081597194e-06, "loss": 0.297, "step": 164 }, { "epoch": 0.2126671470894387, "grad_norm": 0.3218678832054138, "learning_rate": 9.292554582479349e-06, "loss": 0.2862, "step": 166 }, { "epoch": 0.2152294018736488, "grad_norm": 0.3139230012893677, "learning_rate": 9.270659720119533e-06, "loss": 0.2958, "step": 168 }, { "epoch": 0.2177916566578589, "grad_norm": 0.2383907586336136, "learning_rate": 9.248457803103476e-06, "loss": 0.2988, "step": 170 }, { "epoch": 0.22035391144206903, "grad_norm": 0.32504117488861084, "learning_rate": 9.225950427718974e-06, "loss": 0.2803, "step": 172 }, { "epoch": 0.22291616622627913, "grad_norm": 0.2564990818500519, "learning_rate": 9.203139212215868e-06, "loss": 0.2957, "step": 174 }, { "epoch": 0.22547842101048923, "grad_norm": 0.282103568315506, "learning_rate": 9.180025796689692e-06, "loss": 0.2933, "step": 176 }, { "epoch": 0.22804067579469933, "grad_norm": 0.3701488971710205, "learning_rate": 9.156611842963753e-06, "loss": 0.2926, "step": 178 }, { "epoch": 0.23060293057890943, "grad_norm": 0.3153334856033325, "learning_rate": 9.132899034469648e-06, "loss": 0.3111, "step": 180 }, { "epoch": 0.23316518536311953, "grad_norm": 0.31320127844810486, "learning_rate": 9.108889076126226e-06, "loss": 0.2948, "step": 182 }, { "epoch": 0.23572744014732966, "grad_norm": 0.3407798409461975, "learning_rate": 9.084583694217012e-06, "loss": 0.2872, "step": 184 }, { "epoch": 0.23828969493153976, "grad_norm": 0.5061964988708496, "learning_rate": 9.059984636266082e-06, "loss": 0.3073, "step": 186 }, { "epoch": 0.24085194971574986, "grad_norm": 0.342929482460022, "learning_rate": 9.035093670912424e-06, "loss": 0.2866, "step": 188 }, { "epoch": 0.24341420449995996, "grad_norm": 0.38549765944480896, "learning_rate": 9.009912587782772e-06, "loss": 0.298, "step": 190 }, { "epoch": 0.24597645928417006, "grad_norm": 0.339372843503952, "learning_rate": 8.984443197362938e-06, "loss": 0.2644, "step": 192 }, { "epoch": 0.24853871406838018, "grad_norm": 0.26157572865486145, "learning_rate": 8.958687330867634e-06, "loss": 0.2986, "step": 194 }, { "epoch": 0.25110096885259026, "grad_norm": 0.307921826839447, "learning_rate": 8.932646840108818e-06, "loss": 0.2826, "step": 196 }, { "epoch": 0.2536632236368004, "grad_norm": 0.48844948410987854, "learning_rate": 8.906323597362547e-06, "loss": 0.2824, "step": 198 }, { "epoch": 0.2562254784210105, "grad_norm": 0.3046979308128357, "learning_rate": 8.879719495234363e-06, "loss": 0.2836, "step": 200 }, { "epoch": 0.2587877332052206, "grad_norm": 0.37873372435569763, "learning_rate": 8.852836446523213e-06, "loss": 0.2799, "step": 202 }, { "epoch": 0.2613499879894307, "grad_norm": 0.5752015709877014, "learning_rate": 8.825676384083936e-06, "loss": 0.3027, "step": 204 }, { "epoch": 0.2639122427736408, "grad_norm": 0.403952956199646, "learning_rate": 8.798241260688273e-06, "loss": 0.3032, "step": 206 }, { "epoch": 0.2664744975578509, "grad_norm": 0.36202457547187805, "learning_rate": 8.770533048884483e-06, "loss": 0.3044, "step": 208 }, { "epoch": 0.26903675234206104, "grad_norm": 0.34956708550453186, "learning_rate": 8.742553740855507e-06, "loss": 0.2784, "step": 210 }, { "epoch": 0.2715990071262711, "grad_norm": 0.44058695435523987, "learning_rate": 8.71430534827574e-06, "loss": 0.3142, "step": 212 }, { "epoch": 0.27416126191048124, "grad_norm": 0.3903171420097351, "learning_rate": 8.685789902166395e-06, "loss": 0.2592, "step": 214 }, { "epoch": 0.2767235166946913, "grad_norm": 0.34790173172950745, "learning_rate": 8.657009452749466e-06, "loss": 0.2881, "step": 216 }, { "epoch": 0.27928577147890143, "grad_norm": 0.3779347240924835, "learning_rate": 8.627966069300332e-06, "loss": 0.3017, "step": 218 }, { "epoch": 0.28184802626311156, "grad_norm": 0.40141528844833374, "learning_rate": 8.598661839998972e-06, "loss": 0.2781, "step": 220 }, { "epoch": 0.28441028104732163, "grad_norm": 0.30786147713661194, "learning_rate": 8.569098871779828e-06, "loss": 0.296, "step": 222 }, { "epoch": 0.28697253583153176, "grad_norm": 0.2742227017879486, "learning_rate": 8.539279290180315e-06, "loss": 0.3161, "step": 224 }, { "epoch": 0.28953479061574183, "grad_norm": 0.5068826675415039, "learning_rate": 8.509205239188017e-06, "loss": 0.2948, "step": 226 }, { "epoch": 0.29209704539995196, "grad_norm": 0.3508552610874176, "learning_rate": 8.478878881086505e-06, "loss": 0.2736, "step": 228 }, { "epoch": 0.2946593001841621, "grad_norm": 0.47813767194747925, "learning_rate": 8.448302396299906e-06, "loss": 0.2954, "step": 230 }, { "epoch": 0.29722155496837216, "grad_norm": 0.29084405303001404, "learning_rate": 8.417477983236107e-06, "loss": 0.3134, "step": 232 }, { "epoch": 0.2997838097525823, "grad_norm": 0.41805362701416016, "learning_rate": 8.386407858128707e-06, "loss": 0.2767, "step": 234 }, { "epoch": 0.30234606453679236, "grad_norm": 0.32367441058158875, "learning_rate": 8.355094254877665e-06, "loss": 0.2783, "step": 236 }, { "epoch": 0.3049083193210025, "grad_norm": 0.31607088446617126, "learning_rate": 8.323539424888695e-06, "loss": 0.2871, "step": 238 }, { "epoch": 0.3074705741052126, "grad_norm": 0.3964040279388428, "learning_rate": 8.291745636911382e-06, "loss": 0.2747, "step": 240 }, { "epoch": 0.3100328288894227, "grad_norm": 0.3582654595375061, "learning_rate": 8.259715176876069e-06, "loss": 0.2737, "step": 242 }, { "epoch": 0.3125950836736328, "grad_norm": 0.38515010476112366, "learning_rate": 8.2274503477295e-06, "loss": 0.2889, "step": 244 }, { "epoch": 0.3151573384578429, "grad_norm": 0.3744358718395233, "learning_rate": 8.19495346926924e-06, "loss": 0.2822, "step": 246 }, { "epoch": 0.317719593242053, "grad_norm": 0.3402256369590759, "learning_rate": 8.162226877976886e-06, "loss": 0.284, "step": 248 }, { "epoch": 0.32028184802626314, "grad_norm": 0.4301615059375763, "learning_rate": 8.129272926850079e-06, "loss": 0.2915, "step": 250 }, { "epoch": 0.3228441028104732, "grad_norm": 0.3376031816005707, "learning_rate": 8.096093985233323e-06, "loss": 0.2842, "step": 252 }, { "epoch": 0.32540635759468334, "grad_norm": 0.546100378036499, "learning_rate": 8.062692438647628e-06, "loss": 0.3203, "step": 254 }, { "epoch": 0.3279686123788934, "grad_norm": 0.37469664216041565, "learning_rate": 8.029070688619013e-06, "loss": 0.2828, "step": 256 }, { "epoch": 0.33053086716310354, "grad_norm": 0.31530773639678955, "learning_rate": 7.995231152505815e-06, "loss": 0.2672, "step": 258 }, { "epoch": 0.3330931219473136, "grad_norm": 0.47679194808006287, "learning_rate": 7.961176263324902e-06, "loss": 0.292, "step": 260 }, { "epoch": 0.33565537673152374, "grad_norm": 0.7583074569702148, "learning_rate": 7.92690846957673e-06, "loss": 0.2987, "step": 262 }, { "epoch": 0.33821763151573386, "grad_norm": 0.4478585124015808, "learning_rate": 7.892430235069317e-06, "loss": 0.2881, "step": 264 }, { "epoch": 0.34077988629994393, "grad_norm": 0.49820685386657715, "learning_rate": 7.857744038741076e-06, "loss": 0.2912, "step": 266 }, { "epoch": 0.34334214108415406, "grad_norm": 0.42809927463531494, "learning_rate": 7.822852374482597e-06, "loss": 0.2672, "step": 268 }, { "epoch": 0.34590439586836413, "grad_norm": 0.3707646131515503, "learning_rate": 7.787757750957335e-06, "loss": 0.2921, "step": 270 }, { "epoch": 0.34846665065257426, "grad_norm": 0.3849372863769531, "learning_rate": 7.752462691421245e-06, "loss": 0.2676, "step": 272 }, { "epoch": 0.3510289054367844, "grad_norm": 0.34830930829048157, "learning_rate": 7.716969733541357e-06, "loss": 0.2576, "step": 274 }, { "epoch": 0.35359116022099446, "grad_norm": 0.4144101142883301, "learning_rate": 7.681281429213328e-06, "loss": 0.2686, "step": 276 }, { "epoch": 0.3561534150052046, "grad_norm": 0.30803945660591125, "learning_rate": 7.645400344377953e-06, "loss": 0.2678, "step": 278 }, { "epoch": 0.35871566978941466, "grad_norm": 0.40825673937797546, "learning_rate": 7.609329058836694e-06, "loss": 0.2907, "step": 280 }, { "epoch": 0.3612779245736248, "grad_norm": 0.38340067863464355, "learning_rate": 7.5730701660661795e-06, "loss": 0.298, "step": 282 }, { "epoch": 0.3638401793578349, "grad_norm": 0.3731997013092041, "learning_rate": 7.536626273031747e-06, "loss": 0.263, "step": 284 }, { "epoch": 0.366402434142045, "grad_norm": 0.3588733375072479, "learning_rate": 7.500000000000001e-06, "loss": 0.2733, "step": 286 }, { "epoch": 0.3689646889262551, "grad_norm": 0.4146881699562073, "learning_rate": 7.4631939803504215e-06, "loss": 0.3159, "step": 288 }, { "epoch": 0.3715269437104652, "grad_norm": 0.4735972285270691, "learning_rate": 7.426210860386032e-06, "loss": 0.2878, "step": 290 }, { "epoch": 0.3740891984946753, "grad_norm": 0.5484066009521484, "learning_rate": 7.3890532991431174e-06, "loss": 0.2829, "step": 292 }, { "epoch": 0.37665145327888544, "grad_norm": 0.3961395025253296, "learning_rate": 7.3517239682000675e-06, "loss": 0.2646, "step": 294 }, { "epoch": 0.3792137080630955, "grad_norm": 0.43453872203826904, "learning_rate": 7.314225551485273e-06, "loss": 0.301, "step": 296 }, { "epoch": 0.38177596284730564, "grad_norm": 0.45246270298957825, "learning_rate": 7.276560745084167e-06, "loss": 0.2622, "step": 298 }, { "epoch": 0.3843382176315157, "grad_norm": 0.4539019763469696, "learning_rate": 7.2387322570453724e-06, "loss": 0.2901, "step": 300 }, { "epoch": 0.38690047241572584, "grad_norm": 0.4333208203315735, "learning_rate": 7.2007428071860045e-06, "loss": 0.2576, "step": 302 }, { "epoch": 0.38946272719993597, "grad_norm": 0.3936616778373718, "learning_rate": 7.162595126896111e-06, "loss": 0.2716, "step": 304 }, { "epoch": 0.39202498198414604, "grad_norm": 0.40865668654441833, "learning_rate": 7.1242919589422974e-06, "loss": 0.2716, "step": 306 }, { "epoch": 0.39458723676835616, "grad_norm": 0.5468711256980896, "learning_rate": 7.085836057270521e-06, "loss": 0.2978, "step": 308 }, { "epoch": 0.39714949155256624, "grad_norm": 0.469566285610199, "learning_rate": 7.047230186808085e-06, "loss": 0.2499, "step": 310 }, { "epoch": 0.39971174633677636, "grad_norm": 0.5449560880661011, "learning_rate": 7.008477123264849e-06, "loss": 0.3018, "step": 312 }, { "epoch": 0.4022740011209865, "grad_norm": 0.48154890537261963, "learning_rate": 6.96957965293365e-06, "loss": 0.2834, "step": 314 }, { "epoch": 0.40483625590519656, "grad_norm": 0.3875851035118103, "learning_rate": 6.9305405724899876e-06, "loss": 0.3008, "step": 316 }, { "epoch": 0.4073985106894067, "grad_norm": 0.5583494305610657, "learning_rate": 6.891362688790925e-06, "loss": 0.2753, "step": 318 }, { "epoch": 0.40996076547361676, "grad_norm": 0.47610044479370117, "learning_rate": 6.8520488186733e-06, "loss": 0.2943, "step": 320 }, { "epoch": 0.4125230202578269, "grad_norm": 0.33989906311035156, "learning_rate": 6.812601788751192e-06, "loss": 0.2692, "step": 322 }, { "epoch": 0.415085275042037, "grad_norm": 0.4737338125705719, "learning_rate": 6.773024435212678e-06, "loss": 0.2961, "step": 324 }, { "epoch": 0.4176475298262471, "grad_norm": 0.538935124874115, "learning_rate": 6.733319603615941e-06, "loss": 0.2898, "step": 326 }, { "epoch": 0.4202097846104572, "grad_norm": 0.4021223187446594, "learning_rate": 6.693490148684654e-06, "loss": 0.2555, "step": 328 }, { "epoch": 0.4227720393946673, "grad_norm": 0.330159991979599, "learning_rate": 6.653538934102743e-06, "loss": 0.3043, "step": 330 }, { "epoch": 0.4253342941788774, "grad_norm": 0.39451590180397034, "learning_rate": 6.6134688323084884e-06, "loss": 0.3098, "step": 332 }, { "epoch": 0.42789654896308754, "grad_norm": 0.3512692451477051, "learning_rate": 6.573282724288001e-06, "loss": 0.276, "step": 334 }, { "epoch": 0.4304588037472976, "grad_norm": 0.3749544322490692, "learning_rate": 6.532983499368078e-06, "loss": 0.2893, "step": 336 }, { "epoch": 0.43302105853150774, "grad_norm": 0.35993286967277527, "learning_rate": 6.492574055008474e-06, "loss": 0.2522, "step": 338 }, { "epoch": 0.4355833133157178, "grad_norm": 0.3857017457485199, "learning_rate": 6.452057296593568e-06, "loss": 0.2556, "step": 340 }, { "epoch": 0.43814556809992794, "grad_norm": 0.36345577239990234, "learning_rate": 6.411436137223479e-06, "loss": 0.2795, "step": 342 }, { "epoch": 0.44070782288413807, "grad_norm": 0.40086713433265686, "learning_rate": 6.370713497504607e-06, "loss": 0.2619, "step": 344 }, { "epoch": 0.44327007766834814, "grad_norm": 0.4900248944759369, "learning_rate": 6.329892305339659e-06, "loss": 0.2748, "step": 346 }, { "epoch": 0.44583233245255827, "grad_norm": 0.6341924071311951, "learning_rate": 6.288975495717124e-06, "loss": 0.2731, "step": 348 }, { "epoch": 0.44839458723676834, "grad_norm": 0.5340880751609802, "learning_rate": 6.247966010500258e-06, "loss": 0.2797, "step": 350 }, { "epoch": 0.45095684202097847, "grad_norm": 0.37570691108703613, "learning_rate": 6.206866798215571e-06, "loss": 0.2724, "step": 352 }, { "epoch": 0.4535190968051886, "grad_norm": 0.4172237515449524, "learning_rate": 6.165680813840822e-06, "loss": 0.2728, "step": 354 }, { "epoch": 0.45608135158939866, "grad_norm": 0.36990782618522644, "learning_rate": 6.124411018592568e-06, "loss": 0.2733, "step": 356 }, { "epoch": 0.4586436063736088, "grad_norm": 0.35491085052490234, "learning_rate": 6.0830603797132574e-06, "loss": 0.2688, "step": 358 }, { "epoch": 0.46120586115781886, "grad_norm": 0.36608174443244934, "learning_rate": 6.041631870257882e-06, "loss": 0.2505, "step": 360 }, { "epoch": 0.463768115942029, "grad_norm": 0.3670680820941925, "learning_rate": 6.000128468880223e-06, "loss": 0.2749, "step": 362 }, { "epoch": 0.46633037072623906, "grad_norm": 0.40972089767456055, "learning_rate": 5.958553159618693e-06, "loss": 0.2541, "step": 364 }, { "epoch": 0.4688926255104492, "grad_norm": 0.40942203998565674, "learning_rate": 5.916908931681781e-06, "loss": 0.2721, "step": 366 }, { "epoch": 0.4714548802946593, "grad_norm": 0.508773922920227, "learning_rate": 5.8751987792331365e-06, "loss": 0.2774, "step": 368 }, { "epoch": 0.4740171350788694, "grad_norm": 0.38248467445373535, "learning_rate": 5.833425701176294e-06, "loss": 0.2497, "step": 370 }, { "epoch": 0.4765793898630795, "grad_norm": 0.42881184816360474, "learning_rate": 5.79159270093905e-06, "loss": 0.2686, "step": 372 }, { "epoch": 0.4791416446472896, "grad_norm": 0.4207112491130829, "learning_rate": 5.749702786257529e-06, "loss": 0.2797, "step": 374 }, { "epoch": 0.4817038994314997, "grad_norm": 0.4612100124359131, "learning_rate": 5.707758968959923e-06, "loss": 0.2665, "step": 376 }, { "epoch": 0.48426615421570984, "grad_norm": 0.471349835395813, "learning_rate": 5.6657642647499545e-06, "loss": 0.2753, "step": 378 }, { "epoch": 0.4868284089999199, "grad_norm": 0.4658471643924713, "learning_rate": 5.62372169299004e-06, "loss": 0.2445, "step": 380 }, { "epoch": 0.48939066378413004, "grad_norm": 0.48692232370376587, "learning_rate": 5.581634276484211e-06, "loss": 0.2933, "step": 382 }, { "epoch": 0.4919529185683401, "grad_norm": 0.44437411427497864, "learning_rate": 5.539505041260779e-06, "loss": 0.2502, "step": 384 }, { "epoch": 0.49451517335255024, "grad_norm": 0.4907655119895935, "learning_rate": 5.497337016354757e-06, "loss": 0.263, "step": 386 }, { "epoch": 0.49707742813676037, "grad_norm": 0.4633347690105438, "learning_rate": 5.45513323359009e-06, "loss": 0.2494, "step": 388 }, { "epoch": 0.49963968292097044, "grad_norm": 0.5105425715446472, "learning_rate": 5.412896727361663e-06, "loss": 0.2431, "step": 390 }, { "epoch": 0.5022019377051805, "grad_norm": 0.43711456656455994, "learning_rate": 5.370630534417133e-06, "loss": 0.248, "step": 392 }, { "epoch": 0.5047641924893906, "grad_norm": 0.44248372316360474, "learning_rate": 5.328337693638591e-06, "loss": 0.2522, "step": 394 }, { "epoch": 0.5073264472736008, "grad_norm": 0.41455918550491333, "learning_rate": 5.286021245824075e-06, "loss": 0.2856, "step": 396 }, { "epoch": 0.5098887020578109, "grad_norm": 0.36339160799980164, "learning_rate": 5.243684233468933e-06, "loss": 0.2626, "step": 398 }, { "epoch": 0.512450956842021, "grad_norm": 0.4179689288139343, "learning_rate": 5.201329700547077e-06, "loss": 0.2738, "step": 400 }, { "epoch": 0.515013211626231, "grad_norm": 0.3756559193134308, "learning_rate": 5.158960692292122e-06, "loss": 0.2511, "step": 402 }, { "epoch": 0.5175754664104412, "grad_norm": 0.5741788148880005, "learning_rate": 5.116580254978447e-06, "loss": 0.2957, "step": 404 }, { "epoch": 0.5201377211946513, "grad_norm": 0.4136016070842743, "learning_rate": 5.074191435702155e-06, "loss": 0.2704, "step": 406 }, { "epoch": 0.5226999759788614, "grad_norm": 0.5152673125267029, "learning_rate": 5.031797282162007e-06, "loss": 0.3206, "step": 408 }, { "epoch": 0.5252622307630715, "grad_norm": 0.4879305958747864, "learning_rate": 4.98940084244029e-06, "loss": 0.2536, "step": 410 }, { "epoch": 0.5278244855472816, "grad_norm": 0.36677488684654236, "learning_rate": 4.947005164783661e-06, "loss": 0.2517, "step": 412 }, { "epoch": 0.5303867403314917, "grad_norm": 0.4830959141254425, "learning_rate": 4.9046132973839895e-06, "loss": 0.2751, "step": 414 }, { "epoch": 0.5329489951157018, "grad_norm": 0.39130493998527527, "learning_rate": 4.862228288159191e-06, "loss": 0.2583, "step": 416 }, { "epoch": 0.535511249899912, "grad_norm": 0.45581528544425964, "learning_rate": 4.819853184534085e-06, "loss": 0.3033, "step": 418 }, { "epoch": 0.5380735046841221, "grad_norm": 0.552720308303833, "learning_rate": 4.7774910332213005e-06, "loss": 0.2679, "step": 420 }, { "epoch": 0.5406357594683321, "grad_norm": 0.5465298295021057, "learning_rate": 4.735144880002199e-06, "loss": 0.2765, "step": 422 }, { "epoch": 0.5431980142525422, "grad_norm": 0.452952116727829, "learning_rate": 4.692817769507912e-06, "loss": 0.2629, "step": 424 }, { "epoch": 0.5457602690367523, "grad_norm": 0.5454785227775574, "learning_rate": 4.6505127450004216e-06, "loss": 0.292, "step": 426 }, { "epoch": 0.5483225238209625, "grad_norm": 0.36023062467575073, "learning_rate": 4.608232848153757e-06, "loss": 0.2388, "step": 428 }, { "epoch": 0.5508847786051726, "grad_norm": 0.3965865969657898, "learning_rate": 4.565981118835299e-06, "loss": 0.2683, "step": 430 }, { "epoch": 0.5534470333893826, "grad_norm": 0.47152435779571533, "learning_rate": 4.523760594887228e-06, "loss": 0.265, "step": 432 }, { "epoch": 0.5560092881735927, "grad_norm": 0.5159929394721985, "learning_rate": 4.481574311908096e-06, "loss": 0.2823, "step": 434 }, { "epoch": 0.5585715429578029, "grad_norm": 0.371762752532959, "learning_rate": 4.439425303034576e-06, "loss": 0.2942, "step": 436 }, { "epoch": 0.561133797742013, "grad_norm": 0.4925728440284729, "learning_rate": 4.397316598723385e-06, "loss": 0.2983, "step": 438 }, { "epoch": 0.5636960525262231, "grad_norm": 0.3970510959625244, "learning_rate": 4.355251226533396e-06, "loss": 0.2435, "step": 440 }, { "epoch": 0.5662583073104331, "grad_norm": 0.4432925283908844, "learning_rate": 4.313232210907959e-06, "loss": 0.2615, "step": 442 }, { "epoch": 0.5688205620946433, "grad_norm": 0.39295539259910583, "learning_rate": 4.271262572957453e-06, "loss": 0.2603, "step": 444 }, { "epoch": 0.5713828168788534, "grad_norm": 0.3533722460269928, "learning_rate": 4.229345330242067e-06, "loss": 0.246, "step": 446 }, { "epoch": 0.5739450716630635, "grad_norm": 0.4501621127128601, "learning_rate": 4.187483496554844e-06, "loss": 0.2679, "step": 448 }, { "epoch": 0.5765073264472736, "grad_norm": 0.4579297602176666, "learning_rate": 4.145680081704989e-06, "loss": 0.2616, "step": 450 }, { "epoch": 0.5790695812314837, "grad_norm": 0.43312978744506836, "learning_rate": 4.103938091301479e-06, "loss": 0.2534, "step": 452 }, { "epoch": 0.5816318360156938, "grad_norm": 0.45154210925102234, "learning_rate": 4.062260526536955e-06, "loss": 0.2909, "step": 454 }, { "epoch": 0.5841940907999039, "grad_norm": 0.34377482533454895, "learning_rate": 4.0206503839719335e-06, "loss": 0.261, "step": 456 }, { "epoch": 0.586756345584114, "grad_norm": 0.4153713881969452, "learning_rate": 3.9791106553193746e-06, "loss": 0.2669, "step": 458 }, { "epoch": 0.5893186003683242, "grad_norm": 0.5368139743804932, "learning_rate": 3.937644327229572e-06, "loss": 0.251, "step": 460 }, { "epoch": 0.5918808551525342, "grad_norm": 0.4761441946029663, "learning_rate": 3.896254381075416e-06, "loss": 0.2595, "step": 462 }, { "epoch": 0.5944431099367443, "grad_norm": 0.597135603427887, "learning_rate": 3.854943792738037e-06, "loss": 0.2866, "step": 464 }, { "epoch": 0.5970053647209544, "grad_norm": 0.6271767616271973, "learning_rate": 3.8137155323928526e-06, "loss": 0.2832, "step": 466 }, { "epoch": 0.5995676195051646, "grad_norm": 0.3820246458053589, "learning_rate": 3.7725725642960047e-06, "loss": 0.2548, "step": 468 }, { "epoch": 0.6021298742893747, "grad_norm": 0.5720183849334717, "learning_rate": 3.7315178465712364e-06, "loss": 0.2603, "step": 470 }, { "epoch": 0.6046921290735847, "grad_norm": 0.4225583076477051, "learning_rate": 3.690554330997215e-06, "loss": 0.2685, "step": 472 }, { "epoch": 0.6072543838577948, "grad_norm": 0.3530130386352539, "learning_rate": 3.6496849627952875e-06, "loss": 0.2723, "step": 474 }, { "epoch": 0.609816638642005, "grad_norm": 0.3795667290687561, "learning_rate": 3.6089126804177373e-06, "loss": 0.2691, "step": 476 }, { "epoch": 0.6123788934262151, "grad_norm": 0.43652230501174927, "learning_rate": 3.568240415336509e-06, "loss": 0.2838, "step": 478 }, { "epoch": 0.6149411482104252, "grad_norm": 0.4311392903327942, "learning_rate": 3.52767109183244e-06, "loss": 0.2847, "step": 480 }, { "epoch": 0.6175034029946352, "grad_norm": 0.42163416743278503, "learning_rate": 3.4872076267850015e-06, "loss": 0.2488, "step": 482 }, { "epoch": 0.6200656577788454, "grad_norm": 0.4223015308380127, "learning_rate": 3.4468529294625895e-06, "loss": 0.2621, "step": 484 }, { "epoch": 0.6226279125630555, "grad_norm": 0.4520999491214752, "learning_rate": 3.406609901313349e-06, "loss": 0.2543, "step": 486 }, { "epoch": 0.6251901673472656, "grad_norm": 0.5905027985572815, "learning_rate": 3.36648143575656e-06, "loss": 0.271, "step": 488 }, { "epoch": 0.6277524221314758, "grad_norm": 0.5310239195823669, "learning_rate": 3.326470417974604e-06, "loss": 0.2794, "step": 490 }, { "epoch": 0.6303146769156858, "grad_norm": 0.43746617436408997, "learning_rate": 3.2865797247055354e-06, "loss": 0.2716, "step": 492 }, { "epoch": 0.6328769316998959, "grad_norm": 0.4661629796028137, "learning_rate": 3.2468122240362287e-06, "loss": 0.243, "step": 494 }, { "epoch": 0.635439186484106, "grad_norm": 0.44793224334716797, "learning_rate": 3.2071707751961838e-06, "loss": 0.2808, "step": 496 }, { "epoch": 0.6380014412683161, "grad_norm": 0.5625908970832825, "learning_rate": 3.1676582283519454e-06, "loss": 0.265, "step": 498 }, { "epoch": 0.6405636960525263, "grad_norm": 0.44215095043182373, "learning_rate": 3.1282774244021717e-06, "loss": 0.2858, "step": 500 }, { "epoch": 0.6405636960525263, "eval_loss": 0.2639869451522827, "eval_runtime": 270.7894, "eval_samples_per_second": 19.421, "eval_steps_per_second": 2.43, "step": 500 }, { "epoch": 0.6431259508367363, "grad_norm": 0.47866004705429077, "learning_rate": 3.089031194773392e-06, "loss": 0.2879, "step": 502 }, { "epoch": 0.6456882056209464, "grad_norm": 0.5291287302970886, "learning_rate": 3.049922361216422e-06, "loss": 0.2501, "step": 504 }, { "epoch": 0.6482504604051565, "grad_norm": 0.4798702895641327, "learning_rate": 3.0109537356034856e-06, "loss": 0.2691, "step": 506 }, { "epoch": 0.6508127151893667, "grad_norm": 0.7165606617927551, "learning_rate": 2.9721281197260427e-06, "loss": 0.3519, "step": 508 }, { "epoch": 0.6533749699735768, "grad_norm": 0.6769598126411438, "learning_rate": 2.9334483050933506e-06, "loss": 0.281, "step": 510 }, { "epoch": 0.6559372247577868, "grad_norm": 0.47096380591392517, "learning_rate": 2.894917072731753e-06, "loss": 0.2677, "step": 512 }, { "epoch": 0.658499479541997, "grad_norm": 0.6711763739585876, "learning_rate": 2.8565371929847286e-06, "loss": 0.2707, "step": 514 }, { "epoch": 0.6610617343262071, "grad_norm": 0.44064444303512573, "learning_rate": 2.81831142531371e-06, "loss": 0.2654, "step": 516 }, { "epoch": 0.6636239891104172, "grad_norm": 0.42236313223838806, "learning_rate": 2.780242518099675e-06, "loss": 0.2601, "step": 518 }, { "epoch": 0.6661862438946272, "grad_norm": 0.4029591381549835, "learning_rate": 2.7423332084455543e-06, "loss": 0.2648, "step": 520 }, { "epoch": 0.6687484986788373, "grad_norm": 0.47852271795272827, "learning_rate": 2.704586221979422e-06, "loss": 0.2744, "step": 522 }, { "epoch": 0.6713107534630475, "grad_norm": 0.44856366515159607, "learning_rate": 2.667004272658541e-06, "loss": 0.2499, "step": 524 }, { "epoch": 0.6738730082472576, "grad_norm": 0.4645158648490906, "learning_rate": 2.629590062574221e-06, "loss": 0.2716, "step": 526 }, { "epoch": 0.6764352630314677, "grad_norm": 0.5160189867019653, "learning_rate": 2.592346281757552e-06, "loss": 0.2361, "step": 528 }, { "epoch": 0.6789975178156777, "grad_norm": 0.3944529891014099, "learning_rate": 2.5552756079859904e-06, "loss": 0.2476, "step": 530 }, { "epoch": 0.6815597725998879, "grad_norm": 0.5633410811424255, "learning_rate": 2.5183807065908296e-06, "loss": 0.2287, "step": 532 }, { "epoch": 0.684122027384098, "grad_norm": 0.3865067958831787, "learning_rate": 2.4816642302655634e-06, "loss": 0.2644, "step": 534 }, { "epoch": 0.6866842821683081, "grad_norm": 0.5245662331581116, "learning_rate": 2.445128818875166e-06, "loss": 0.2354, "step": 536 }, { "epoch": 0.6892465369525183, "grad_norm": 0.4881504774093628, "learning_rate": 2.408777099266291e-06, "loss": 0.2779, "step": 538 }, { "epoch": 0.6918087917367283, "grad_norm": 0.5840505957603455, "learning_rate": 2.3726116850783987e-06, "loss": 0.2742, "step": 540 }, { "epoch": 0.6943710465209384, "grad_norm": 0.4902634918689728, "learning_rate": 2.3366351765558437e-06, "loss": 0.2818, "step": 542 }, { "epoch": 0.6969333013051485, "grad_norm": 0.4141348600387573, "learning_rate": 2.3008501603609147e-06, "loss": 0.2542, "step": 544 }, { "epoch": 0.6994955560893586, "grad_norm": 0.3754000663757324, "learning_rate": 2.265259209387867e-06, "loss": 0.2664, "step": 546 }, { "epoch": 0.7020578108735688, "grad_norm": 0.6529264450073242, "learning_rate": 2.229864882577921e-06, "loss": 0.2678, "step": 548 }, { "epoch": 0.7046200656577788, "grad_norm": 0.3764033615589142, "learning_rate": 2.194669724735296e-06, "loss": 0.2668, "step": 550 }, { "epoch": 0.7071823204419889, "grad_norm": 0.3769323229789734, "learning_rate": 2.159676266344222e-06, "loss": 0.2663, "step": 552 }, { "epoch": 0.709744575226199, "grad_norm": 0.3979746103286743, "learning_rate": 2.124887023387017e-06, "loss": 0.2666, "step": 554 }, { "epoch": 0.7123068300104092, "grad_norm": 0.4987868070602417, "learning_rate": 2.0903044971631854e-06, "loss": 0.2292, "step": 556 }, { "epoch": 0.7148690847946193, "grad_norm": 0.6058522462844849, "learning_rate": 2.055931174109579e-06, "loss": 0.2354, "step": 558 }, { "epoch": 0.7174313395788293, "grad_norm": 0.5615466237068176, "learning_rate": 2.02176952562162e-06, "loss": 0.2557, "step": 560 }, { "epoch": 0.7199935943630394, "grad_norm": 0.5051982998847961, "learning_rate": 1.987822007875617e-06, "loss": 0.2706, "step": 562 }, { "epoch": 0.7225558491472496, "grad_norm": 0.571441650390625, "learning_rate": 1.954091061652172e-06, "loss": 0.2815, "step": 564 }, { "epoch": 0.7251181039314597, "grad_norm": 0.5101485252380371, "learning_rate": 1.920579112160685e-06, "loss": 0.2314, "step": 566 }, { "epoch": 0.7276803587156698, "grad_norm": 0.4810335040092468, "learning_rate": 1.8872885688649879e-06, "loss": 0.2812, "step": 568 }, { "epoch": 0.7302426134998798, "grad_norm": 0.49377724528312683, "learning_rate": 1.854221825310103e-06, "loss": 0.2656, "step": 570 }, { "epoch": 0.73280486828409, "grad_norm": 0.5363904237747192, "learning_rate": 1.8213812589501611e-06, "loss": 0.265, "step": 572 }, { "epoch": 0.7353671230683001, "grad_norm": 0.5577176213264465, "learning_rate": 1.78876923097745e-06, "loss": 0.2652, "step": 574 }, { "epoch": 0.7379293778525102, "grad_norm": 0.44135797023773193, "learning_rate": 1.7563880861526656e-06, "loss": 0.2748, "step": 576 }, { "epoch": 0.7404916326367204, "grad_norm": 0.41491812467575073, "learning_rate": 1.7242401526363095e-06, "loss": 0.2847, "step": 578 }, { "epoch": 0.7430538874209304, "grad_norm": 0.4843028783798218, "learning_rate": 1.692327741821312e-06, "loss": 0.2792, "step": 580 }, { "epoch": 0.7456161422051405, "grad_norm": 0.5842957496643066, "learning_rate": 1.6606531481668364e-06, "loss": 0.2784, "step": 582 }, { "epoch": 0.7481783969893506, "grad_norm": 0.572831928730011, "learning_rate": 1.6292186490333172e-06, "loss": 0.2862, "step": 584 }, { "epoch": 0.7507406517735608, "grad_norm": 0.5212300419807434, "learning_rate": 1.5980265045187139e-06, "loss": 0.2637, "step": 586 }, { "epoch": 0.7533029065577709, "grad_norm": 0.5278065800666809, "learning_rate": 1.567078957296016e-06, "loss": 0.2617, "step": 588 }, { "epoch": 0.7558651613419809, "grad_norm": 0.5063283443450928, "learning_rate": 1.5363782324520033e-06, "loss": 0.2569, "step": 590 }, { "epoch": 0.758427416126191, "grad_norm": 0.40898391604423523, "learning_rate": 1.5059265373272574e-06, "loss": 0.2558, "step": 592 }, { "epoch": 0.7609896709104012, "grad_norm": 0.5030636191368103, "learning_rate": 1.475726061357463e-06, "loss": 0.2547, "step": 594 }, { "epoch": 0.7635519256946113, "grad_norm": 0.5822692513465881, "learning_rate": 1.4457789759159813e-06, "loss": 0.2266, "step": 596 }, { "epoch": 0.7661141804788214, "grad_norm": 0.5503767132759094, "learning_rate": 1.4160874341577447e-06, "loss": 0.269, "step": 598 }, { "epoch": 0.7686764352630314, "grad_norm": 0.4649931788444519, "learning_rate": 1.3866535708644335e-06, "loss": 0.2536, "step": 600 }, { "epoch": 0.7712386900472415, "grad_norm": 0.6687978506088257, "learning_rate": 1.3574795022910014e-06, "loss": 0.3012, "step": 602 }, { "epoch": 0.7738009448314517, "grad_norm": 0.5331063866615295, "learning_rate": 1.3285673260135073e-06, "loss": 0.2453, "step": 604 }, { "epoch": 0.7763631996156618, "grad_norm": 0.46101680397987366, "learning_rate": 1.2999191207783129e-06, "loss": 0.2308, "step": 606 }, { "epoch": 0.7789254543998719, "grad_norm": 0.4032719135284424, "learning_rate": 1.2715369463526173e-06, "loss": 0.2534, "step": 608 }, { "epoch": 0.781487709184082, "grad_norm": 0.7435618042945862, "learning_rate": 1.2434228433763657e-06, "loss": 0.2331, "step": 610 }, { "epoch": 0.7840499639682921, "grad_norm": 0.6071492433547974, "learning_rate": 1.215578833215526e-06, "loss": 0.2695, "step": 612 }, { "epoch": 0.7866122187525022, "grad_norm": 0.4534173011779785, "learning_rate": 1.1880069178167586e-06, "loss": 0.2654, "step": 614 }, { "epoch": 0.7891744735367123, "grad_norm": 0.48930707573890686, "learning_rate": 1.1607090795634802e-06, "loss": 0.2597, "step": 616 }, { "epoch": 0.7917367283209225, "grad_norm": 0.43963509798049927, "learning_rate": 1.133687281133331e-06, "loss": 0.2454, "step": 618 }, { "epoch": 0.7942989831051325, "grad_norm": 0.45418596267700195, "learning_rate": 1.1069434653570633e-06, "loss": 0.2541, "step": 620 }, { "epoch": 0.7968612378893426, "grad_norm": 0.41048523783683777, "learning_rate": 1.0804795550788473e-06, "loss": 0.2743, "step": 622 }, { "epoch": 0.7994234926735527, "grad_norm": 0.516132116317749, "learning_rate": 1.0542974530180327e-06, "loss": 0.2736, "step": 624 }, { "epoch": 0.8019857474577629, "grad_norm": 0.412601113319397, "learning_rate": 1.0283990416323336e-06, "loss": 0.2503, "step": 626 }, { "epoch": 0.804548002241973, "grad_norm": 0.5029380917549133, "learning_rate": 1.0027861829824953e-06, "loss": 0.232, "step": 628 }, { "epoch": 0.807110257026183, "grad_norm": 0.4999438226222992, "learning_rate": 9.774607185984004e-07, "loss": 0.2549, "step": 630 }, { "epoch": 0.8096725118103931, "grad_norm": 0.44878801703453064, "learning_rate": 9.524244693466773e-07, "loss": 0.2355, "step": 632 }, { "epoch": 0.8122347665946033, "grad_norm": 0.4290701150894165, "learning_rate": 9.276792352997782e-07, "loss": 0.2579, "step": 634 }, { "epoch": 0.8147970213788134, "grad_norm": 0.5716743469238281, "learning_rate": 9.032267956065516e-07, "loss": 0.2833, "step": 636 }, { "epoch": 0.8173592761630235, "grad_norm": 0.4765143394470215, "learning_rate": 8.790689083643328e-07, "loss": 0.2473, "step": 638 }, { "epoch": 0.8199215309472335, "grad_norm": 0.4390144646167755, "learning_rate": 8.552073104925296e-07, "loss": 0.2711, "step": 640 }, { "epoch": 0.8224837857314437, "grad_norm": 0.5272576808929443, "learning_rate": 8.316437176077491e-07, "loss": 0.2749, "step": 642 }, { "epoch": 0.8250460405156538, "grad_norm": 0.44547039270401, "learning_rate": 8.083798239004408e-07, "loss": 0.259, "step": 644 }, { "epoch": 0.8276082952998639, "grad_norm": 0.578179121017456, "learning_rate": 7.854173020130906e-07, "loss": 0.2946, "step": 646 }, { "epoch": 0.830170550084074, "grad_norm": 0.4996013641357422, "learning_rate": 7.627578029199562e-07, "loss": 0.2573, "step": 648 }, { "epoch": 0.832732804868284, "grad_norm": 0.5044499039649963, "learning_rate": 7.404029558083653e-07, "loss": 0.2461, "step": 650 }, { "epoch": 0.8352950596524942, "grad_norm": 0.42843055725097656, "learning_rate": 7.183543679615834e-07, "loss": 0.2578, "step": 652 }, { "epoch": 0.8378573144367043, "grad_norm": 0.5041942596435547, "learning_rate": 6.966136246432492e-07, "loss": 0.2412, "step": 654 }, { "epoch": 0.8404195692209144, "grad_norm": 0.454973042011261, "learning_rate": 6.751822889833926e-07, "loss": 0.265, "step": 656 }, { "epoch": 0.8429818240051246, "grad_norm": 0.4820737838745117, "learning_rate": 6.540619018660555e-07, "loss": 0.226, "step": 658 }, { "epoch": 0.8455440787893346, "grad_norm": 0.5445938110351562, "learning_rate": 6.332539818184985e-07, "loss": 0.2501, "step": 660 }, { "epoch": 0.8481063335735447, "grad_norm": 0.5699609518051147, "learning_rate": 6.127600249020216e-07, "loss": 0.2747, "step": 662 }, { "epoch": 0.8506685883577548, "grad_norm": 0.46571552753448486, "learning_rate": 5.925815046044026e-07, "loss": 0.2612, "step": 664 }, { "epoch": 0.853230843141965, "grad_norm": 0.47914472222328186, "learning_rate": 5.727198717339511e-07, "loss": 0.2574, "step": 666 }, { "epoch": 0.8557930979261751, "grad_norm": 0.40852856636047363, "learning_rate": 5.531765543152002e-07, "loss": 0.2734, "step": 668 }, { "epoch": 0.8583553527103851, "grad_norm": 0.3702560067176819, "learning_rate": 5.33952957486234e-07, "loss": 0.2539, "step": 670 }, { "epoch": 0.8609176074945952, "grad_norm": 0.5180298686027527, "learning_rate": 5.150504633976572e-07, "loss": 0.3682, "step": 672 }, { "epoch": 0.8634798622788054, "grad_norm": 0.7016831040382385, "learning_rate": 4.964704311132224e-07, "loss": 0.2265, "step": 674 }, { "epoch": 0.8660421170630155, "grad_norm": 0.5376434922218323, "learning_rate": 4.782141965121129e-07, "loss": 0.2676, "step": 676 }, { "epoch": 0.8686043718472256, "grad_norm": 0.47063949704170227, "learning_rate": 4.602830721928997e-07, "loss": 0.2606, "step": 678 }, { "epoch": 0.8711666266314356, "grad_norm": 0.4991367757320404, "learning_rate": 4.4267834737916295e-07, "loss": 0.2414, "step": 680 }, { "epoch": 0.8737288814156458, "grad_norm": 0.4373914301395416, "learning_rate": 4.2540128782679934e-07, "loss": 0.2287, "step": 682 }, { "epoch": 0.8762911361998559, "grad_norm": 0.39528214931488037, "learning_rate": 4.0845313573301736e-07, "loss": 0.2404, "step": 684 }, { "epoch": 0.878853390984066, "grad_norm": 0.5945621132850647, "learning_rate": 3.9183510964702463e-07, "loss": 0.2719, "step": 686 }, { "epoch": 0.8814156457682761, "grad_norm": 0.6032932996749878, "learning_rate": 3.755484043824131e-07, "loss": 0.2608, "step": 688 }, { "epoch": 0.8839779005524862, "grad_norm": 0.49754688143730164, "learning_rate": 3.595941909312595e-07, "loss": 0.2852, "step": 690 }, { "epoch": 0.8865401553366963, "grad_norm": 0.49544405937194824, "learning_rate": 3.439736163799251e-07, "loss": 0.2693, "step": 692 }, { "epoch": 0.8891024101209064, "grad_norm": 0.4462824761867523, "learning_rate": 3.2868780382658895e-07, "loss": 0.2443, "step": 694 }, { "epoch": 0.8916646649051165, "grad_norm": 0.4302297532558441, "learning_rate": 3.1373785230049356e-07, "loss": 0.2515, "step": 696 }, { "epoch": 0.8942269196893267, "grad_norm": 0.4883180856704712, "learning_rate": 2.991248366829291e-07, "loss": 0.2757, "step": 698 }, { "epoch": 0.8967891744735367, "grad_norm": 0.7474163174629211, "learning_rate": 2.848498076299483e-07, "loss": 0.2963, "step": 700 }, { "epoch": 0.8993514292577468, "grad_norm": 0.4648323059082031, "learning_rate": 2.7091379149682683e-07, "loss": 0.2361, "step": 702 }, { "epoch": 0.9019136840419569, "grad_norm": 0.4341067373752594, "learning_rate": 2.573177902642726e-07, "loss": 0.2555, "step": 704 }, { "epoch": 0.9044759388261671, "grad_norm": 0.47577670216560364, "learning_rate": 2.440627814663804e-07, "loss": 0.2772, "step": 706 }, { "epoch": 0.9070381936103772, "grad_norm": 0.47802722454071045, "learning_rate": 2.3114971812034981e-07, "loss": 0.2544, "step": 708 }, { "epoch": 0.9096004483945872, "grad_norm": 0.4585348665714264, "learning_rate": 2.1857952865796616e-07, "loss": 0.2424, "step": 710 }, { "epoch": 0.9121627031787973, "grad_norm": 0.4453139305114746, "learning_rate": 2.0635311685884675e-07, "loss": 0.2424, "step": 712 }, { "epoch": 0.9147249579630075, "grad_norm": 0.4780106544494629, "learning_rate": 1.9447136178545766e-07, "loss": 0.2475, "step": 714 }, { "epoch": 0.9172872127472176, "grad_norm": 0.47332102060317993, "learning_rate": 1.8293511771991624e-07, "loss": 0.2414, "step": 716 }, { "epoch": 0.9198494675314277, "grad_norm": 0.5608975887298584, "learning_rate": 1.7174521410256162e-07, "loss": 0.2733, "step": 718 }, { "epoch": 0.9224117223156377, "grad_norm": 0.611322283744812, "learning_rate": 1.6090245547232707e-07, "loss": 0.2195, "step": 720 }, { "epoch": 0.9249739770998479, "grad_norm": 0.37321174144744873, "learning_rate": 1.5040762140888843e-07, "loss": 0.2496, "step": 722 }, { "epoch": 0.927536231884058, "grad_norm": 0.394593745470047, "learning_rate": 1.402614664766172e-07, "loss": 0.2521, "step": 724 }, { "epoch": 0.9300984866682681, "grad_norm": 0.5954830646514893, "learning_rate": 1.3046472017032685e-07, "loss": 0.2742, "step": 726 }, { "epoch": 0.9326607414524781, "grad_norm": 0.3724110722541809, "learning_rate": 1.210180868628219e-07, "loss": 0.2359, "step": 728 }, { "epoch": 0.9352229962366883, "grad_norm": 0.42592036724090576, "learning_rate": 1.1192224575425848e-07, "loss": 0.2428, "step": 730 }, { "epoch": 0.9377852510208984, "grad_norm": 0.481985479593277, "learning_rate": 1.0317785082330555e-07, "loss": 0.2982, "step": 732 }, { "epoch": 0.9403475058051085, "grad_norm": 0.5069997906684875, "learning_rate": 9.478553078013042e-08, "loss": 0.2659, "step": 734 }, { "epoch": 0.9429097605893186, "grad_norm": 0.41395503282546997, "learning_rate": 8.674588902118919e-08, "loss": 0.2429, "step": 736 }, { "epoch": 0.9454720153735287, "grad_norm": 0.5499728322029114, "learning_rate": 7.905950358584768e-08, "loss": 0.2665, "step": 738 }, { "epoch": 0.9480342701577388, "grad_norm": 0.5084072351455688, "learning_rate": 7.172692711482022e-08, "loss": 0.2768, "step": 740 }, { "epoch": 0.9505965249419489, "grad_norm": 0.6337217688560486, "learning_rate": 6.474868681043578e-08, "loss": 0.264, "step": 742 }, { "epoch": 0.953158779726159, "grad_norm": 0.47787654399871826, "learning_rate": 5.8125284398730666e-08, "loss": 0.2686, "step": 744 }, { "epoch": 0.9557210345103692, "grad_norm": 0.46294164657592773, "learning_rate": 5.185719609337836e-08, "loss": 0.2709, "step": 746 }, { "epoch": 0.9582832892945792, "grad_norm": 0.3983994126319885, "learning_rate": 4.5944872561448084e-08, "loss": 0.2542, "step": 748 }, { "epoch": 0.9608455440787893, "grad_norm": 0.5090007781982422, "learning_rate": 4.038873889100237e-08, "loss": 0.2613, "step": 750 }, { "epoch": 0.9634077988629994, "grad_norm": 0.6989894509315491, "learning_rate": 3.518919456053649e-08, "loss": 0.2394, "step": 752 }, { "epoch": 0.9659700536472096, "grad_norm": 0.5098798871040344, "learning_rate": 3.034661341025258e-08, "loss": 0.2581, "step": 754 }, { "epoch": 0.9685323084314197, "grad_norm": 0.49127092957496643, "learning_rate": 2.5861343615184997e-08, "loss": 0.2271, "step": 756 }, { "epoch": 0.9710945632156297, "grad_norm": 0.42872855067253113, "learning_rate": 2.173370766016314e-08, "loss": 0.2569, "step": 758 }, { "epoch": 0.9736568179998398, "grad_norm": 0.5179227590560913, "learning_rate": 1.7964002316628316e-08, "loss": 0.2569, "step": 760 }, { "epoch": 0.97621907278405, "grad_norm": 0.6316475868225098, "learning_rate": 1.4552498621295264e-08, "loss": 0.2667, "step": 762 }, { "epoch": 0.9787813275682601, "grad_norm": 0.4547966718673706, "learning_rate": 1.1499441856663296e-08, "loss": 0.2743, "step": 764 }, { "epoch": 0.9813435823524702, "grad_norm": 0.3960263431072235, "learning_rate": 8.805051533384846e-09, "loss": 0.2321, "step": 766 }, { "epoch": 0.9839058371366802, "grad_norm": 0.4860779345035553, "learning_rate": 6.469521374477539e-09, "loss": 0.2375, "step": 768 }, { "epoch": 0.9864680919208904, "grad_norm": 0.4424307346343994, "learning_rate": 4.493019301401447e-09, "loss": 0.2373, "step": 770 }, { "epoch": 0.9890303467051005, "grad_norm": 0.6010534167289734, "learning_rate": 2.875687421980966e-09, "loss": 0.2703, "step": 772 }, { "epoch": 0.9915926014893106, "grad_norm": 0.4469178318977356, "learning_rate": 1.6176420201902132e-09, "loss": 0.2426, "step": 774 }, { "epoch": 0.9941548562735207, "grad_norm": 0.3937220573425293, "learning_rate": 7.189735477913795e-10, "loss": 0.2662, "step": 776 }, { "epoch": 0.9967171110577308, "grad_norm": 0.44991588592529297, "learning_rate": 1.797466178327101e-10, "loss": 0.242, "step": 778 }, { "epoch": 0.9992793658419409, "grad_norm": 0.7540487051010132, "learning_rate": 0.0, "loss": 0.308, "step": 780 }, { "epoch": 0.9992793658419409, "step": 780, "total_flos": 2.0573294793064448e+18, "train_loss": 0.29625688539101525, "train_runtime": 10334.3449, "train_samples_per_second": 9.668, "train_steps_per_second": 0.075 } ], "logging_steps": 2, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0573294793064448e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }