{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2499194847020934, "eval_steps": 500, "global_step": 582, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042941492216854533, "grad_norm": 0.11985349655151367, "learning_rate": 2e-05, "loss": 1.3519, "step": 1 }, { "epoch": 0.0008588298443370907, "grad_norm": 0.10720210522413254, "learning_rate": 4e-05, "loss": 1.226, "step": 2 }, { "epoch": 0.0012882447665056361, "grad_norm": 0.13469132781028748, "learning_rate": 6e-05, "loss": 1.4841, "step": 3 }, { "epoch": 0.0017176596886741813, "grad_norm": 0.1580151468515396, "learning_rate": 8e-05, "loss": 1.6834, "step": 4 }, { "epoch": 0.0021470746108427268, "grad_norm": 0.1583908349275589, "learning_rate": 0.0001, "loss": 1.5718, "step": 5 }, { "epoch": 0.0025764895330112722, "grad_norm": 0.1486492156982422, "learning_rate": 0.00012, "loss": 1.4708, "step": 6 }, { "epoch": 0.0030059044551798177, "grad_norm": 0.15314875543117523, "learning_rate": 0.00014, "loss": 1.3917, "step": 7 }, { "epoch": 0.0034353193773483627, "grad_norm": 0.1677706390619278, "learning_rate": 0.00016, "loss": 1.4053, "step": 8 }, { "epoch": 0.003864734299516908, "grad_norm": 0.17734883725643158, "learning_rate": 0.00018, "loss": 1.4917, "step": 9 }, { "epoch": 0.0042941492216854536, "grad_norm": 0.15920934081077576, "learning_rate": 0.0002, "loss": 1.369, "step": 10 }, { "epoch": 0.0047235641438539986, "grad_norm": 0.14339257776737213, "learning_rate": 0.00019999990815768547, "loss": 1.5381, "step": 11 }, { "epoch": 0.0051529790660225444, "grad_norm": 0.18288248777389526, "learning_rate": 0.00019999963263091051, "loss": 1.6338, "step": 12 }, { "epoch": 0.0055823939881910895, "grad_norm": 0.13021744787693024, "learning_rate": 0.00019999917342018129, "loss": 1.047, "step": 13 }, { "epoch": 0.006011808910359635, "grad_norm": 0.14156687259674072, "learning_rate": 0.00019999853052634123, "loss": 1.2525, "step": 14 }, { "epoch": 0.00644122383252818, "grad_norm": 0.1400100141763687, "learning_rate": 0.0001999977039505713, "loss": 1.2294, "step": 15 }, { "epoch": 0.006870638754696725, "grad_norm": 0.16356173157691956, "learning_rate": 0.00019999669369438975, "loss": 1.3664, "step": 16 }, { "epoch": 0.007300053676865271, "grad_norm": 0.17197328805923462, "learning_rate": 0.00019999549975965227, "loss": 1.3922, "step": 17 }, { "epoch": 0.007729468599033816, "grad_norm": 0.1663227528333664, "learning_rate": 0.00019999412214855196, "loss": 1.1996, "step": 18 }, { "epoch": 0.008158883521202361, "grad_norm": 0.1358145773410797, "learning_rate": 0.00019999256086361924, "loss": 0.9447, "step": 19 }, { "epoch": 0.008588298443370907, "grad_norm": 0.14678195118904114, "learning_rate": 0.000199990815907722, "loss": 1.3465, "step": 20 }, { "epoch": 0.009017713365539453, "grad_norm": 0.14393630623817444, "learning_rate": 0.00019998888728406543, "loss": 1.0345, "step": 21 }, { "epoch": 0.009447128287707997, "grad_norm": 0.1871100217103958, "learning_rate": 0.00019998677499619206, "loss": 1.1669, "step": 22 }, { "epoch": 0.009876543209876543, "grad_norm": 0.12790684401988983, "learning_rate": 0.00019998447904798195, "loss": 0.9759, "step": 23 }, { "epoch": 0.010305958132045089, "grad_norm": 0.1504671722650528, "learning_rate": 0.00019998199944365236, "loss": 1.3362, "step": 24 }, { "epoch": 0.010735373054213635, "grad_norm": 0.14933271706104279, "learning_rate": 0.00019997933618775787, "loss": 1.2592, "step": 25 }, { "epoch": 0.011164787976382179, "grad_norm": 0.1384006291627884, "learning_rate": 0.00019997648928519055, "loss": 1.0959, "step": 26 }, { "epoch": 0.011594202898550725, "grad_norm": 0.12688492238521576, "learning_rate": 0.00019997345874117972, "loss": 1.1937, "step": 27 }, { "epoch": 0.01202361782071927, "grad_norm": 0.14218132197856903, "learning_rate": 0.00019997024456129195, "loss": 1.3615, "step": 28 }, { "epoch": 0.012453032742887815, "grad_norm": 0.16093435883522034, "learning_rate": 0.0001999668467514313, "loss": 1.0413, "step": 29 }, { "epoch": 0.01288244766505636, "grad_norm": 0.14973227679729462, "learning_rate": 0.00019996326531783898, "loss": 1.0408, "step": 30 }, { "epoch": 0.013311862587224907, "grad_norm": 0.12071070075035095, "learning_rate": 0.00019995950026709353, "loss": 1.0642, "step": 31 }, { "epoch": 0.01374127750939345, "grad_norm": 0.1471056491136551, "learning_rate": 0.00019995555160611073, "loss": 1.2353, "step": 32 }, { "epoch": 0.014170692431561997, "grad_norm": 0.14476723968982697, "learning_rate": 0.00019995141934214372, "loss": 1.1288, "step": 33 }, { "epoch": 0.014600107353730542, "grad_norm": 0.1581466645002365, "learning_rate": 0.0001999471034827828, "loss": 1.2426, "step": 34 }, { "epoch": 0.015029522275899088, "grad_norm": 0.15047816932201385, "learning_rate": 0.0001999426040359556, "loss": 1.044, "step": 35 }, { "epoch": 0.015458937198067632, "grad_norm": 0.13698647916316986, "learning_rate": 0.00019993792100992682, "loss": 1.0759, "step": 36 }, { "epoch": 0.015888352120236177, "grad_norm": 0.16587479412555695, "learning_rate": 0.0001999330544132985, "loss": 1.1251, "step": 37 }, { "epoch": 0.016317767042404722, "grad_norm": 0.14546941220760345, "learning_rate": 0.00019992800425500988, "loss": 1.0911, "step": 38 }, { "epoch": 0.01674718196457327, "grad_norm": 0.137843519449234, "learning_rate": 0.00019992277054433727, "loss": 1.1183, "step": 39 }, { "epoch": 0.017176596886741814, "grad_norm": 0.14544665813446045, "learning_rate": 0.00019991735329089416, "loss": 1.1161, "step": 40 }, { "epoch": 0.01760601180891036, "grad_norm": 0.16081300377845764, "learning_rate": 0.00019991175250463127, "loss": 1.2546, "step": 41 }, { "epoch": 0.018035426731078906, "grad_norm": 0.15027405321598053, "learning_rate": 0.0001999059681958364, "loss": 1.0595, "step": 42 }, { "epoch": 0.018464841653247452, "grad_norm": 0.14687219262123108, "learning_rate": 0.00019990000037513437, "loss": 1.2931, "step": 43 }, { "epoch": 0.018894256575415994, "grad_norm": 0.1763402223587036, "learning_rate": 0.0001998938490534872, "loss": 1.4514, "step": 44 }, { "epoch": 0.01932367149758454, "grad_norm": 0.17130351066589355, "learning_rate": 0.00019988751424219388, "loss": 1.3405, "step": 45 }, { "epoch": 0.019753086419753086, "grad_norm": 0.14724081754684448, "learning_rate": 0.00019988099595289054, "loss": 0.9397, "step": 46 }, { "epoch": 0.020182501341921632, "grad_norm": 0.14184130728244781, "learning_rate": 0.00019987429419755022, "loss": 1.1355, "step": 47 }, { "epoch": 0.020611916264090178, "grad_norm": 0.1490873247385025, "learning_rate": 0.00019986740898848306, "loss": 1.1162, "step": 48 }, { "epoch": 0.021041331186258724, "grad_norm": 0.1417856514453888, "learning_rate": 0.00019986034033833613, "loss": 1.0778, "step": 49 }, { "epoch": 0.02147074610842727, "grad_norm": 0.14795203506946564, "learning_rate": 0.00019985308826009338, "loss": 1.0645, "step": 50 }, { "epoch": 0.021900161030595812, "grad_norm": 0.18487784266471863, "learning_rate": 0.00019984565276707583, "loss": 1.0634, "step": 51 }, { "epoch": 0.022329575952764358, "grad_norm": 0.15679900348186493, "learning_rate": 0.00019983803387294135, "loss": 1.2826, "step": 52 }, { "epoch": 0.022758990874932904, "grad_norm": 0.1397986263036728, "learning_rate": 0.00019983023159168465, "loss": 1.1745, "step": 53 }, { "epoch": 0.02318840579710145, "grad_norm": 0.13861894607543945, "learning_rate": 0.00019982224593763733, "loss": 0.9461, "step": 54 }, { "epoch": 0.023617820719269995, "grad_norm": 0.1317225992679596, "learning_rate": 0.00019981407692546777, "loss": 0.9315, "step": 55 }, { "epoch": 0.02404723564143854, "grad_norm": 0.1468420773744583, "learning_rate": 0.00019980572457018123, "loss": 1.2609, "step": 56 }, { "epoch": 0.024476650563607084, "grad_norm": 0.14463701844215393, "learning_rate": 0.0001997971888871197, "loss": 1.1092, "step": 57 }, { "epoch": 0.02490606548577563, "grad_norm": 0.14022503793239594, "learning_rate": 0.0001997884698919619, "loss": 0.9528, "step": 58 }, { "epoch": 0.025335480407944175, "grad_norm": 0.14202667772769928, "learning_rate": 0.00019977956760072334, "loss": 1.1813, "step": 59 }, { "epoch": 0.02576489533011272, "grad_norm": 0.1546659618616104, "learning_rate": 0.00019977048202975608, "loss": 1.348, "step": 60 }, { "epoch": 0.026194310252281267, "grad_norm": 0.1386214941740036, "learning_rate": 0.00019976121319574896, "loss": 1.1747, "step": 61 }, { "epoch": 0.026623725174449813, "grad_norm": 0.1513381004333496, "learning_rate": 0.00019975176111572743, "loss": 1.0845, "step": 62 }, { "epoch": 0.02705314009661836, "grad_norm": 0.1494988650083542, "learning_rate": 0.00019974212580705345, "loss": 1.1647, "step": 63 }, { "epoch": 0.0274825550187869, "grad_norm": 0.16360332071781158, "learning_rate": 0.0001997323072874256, "loss": 1.0523, "step": 64 }, { "epoch": 0.027911969940955447, "grad_norm": 0.17121770977973938, "learning_rate": 0.00019972230557487906, "loss": 1.3142, "step": 65 }, { "epoch": 0.028341384863123993, "grad_norm": 0.15700650215148926, "learning_rate": 0.0001997121206877854, "loss": 1.0519, "step": 66 }, { "epoch": 0.02877079978529254, "grad_norm": 0.15610812604427338, "learning_rate": 0.00019970175264485266, "loss": 1.2066, "step": 67 }, { "epoch": 0.029200214707461085, "grad_norm": 0.13125644624233246, "learning_rate": 0.00019969120146512542, "loss": 0.9134, "step": 68 }, { "epoch": 0.02962962962962963, "grad_norm": 0.16931581497192383, "learning_rate": 0.00019968046716798449, "loss": 1.0536, "step": 69 }, { "epoch": 0.030059044551798177, "grad_norm": 0.14404140412807465, "learning_rate": 0.00019966954977314715, "loss": 1.1876, "step": 70 }, { "epoch": 0.03048845947396672, "grad_norm": 0.18353833258152008, "learning_rate": 0.000199658449300667, "loss": 1.1881, "step": 71 }, { "epoch": 0.030917874396135265, "grad_norm": 0.1493215709924698, "learning_rate": 0.00019964716577093388, "loss": 1.2907, "step": 72 }, { "epoch": 0.031347289318303814, "grad_norm": 0.1731230616569519, "learning_rate": 0.0001996356992046739, "loss": 1.2771, "step": 73 }, { "epoch": 0.03177670424047235, "grad_norm": 0.15955105423927307, "learning_rate": 0.00019962404962294944, "loss": 1.1304, "step": 74 }, { "epoch": 0.0322061191626409, "grad_norm": 0.1388455629348755, "learning_rate": 0.00019961221704715886, "loss": 0.9874, "step": 75 }, { "epoch": 0.032635534084809445, "grad_norm": 0.16745209693908691, "learning_rate": 0.0001996002014990369, "loss": 1.1035, "step": 76 }, { "epoch": 0.03306494900697799, "grad_norm": 0.17726710438728333, "learning_rate": 0.00019958800300065425, "loss": 1.2322, "step": 77 }, { "epoch": 0.03349436392914654, "grad_norm": 0.16995428502559662, "learning_rate": 0.00019957562157441765, "loss": 1.2029, "step": 78 }, { "epoch": 0.03392377885131508, "grad_norm": 0.14299820363521576, "learning_rate": 0.00019956305724306986, "loss": 1.0119, "step": 79 }, { "epoch": 0.03435319377348363, "grad_norm": 0.15954792499542236, "learning_rate": 0.00019955031002968972, "loss": 1.127, "step": 80 }, { "epoch": 0.034782608695652174, "grad_norm": 0.166239395737648, "learning_rate": 0.00019953737995769179, "loss": 1.185, "step": 81 }, { "epoch": 0.03521202361782072, "grad_norm": 0.17462775111198425, "learning_rate": 0.0001995242670508267, "loss": 1.3376, "step": 82 }, { "epoch": 0.035641438539989266, "grad_norm": 0.16347193717956543, "learning_rate": 0.00019951097133318076, "loss": 1.1657, "step": 83 }, { "epoch": 0.03607085346215781, "grad_norm": 0.1850813329219818, "learning_rate": 0.00019949749282917626, "loss": 1.1724, "step": 84 }, { "epoch": 0.03650026838432636, "grad_norm": 0.16961267590522766, "learning_rate": 0.00019948383156357112, "loss": 1.1548, "step": 85 }, { "epoch": 0.036929683306494904, "grad_norm": 0.18874776363372803, "learning_rate": 0.0001994699875614589, "loss": 1.0729, "step": 86 }, { "epoch": 0.03735909822866344, "grad_norm": 0.17659211158752441, "learning_rate": 0.000199455960848269, "loss": 1.2371, "step": 87 }, { "epoch": 0.03778851315083199, "grad_norm": 0.16227173805236816, "learning_rate": 0.0001994417514497663, "loss": 1.0381, "step": 88 }, { "epoch": 0.038217928073000534, "grad_norm": 0.14537280797958374, "learning_rate": 0.0001994273593920513, "loss": 1.0392, "step": 89 }, { "epoch": 0.03864734299516908, "grad_norm": 0.1782526969909668, "learning_rate": 0.00019941278470155994, "loss": 1.1891, "step": 90 }, { "epoch": 0.039076757917337626, "grad_norm": 0.15369926393032074, "learning_rate": 0.00019939802740506375, "loss": 0.8279, "step": 91 }, { "epoch": 0.03950617283950617, "grad_norm": 0.1525738388299942, "learning_rate": 0.00019938308752966957, "loss": 1.1378, "step": 92 }, { "epoch": 0.03993558776167472, "grad_norm": 0.14440616965293884, "learning_rate": 0.0001993679651028197, "loss": 0.9707, "step": 93 }, { "epoch": 0.040365002683843264, "grad_norm": 0.1944921761751175, "learning_rate": 0.00019935266015229166, "loss": 1.2753, "step": 94 }, { "epoch": 0.04079441760601181, "grad_norm": 0.17704033851623535, "learning_rate": 0.00019933717270619833, "loss": 1.215, "step": 95 }, { "epoch": 0.041223832528180356, "grad_norm": 0.16801829636096954, "learning_rate": 0.00019932150279298777, "loss": 1.2177, "step": 96 }, { "epoch": 0.0416532474503489, "grad_norm": 0.14935865998268127, "learning_rate": 0.00019930565044144318, "loss": 1.0213, "step": 97 }, { "epoch": 0.04208266237251745, "grad_norm": 0.16046607494354248, "learning_rate": 0.0001992896156806829, "loss": 1.0529, "step": 98 }, { "epoch": 0.04251207729468599, "grad_norm": 0.16249270737171173, "learning_rate": 0.00019927339854016037, "loss": 1.0861, "step": 99 }, { "epoch": 0.04294149221685454, "grad_norm": 0.16730612516403198, "learning_rate": 0.0001992569990496639, "loss": 0.9681, "step": 100 }, { "epoch": 0.04337090713902308, "grad_norm": 0.17123740911483765, "learning_rate": 0.00019924041723931688, "loss": 0.9648, "step": 101 }, { "epoch": 0.043800322061191624, "grad_norm": 0.15978355705738068, "learning_rate": 0.00019922365313957752, "loss": 1.0962, "step": 102 }, { "epoch": 0.04422973698336017, "grad_norm": 0.18542608618736267, "learning_rate": 0.00019920670678123893, "loss": 1.1831, "step": 103 }, { "epoch": 0.044659151905528716, "grad_norm": 0.17981840670108795, "learning_rate": 0.00019918957819542893, "loss": 1.2029, "step": 104 }, { "epoch": 0.04508856682769726, "grad_norm": 0.16533541679382324, "learning_rate": 0.00019917226741361015, "loss": 1.2239, "step": 105 }, { "epoch": 0.04551798174986581, "grad_norm": 0.1770992875099182, "learning_rate": 0.0001991547744675798, "loss": 1.103, "step": 106 }, { "epoch": 0.04594739667203435, "grad_norm": 0.15934127569198608, "learning_rate": 0.00019913709938946972, "loss": 0.9117, "step": 107 }, { "epoch": 0.0463768115942029, "grad_norm": 0.1818443238735199, "learning_rate": 0.00019911924221174636, "loss": 1.149, "step": 108 }, { "epoch": 0.046806226516371445, "grad_norm": 0.17105095088481903, "learning_rate": 0.00019910120296721053, "loss": 1.3834, "step": 109 }, { "epoch": 0.04723564143853999, "grad_norm": 0.1493517905473709, "learning_rate": 0.00019908298168899765, "loss": 0.9976, "step": 110 }, { "epoch": 0.04766505636070854, "grad_norm": 0.17170068621635437, "learning_rate": 0.00019906457841057732, "loss": 1.0791, "step": 111 }, { "epoch": 0.04809447128287708, "grad_norm": 0.17287380993366241, "learning_rate": 0.00019904599316575357, "loss": 1.108, "step": 112 }, { "epoch": 0.04852388620504563, "grad_norm": 0.15946826338768005, "learning_rate": 0.00019902722598866466, "loss": 1.0462, "step": 113 }, { "epoch": 0.04895330112721417, "grad_norm": 0.18682260811328888, "learning_rate": 0.00019900827691378298, "loss": 1.0757, "step": 114 }, { "epoch": 0.04938271604938271, "grad_norm": 0.15951935946941376, "learning_rate": 0.00019898914597591506, "loss": 1.3103, "step": 115 }, { "epoch": 0.04981213097155126, "grad_norm": 0.16503126919269562, "learning_rate": 0.0001989698332102015, "loss": 1.1521, "step": 116 }, { "epoch": 0.050241545893719805, "grad_norm": 0.15713706612586975, "learning_rate": 0.0001989503386521169, "loss": 1.2906, "step": 117 }, { "epoch": 0.05067096081588835, "grad_norm": 0.1533653736114502, "learning_rate": 0.00019893066233746978, "loss": 1.0389, "step": 118 }, { "epoch": 0.0511003757380569, "grad_norm": 0.16496874392032623, "learning_rate": 0.0001989108043024025, "loss": 1.2676, "step": 119 }, { "epoch": 0.05152979066022544, "grad_norm": 0.14784802496433258, "learning_rate": 0.00019889076458339116, "loss": 0.9091, "step": 120 }, { "epoch": 0.05195920558239399, "grad_norm": 0.1391952782869339, "learning_rate": 0.00019887054321724565, "loss": 0.7391, "step": 121 }, { "epoch": 0.052388620504562534, "grad_norm": 0.16542598605155945, "learning_rate": 0.0001988501402411096, "loss": 1.26, "step": 122 }, { "epoch": 0.05281803542673108, "grad_norm": 0.1864759474992752, "learning_rate": 0.00019882955569246007, "loss": 1.1248, "step": 123 }, { "epoch": 0.053247450348899626, "grad_norm": 0.19127963483333588, "learning_rate": 0.00019880878960910772, "loss": 1.2209, "step": 124 }, { "epoch": 0.05367686527106817, "grad_norm": 0.18262384831905365, "learning_rate": 0.00019878784202919666, "loss": 1.2114, "step": 125 }, { "epoch": 0.05410628019323672, "grad_norm": 0.16955001652240753, "learning_rate": 0.0001987667129912044, "loss": 1.133, "step": 126 }, { "epoch": 0.05453569511540526, "grad_norm": 0.17882367968559265, "learning_rate": 0.00019874540253394168, "loss": 1.3044, "step": 127 }, { "epoch": 0.0549651100375738, "grad_norm": 0.20200395584106445, "learning_rate": 0.00019872391069655258, "loss": 1.1933, "step": 128 }, { "epoch": 0.05539452495974235, "grad_norm": 0.17120778560638428, "learning_rate": 0.00019870223751851428, "loss": 1.0102, "step": 129 }, { "epoch": 0.055823939881910895, "grad_norm": 0.19138963520526886, "learning_rate": 0.0001986803830396371, "loss": 1.4741, "step": 130 }, { "epoch": 0.05625335480407944, "grad_norm": 0.181193545460701, "learning_rate": 0.00019865834730006433, "loss": 1.1563, "step": 131 }, { "epoch": 0.056682769726247986, "grad_norm": 0.16531504690647125, "learning_rate": 0.00019863613034027224, "loss": 1.1427, "step": 132 }, { "epoch": 0.05711218464841653, "grad_norm": 0.1994440257549286, "learning_rate": 0.00019861373220106997, "loss": 1.3541, "step": 133 }, { "epoch": 0.05754159957058508, "grad_norm": 0.18033157289028168, "learning_rate": 0.0001985911529235995, "loss": 0.9477, "step": 134 }, { "epoch": 0.057971014492753624, "grad_norm": 0.17404161393642426, "learning_rate": 0.00019856839254933544, "loss": 1.1277, "step": 135 }, { "epoch": 0.05840042941492217, "grad_norm": 0.17261551320552826, "learning_rate": 0.00019854545112008514, "loss": 1.2953, "step": 136 }, { "epoch": 0.058829844337090716, "grad_norm": 0.1669391393661499, "learning_rate": 0.00019852232867798844, "loss": 1.2108, "step": 137 }, { "epoch": 0.05925925925925926, "grad_norm": 0.1854487657546997, "learning_rate": 0.00019849902526551772, "loss": 1.5342, "step": 138 }, { "epoch": 0.05968867418142781, "grad_norm": 0.18810135126113892, "learning_rate": 0.0001984755409254778, "loss": 1.0847, "step": 139 }, { "epoch": 0.06011808910359635, "grad_norm": 0.15636786818504333, "learning_rate": 0.00019845187570100573, "loss": 1.1426, "step": 140 }, { "epoch": 0.06054750402576489, "grad_norm": 0.15283016860485077, "learning_rate": 0.000198428029635571, "loss": 0.9389, "step": 141 }, { "epoch": 0.06097691894793344, "grad_norm": 0.1785784810781479, "learning_rate": 0.00019840400277297508, "loss": 0.8145, "step": 142 }, { "epoch": 0.061406333870101984, "grad_norm": 0.19488206505775452, "learning_rate": 0.00019837979515735166, "loss": 1.1245, "step": 143 }, { "epoch": 0.06183574879227053, "grad_norm": 0.1749604046344757, "learning_rate": 0.00019835540683316638, "loss": 1.0823, "step": 144 }, { "epoch": 0.062265163714439076, "grad_norm": 0.14947979152202606, "learning_rate": 0.00019833083784521688, "loss": 0.9827, "step": 145 }, { "epoch": 0.06269457863660763, "grad_norm": 0.18214192986488342, "learning_rate": 0.00019830608823863258, "loss": 1.1311, "step": 146 }, { "epoch": 0.06312399355877617, "grad_norm": 0.15751980245113373, "learning_rate": 0.0001982811580588747, "loss": 1.126, "step": 147 }, { "epoch": 0.0635534084809447, "grad_norm": 0.17060008645057678, "learning_rate": 0.0001982560473517362, "loss": 1.0999, "step": 148 }, { "epoch": 0.06398282340311326, "grad_norm": 0.15626037120819092, "learning_rate": 0.00019823075616334155, "loss": 1.1292, "step": 149 }, { "epoch": 0.0644122383252818, "grad_norm": 0.17362122237682343, "learning_rate": 0.00019820528454014678, "loss": 1.0831, "step": 150 }, { "epoch": 0.06484165324745035, "grad_norm": 0.17661671340465546, "learning_rate": 0.00019817963252893934, "loss": 1.0467, "step": 151 }, { "epoch": 0.06527106816961889, "grad_norm": 0.1770239919424057, "learning_rate": 0.00019815380017683805, "loss": 1.3296, "step": 152 }, { "epoch": 0.06570048309178744, "grad_norm": 0.1600884646177292, "learning_rate": 0.00019812778753129295, "loss": 1.1975, "step": 153 }, { "epoch": 0.06612989801395598, "grad_norm": 0.14404766261577606, "learning_rate": 0.0001981015946400853, "loss": 1.0152, "step": 154 }, { "epoch": 0.06655931293612453, "grad_norm": 0.15787601470947266, "learning_rate": 0.0001980752215513274, "loss": 0.8621, "step": 155 }, { "epoch": 0.06698872785829307, "grad_norm": 0.16410237550735474, "learning_rate": 0.00019804866831346253, "loss": 1.1043, "step": 156 }, { "epoch": 0.06741814278046163, "grad_norm": 0.14886626601219177, "learning_rate": 0.00019802193497526496, "loss": 1.0065, "step": 157 }, { "epoch": 0.06784755770263017, "grad_norm": 0.18639588356018066, "learning_rate": 0.00019799502158583966, "loss": 1.1146, "step": 158 }, { "epoch": 0.06827697262479872, "grad_norm": 0.1470535844564438, "learning_rate": 0.00019796792819462246, "loss": 0.9775, "step": 159 }, { "epoch": 0.06870638754696726, "grad_norm": 0.177282452583313, "learning_rate": 0.0001979406548513797, "loss": 1.316, "step": 160 }, { "epoch": 0.0691358024691358, "grad_norm": 0.17426224052906036, "learning_rate": 0.00019791320160620837, "loss": 1.2854, "step": 161 }, { "epoch": 0.06956521739130435, "grad_norm": 0.16735795140266418, "learning_rate": 0.0001978855685095358, "loss": 1.2184, "step": 162 }, { "epoch": 0.06999463231347289, "grad_norm": 0.18738149106502533, "learning_rate": 0.00019785775561211976, "loss": 1.1342, "step": 163 }, { "epoch": 0.07042404723564144, "grad_norm": 0.17026057839393616, "learning_rate": 0.00019782976296504835, "loss": 1.0973, "step": 164 }, { "epoch": 0.07085346215780998, "grad_norm": 0.14129336178302765, "learning_rate": 0.00019780159061973964, "loss": 0.8889, "step": 165 }, { "epoch": 0.07128287707997853, "grad_norm": 0.19238591194152832, "learning_rate": 0.00019777323862794192, "loss": 1.0827, "step": 166 }, { "epoch": 0.07171229200214707, "grad_norm": 0.17041011154651642, "learning_rate": 0.00019774470704173353, "loss": 1.2057, "step": 167 }, { "epoch": 0.07214170692431562, "grad_norm": 0.18856163322925568, "learning_rate": 0.00019771599591352252, "loss": 1.1693, "step": 168 }, { "epoch": 0.07257112184648416, "grad_norm": 0.17438524961471558, "learning_rate": 0.00019768710529604686, "loss": 1.1714, "step": 169 }, { "epoch": 0.07300053676865272, "grad_norm": 0.17283211648464203, "learning_rate": 0.00019765803524237417, "loss": 1.34, "step": 170 }, { "epoch": 0.07342995169082125, "grad_norm": 0.15461453795433044, "learning_rate": 0.00019762878580590162, "loss": 1.1, "step": 171 }, { "epoch": 0.07385936661298981, "grad_norm": 0.1745782196521759, "learning_rate": 0.00019759935704035598, "loss": 1.1485, "step": 172 }, { "epoch": 0.07428878153515835, "grad_norm": 0.19017790257930756, "learning_rate": 0.0001975697489997934, "loss": 1.2036, "step": 173 }, { "epoch": 0.07471819645732689, "grad_norm": 0.14983102679252625, "learning_rate": 0.0001975399617385992, "loss": 0.9465, "step": 174 }, { "epoch": 0.07514761137949544, "grad_norm": 0.1556852161884308, "learning_rate": 0.0001975099953114881, "loss": 0.941, "step": 175 }, { "epoch": 0.07557702630166398, "grad_norm": 0.1680162101984024, "learning_rate": 0.00019747984977350379, "loss": 1.2423, "step": 176 }, { "epoch": 0.07600644122383253, "grad_norm": 0.17990583181381226, "learning_rate": 0.00019744952518001893, "loss": 1.0285, "step": 177 }, { "epoch": 0.07643585614600107, "grad_norm": 0.18733762204647064, "learning_rate": 0.00019741902158673522, "loss": 1.3571, "step": 178 }, { "epoch": 0.07686527106816962, "grad_norm": 0.14356885850429535, "learning_rate": 0.00019738833904968302, "loss": 0.8155, "step": 179 }, { "epoch": 0.07729468599033816, "grad_norm": 0.19046086072921753, "learning_rate": 0.00019735747762522147, "loss": 1.0226, "step": 180 }, { "epoch": 0.07772410091250671, "grad_norm": 0.14588217437267303, "learning_rate": 0.00019732643737003827, "loss": 0.8774, "step": 181 }, { "epoch": 0.07815351583467525, "grad_norm": 0.16085247695446014, "learning_rate": 0.00019729521834114952, "loss": 1.1483, "step": 182 }, { "epoch": 0.0785829307568438, "grad_norm": 0.1786722093820572, "learning_rate": 0.00019726382059589986, "loss": 1.0986, "step": 183 }, { "epoch": 0.07901234567901234, "grad_norm": 0.1842159777879715, "learning_rate": 0.0001972322441919621, "loss": 1.1254, "step": 184 }, { "epoch": 0.0794417606011809, "grad_norm": 0.1684993952512741, "learning_rate": 0.00019720048918733723, "loss": 0.9512, "step": 185 }, { "epoch": 0.07987117552334944, "grad_norm": 0.18039727210998535, "learning_rate": 0.0001971685556403543, "loss": 1.2037, "step": 186 }, { "epoch": 0.08030059044551799, "grad_norm": 0.16253158450126648, "learning_rate": 0.0001971364436096703, "loss": 1.1042, "step": 187 }, { "epoch": 0.08073000536768653, "grad_norm": 0.17348501086235046, "learning_rate": 0.00019710415315427022, "loss": 1.0384, "step": 188 }, { "epoch": 0.08115942028985507, "grad_norm": 0.19116544723510742, "learning_rate": 0.00019707168433346655, "loss": 1.1186, "step": 189 }, { "epoch": 0.08158883521202362, "grad_norm": 0.17228098213672638, "learning_rate": 0.00019703903720689954, "loss": 1.0421, "step": 190 }, { "epoch": 0.08201825013419216, "grad_norm": 0.15176887810230255, "learning_rate": 0.00019700621183453695, "loss": 1.1865, "step": 191 }, { "epoch": 0.08244766505636071, "grad_norm": 0.16815736889839172, "learning_rate": 0.00019697320827667398, "loss": 1.3136, "step": 192 }, { "epoch": 0.08287707997852925, "grad_norm": 0.18581236898899078, "learning_rate": 0.00019694002659393305, "loss": 1.2243, "step": 193 }, { "epoch": 0.0833064949006978, "grad_norm": 0.19139103591442108, "learning_rate": 0.00019690666684726382, "loss": 1.1882, "step": 194 }, { "epoch": 0.08373590982286634, "grad_norm": 0.15718159079551697, "learning_rate": 0.00019687312909794305, "loss": 1.0329, "step": 195 }, { "epoch": 0.0841653247450349, "grad_norm": 0.1583366096019745, "learning_rate": 0.00019683941340757434, "loss": 0.9521, "step": 196 }, { "epoch": 0.08459473966720343, "grad_norm": 0.17986145615577698, "learning_rate": 0.00019680551983808836, "loss": 1.3057, "step": 197 }, { "epoch": 0.08502415458937199, "grad_norm": 0.14667508006095886, "learning_rate": 0.00019677144845174226, "loss": 1.204, "step": 198 }, { "epoch": 0.08545356951154053, "grad_norm": 0.16105642914772034, "learning_rate": 0.00019673719931112004, "loss": 1.2272, "step": 199 }, { "epoch": 0.08588298443370908, "grad_norm": 0.17806339263916016, "learning_rate": 0.00019670277247913205, "loss": 0.9928, "step": 200 }, { "epoch": 0.08631239935587762, "grad_norm": 0.15053167939186096, "learning_rate": 0.0001966681680190151, "loss": 0.8566, "step": 201 }, { "epoch": 0.08674181427804616, "grad_norm": 0.13740143179893494, "learning_rate": 0.00019663338599433227, "loss": 0.7979, "step": 202 }, { "epoch": 0.08717122920021471, "grad_norm": 0.17480605840682983, "learning_rate": 0.00019659842646897282, "loss": 0.9794, "step": 203 }, { "epoch": 0.08760064412238325, "grad_norm": 0.192199245095253, "learning_rate": 0.00019656328950715194, "loss": 1.2525, "step": 204 }, { "epoch": 0.0880300590445518, "grad_norm": 0.18914753198623657, "learning_rate": 0.00019652797517341096, "loss": 1.2156, "step": 205 }, { "epoch": 0.08845947396672034, "grad_norm": 0.19193218648433685, "learning_rate": 0.00019649248353261674, "loss": 1.385, "step": 206 }, { "epoch": 0.08888888888888889, "grad_norm": 0.19617465138435364, "learning_rate": 0.00019645681464996206, "loss": 1.2991, "step": 207 }, { "epoch": 0.08931830381105743, "grad_norm": 0.16679921746253967, "learning_rate": 0.00019642096859096516, "loss": 1.0183, "step": 208 }, { "epoch": 0.08974771873322598, "grad_norm": 0.1839999556541443, "learning_rate": 0.00019638494542146973, "loss": 1.2098, "step": 209 }, { "epoch": 0.09017713365539452, "grad_norm": 0.17847347259521484, "learning_rate": 0.0001963487452076448, "loss": 1.1791, "step": 210 }, { "epoch": 0.09060654857756308, "grad_norm": 0.1537715196609497, "learning_rate": 0.00019631236801598458, "loss": 1.307, "step": 211 }, { "epoch": 0.09103596349973161, "grad_norm": 0.16377565264701843, "learning_rate": 0.0001962758139133084, "loss": 0.9766, "step": 212 }, { "epoch": 0.09146537842190017, "grad_norm": 0.1567695438861847, "learning_rate": 0.0001962390829667605, "loss": 1.1082, "step": 213 }, { "epoch": 0.0918947933440687, "grad_norm": 0.14198783040046692, "learning_rate": 0.00019620217524381005, "loss": 1.0773, "step": 214 }, { "epoch": 0.09232420826623725, "grad_norm": 0.16413229703903198, "learning_rate": 0.0001961650908122508, "loss": 1.1947, "step": 215 }, { "epoch": 0.0927536231884058, "grad_norm": 0.15348884463310242, "learning_rate": 0.00019612782974020118, "loss": 0.7186, "step": 216 }, { "epoch": 0.09318303811057434, "grad_norm": 0.1820840686559677, "learning_rate": 0.00019609039209610404, "loss": 1.0661, "step": 217 }, { "epoch": 0.09361245303274289, "grad_norm": 0.1551450490951538, "learning_rate": 0.00019605277794872657, "loss": 0.8472, "step": 218 }, { "epoch": 0.09404186795491143, "grad_norm": 0.19438843429088593, "learning_rate": 0.00019601498736716017, "loss": 1.2454, "step": 219 }, { "epoch": 0.09447128287707998, "grad_norm": 0.16173028945922852, "learning_rate": 0.00019597702042082037, "loss": 0.8713, "step": 220 }, { "epoch": 0.09490069779924852, "grad_norm": 0.18918974697589874, "learning_rate": 0.00019593887717944659, "loss": 1.2559, "step": 221 }, { "epoch": 0.09533011272141707, "grad_norm": 0.1581108570098877, "learning_rate": 0.00019590055771310212, "loss": 0.7194, "step": 222 }, { "epoch": 0.09575952764358561, "grad_norm": 0.13984139263629913, "learning_rate": 0.0001958620620921739, "loss": 0.7027, "step": 223 }, { "epoch": 0.09618894256575417, "grad_norm": 0.1842825710773468, "learning_rate": 0.00019582339038737247, "loss": 1.2838, "step": 224 }, { "epoch": 0.0966183574879227, "grad_norm": 0.16079159080982208, "learning_rate": 0.00019578454266973183, "loss": 1.0553, "step": 225 }, { "epoch": 0.09704777241009126, "grad_norm": 0.16030196845531464, "learning_rate": 0.00019574551901060922, "loss": 1.0496, "step": 226 }, { "epoch": 0.0974771873322598, "grad_norm": 0.16699260473251343, "learning_rate": 0.0001957063194816852, "loss": 1.3505, "step": 227 }, { "epoch": 0.09790660225442833, "grad_norm": 0.1571999043226242, "learning_rate": 0.00019566694415496316, "loss": 1.2156, "step": 228 }, { "epoch": 0.09833601717659689, "grad_norm": 0.15415778756141663, "learning_rate": 0.0001956273931027696, "loss": 1.0225, "step": 229 }, { "epoch": 0.09876543209876543, "grad_norm": 0.16700062155723572, "learning_rate": 0.0001955876663977537, "loss": 1.0049, "step": 230 }, { "epoch": 0.09919484702093398, "grad_norm": 0.16353946924209595, "learning_rate": 0.00019554776411288732, "loss": 1.2387, "step": 231 }, { "epoch": 0.09962426194310252, "grad_norm": 0.16290371119976044, "learning_rate": 0.00019550768632146484, "loss": 1.044, "step": 232 }, { "epoch": 0.10005367686527107, "grad_norm": 0.15819229185581207, "learning_rate": 0.00019546743309710297, "loss": 1.13, "step": 233 }, { "epoch": 0.10048309178743961, "grad_norm": 0.18955904245376587, "learning_rate": 0.00019542700451374067, "loss": 1.1663, "step": 234 }, { "epoch": 0.10091250670960816, "grad_norm": 0.14698690176010132, "learning_rate": 0.0001953864006456391, "loss": 1.1295, "step": 235 }, { "epoch": 0.1013419216317767, "grad_norm": 0.1734054684638977, "learning_rate": 0.00019534562156738129, "loss": 0.8559, "step": 236 }, { "epoch": 0.10177133655394525, "grad_norm": 0.16847679018974304, "learning_rate": 0.00019530466735387213, "loss": 1.0313, "step": 237 }, { "epoch": 0.1022007514761138, "grad_norm": 0.1666480153799057, "learning_rate": 0.00019526353808033825, "loss": 1.0825, "step": 238 }, { "epoch": 0.10263016639828235, "grad_norm": 0.14294366538524628, "learning_rate": 0.0001952222338223278, "loss": 0.9846, "step": 239 }, { "epoch": 0.10305958132045089, "grad_norm": 0.1204523891210556, "learning_rate": 0.00019518075465571028, "loss": 0.9862, "step": 240 }, { "epoch": 0.10348899624261942, "grad_norm": 0.14956791698932648, "learning_rate": 0.00019513910065667664, "loss": 1.0975, "step": 241 }, { "epoch": 0.10391841116478798, "grad_norm": 0.16827872395515442, "learning_rate": 0.00019509727190173884, "loss": 1.3116, "step": 242 }, { "epoch": 0.10434782608695652, "grad_norm": 0.16410714387893677, "learning_rate": 0.00019505526846772984, "loss": 0.9231, "step": 243 }, { "epoch": 0.10477724100912507, "grad_norm": 0.19388873875141144, "learning_rate": 0.00019501309043180352, "loss": 1.0604, "step": 244 }, { "epoch": 0.10520665593129361, "grad_norm": 0.17403458058834076, "learning_rate": 0.00019497073787143446, "loss": 1.0757, "step": 245 }, { "epoch": 0.10563607085346216, "grad_norm": 0.1442354917526245, "learning_rate": 0.0001949282108644178, "loss": 0.6964, "step": 246 }, { "epoch": 0.1060654857756307, "grad_norm": 0.1477101892232895, "learning_rate": 0.0001948855094888691, "loss": 1.2497, "step": 247 }, { "epoch": 0.10649490069779925, "grad_norm": 0.1691221445798874, "learning_rate": 0.0001948426338232242, "loss": 1.1567, "step": 248 }, { "epoch": 0.10692431561996779, "grad_norm": 0.16259369254112244, "learning_rate": 0.00019479958394623913, "loss": 0.9878, "step": 249 }, { "epoch": 0.10735373054213634, "grad_norm": 0.17605777084827423, "learning_rate": 0.00019475635993698994, "loss": 1.0964, "step": 250 }, { "epoch": 0.10778314546430488, "grad_norm": 0.17357371747493744, "learning_rate": 0.0001947129618748724, "loss": 1.0984, "step": 251 }, { "epoch": 0.10821256038647344, "grad_norm": 0.16604338586330414, "learning_rate": 0.00019466938983960218, "loss": 1.2584, "step": 252 }, { "epoch": 0.10864197530864197, "grad_norm": 0.15120381116867065, "learning_rate": 0.00019462564391121436, "loss": 0.7606, "step": 253 }, { "epoch": 0.10907139023081051, "grad_norm": 0.18790557980537415, "learning_rate": 0.00019458172417006347, "loss": 1.1506, "step": 254 }, { "epoch": 0.10950080515297907, "grad_norm": 0.17807306349277496, "learning_rate": 0.00019453763069682335, "loss": 1.1895, "step": 255 }, { "epoch": 0.1099302200751476, "grad_norm": 0.18234007060527802, "learning_rate": 0.00019449336357248696, "loss": 1.1112, "step": 256 }, { "epoch": 0.11035963499731616, "grad_norm": 0.1744687557220459, "learning_rate": 0.00019444892287836613, "loss": 1.042, "step": 257 }, { "epoch": 0.1107890499194847, "grad_norm": 0.15671797096729279, "learning_rate": 0.00019440430869609166, "loss": 1.1334, "step": 258 }, { "epoch": 0.11121846484165325, "grad_norm": 0.17378878593444824, "learning_rate": 0.00019435952110761289, "loss": 1.1142, "step": 259 }, { "epoch": 0.11164787976382179, "grad_norm": 0.17875009775161743, "learning_rate": 0.00019431456019519775, "loss": 1.0393, "step": 260 }, { "epoch": 0.11207729468599034, "grad_norm": 0.15020230412483215, "learning_rate": 0.00019426942604143253, "loss": 1.2424, "step": 261 }, { "epoch": 0.11250670960815888, "grad_norm": 0.17647111415863037, "learning_rate": 0.00019422411872922171, "loss": 1.1036, "step": 262 }, { "epoch": 0.11293612453032743, "grad_norm": 0.1858074814081192, "learning_rate": 0.00019417863834178794, "loss": 1.1087, "step": 263 }, { "epoch": 0.11336553945249597, "grad_norm": 0.18380528688430786, "learning_rate": 0.0001941329849626716, "loss": 1.1344, "step": 264 }, { "epoch": 0.11379495437466453, "grad_norm": 0.1671726554632187, "learning_rate": 0.000194087158675731, "loss": 0.8795, "step": 265 }, { "epoch": 0.11422436929683306, "grad_norm": 0.17651990056037903, "learning_rate": 0.00019404115956514194, "loss": 1.1036, "step": 266 }, { "epoch": 0.11465378421900162, "grad_norm": 0.17102883756160736, "learning_rate": 0.00019399498771539774, "loss": 1.0949, "step": 267 }, { "epoch": 0.11508319914117016, "grad_norm": 0.18060144782066345, "learning_rate": 0.000193948643211309, "loss": 1.1315, "step": 268 }, { "epoch": 0.1155126140633387, "grad_norm": 0.15454426407814026, "learning_rate": 0.0001939021261380034, "loss": 1.057, "step": 269 }, { "epoch": 0.11594202898550725, "grad_norm": 0.14077837765216827, "learning_rate": 0.0001938554365809257, "loss": 0.8064, "step": 270 }, { "epoch": 0.11637144390767579, "grad_norm": 0.17142775654792786, "learning_rate": 0.00019380857462583743, "loss": 1.156, "step": 271 }, { "epoch": 0.11680085882984434, "grad_norm": 0.1670989692211151, "learning_rate": 0.0001937615403588168, "loss": 0.9589, "step": 272 }, { "epoch": 0.11723027375201288, "grad_norm": 0.19140732288360596, "learning_rate": 0.00019371433386625856, "loss": 0.9871, "step": 273 }, { "epoch": 0.11765968867418143, "grad_norm": 0.18820329010486603, "learning_rate": 0.00019366695523487368, "loss": 1.0285, "step": 274 }, { "epoch": 0.11808910359634997, "grad_norm": 0.17042939364910126, "learning_rate": 0.00019361940455168956, "loss": 1.0943, "step": 275 }, { "epoch": 0.11851851851851852, "grad_norm": 0.16640831530094147, "learning_rate": 0.00019357168190404936, "loss": 1.1504, "step": 276 }, { "epoch": 0.11894793344068706, "grad_norm": 0.16726379096508026, "learning_rate": 0.00019352378737961235, "loss": 1.3996, "step": 277 }, { "epoch": 0.11937734836285561, "grad_norm": 0.1757480800151825, "learning_rate": 0.00019347572106635335, "loss": 1.1903, "step": 278 }, { "epoch": 0.11980676328502415, "grad_norm": 0.1531904935836792, "learning_rate": 0.00019342748305256285, "loss": 1.0287, "step": 279 }, { "epoch": 0.1202361782071927, "grad_norm": 0.19600524008274078, "learning_rate": 0.0001933790734268466, "loss": 1.1248, "step": 280 }, { "epoch": 0.12066559312936125, "grad_norm": 0.1654789000749588, "learning_rate": 0.0001933304922781257, "loss": 1.2959, "step": 281 }, { "epoch": 0.12109500805152978, "grad_norm": 0.16465742886066437, "learning_rate": 0.0001932817396956362, "loss": 0.9625, "step": 282 }, { "epoch": 0.12152442297369834, "grad_norm": 0.16723015904426575, "learning_rate": 0.00019323281576892916, "loss": 1.034, "step": 283 }, { "epoch": 0.12195383789586688, "grad_norm": 0.15436948835849762, "learning_rate": 0.00019318372058787025, "loss": 1.085, "step": 284 }, { "epoch": 0.12238325281803543, "grad_norm": 0.17568649351596832, "learning_rate": 0.00019313445424263978, "loss": 1.1922, "step": 285 }, { "epoch": 0.12281266774020397, "grad_norm": 0.15134669840335846, "learning_rate": 0.0001930850168237325, "loss": 1.1783, "step": 286 }, { "epoch": 0.12324208266237252, "grad_norm": 0.19426967203617096, "learning_rate": 0.00019303540842195732, "loss": 1.2244, "step": 287 }, { "epoch": 0.12367149758454106, "grad_norm": 0.17754550278186798, "learning_rate": 0.00019298562912843724, "loss": 0.9266, "step": 288 }, { "epoch": 0.12410091250670961, "grad_norm": 0.18942666053771973, "learning_rate": 0.00019293567903460918, "loss": 1.0538, "step": 289 }, { "epoch": 0.12453032742887815, "grad_norm": 0.14974556863307953, "learning_rate": 0.0001928855582322238, "loss": 0.8825, "step": 290 }, { "epoch": 0.1249597423510467, "grad_norm": 0.16468919813632965, "learning_rate": 0.0001928352668133453, "loss": 1.2179, "step": 291 }, { "epoch": 0.12538915727321526, "grad_norm": 0.18979178369045258, "learning_rate": 0.00019278480487035126, "loss": 1.0274, "step": 292 }, { "epoch": 0.12581857219538378, "grad_norm": 0.1661735624074936, "learning_rate": 0.00019273417249593256, "loss": 1.0588, "step": 293 }, { "epoch": 0.12624798711755233, "grad_norm": 0.18528646230697632, "learning_rate": 0.00019268336978309303, "loss": 1.1263, "step": 294 }, { "epoch": 0.1266774020397209, "grad_norm": 0.16602130234241486, "learning_rate": 0.00019263239682514952, "loss": 0.7833, "step": 295 }, { "epoch": 0.1271068169618894, "grad_norm": 0.18867306411266327, "learning_rate": 0.00019258125371573144, "loss": 1.1295, "step": 296 }, { "epoch": 0.12753623188405797, "grad_norm": 0.1883901059627533, "learning_rate": 0.00019252994054878088, "loss": 1.0669, "step": 297 }, { "epoch": 0.12796564680622652, "grad_norm": 0.1632394641637802, "learning_rate": 0.00019247845741855222, "loss": 1.0846, "step": 298 }, { "epoch": 0.12839506172839507, "grad_norm": 0.18154770135879517, "learning_rate": 0.00019242680441961205, "loss": 1.1138, "step": 299 }, { "epoch": 0.1288244766505636, "grad_norm": 0.16086812317371368, "learning_rate": 0.00019237498164683897, "loss": 0.9613, "step": 300 }, { "epoch": 0.12925389157273215, "grad_norm": 0.19330988824367523, "learning_rate": 0.0001923229891954235, "loss": 0.7739, "step": 301 }, { "epoch": 0.1296833064949007, "grad_norm": 0.1668129414319992, "learning_rate": 0.00019227082716086777, "loss": 1.0718, "step": 302 }, { "epoch": 0.13011272141706925, "grad_norm": 0.1654328554868698, "learning_rate": 0.00019221849563898536, "loss": 0.9797, "step": 303 }, { "epoch": 0.13054213633923778, "grad_norm": 0.1601610779762268, "learning_rate": 0.00019216599472590134, "loss": 1.0867, "step": 304 }, { "epoch": 0.13097155126140633, "grad_norm": 0.16391853988170624, "learning_rate": 0.0001921133245180517, "loss": 0.8036, "step": 305 }, { "epoch": 0.13140096618357489, "grad_norm": 0.18757081031799316, "learning_rate": 0.0001920604851121836, "loss": 1.3174, "step": 306 }, { "epoch": 0.13183038110574344, "grad_norm": 0.18147063255310059, "learning_rate": 0.00019200747660535488, "loss": 1.1763, "step": 307 }, { "epoch": 0.13225979602791196, "grad_norm": 0.16341471672058105, "learning_rate": 0.000191954299094934, "loss": 1.0075, "step": 308 }, { "epoch": 0.13268921095008052, "grad_norm": 0.183994323015213, "learning_rate": 0.00019190095267859988, "loss": 1.144, "step": 309 }, { "epoch": 0.13311862587224907, "grad_norm": 0.1656254529953003, "learning_rate": 0.0001918474374543417, "loss": 1.0775, "step": 310 }, { "epoch": 0.1335480407944176, "grad_norm": 0.15094861388206482, "learning_rate": 0.0001917937535204587, "loss": 0.6977, "step": 311 }, { "epoch": 0.13397745571658615, "grad_norm": 0.1565057784318924, "learning_rate": 0.00019173990097556002, "loss": 1.1004, "step": 312 }, { "epoch": 0.1344068706387547, "grad_norm": 0.18779979646205902, "learning_rate": 0.00019168587991856448, "loss": 1.257, "step": 313 }, { "epoch": 0.13483628556092325, "grad_norm": 0.15053409337997437, "learning_rate": 0.0001916316904487005, "loss": 0.8913, "step": 314 }, { "epoch": 0.13526570048309178, "grad_norm": 0.16636574268341064, "learning_rate": 0.00019157733266550575, "loss": 0.8063, "step": 315 }, { "epoch": 0.13569511540526033, "grad_norm": 0.19238772988319397, "learning_rate": 0.00019152280666882718, "loss": 1.2016, "step": 316 }, { "epoch": 0.13612453032742888, "grad_norm": 0.17583003640174866, "learning_rate": 0.00019146811255882064, "loss": 1.0703, "step": 317 }, { "epoch": 0.13655394524959744, "grad_norm": 0.1871437430381775, "learning_rate": 0.0001914132504359508, "loss": 1.2822, "step": 318 }, { "epoch": 0.13698336017176596, "grad_norm": 0.15960069000720978, "learning_rate": 0.00019135822040099095, "loss": 0.9356, "step": 319 }, { "epoch": 0.1374127750939345, "grad_norm": 0.17675542831420898, "learning_rate": 0.0001913030225550228, "loss": 1.1216, "step": 320 }, { "epoch": 0.13784219001610307, "grad_norm": 0.18341028690338135, "learning_rate": 0.00019124765699943632, "loss": 1.1436, "step": 321 }, { "epoch": 0.1382716049382716, "grad_norm": 0.1786155104637146, "learning_rate": 0.00019119212383592954, "loss": 1.1862, "step": 322 }, { "epoch": 0.13870101986044014, "grad_norm": 0.15550769865512848, "learning_rate": 0.0001911364231665083, "loss": 1.107, "step": 323 }, { "epoch": 0.1391304347826087, "grad_norm": 0.16558977961540222, "learning_rate": 0.00019108055509348623, "loss": 1.1584, "step": 324 }, { "epoch": 0.13955984970477725, "grad_norm": 0.15727491676807404, "learning_rate": 0.0001910245197194843, "loss": 1.1332, "step": 325 }, { "epoch": 0.13998926462694578, "grad_norm": 0.16455912590026855, "learning_rate": 0.00019096831714743098, "loss": 0.8548, "step": 326 }, { "epoch": 0.14041867954911433, "grad_norm": 0.16871945559978485, "learning_rate": 0.00019091194748056172, "loss": 0.9473, "step": 327 }, { "epoch": 0.14084809447128288, "grad_norm": 0.18946193158626556, "learning_rate": 0.0001908554108224189, "loss": 1.1623, "step": 328 }, { "epoch": 0.14127750939345143, "grad_norm": 0.18290971219539642, "learning_rate": 0.0001907987072768517, "loss": 1.0757, "step": 329 }, { "epoch": 0.14170692431561996, "grad_norm": 0.17551882565021515, "learning_rate": 0.0001907418369480158, "loss": 1.0275, "step": 330 }, { "epoch": 0.1421363392377885, "grad_norm": 0.1738695502281189, "learning_rate": 0.00019068479994037327, "loss": 1.0504, "step": 331 }, { "epoch": 0.14256575415995706, "grad_norm": 0.18197093904018402, "learning_rate": 0.00019062759635869232, "loss": 1.2005, "step": 332 }, { "epoch": 0.14299516908212562, "grad_norm": 0.16323554515838623, "learning_rate": 0.00019057022630804716, "loss": 1.1509, "step": 333 }, { "epoch": 0.14342458400429414, "grad_norm": 0.1790863275527954, "learning_rate": 0.00019051268989381771, "loss": 0.9633, "step": 334 }, { "epoch": 0.1438539989264627, "grad_norm": 0.17193441092967987, "learning_rate": 0.00019045498722168955, "loss": 1.0501, "step": 335 }, { "epoch": 0.14428341384863125, "grad_norm": 0.18548649549484253, "learning_rate": 0.0001903971183976536, "loss": 1.2305, "step": 336 }, { "epoch": 0.14471282877079977, "grad_norm": 0.16440680623054504, "learning_rate": 0.00019033908352800608, "loss": 1.1256, "step": 337 }, { "epoch": 0.14514224369296833, "grad_norm": 0.18403667211532593, "learning_rate": 0.00019028088271934798, "loss": 1.2889, "step": 338 }, { "epoch": 0.14557165861513688, "grad_norm": 0.16041843593120575, "learning_rate": 0.0001902225160785853, "loss": 1.0806, "step": 339 }, { "epoch": 0.14600107353730543, "grad_norm": 0.15153127908706665, "learning_rate": 0.00019016398371292864, "loss": 0.7621, "step": 340 }, { "epoch": 0.14643048845947396, "grad_norm": 0.14983665943145752, "learning_rate": 0.0001901052857298929, "loss": 0.9134, "step": 341 }, { "epoch": 0.1468599033816425, "grad_norm": 0.17730404436588287, "learning_rate": 0.00019004642223729727, "loss": 1.2925, "step": 342 }, { "epoch": 0.14728931830381106, "grad_norm": 0.1685967743396759, "learning_rate": 0.00018998739334326494, "loss": 1.1359, "step": 343 }, { "epoch": 0.14771873322597961, "grad_norm": 0.15899759531021118, "learning_rate": 0.00018992819915622291, "loss": 1.0883, "step": 344 }, { "epoch": 0.14814814814814814, "grad_norm": 0.1822543740272522, "learning_rate": 0.00018986883978490182, "loss": 1.1186, "step": 345 }, { "epoch": 0.1485775630703167, "grad_norm": 0.17298339307308197, "learning_rate": 0.00018980931533833567, "loss": 0.8858, "step": 346 }, { "epoch": 0.14900697799248525, "grad_norm": 0.17505380511283875, "learning_rate": 0.00018974962592586178, "loss": 1.1411, "step": 347 }, { "epoch": 0.14943639291465377, "grad_norm": 0.1915581226348877, "learning_rate": 0.00018968977165712036, "loss": 1.1323, "step": 348 }, { "epoch": 0.14986580783682232, "grad_norm": 0.17531049251556396, "learning_rate": 0.00018962975264205455, "loss": 0.886, "step": 349 }, { "epoch": 0.15029522275899088, "grad_norm": 0.1736138015985489, "learning_rate": 0.00018956956899091003, "loss": 1.1875, "step": 350 }, { "epoch": 0.15072463768115943, "grad_norm": 0.16522866487503052, "learning_rate": 0.00018950922081423493, "loss": 0.9511, "step": 351 }, { "epoch": 0.15115405260332795, "grad_norm": 0.15171727538108826, "learning_rate": 0.00018944870822287956, "loss": 1.1202, "step": 352 }, { "epoch": 0.1515834675254965, "grad_norm": 0.18102163076400757, "learning_rate": 0.00018938803132799626, "loss": 1.2382, "step": 353 }, { "epoch": 0.15201288244766506, "grad_norm": 0.1564633846282959, "learning_rate": 0.0001893271902410392, "loss": 0.9987, "step": 354 }, { "epoch": 0.1524422973698336, "grad_norm": 0.17558157444000244, "learning_rate": 0.00018926618507376399, "loss": 1.274, "step": 355 }, { "epoch": 0.15287171229200214, "grad_norm": 0.1743505746126175, "learning_rate": 0.00018920501593822789, "loss": 0.8533, "step": 356 }, { "epoch": 0.1533011272141707, "grad_norm": 0.19371235370635986, "learning_rate": 0.0001891436829467891, "loss": 1.2622, "step": 357 }, { "epoch": 0.15373054213633924, "grad_norm": 0.16197408735752106, "learning_rate": 0.00018908218621210688, "loss": 0.7451, "step": 358 }, { "epoch": 0.1541599570585078, "grad_norm": 0.2163006216287613, "learning_rate": 0.00018902052584714136, "loss": 1.2091, "step": 359 }, { "epoch": 0.15458937198067632, "grad_norm": 0.1739387959241867, "learning_rate": 0.00018895870196515314, "loss": 0.9003, "step": 360 }, { "epoch": 0.15501878690284487, "grad_norm": 0.16117063164710999, "learning_rate": 0.00018889671467970317, "loss": 1.0175, "step": 361 }, { "epoch": 0.15544820182501343, "grad_norm": 0.16463720798492432, "learning_rate": 0.0001888345641046525, "loss": 1.2892, "step": 362 }, { "epoch": 0.15587761674718195, "grad_norm": 0.19594573974609375, "learning_rate": 0.0001887722503541623, "loss": 1.1554, "step": 363 }, { "epoch": 0.1563070316693505, "grad_norm": 0.15671700239181519, "learning_rate": 0.00018870977354269326, "loss": 0.9604, "step": 364 }, { "epoch": 0.15673644659151906, "grad_norm": 0.16734743118286133, "learning_rate": 0.00018864713378500574, "loss": 1.0694, "step": 365 }, { "epoch": 0.1571658615136876, "grad_norm": 0.13222168385982513, "learning_rate": 0.0001885843311961593, "loss": 0.6987, "step": 366 }, { "epoch": 0.15759527643585614, "grad_norm": 0.17755256593227386, "learning_rate": 0.00018852136589151268, "loss": 1.0576, "step": 367 }, { "epoch": 0.1580246913580247, "grad_norm": 0.17115449905395508, "learning_rate": 0.00018845823798672347, "loss": 1.2332, "step": 368 }, { "epoch": 0.15845410628019324, "grad_norm": 0.17211580276489258, "learning_rate": 0.00018839494759774787, "loss": 1.0443, "step": 369 }, { "epoch": 0.1588835212023618, "grad_norm": 0.16635645925998688, "learning_rate": 0.00018833149484084066, "loss": 1.3116, "step": 370 }, { "epoch": 0.15931293612453032, "grad_norm": 0.13584615290164948, "learning_rate": 0.00018826787983255473, "loss": 0.816, "step": 371 }, { "epoch": 0.15974235104669887, "grad_norm": 0.15319599211215973, "learning_rate": 0.00018820410268974115, "loss": 1.3403, "step": 372 }, { "epoch": 0.16017176596886742, "grad_norm": 0.1778756082057953, "learning_rate": 0.00018814016352954873, "loss": 0.9581, "step": 373 }, { "epoch": 0.16060118089103598, "grad_norm": 0.17817425727844238, "learning_rate": 0.00018807606246942383, "loss": 1.0942, "step": 374 }, { "epoch": 0.1610305958132045, "grad_norm": 0.19471527636051178, "learning_rate": 0.00018801179962711019, "loss": 1.1226, "step": 375 }, { "epoch": 0.16146001073537306, "grad_norm": 0.1694117933511734, "learning_rate": 0.0001879473751206489, "loss": 1.1468, "step": 376 }, { "epoch": 0.1618894256575416, "grad_norm": 0.18657226860523224, "learning_rate": 0.0001878827890683778, "loss": 1.3482, "step": 377 }, { "epoch": 0.16231884057971013, "grad_norm": 0.17072419822216034, "learning_rate": 0.0001878180415889316, "loss": 1.1668, "step": 378 }, { "epoch": 0.16274825550187869, "grad_norm": 0.15484756231307983, "learning_rate": 0.00018775313280124142, "loss": 1.1584, "step": 379 }, { "epoch": 0.16317767042404724, "grad_norm": 0.1646227240562439, "learning_rate": 0.00018768806282453467, "loss": 1.1282, "step": 380 }, { "epoch": 0.1636070853462158, "grad_norm": 0.18709446489810944, "learning_rate": 0.000187622831778335, "loss": 1.1701, "step": 381 }, { "epoch": 0.16403650026838432, "grad_norm": 0.1889953762292862, "learning_rate": 0.0001875574397824618, "loss": 1.1496, "step": 382 }, { "epoch": 0.16446591519055287, "grad_norm": 0.16929011046886444, "learning_rate": 0.00018749188695703006, "loss": 0.8927, "step": 383 }, { "epoch": 0.16489533011272142, "grad_norm": 0.16205012798309326, "learning_rate": 0.0001874261734224503, "loss": 1.135, "step": 384 }, { "epoch": 0.16532474503488997, "grad_norm": 0.16252653300762177, "learning_rate": 0.00018736029929942812, "loss": 0.9563, "step": 385 }, { "epoch": 0.1657541599570585, "grad_norm": 0.18884459137916565, "learning_rate": 0.0001872942647089642, "loss": 0.8866, "step": 386 }, { "epoch": 0.16618357487922705, "grad_norm": 0.1668461114168167, "learning_rate": 0.00018722806977235391, "loss": 1.0448, "step": 387 }, { "epoch": 0.1666129898013956, "grad_norm": 0.17943502962589264, "learning_rate": 0.0001871617146111872, "loss": 1.1933, "step": 388 }, { "epoch": 0.16704240472356413, "grad_norm": 0.16244441270828247, "learning_rate": 0.0001870951993473483, "loss": 1.0513, "step": 389 }, { "epoch": 0.16747181964573268, "grad_norm": 0.18279998004436493, "learning_rate": 0.00018702852410301554, "loss": 1.3546, "step": 390 }, { "epoch": 0.16790123456790124, "grad_norm": 0.174489825963974, "learning_rate": 0.00018696168900066105, "loss": 1.1154, "step": 391 }, { "epoch": 0.1683306494900698, "grad_norm": 0.19099275767803192, "learning_rate": 0.00018689469416305067, "loss": 1.3016, "step": 392 }, { "epoch": 0.16876006441223831, "grad_norm": 0.1332124024629593, "learning_rate": 0.00018682753971324358, "loss": 0.8249, "step": 393 }, { "epoch": 0.16918947933440687, "grad_norm": 0.17980900406837463, "learning_rate": 0.00018676022577459225, "loss": 1.2107, "step": 394 }, { "epoch": 0.16961889425657542, "grad_norm": 0.1861777901649475, "learning_rate": 0.000186692752470742, "loss": 1.1602, "step": 395 }, { "epoch": 0.17004830917874397, "grad_norm": 0.1574292778968811, "learning_rate": 0.0001866251199256309, "loss": 0.758, "step": 396 }, { "epoch": 0.1704777241009125, "grad_norm": 0.17709052562713623, "learning_rate": 0.00018655732826348956, "loss": 0.965, "step": 397 }, { "epoch": 0.17090713902308105, "grad_norm": 0.18563103675842285, "learning_rate": 0.00018648937760884084, "loss": 1.14, "step": 398 }, { "epoch": 0.1713365539452496, "grad_norm": 0.19391857087612152, "learning_rate": 0.00018642126808649968, "loss": 0.8621, "step": 399 }, { "epoch": 0.17176596886741816, "grad_norm": 0.13754752278327942, "learning_rate": 0.00018635299982157274, "loss": 0.8559, "step": 400 }, { "epoch": 0.17219538378958668, "grad_norm": 0.17602375149726868, "learning_rate": 0.0001862845729394584, "loss": 1.0353, "step": 401 }, { "epoch": 0.17262479871175523, "grad_norm": 0.1522264927625656, "learning_rate": 0.00018621598756584623, "loss": 1.0975, "step": 402 }, { "epoch": 0.1730542136339238, "grad_norm": 0.13852877914905548, "learning_rate": 0.00018614724382671712, "loss": 0.8971, "step": 403 }, { "epoch": 0.1734836285560923, "grad_norm": 0.16204625368118286, "learning_rate": 0.0001860783418483427, "loss": 0.8758, "step": 404 }, { "epoch": 0.17391304347826086, "grad_norm": 0.17039796710014343, "learning_rate": 0.00018600928175728534, "loss": 0.9861, "step": 405 }, { "epoch": 0.17434245840042942, "grad_norm": 0.13860173523426056, "learning_rate": 0.00018594006368039779, "loss": 0.9373, "step": 406 }, { "epoch": 0.17477187332259797, "grad_norm": 0.16568392515182495, "learning_rate": 0.00018587068774482299, "loss": 1.1601, "step": 407 }, { "epoch": 0.1752012882447665, "grad_norm": 0.15709200501441956, "learning_rate": 0.00018580115407799394, "loss": 1.0979, "step": 408 }, { "epoch": 0.17563070316693505, "grad_norm": 0.1760331690311432, "learning_rate": 0.00018573146280763324, "loss": 0.9153, "step": 409 }, { "epoch": 0.1760601180891036, "grad_norm": 0.16068683564662933, "learning_rate": 0.00018566161406175308, "loss": 0.9569, "step": 410 }, { "epoch": 0.17648953301127215, "grad_norm": 0.19457021355628967, "learning_rate": 0.00018559160796865484, "loss": 1.0332, "step": 411 }, { "epoch": 0.17691894793344068, "grad_norm": 0.18924041092395782, "learning_rate": 0.00018552144465692897, "loss": 1.0282, "step": 412 }, { "epoch": 0.17734836285560923, "grad_norm": 0.17188721895217896, "learning_rate": 0.0001854511242554547, "loss": 1.1342, "step": 413 }, { "epoch": 0.17777777777777778, "grad_norm": 0.1609194427728653, "learning_rate": 0.0001853806468933997, "loss": 1.0553, "step": 414 }, { "epoch": 0.1782071926999463, "grad_norm": 0.16070395708084106, "learning_rate": 0.00018531001270022022, "loss": 1.2386, "step": 415 }, { "epoch": 0.17863660762211486, "grad_norm": 0.17878350615501404, "learning_rate": 0.00018523922180566028, "loss": 1.0539, "step": 416 }, { "epoch": 0.17906602254428342, "grad_norm": 0.19119922816753387, "learning_rate": 0.00018516827433975194, "loss": 1.105, "step": 417 }, { "epoch": 0.17949543746645197, "grad_norm": 0.19245749711990356, "learning_rate": 0.00018509717043281479, "loss": 0.9197, "step": 418 }, { "epoch": 0.1799248523886205, "grad_norm": 0.1675061136484146, "learning_rate": 0.00018502591021545573, "loss": 1.1746, "step": 419 }, { "epoch": 0.18035426731078905, "grad_norm": 0.1748921126127243, "learning_rate": 0.00018495449381856886, "loss": 1.2055, "step": 420 }, { "epoch": 0.1807836822329576, "grad_norm": 0.1709417849779129, "learning_rate": 0.00018488292137333514, "loss": 1.2112, "step": 421 }, { "epoch": 0.18121309715512615, "grad_norm": 0.16465428471565247, "learning_rate": 0.0001848111930112221, "loss": 0.9713, "step": 422 }, { "epoch": 0.18164251207729468, "grad_norm": 0.14309629797935486, "learning_rate": 0.00018473930886398377, "loss": 0.7619, "step": 423 }, { "epoch": 0.18207192699946323, "grad_norm": 0.15775880217552185, "learning_rate": 0.0001846672690636602, "loss": 0.9245, "step": 424 }, { "epoch": 0.18250134192163178, "grad_norm": 0.18402914702892303, "learning_rate": 0.00018459507374257755, "loss": 1.0844, "step": 425 }, { "epoch": 0.18293075684380034, "grad_norm": 0.15407468378543854, "learning_rate": 0.00018452272303334742, "loss": 0.9946, "step": 426 }, { "epoch": 0.18336017176596886, "grad_norm": 0.19107265770435333, "learning_rate": 0.000184450217068867, "loss": 1.2696, "step": 427 }, { "epoch": 0.1837895866881374, "grad_norm": 0.16658765077590942, "learning_rate": 0.00018437755598231856, "loss": 1.2813, "step": 428 }, { "epoch": 0.18421900161030597, "grad_norm": 0.1602768748998642, "learning_rate": 0.0001843047399071694, "loss": 1.1808, "step": 429 }, { "epoch": 0.1846484165324745, "grad_norm": 0.16247111558914185, "learning_rate": 0.00018423176897717141, "loss": 0.9986, "step": 430 }, { "epoch": 0.18507783145464304, "grad_norm": 0.152525395154953, "learning_rate": 0.00018415864332636104, "loss": 1.0343, "step": 431 }, { "epoch": 0.1855072463768116, "grad_norm": 0.17383332550525665, "learning_rate": 0.00018408536308905878, "loss": 0.981, "step": 432 }, { "epoch": 0.18593666129898015, "grad_norm": 0.17568951845169067, "learning_rate": 0.0001840119283998692, "loss": 1.1869, "step": 433 }, { "epoch": 0.18636607622114867, "grad_norm": 0.18272657692432404, "learning_rate": 0.00018393833939368056, "loss": 1.0451, "step": 434 }, { "epoch": 0.18679549114331723, "grad_norm": 0.1720953732728958, "learning_rate": 0.0001838645962056645, "loss": 0.914, "step": 435 }, { "epoch": 0.18722490606548578, "grad_norm": 0.20161637663841248, "learning_rate": 0.00018379069897127601, "loss": 1.189, "step": 436 }, { "epoch": 0.18765432098765433, "grad_norm": 0.17120416462421417, "learning_rate": 0.00018371664782625287, "loss": 1.0226, "step": 437 }, { "epoch": 0.18808373590982286, "grad_norm": 0.19251450896263123, "learning_rate": 0.00018364244290661568, "loss": 1.1604, "step": 438 }, { "epoch": 0.1885131508319914, "grad_norm": 0.16157999634742737, "learning_rate": 0.00018356808434866748, "loss": 1.1928, "step": 439 }, { "epoch": 0.18894256575415996, "grad_norm": 0.16121311485767365, "learning_rate": 0.00018349357228899347, "loss": 0.8092, "step": 440 }, { "epoch": 0.18937198067632852, "grad_norm": 0.18607012927532196, "learning_rate": 0.0001834189068644609, "loss": 1.0936, "step": 441 }, { "epoch": 0.18980139559849704, "grad_norm": 0.15668633580207825, "learning_rate": 0.00018334408821221864, "loss": 1.1534, "step": 442 }, { "epoch": 0.1902308105206656, "grad_norm": 0.1856255829334259, "learning_rate": 0.0001832691164696971, "loss": 1.0586, "step": 443 }, { "epoch": 0.19066022544283415, "grad_norm": 0.14413128793239594, "learning_rate": 0.0001831939917746078, "loss": 0.9904, "step": 444 }, { "epoch": 0.19108964036500267, "grad_norm": 0.15035253763198853, "learning_rate": 0.0001831187142649433, "loss": 0.9658, "step": 445 }, { "epoch": 0.19151905528717122, "grad_norm": 0.19175738096237183, "learning_rate": 0.00018304328407897676, "loss": 1.1088, "step": 446 }, { "epoch": 0.19194847020933978, "grad_norm": 0.1885284036397934, "learning_rate": 0.0001829677013552619, "loss": 1.233, "step": 447 }, { "epoch": 0.19237788513150833, "grad_norm": 0.16992244124412537, "learning_rate": 0.00018289196623263253, "loss": 0.9719, "step": 448 }, { "epoch": 0.19280730005367686, "grad_norm": 0.17281030118465424, "learning_rate": 0.00018281607885020242, "loss": 0.9497, "step": 449 }, { "epoch": 0.1932367149758454, "grad_norm": 0.18136782944202423, "learning_rate": 0.00018274003934736505, "loss": 1.0897, "step": 450 }, { "epoch": 0.19366612989801396, "grad_norm": 0.15827056765556335, "learning_rate": 0.0001826638478637933, "loss": 0.9363, "step": 451 }, { "epoch": 0.19409554482018251, "grad_norm": 0.20995981991291046, "learning_rate": 0.00018258750453943918, "loss": 1.049, "step": 452 }, { "epoch": 0.19452495974235104, "grad_norm": 0.17867140471935272, "learning_rate": 0.00018251100951453367, "loss": 1.0149, "step": 453 }, { "epoch": 0.1949543746645196, "grad_norm": 0.1835739016532898, "learning_rate": 0.00018243436292958638, "loss": 1.1985, "step": 454 }, { "epoch": 0.19538378958668814, "grad_norm": 0.17710070312023163, "learning_rate": 0.0001823575649253853, "loss": 0.9616, "step": 455 }, { "epoch": 0.19581320450885667, "grad_norm": 0.16101765632629395, "learning_rate": 0.0001822806156429965, "loss": 1.2936, "step": 456 }, { "epoch": 0.19624261943102522, "grad_norm": 0.1469978541135788, "learning_rate": 0.00018220351522376407, "loss": 1.1137, "step": 457 }, { "epoch": 0.19667203435319378, "grad_norm": 0.17269261181354523, "learning_rate": 0.00018212626380930967, "loss": 1.35, "step": 458 }, { "epoch": 0.19710144927536233, "grad_norm": 0.18232795596122742, "learning_rate": 0.0001820488615415321, "loss": 1.0693, "step": 459 }, { "epoch": 0.19753086419753085, "grad_norm": 0.19020916521549225, "learning_rate": 0.00018197130856260758, "loss": 1.085, "step": 460 }, { "epoch": 0.1979602791196994, "grad_norm": 0.1793365776538849, "learning_rate": 0.00018189360501498896, "loss": 1.1711, "step": 461 }, { "epoch": 0.19838969404186796, "grad_norm": 0.17583267390727997, "learning_rate": 0.00018181575104140568, "loss": 1.2276, "step": 462 }, { "epoch": 0.1988191089640365, "grad_norm": 0.16527873277664185, "learning_rate": 0.00018173774678486356, "loss": 1.1692, "step": 463 }, { "epoch": 0.19924852388620504, "grad_norm": 0.15330368280410767, "learning_rate": 0.00018165959238864446, "loss": 1.0472, "step": 464 }, { "epoch": 0.1996779388083736, "grad_norm": 0.18043364584445953, "learning_rate": 0.00018158128799630594, "loss": 1.1462, "step": 465 }, { "epoch": 0.20010735373054214, "grad_norm": 0.1676676869392395, "learning_rate": 0.00018150283375168114, "loss": 1.1693, "step": 466 }, { "epoch": 0.2005367686527107, "grad_norm": 0.17557865381240845, "learning_rate": 0.00018142422979887848, "loss": 0.9993, "step": 467 }, { "epoch": 0.20096618357487922, "grad_norm": 0.17406152188777924, "learning_rate": 0.00018134547628228132, "loss": 1.2718, "step": 468 }, { "epoch": 0.20139559849704777, "grad_norm": 0.16246803104877472, "learning_rate": 0.00018126657334654772, "loss": 0.906, "step": 469 }, { "epoch": 0.20182501341921633, "grad_norm": 0.19664785265922546, "learning_rate": 0.00018118752113661034, "loss": 1.1194, "step": 470 }, { "epoch": 0.20225442834138485, "grad_norm": 0.17243239283561707, "learning_rate": 0.00018110831979767586, "loss": 0.9779, "step": 471 }, { "epoch": 0.2026838432635534, "grad_norm": 0.1569763720035553, "learning_rate": 0.000181028969475225, "loss": 1.2128, "step": 472 }, { "epoch": 0.20311325818572196, "grad_norm": 0.17845910787582397, "learning_rate": 0.0001809494703150121, "loss": 1.087, "step": 473 }, { "epoch": 0.2035426731078905, "grad_norm": 0.15362991392612457, "learning_rate": 0.0001808698224630649, "loss": 0.8389, "step": 474 }, { "epoch": 0.20397208803005903, "grad_norm": 0.1604796200990677, "learning_rate": 0.00018079002606568426, "loss": 0.9256, "step": 475 }, { "epoch": 0.2044015029522276, "grad_norm": 0.16644595563411713, "learning_rate": 0.00018071008126944386, "loss": 1.0327, "step": 476 }, { "epoch": 0.20483091787439614, "grad_norm": 0.1740645319223404, "learning_rate": 0.00018062998822119007, "loss": 1.0971, "step": 477 }, { "epoch": 0.2052603327965647, "grad_norm": 0.17992867529392242, "learning_rate": 0.00018054974706804147, "loss": 0.8937, "step": 478 }, { "epoch": 0.20568974771873322, "grad_norm": 0.16396278142929077, "learning_rate": 0.00018046935795738872, "loss": 0.8748, "step": 479 }, { "epoch": 0.20611916264090177, "grad_norm": 0.16882237792015076, "learning_rate": 0.00018038882103689426, "loss": 0.859, "step": 480 }, { "epoch": 0.20654857756307032, "grad_norm": 0.142868772149086, "learning_rate": 0.00018030813645449208, "loss": 0.8051, "step": 481 }, { "epoch": 0.20697799248523885, "grad_norm": 0.17199325561523438, "learning_rate": 0.00018022730435838727, "loss": 1.1636, "step": 482 }, { "epoch": 0.2074074074074074, "grad_norm": 0.17648378014564514, "learning_rate": 0.00018014632489705604, "loss": 1.1394, "step": 483 }, { "epoch": 0.20783682232957595, "grad_norm": 0.1827528178691864, "learning_rate": 0.0001800651982192452, "loss": 1.1095, "step": 484 }, { "epoch": 0.2082662372517445, "grad_norm": 0.13080927729606628, "learning_rate": 0.00017998392447397197, "loss": 0.7807, "step": 485 }, { "epoch": 0.20869565217391303, "grad_norm": 0.17123474180698395, "learning_rate": 0.00017990250381052372, "loss": 1.2197, "step": 486 }, { "epoch": 0.20912506709608158, "grad_norm": 0.17640285193920135, "learning_rate": 0.00017982093637845768, "loss": 1.1285, "step": 487 }, { "epoch": 0.20955448201825014, "grad_norm": 0.1964927464723587, "learning_rate": 0.00017973922232760074, "loss": 1.3984, "step": 488 }, { "epoch": 0.2099838969404187, "grad_norm": 0.18344812095165253, "learning_rate": 0.00017965736180804905, "loss": 0.8897, "step": 489 }, { "epoch": 0.21041331186258722, "grad_norm": 0.17509503662586212, "learning_rate": 0.00017957535497016772, "loss": 1.0808, "step": 490 }, { "epoch": 0.21084272678475577, "grad_norm": 0.16462327539920807, "learning_rate": 0.00017949320196459077, "loss": 0.982, "step": 491 }, { "epoch": 0.21127214170692432, "grad_norm": 0.17547428607940674, "learning_rate": 0.00017941090294222066, "loss": 1.0466, "step": 492 }, { "epoch": 0.21170155662909287, "grad_norm": 0.18705184757709503, "learning_rate": 0.000179328458054228, "loss": 1.1574, "step": 493 }, { "epoch": 0.2121309715512614, "grad_norm": 0.17873774468898773, "learning_rate": 0.00017924586745205143, "loss": 1.3599, "step": 494 }, { "epoch": 0.21256038647342995, "grad_norm": 0.1929023265838623, "learning_rate": 0.0001791631312873971, "loss": 1.2727, "step": 495 }, { "epoch": 0.2129898013955985, "grad_norm": 0.1473141312599182, "learning_rate": 0.00017908024971223876, "loss": 1.0392, "step": 496 }, { "epoch": 0.21341921631776703, "grad_norm": 0.1641705185174942, "learning_rate": 0.00017899722287881699, "loss": 0.9458, "step": 497 }, { "epoch": 0.21384863123993558, "grad_norm": 0.16218411922454834, "learning_rate": 0.00017891405093963938, "loss": 0.8449, "step": 498 }, { "epoch": 0.21427804616210414, "grad_norm": 0.15134935081005096, "learning_rate": 0.00017883073404748002, "loss": 1.0388, "step": 499 }, { "epoch": 0.2147074610842727, "grad_norm": 0.13633696734905243, "learning_rate": 0.00017874727235537918, "loss": 0.6724, "step": 500 }, { "epoch": 0.2151368760064412, "grad_norm": 0.18835188448429108, "learning_rate": 0.0001786636660166432, "loss": 1.2972, "step": 501 }, { "epoch": 0.21556629092860977, "grad_norm": 0.16085697710514069, "learning_rate": 0.00017857991518484406, "loss": 1.0825, "step": 502 }, { "epoch": 0.21599570585077832, "grad_norm": 0.17221853137016296, "learning_rate": 0.00017849602001381918, "loss": 1.2739, "step": 503 }, { "epoch": 0.21642512077294687, "grad_norm": 0.1634456366300583, "learning_rate": 0.00017841198065767107, "loss": 0.9839, "step": 504 }, { "epoch": 0.2168545356951154, "grad_norm": 0.18110795319080353, "learning_rate": 0.00017832779727076708, "loss": 1.3229, "step": 505 }, { "epoch": 0.21728395061728395, "grad_norm": 0.13345003128051758, "learning_rate": 0.00017824347000773927, "loss": 0.8383, "step": 506 }, { "epoch": 0.2177133655394525, "grad_norm": 0.15196914970874786, "learning_rate": 0.00017815899902348377, "loss": 1.0096, "step": 507 }, { "epoch": 0.21814278046162103, "grad_norm": 0.17290259897708893, "learning_rate": 0.00017807438447316076, "loss": 0.8173, "step": 508 }, { "epoch": 0.21857219538378958, "grad_norm": 0.16334594786167145, "learning_rate": 0.00017798962651219424, "loss": 1.0307, "step": 509 }, { "epoch": 0.21900161030595813, "grad_norm": 0.16071034967899323, "learning_rate": 0.00017790472529627152, "loss": 1.0597, "step": 510 }, { "epoch": 0.21943102522812669, "grad_norm": 0.14360260963439941, "learning_rate": 0.0001778196809813431, "loss": 0.9411, "step": 511 }, { "epoch": 0.2198604401502952, "grad_norm": 0.1717967838048935, "learning_rate": 0.0001777344937236223, "loss": 1.1883, "step": 512 }, { "epoch": 0.22028985507246376, "grad_norm": 0.1511518657207489, "learning_rate": 0.00017764916367958502, "loss": 0.9472, "step": 513 }, { "epoch": 0.22071926999463232, "grad_norm": 0.1570175439119339, "learning_rate": 0.00017756369100596942, "loss": 0.8677, "step": 514 }, { "epoch": 0.22114868491680087, "grad_norm": 0.17275646328926086, "learning_rate": 0.00017747807585977575, "loss": 1.1496, "step": 515 }, { "epoch": 0.2215780998389694, "grad_norm": 0.16934038698673248, "learning_rate": 0.00017739231839826575, "loss": 0.9445, "step": 516 }, { "epoch": 0.22200751476113795, "grad_norm": 0.18247805535793304, "learning_rate": 0.00017730641877896275, "loss": 1.2478, "step": 517 }, { "epoch": 0.2224369296833065, "grad_norm": 0.17023034393787384, "learning_rate": 0.00017722037715965115, "loss": 1.0587, "step": 518 }, { "epoch": 0.22286634460547505, "grad_norm": 0.17108768224716187, "learning_rate": 0.00017713419369837617, "loss": 1.2587, "step": 519 }, { "epoch": 0.22329575952764358, "grad_norm": 0.16779127717018127, "learning_rate": 0.00017704786855344363, "loss": 0.8168, "step": 520 }, { "epoch": 0.22372517444981213, "grad_norm": 0.17807330191135406, "learning_rate": 0.00017696140188341945, "loss": 1.2265, "step": 521 }, { "epoch": 0.22415458937198068, "grad_norm": 0.15085840225219727, "learning_rate": 0.0001768747938471297, "loss": 0.9862, "step": 522 }, { "epoch": 0.2245840042941492, "grad_norm": 0.16962507367134094, "learning_rate": 0.00017678804460366, "loss": 1.2014, "step": 523 }, { "epoch": 0.22501341921631776, "grad_norm": 0.20221249759197235, "learning_rate": 0.00017670115431235538, "loss": 1.15, "step": 524 }, { "epoch": 0.22544283413848631, "grad_norm": 0.1703234761953354, "learning_rate": 0.00017661412313281995, "loss": 1.1397, "step": 525 }, { "epoch": 0.22587224906065487, "grad_norm": 0.15764622390270233, "learning_rate": 0.00017652695122491663, "loss": 1.0963, "step": 526 }, { "epoch": 0.2263016639828234, "grad_norm": 0.1757158637046814, "learning_rate": 0.00017643963874876677, "loss": 1.2059, "step": 527 }, { "epoch": 0.22673107890499195, "grad_norm": 0.17365393042564392, "learning_rate": 0.00017635218586474998, "loss": 1.0233, "step": 528 }, { "epoch": 0.2271604938271605, "grad_norm": 0.1677040010690689, "learning_rate": 0.0001762645927335038, "loss": 1.1272, "step": 529 }, { "epoch": 0.22758990874932905, "grad_norm": 0.1669892817735672, "learning_rate": 0.0001761768595159233, "loss": 0.9677, "step": 530 }, { "epoch": 0.22801932367149758, "grad_norm": 0.19120194017887115, "learning_rate": 0.00017608898637316096, "loss": 1.2069, "step": 531 }, { "epoch": 0.22844873859366613, "grad_norm": 0.15439291298389435, "learning_rate": 0.00017600097346662623, "loss": 0.8796, "step": 532 }, { "epoch": 0.22887815351583468, "grad_norm": 0.1759713590145111, "learning_rate": 0.00017591282095798526, "loss": 0.7718, "step": 533 }, { "epoch": 0.22930756843800323, "grad_norm": 0.17327053844928741, "learning_rate": 0.00017582452900916063, "loss": 1.4072, "step": 534 }, { "epoch": 0.22973698336017176, "grad_norm": 0.1783333122730255, "learning_rate": 0.0001757360977823312, "loss": 1.4336, "step": 535 }, { "epoch": 0.2301663982823403, "grad_norm": 0.16632091999053955, "learning_rate": 0.00017564752743993143, "loss": 0.9684, "step": 536 }, { "epoch": 0.23059581320450886, "grad_norm": 0.17739808559417725, "learning_rate": 0.00017555881814465148, "loss": 0.9855, "step": 537 }, { "epoch": 0.2310252281266774, "grad_norm": 0.16482579708099365, "learning_rate": 0.00017546997005943665, "loss": 1.1435, "step": 538 }, { "epoch": 0.23145464304884594, "grad_norm": 0.19359920918941498, "learning_rate": 0.00017538098334748722, "loss": 1.2677, "step": 539 }, { "epoch": 0.2318840579710145, "grad_norm": 0.1723766326904297, "learning_rate": 0.00017529185817225816, "loss": 1.3, "step": 540 }, { "epoch": 0.23231347289318305, "grad_norm": 0.18761831521987915, "learning_rate": 0.00017520259469745866, "loss": 1.2971, "step": 541 }, { "epoch": 0.23274288781535157, "grad_norm": 0.139839306473732, "learning_rate": 0.00017511319308705198, "loss": 0.975, "step": 542 }, { "epoch": 0.23317230273752013, "grad_norm": 0.17375217378139496, "learning_rate": 0.00017502365350525524, "loss": 0.9755, "step": 543 }, { "epoch": 0.23360171765968868, "grad_norm": 0.1978386640548706, "learning_rate": 0.00017493397611653875, "loss": 1.3327, "step": 544 }, { "epoch": 0.23403113258185723, "grad_norm": 0.21363678574562073, "learning_rate": 0.0001748441610856262, "loss": 1.1973, "step": 545 }, { "epoch": 0.23446054750402576, "grad_norm": 0.18306796252727509, "learning_rate": 0.00017475420857749398, "loss": 1.0939, "step": 546 }, { "epoch": 0.2348899624261943, "grad_norm": 0.1709376573562622, "learning_rate": 0.00017466411875737098, "loss": 1.1383, "step": 547 }, { "epoch": 0.23531937734836286, "grad_norm": 0.19025692343711853, "learning_rate": 0.0001745738917907384, "loss": 0.9749, "step": 548 }, { "epoch": 0.2357487922705314, "grad_norm": 0.1548996865749359, "learning_rate": 0.00017448352784332926, "loss": 1.1391, "step": 549 }, { "epoch": 0.23617820719269994, "grad_norm": 0.15124543011188507, "learning_rate": 0.00017439302708112826, "loss": 1.0438, "step": 550 }, { "epoch": 0.2366076221148685, "grad_norm": 0.178885355591774, "learning_rate": 0.00017430238967037137, "loss": 1.2482, "step": 551 }, { "epoch": 0.23703703703703705, "grad_norm": 0.16636434197425842, "learning_rate": 0.00017421161577754564, "loss": 1.079, "step": 552 }, { "epoch": 0.23746645195920557, "grad_norm": 0.16374240815639496, "learning_rate": 0.00017412070556938872, "loss": 1.1511, "step": 553 }, { "epoch": 0.23789586688137412, "grad_norm": 0.15488043427467346, "learning_rate": 0.00017402965921288865, "loss": 1.1565, "step": 554 }, { "epoch": 0.23832528180354268, "grad_norm": 0.16751627624034882, "learning_rate": 0.00017393847687528367, "loss": 1.1209, "step": 555 }, { "epoch": 0.23875469672571123, "grad_norm": 0.17798767983913422, "learning_rate": 0.00017384715872406168, "loss": 1.2118, "step": 556 }, { "epoch": 0.23918411164787975, "grad_norm": 0.17087987065315247, "learning_rate": 0.00017375570492696009, "loss": 0.9564, "step": 557 }, { "epoch": 0.2396135265700483, "grad_norm": 0.14827404916286469, "learning_rate": 0.00017366411565196543, "loss": 0.9969, "step": 558 }, { "epoch": 0.24004294149221686, "grad_norm": 0.16151390969753265, "learning_rate": 0.00017357239106731317, "loss": 1.0805, "step": 559 }, { "epoch": 0.2404723564143854, "grad_norm": 0.20443901419639587, "learning_rate": 0.00017348053134148727, "loss": 1.1291, "step": 560 }, { "epoch": 0.24090177133655394, "grad_norm": 0.15805144608020782, "learning_rate": 0.00017338853664321992, "loss": 1.067, "step": 561 }, { "epoch": 0.2413311862587225, "grad_norm": 0.17929919064044952, "learning_rate": 0.00017329640714149123, "loss": 1.1768, "step": 562 }, { "epoch": 0.24176060118089104, "grad_norm": 0.15413890779018402, "learning_rate": 0.00017320414300552893, "loss": 1.1613, "step": 563 }, { "epoch": 0.24219001610305957, "grad_norm": 0.16163668036460876, "learning_rate": 0.0001731117444048081, "loss": 1.0257, "step": 564 }, { "epoch": 0.24261943102522812, "grad_norm": 0.17742857336997986, "learning_rate": 0.0001730192115090507, "loss": 1.0139, "step": 565 }, { "epoch": 0.24304884594739667, "grad_norm": 0.1430206149816513, "learning_rate": 0.0001729265444882255, "loss": 0.8641, "step": 566 }, { "epoch": 0.24347826086956523, "grad_norm": 0.1846974492073059, "learning_rate": 0.00017283374351254754, "loss": 1.3239, "step": 567 }, { "epoch": 0.24390767579173375, "grad_norm": 0.16652631759643555, "learning_rate": 0.00017274080875247794, "loss": 1.0221, "step": 568 }, { "epoch": 0.2443370907139023, "grad_norm": 0.1801396608352661, "learning_rate": 0.00017264774037872358, "loss": 1.2199, "step": 569 }, { "epoch": 0.24476650563607086, "grad_norm": 0.1728580743074417, "learning_rate": 0.00017255453856223675, "loss": 1.0899, "step": 570 }, { "epoch": 0.2451959205582394, "grad_norm": 0.1778605431318283, "learning_rate": 0.00017246120347421488, "loss": 0.949, "step": 571 }, { "epoch": 0.24562533548040794, "grad_norm": 0.16379563510417938, "learning_rate": 0.00017236773528610017, "loss": 1.2364, "step": 572 }, { "epoch": 0.2460547504025765, "grad_norm": 0.15087537467479706, "learning_rate": 0.0001722741341695793, "loss": 0.9602, "step": 573 }, { "epoch": 0.24648416532474504, "grad_norm": 0.18357989192008972, "learning_rate": 0.00017218040029658315, "loss": 1.2449, "step": 574 }, { "epoch": 0.24691358024691357, "grad_norm": 0.1720157265663147, "learning_rate": 0.00017208653383928642, "loss": 1.1534, "step": 575 }, { "epoch": 0.24734299516908212, "grad_norm": 0.19645382463932037, "learning_rate": 0.00017199253497010743, "loss": 1.0639, "step": 576 }, { "epoch": 0.24777241009125067, "grad_norm": 0.1753363013267517, "learning_rate": 0.00017189840386170756, "loss": 0.8053, "step": 577 }, { "epoch": 0.24820182501341922, "grad_norm": 0.19694557785987854, "learning_rate": 0.00017180414068699126, "loss": 1.0593, "step": 578 }, { "epoch": 0.24863123993558775, "grad_norm": 0.20301617681980133, "learning_rate": 0.00017170974561910542, "loss": 1.2998, "step": 579 }, { "epoch": 0.2490606548577563, "grad_norm": 0.18933315575122833, "learning_rate": 0.00017161521883143934, "loss": 1.2534, "step": 580 }, { "epoch": 0.24949006977992486, "grad_norm": 0.17308446764945984, "learning_rate": 0.00017152056049762418, "loss": 1.2115, "step": 581 }, { "epoch": 0.2499194847020934, "grad_norm": 0.17606200277805328, "learning_rate": 0.0001714257707915327, "loss": 1.0521, "step": 582 } ], "logging_steps": 1, "max_steps": 2328, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 291, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.7796911480465e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }