nicoboss's picture
Upload folder using huggingface_hub
c48814e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2499194847020934,
"eval_steps": 500,
"global_step": 582,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00042941492216854533,
"grad_norm": 0.11985349655151367,
"learning_rate": 2e-05,
"loss": 1.3519,
"step": 1
},
{
"epoch": 0.0008588298443370907,
"grad_norm": 0.10720210522413254,
"learning_rate": 4e-05,
"loss": 1.226,
"step": 2
},
{
"epoch": 0.0012882447665056361,
"grad_norm": 0.13469132781028748,
"learning_rate": 6e-05,
"loss": 1.4841,
"step": 3
},
{
"epoch": 0.0017176596886741813,
"grad_norm": 0.1580151468515396,
"learning_rate": 8e-05,
"loss": 1.6834,
"step": 4
},
{
"epoch": 0.0021470746108427268,
"grad_norm": 0.1583908349275589,
"learning_rate": 0.0001,
"loss": 1.5718,
"step": 5
},
{
"epoch": 0.0025764895330112722,
"grad_norm": 0.1486492156982422,
"learning_rate": 0.00012,
"loss": 1.4708,
"step": 6
},
{
"epoch": 0.0030059044551798177,
"grad_norm": 0.15314875543117523,
"learning_rate": 0.00014,
"loss": 1.3917,
"step": 7
},
{
"epoch": 0.0034353193773483627,
"grad_norm": 0.1677706390619278,
"learning_rate": 0.00016,
"loss": 1.4053,
"step": 8
},
{
"epoch": 0.003864734299516908,
"grad_norm": 0.17734883725643158,
"learning_rate": 0.00018,
"loss": 1.4917,
"step": 9
},
{
"epoch": 0.0042941492216854536,
"grad_norm": 0.15920934081077576,
"learning_rate": 0.0002,
"loss": 1.369,
"step": 10
},
{
"epoch": 0.0047235641438539986,
"grad_norm": 0.14339257776737213,
"learning_rate": 0.00019999990815768547,
"loss": 1.5381,
"step": 11
},
{
"epoch": 0.0051529790660225444,
"grad_norm": 0.18288248777389526,
"learning_rate": 0.00019999963263091051,
"loss": 1.6338,
"step": 12
},
{
"epoch": 0.0055823939881910895,
"grad_norm": 0.13021744787693024,
"learning_rate": 0.00019999917342018129,
"loss": 1.047,
"step": 13
},
{
"epoch": 0.006011808910359635,
"grad_norm": 0.14156687259674072,
"learning_rate": 0.00019999853052634123,
"loss": 1.2525,
"step": 14
},
{
"epoch": 0.00644122383252818,
"grad_norm": 0.1400100141763687,
"learning_rate": 0.0001999977039505713,
"loss": 1.2294,
"step": 15
},
{
"epoch": 0.006870638754696725,
"grad_norm": 0.16356173157691956,
"learning_rate": 0.00019999669369438975,
"loss": 1.3664,
"step": 16
},
{
"epoch": 0.007300053676865271,
"grad_norm": 0.17197328805923462,
"learning_rate": 0.00019999549975965227,
"loss": 1.3922,
"step": 17
},
{
"epoch": 0.007729468599033816,
"grad_norm": 0.1663227528333664,
"learning_rate": 0.00019999412214855196,
"loss": 1.1996,
"step": 18
},
{
"epoch": 0.008158883521202361,
"grad_norm": 0.1358145773410797,
"learning_rate": 0.00019999256086361924,
"loss": 0.9447,
"step": 19
},
{
"epoch": 0.008588298443370907,
"grad_norm": 0.14678195118904114,
"learning_rate": 0.000199990815907722,
"loss": 1.3465,
"step": 20
},
{
"epoch": 0.009017713365539453,
"grad_norm": 0.14393630623817444,
"learning_rate": 0.00019998888728406543,
"loss": 1.0345,
"step": 21
},
{
"epoch": 0.009447128287707997,
"grad_norm": 0.1871100217103958,
"learning_rate": 0.00019998677499619206,
"loss": 1.1669,
"step": 22
},
{
"epoch": 0.009876543209876543,
"grad_norm": 0.12790684401988983,
"learning_rate": 0.00019998447904798195,
"loss": 0.9759,
"step": 23
},
{
"epoch": 0.010305958132045089,
"grad_norm": 0.1504671722650528,
"learning_rate": 0.00019998199944365236,
"loss": 1.3362,
"step": 24
},
{
"epoch": 0.010735373054213635,
"grad_norm": 0.14933271706104279,
"learning_rate": 0.00019997933618775787,
"loss": 1.2592,
"step": 25
},
{
"epoch": 0.011164787976382179,
"grad_norm": 0.1384006291627884,
"learning_rate": 0.00019997648928519055,
"loss": 1.0959,
"step": 26
},
{
"epoch": 0.011594202898550725,
"grad_norm": 0.12688492238521576,
"learning_rate": 0.00019997345874117972,
"loss": 1.1937,
"step": 27
},
{
"epoch": 0.01202361782071927,
"grad_norm": 0.14218132197856903,
"learning_rate": 0.00019997024456129195,
"loss": 1.3615,
"step": 28
},
{
"epoch": 0.012453032742887815,
"grad_norm": 0.16093435883522034,
"learning_rate": 0.0001999668467514313,
"loss": 1.0413,
"step": 29
},
{
"epoch": 0.01288244766505636,
"grad_norm": 0.14973227679729462,
"learning_rate": 0.00019996326531783898,
"loss": 1.0408,
"step": 30
},
{
"epoch": 0.013311862587224907,
"grad_norm": 0.12071070075035095,
"learning_rate": 0.00019995950026709353,
"loss": 1.0642,
"step": 31
},
{
"epoch": 0.01374127750939345,
"grad_norm": 0.1471056491136551,
"learning_rate": 0.00019995555160611073,
"loss": 1.2353,
"step": 32
},
{
"epoch": 0.014170692431561997,
"grad_norm": 0.14476723968982697,
"learning_rate": 0.00019995141934214372,
"loss": 1.1288,
"step": 33
},
{
"epoch": 0.014600107353730542,
"grad_norm": 0.1581466645002365,
"learning_rate": 0.0001999471034827828,
"loss": 1.2426,
"step": 34
},
{
"epoch": 0.015029522275899088,
"grad_norm": 0.15047816932201385,
"learning_rate": 0.0001999426040359556,
"loss": 1.044,
"step": 35
},
{
"epoch": 0.015458937198067632,
"grad_norm": 0.13698647916316986,
"learning_rate": 0.00019993792100992682,
"loss": 1.0759,
"step": 36
},
{
"epoch": 0.015888352120236177,
"grad_norm": 0.16587479412555695,
"learning_rate": 0.0001999330544132985,
"loss": 1.1251,
"step": 37
},
{
"epoch": 0.016317767042404722,
"grad_norm": 0.14546941220760345,
"learning_rate": 0.00019992800425500988,
"loss": 1.0911,
"step": 38
},
{
"epoch": 0.01674718196457327,
"grad_norm": 0.137843519449234,
"learning_rate": 0.00019992277054433727,
"loss": 1.1183,
"step": 39
},
{
"epoch": 0.017176596886741814,
"grad_norm": 0.14544665813446045,
"learning_rate": 0.00019991735329089416,
"loss": 1.1161,
"step": 40
},
{
"epoch": 0.01760601180891036,
"grad_norm": 0.16081300377845764,
"learning_rate": 0.00019991175250463127,
"loss": 1.2546,
"step": 41
},
{
"epoch": 0.018035426731078906,
"grad_norm": 0.15027405321598053,
"learning_rate": 0.0001999059681958364,
"loss": 1.0595,
"step": 42
},
{
"epoch": 0.018464841653247452,
"grad_norm": 0.14687219262123108,
"learning_rate": 0.00019990000037513437,
"loss": 1.2931,
"step": 43
},
{
"epoch": 0.018894256575415994,
"grad_norm": 0.1763402223587036,
"learning_rate": 0.0001998938490534872,
"loss": 1.4514,
"step": 44
},
{
"epoch": 0.01932367149758454,
"grad_norm": 0.17130351066589355,
"learning_rate": 0.00019988751424219388,
"loss": 1.3405,
"step": 45
},
{
"epoch": 0.019753086419753086,
"grad_norm": 0.14724081754684448,
"learning_rate": 0.00019988099595289054,
"loss": 0.9397,
"step": 46
},
{
"epoch": 0.020182501341921632,
"grad_norm": 0.14184130728244781,
"learning_rate": 0.00019987429419755022,
"loss": 1.1355,
"step": 47
},
{
"epoch": 0.020611916264090178,
"grad_norm": 0.1490873247385025,
"learning_rate": 0.00019986740898848306,
"loss": 1.1162,
"step": 48
},
{
"epoch": 0.021041331186258724,
"grad_norm": 0.1417856514453888,
"learning_rate": 0.00019986034033833613,
"loss": 1.0778,
"step": 49
},
{
"epoch": 0.02147074610842727,
"grad_norm": 0.14795203506946564,
"learning_rate": 0.00019985308826009338,
"loss": 1.0645,
"step": 50
},
{
"epoch": 0.021900161030595812,
"grad_norm": 0.18487784266471863,
"learning_rate": 0.00019984565276707583,
"loss": 1.0634,
"step": 51
},
{
"epoch": 0.022329575952764358,
"grad_norm": 0.15679900348186493,
"learning_rate": 0.00019983803387294135,
"loss": 1.2826,
"step": 52
},
{
"epoch": 0.022758990874932904,
"grad_norm": 0.1397986263036728,
"learning_rate": 0.00019983023159168465,
"loss": 1.1745,
"step": 53
},
{
"epoch": 0.02318840579710145,
"grad_norm": 0.13861894607543945,
"learning_rate": 0.00019982224593763733,
"loss": 0.9461,
"step": 54
},
{
"epoch": 0.023617820719269995,
"grad_norm": 0.1317225992679596,
"learning_rate": 0.00019981407692546777,
"loss": 0.9315,
"step": 55
},
{
"epoch": 0.02404723564143854,
"grad_norm": 0.1468420773744583,
"learning_rate": 0.00019980572457018123,
"loss": 1.2609,
"step": 56
},
{
"epoch": 0.024476650563607084,
"grad_norm": 0.14463701844215393,
"learning_rate": 0.0001997971888871197,
"loss": 1.1092,
"step": 57
},
{
"epoch": 0.02490606548577563,
"grad_norm": 0.14022503793239594,
"learning_rate": 0.0001997884698919619,
"loss": 0.9528,
"step": 58
},
{
"epoch": 0.025335480407944175,
"grad_norm": 0.14202667772769928,
"learning_rate": 0.00019977956760072334,
"loss": 1.1813,
"step": 59
},
{
"epoch": 0.02576489533011272,
"grad_norm": 0.1546659618616104,
"learning_rate": 0.00019977048202975608,
"loss": 1.348,
"step": 60
},
{
"epoch": 0.026194310252281267,
"grad_norm": 0.1386214941740036,
"learning_rate": 0.00019976121319574896,
"loss": 1.1747,
"step": 61
},
{
"epoch": 0.026623725174449813,
"grad_norm": 0.1513381004333496,
"learning_rate": 0.00019975176111572743,
"loss": 1.0845,
"step": 62
},
{
"epoch": 0.02705314009661836,
"grad_norm": 0.1494988650083542,
"learning_rate": 0.00019974212580705345,
"loss": 1.1647,
"step": 63
},
{
"epoch": 0.0274825550187869,
"grad_norm": 0.16360332071781158,
"learning_rate": 0.0001997323072874256,
"loss": 1.0523,
"step": 64
},
{
"epoch": 0.027911969940955447,
"grad_norm": 0.17121770977973938,
"learning_rate": 0.00019972230557487906,
"loss": 1.3142,
"step": 65
},
{
"epoch": 0.028341384863123993,
"grad_norm": 0.15700650215148926,
"learning_rate": 0.0001997121206877854,
"loss": 1.0519,
"step": 66
},
{
"epoch": 0.02877079978529254,
"grad_norm": 0.15610812604427338,
"learning_rate": 0.00019970175264485266,
"loss": 1.2066,
"step": 67
},
{
"epoch": 0.029200214707461085,
"grad_norm": 0.13125644624233246,
"learning_rate": 0.00019969120146512542,
"loss": 0.9134,
"step": 68
},
{
"epoch": 0.02962962962962963,
"grad_norm": 0.16931581497192383,
"learning_rate": 0.00019968046716798449,
"loss": 1.0536,
"step": 69
},
{
"epoch": 0.030059044551798177,
"grad_norm": 0.14404140412807465,
"learning_rate": 0.00019966954977314715,
"loss": 1.1876,
"step": 70
},
{
"epoch": 0.03048845947396672,
"grad_norm": 0.18353833258152008,
"learning_rate": 0.000199658449300667,
"loss": 1.1881,
"step": 71
},
{
"epoch": 0.030917874396135265,
"grad_norm": 0.1493215709924698,
"learning_rate": 0.00019964716577093388,
"loss": 1.2907,
"step": 72
},
{
"epoch": 0.031347289318303814,
"grad_norm": 0.1731230616569519,
"learning_rate": 0.0001996356992046739,
"loss": 1.2771,
"step": 73
},
{
"epoch": 0.03177670424047235,
"grad_norm": 0.15955105423927307,
"learning_rate": 0.00019962404962294944,
"loss": 1.1304,
"step": 74
},
{
"epoch": 0.0322061191626409,
"grad_norm": 0.1388455629348755,
"learning_rate": 0.00019961221704715886,
"loss": 0.9874,
"step": 75
},
{
"epoch": 0.032635534084809445,
"grad_norm": 0.16745209693908691,
"learning_rate": 0.0001996002014990369,
"loss": 1.1035,
"step": 76
},
{
"epoch": 0.03306494900697799,
"grad_norm": 0.17726710438728333,
"learning_rate": 0.00019958800300065425,
"loss": 1.2322,
"step": 77
},
{
"epoch": 0.03349436392914654,
"grad_norm": 0.16995428502559662,
"learning_rate": 0.00019957562157441765,
"loss": 1.2029,
"step": 78
},
{
"epoch": 0.03392377885131508,
"grad_norm": 0.14299820363521576,
"learning_rate": 0.00019956305724306986,
"loss": 1.0119,
"step": 79
},
{
"epoch": 0.03435319377348363,
"grad_norm": 0.15954792499542236,
"learning_rate": 0.00019955031002968972,
"loss": 1.127,
"step": 80
},
{
"epoch": 0.034782608695652174,
"grad_norm": 0.166239395737648,
"learning_rate": 0.00019953737995769179,
"loss": 1.185,
"step": 81
},
{
"epoch": 0.03521202361782072,
"grad_norm": 0.17462775111198425,
"learning_rate": 0.0001995242670508267,
"loss": 1.3376,
"step": 82
},
{
"epoch": 0.035641438539989266,
"grad_norm": 0.16347193717956543,
"learning_rate": 0.00019951097133318076,
"loss": 1.1657,
"step": 83
},
{
"epoch": 0.03607085346215781,
"grad_norm": 0.1850813329219818,
"learning_rate": 0.00019949749282917626,
"loss": 1.1724,
"step": 84
},
{
"epoch": 0.03650026838432636,
"grad_norm": 0.16961267590522766,
"learning_rate": 0.00019948383156357112,
"loss": 1.1548,
"step": 85
},
{
"epoch": 0.036929683306494904,
"grad_norm": 0.18874776363372803,
"learning_rate": 0.0001994699875614589,
"loss": 1.0729,
"step": 86
},
{
"epoch": 0.03735909822866344,
"grad_norm": 0.17659211158752441,
"learning_rate": 0.000199455960848269,
"loss": 1.2371,
"step": 87
},
{
"epoch": 0.03778851315083199,
"grad_norm": 0.16227173805236816,
"learning_rate": 0.0001994417514497663,
"loss": 1.0381,
"step": 88
},
{
"epoch": 0.038217928073000534,
"grad_norm": 0.14537280797958374,
"learning_rate": 0.0001994273593920513,
"loss": 1.0392,
"step": 89
},
{
"epoch": 0.03864734299516908,
"grad_norm": 0.1782526969909668,
"learning_rate": 0.00019941278470155994,
"loss": 1.1891,
"step": 90
},
{
"epoch": 0.039076757917337626,
"grad_norm": 0.15369926393032074,
"learning_rate": 0.00019939802740506375,
"loss": 0.8279,
"step": 91
},
{
"epoch": 0.03950617283950617,
"grad_norm": 0.1525738388299942,
"learning_rate": 0.00019938308752966957,
"loss": 1.1378,
"step": 92
},
{
"epoch": 0.03993558776167472,
"grad_norm": 0.14440616965293884,
"learning_rate": 0.0001993679651028197,
"loss": 0.9707,
"step": 93
},
{
"epoch": 0.040365002683843264,
"grad_norm": 0.1944921761751175,
"learning_rate": 0.00019935266015229166,
"loss": 1.2753,
"step": 94
},
{
"epoch": 0.04079441760601181,
"grad_norm": 0.17704033851623535,
"learning_rate": 0.00019933717270619833,
"loss": 1.215,
"step": 95
},
{
"epoch": 0.041223832528180356,
"grad_norm": 0.16801829636096954,
"learning_rate": 0.00019932150279298777,
"loss": 1.2177,
"step": 96
},
{
"epoch": 0.0416532474503489,
"grad_norm": 0.14935865998268127,
"learning_rate": 0.00019930565044144318,
"loss": 1.0213,
"step": 97
},
{
"epoch": 0.04208266237251745,
"grad_norm": 0.16046607494354248,
"learning_rate": 0.0001992896156806829,
"loss": 1.0529,
"step": 98
},
{
"epoch": 0.04251207729468599,
"grad_norm": 0.16249270737171173,
"learning_rate": 0.00019927339854016037,
"loss": 1.0861,
"step": 99
},
{
"epoch": 0.04294149221685454,
"grad_norm": 0.16730612516403198,
"learning_rate": 0.0001992569990496639,
"loss": 0.9681,
"step": 100
},
{
"epoch": 0.04337090713902308,
"grad_norm": 0.17123740911483765,
"learning_rate": 0.00019924041723931688,
"loss": 0.9648,
"step": 101
},
{
"epoch": 0.043800322061191624,
"grad_norm": 0.15978355705738068,
"learning_rate": 0.00019922365313957752,
"loss": 1.0962,
"step": 102
},
{
"epoch": 0.04422973698336017,
"grad_norm": 0.18542608618736267,
"learning_rate": 0.00019920670678123893,
"loss": 1.1831,
"step": 103
},
{
"epoch": 0.044659151905528716,
"grad_norm": 0.17981840670108795,
"learning_rate": 0.00019918957819542893,
"loss": 1.2029,
"step": 104
},
{
"epoch": 0.04508856682769726,
"grad_norm": 0.16533541679382324,
"learning_rate": 0.00019917226741361015,
"loss": 1.2239,
"step": 105
},
{
"epoch": 0.04551798174986581,
"grad_norm": 0.1770992875099182,
"learning_rate": 0.0001991547744675798,
"loss": 1.103,
"step": 106
},
{
"epoch": 0.04594739667203435,
"grad_norm": 0.15934127569198608,
"learning_rate": 0.00019913709938946972,
"loss": 0.9117,
"step": 107
},
{
"epoch": 0.0463768115942029,
"grad_norm": 0.1818443238735199,
"learning_rate": 0.00019911924221174636,
"loss": 1.149,
"step": 108
},
{
"epoch": 0.046806226516371445,
"grad_norm": 0.17105095088481903,
"learning_rate": 0.00019910120296721053,
"loss": 1.3834,
"step": 109
},
{
"epoch": 0.04723564143853999,
"grad_norm": 0.1493517905473709,
"learning_rate": 0.00019908298168899765,
"loss": 0.9976,
"step": 110
},
{
"epoch": 0.04766505636070854,
"grad_norm": 0.17170068621635437,
"learning_rate": 0.00019906457841057732,
"loss": 1.0791,
"step": 111
},
{
"epoch": 0.04809447128287708,
"grad_norm": 0.17287380993366241,
"learning_rate": 0.00019904599316575357,
"loss": 1.108,
"step": 112
},
{
"epoch": 0.04852388620504563,
"grad_norm": 0.15946826338768005,
"learning_rate": 0.00019902722598866466,
"loss": 1.0462,
"step": 113
},
{
"epoch": 0.04895330112721417,
"grad_norm": 0.18682260811328888,
"learning_rate": 0.00019900827691378298,
"loss": 1.0757,
"step": 114
},
{
"epoch": 0.04938271604938271,
"grad_norm": 0.15951935946941376,
"learning_rate": 0.00019898914597591506,
"loss": 1.3103,
"step": 115
},
{
"epoch": 0.04981213097155126,
"grad_norm": 0.16503126919269562,
"learning_rate": 0.0001989698332102015,
"loss": 1.1521,
"step": 116
},
{
"epoch": 0.050241545893719805,
"grad_norm": 0.15713706612586975,
"learning_rate": 0.0001989503386521169,
"loss": 1.2906,
"step": 117
},
{
"epoch": 0.05067096081588835,
"grad_norm": 0.1533653736114502,
"learning_rate": 0.00019893066233746978,
"loss": 1.0389,
"step": 118
},
{
"epoch": 0.0511003757380569,
"grad_norm": 0.16496874392032623,
"learning_rate": 0.0001989108043024025,
"loss": 1.2676,
"step": 119
},
{
"epoch": 0.05152979066022544,
"grad_norm": 0.14784802496433258,
"learning_rate": 0.00019889076458339116,
"loss": 0.9091,
"step": 120
},
{
"epoch": 0.05195920558239399,
"grad_norm": 0.1391952782869339,
"learning_rate": 0.00019887054321724565,
"loss": 0.7391,
"step": 121
},
{
"epoch": 0.052388620504562534,
"grad_norm": 0.16542598605155945,
"learning_rate": 0.0001988501402411096,
"loss": 1.26,
"step": 122
},
{
"epoch": 0.05281803542673108,
"grad_norm": 0.1864759474992752,
"learning_rate": 0.00019882955569246007,
"loss": 1.1248,
"step": 123
},
{
"epoch": 0.053247450348899626,
"grad_norm": 0.19127963483333588,
"learning_rate": 0.00019880878960910772,
"loss": 1.2209,
"step": 124
},
{
"epoch": 0.05367686527106817,
"grad_norm": 0.18262384831905365,
"learning_rate": 0.00019878784202919666,
"loss": 1.2114,
"step": 125
},
{
"epoch": 0.05410628019323672,
"grad_norm": 0.16955001652240753,
"learning_rate": 0.0001987667129912044,
"loss": 1.133,
"step": 126
},
{
"epoch": 0.05453569511540526,
"grad_norm": 0.17882367968559265,
"learning_rate": 0.00019874540253394168,
"loss": 1.3044,
"step": 127
},
{
"epoch": 0.0549651100375738,
"grad_norm": 0.20200395584106445,
"learning_rate": 0.00019872391069655258,
"loss": 1.1933,
"step": 128
},
{
"epoch": 0.05539452495974235,
"grad_norm": 0.17120778560638428,
"learning_rate": 0.00019870223751851428,
"loss": 1.0102,
"step": 129
},
{
"epoch": 0.055823939881910895,
"grad_norm": 0.19138963520526886,
"learning_rate": 0.0001986803830396371,
"loss": 1.4741,
"step": 130
},
{
"epoch": 0.05625335480407944,
"grad_norm": 0.181193545460701,
"learning_rate": 0.00019865834730006433,
"loss": 1.1563,
"step": 131
},
{
"epoch": 0.056682769726247986,
"grad_norm": 0.16531504690647125,
"learning_rate": 0.00019863613034027224,
"loss": 1.1427,
"step": 132
},
{
"epoch": 0.05711218464841653,
"grad_norm": 0.1994440257549286,
"learning_rate": 0.00019861373220106997,
"loss": 1.3541,
"step": 133
},
{
"epoch": 0.05754159957058508,
"grad_norm": 0.18033157289028168,
"learning_rate": 0.0001985911529235995,
"loss": 0.9477,
"step": 134
},
{
"epoch": 0.057971014492753624,
"grad_norm": 0.17404161393642426,
"learning_rate": 0.00019856839254933544,
"loss": 1.1277,
"step": 135
},
{
"epoch": 0.05840042941492217,
"grad_norm": 0.17261551320552826,
"learning_rate": 0.00019854545112008514,
"loss": 1.2953,
"step": 136
},
{
"epoch": 0.058829844337090716,
"grad_norm": 0.1669391393661499,
"learning_rate": 0.00019852232867798844,
"loss": 1.2108,
"step": 137
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.1854487657546997,
"learning_rate": 0.00019849902526551772,
"loss": 1.5342,
"step": 138
},
{
"epoch": 0.05968867418142781,
"grad_norm": 0.18810135126113892,
"learning_rate": 0.0001984755409254778,
"loss": 1.0847,
"step": 139
},
{
"epoch": 0.06011808910359635,
"grad_norm": 0.15636786818504333,
"learning_rate": 0.00019845187570100573,
"loss": 1.1426,
"step": 140
},
{
"epoch": 0.06054750402576489,
"grad_norm": 0.15283016860485077,
"learning_rate": 0.000198428029635571,
"loss": 0.9389,
"step": 141
},
{
"epoch": 0.06097691894793344,
"grad_norm": 0.1785784810781479,
"learning_rate": 0.00019840400277297508,
"loss": 0.8145,
"step": 142
},
{
"epoch": 0.061406333870101984,
"grad_norm": 0.19488206505775452,
"learning_rate": 0.00019837979515735166,
"loss": 1.1245,
"step": 143
},
{
"epoch": 0.06183574879227053,
"grad_norm": 0.1749604046344757,
"learning_rate": 0.00019835540683316638,
"loss": 1.0823,
"step": 144
},
{
"epoch": 0.062265163714439076,
"grad_norm": 0.14947979152202606,
"learning_rate": 0.00019833083784521688,
"loss": 0.9827,
"step": 145
},
{
"epoch": 0.06269457863660763,
"grad_norm": 0.18214192986488342,
"learning_rate": 0.00019830608823863258,
"loss": 1.1311,
"step": 146
},
{
"epoch": 0.06312399355877617,
"grad_norm": 0.15751980245113373,
"learning_rate": 0.0001982811580588747,
"loss": 1.126,
"step": 147
},
{
"epoch": 0.0635534084809447,
"grad_norm": 0.17060008645057678,
"learning_rate": 0.0001982560473517362,
"loss": 1.0999,
"step": 148
},
{
"epoch": 0.06398282340311326,
"grad_norm": 0.15626037120819092,
"learning_rate": 0.00019823075616334155,
"loss": 1.1292,
"step": 149
},
{
"epoch": 0.0644122383252818,
"grad_norm": 0.17362122237682343,
"learning_rate": 0.00019820528454014678,
"loss": 1.0831,
"step": 150
},
{
"epoch": 0.06484165324745035,
"grad_norm": 0.17661671340465546,
"learning_rate": 0.00019817963252893934,
"loss": 1.0467,
"step": 151
},
{
"epoch": 0.06527106816961889,
"grad_norm": 0.1770239919424057,
"learning_rate": 0.00019815380017683805,
"loss": 1.3296,
"step": 152
},
{
"epoch": 0.06570048309178744,
"grad_norm": 0.1600884646177292,
"learning_rate": 0.00019812778753129295,
"loss": 1.1975,
"step": 153
},
{
"epoch": 0.06612989801395598,
"grad_norm": 0.14404766261577606,
"learning_rate": 0.0001981015946400853,
"loss": 1.0152,
"step": 154
},
{
"epoch": 0.06655931293612453,
"grad_norm": 0.15787601470947266,
"learning_rate": 0.0001980752215513274,
"loss": 0.8621,
"step": 155
},
{
"epoch": 0.06698872785829307,
"grad_norm": 0.16410237550735474,
"learning_rate": 0.00019804866831346253,
"loss": 1.1043,
"step": 156
},
{
"epoch": 0.06741814278046163,
"grad_norm": 0.14886626601219177,
"learning_rate": 0.00019802193497526496,
"loss": 1.0065,
"step": 157
},
{
"epoch": 0.06784755770263017,
"grad_norm": 0.18639588356018066,
"learning_rate": 0.00019799502158583966,
"loss": 1.1146,
"step": 158
},
{
"epoch": 0.06827697262479872,
"grad_norm": 0.1470535844564438,
"learning_rate": 0.00019796792819462246,
"loss": 0.9775,
"step": 159
},
{
"epoch": 0.06870638754696726,
"grad_norm": 0.177282452583313,
"learning_rate": 0.0001979406548513797,
"loss": 1.316,
"step": 160
},
{
"epoch": 0.0691358024691358,
"grad_norm": 0.17426224052906036,
"learning_rate": 0.00019791320160620837,
"loss": 1.2854,
"step": 161
},
{
"epoch": 0.06956521739130435,
"grad_norm": 0.16735795140266418,
"learning_rate": 0.0001978855685095358,
"loss": 1.2184,
"step": 162
},
{
"epoch": 0.06999463231347289,
"grad_norm": 0.18738149106502533,
"learning_rate": 0.00019785775561211976,
"loss": 1.1342,
"step": 163
},
{
"epoch": 0.07042404723564144,
"grad_norm": 0.17026057839393616,
"learning_rate": 0.00019782976296504835,
"loss": 1.0973,
"step": 164
},
{
"epoch": 0.07085346215780998,
"grad_norm": 0.14129336178302765,
"learning_rate": 0.00019780159061973964,
"loss": 0.8889,
"step": 165
},
{
"epoch": 0.07128287707997853,
"grad_norm": 0.19238591194152832,
"learning_rate": 0.00019777323862794192,
"loss": 1.0827,
"step": 166
},
{
"epoch": 0.07171229200214707,
"grad_norm": 0.17041011154651642,
"learning_rate": 0.00019774470704173353,
"loss": 1.2057,
"step": 167
},
{
"epoch": 0.07214170692431562,
"grad_norm": 0.18856163322925568,
"learning_rate": 0.00019771599591352252,
"loss": 1.1693,
"step": 168
},
{
"epoch": 0.07257112184648416,
"grad_norm": 0.17438524961471558,
"learning_rate": 0.00019768710529604686,
"loss": 1.1714,
"step": 169
},
{
"epoch": 0.07300053676865272,
"grad_norm": 0.17283211648464203,
"learning_rate": 0.00019765803524237417,
"loss": 1.34,
"step": 170
},
{
"epoch": 0.07342995169082125,
"grad_norm": 0.15461453795433044,
"learning_rate": 0.00019762878580590162,
"loss": 1.1,
"step": 171
},
{
"epoch": 0.07385936661298981,
"grad_norm": 0.1745782196521759,
"learning_rate": 0.00019759935704035598,
"loss": 1.1485,
"step": 172
},
{
"epoch": 0.07428878153515835,
"grad_norm": 0.19017790257930756,
"learning_rate": 0.0001975697489997934,
"loss": 1.2036,
"step": 173
},
{
"epoch": 0.07471819645732689,
"grad_norm": 0.14983102679252625,
"learning_rate": 0.0001975399617385992,
"loss": 0.9465,
"step": 174
},
{
"epoch": 0.07514761137949544,
"grad_norm": 0.1556852161884308,
"learning_rate": 0.0001975099953114881,
"loss": 0.941,
"step": 175
},
{
"epoch": 0.07557702630166398,
"grad_norm": 0.1680162101984024,
"learning_rate": 0.00019747984977350379,
"loss": 1.2423,
"step": 176
},
{
"epoch": 0.07600644122383253,
"grad_norm": 0.17990583181381226,
"learning_rate": 0.00019744952518001893,
"loss": 1.0285,
"step": 177
},
{
"epoch": 0.07643585614600107,
"grad_norm": 0.18733762204647064,
"learning_rate": 0.00019741902158673522,
"loss": 1.3571,
"step": 178
},
{
"epoch": 0.07686527106816962,
"grad_norm": 0.14356885850429535,
"learning_rate": 0.00019738833904968302,
"loss": 0.8155,
"step": 179
},
{
"epoch": 0.07729468599033816,
"grad_norm": 0.19046086072921753,
"learning_rate": 0.00019735747762522147,
"loss": 1.0226,
"step": 180
},
{
"epoch": 0.07772410091250671,
"grad_norm": 0.14588217437267303,
"learning_rate": 0.00019732643737003827,
"loss": 0.8774,
"step": 181
},
{
"epoch": 0.07815351583467525,
"grad_norm": 0.16085247695446014,
"learning_rate": 0.00019729521834114952,
"loss": 1.1483,
"step": 182
},
{
"epoch": 0.0785829307568438,
"grad_norm": 0.1786722093820572,
"learning_rate": 0.00019726382059589986,
"loss": 1.0986,
"step": 183
},
{
"epoch": 0.07901234567901234,
"grad_norm": 0.1842159777879715,
"learning_rate": 0.0001972322441919621,
"loss": 1.1254,
"step": 184
},
{
"epoch": 0.0794417606011809,
"grad_norm": 0.1684993952512741,
"learning_rate": 0.00019720048918733723,
"loss": 0.9512,
"step": 185
},
{
"epoch": 0.07987117552334944,
"grad_norm": 0.18039727210998535,
"learning_rate": 0.0001971685556403543,
"loss": 1.2037,
"step": 186
},
{
"epoch": 0.08030059044551799,
"grad_norm": 0.16253158450126648,
"learning_rate": 0.0001971364436096703,
"loss": 1.1042,
"step": 187
},
{
"epoch": 0.08073000536768653,
"grad_norm": 0.17348501086235046,
"learning_rate": 0.00019710415315427022,
"loss": 1.0384,
"step": 188
},
{
"epoch": 0.08115942028985507,
"grad_norm": 0.19116544723510742,
"learning_rate": 0.00019707168433346655,
"loss": 1.1186,
"step": 189
},
{
"epoch": 0.08158883521202362,
"grad_norm": 0.17228098213672638,
"learning_rate": 0.00019703903720689954,
"loss": 1.0421,
"step": 190
},
{
"epoch": 0.08201825013419216,
"grad_norm": 0.15176887810230255,
"learning_rate": 0.00019700621183453695,
"loss": 1.1865,
"step": 191
},
{
"epoch": 0.08244766505636071,
"grad_norm": 0.16815736889839172,
"learning_rate": 0.00019697320827667398,
"loss": 1.3136,
"step": 192
},
{
"epoch": 0.08287707997852925,
"grad_norm": 0.18581236898899078,
"learning_rate": 0.00019694002659393305,
"loss": 1.2243,
"step": 193
},
{
"epoch": 0.0833064949006978,
"grad_norm": 0.19139103591442108,
"learning_rate": 0.00019690666684726382,
"loss": 1.1882,
"step": 194
},
{
"epoch": 0.08373590982286634,
"grad_norm": 0.15718159079551697,
"learning_rate": 0.00019687312909794305,
"loss": 1.0329,
"step": 195
},
{
"epoch": 0.0841653247450349,
"grad_norm": 0.1583366096019745,
"learning_rate": 0.00019683941340757434,
"loss": 0.9521,
"step": 196
},
{
"epoch": 0.08459473966720343,
"grad_norm": 0.17986145615577698,
"learning_rate": 0.00019680551983808836,
"loss": 1.3057,
"step": 197
},
{
"epoch": 0.08502415458937199,
"grad_norm": 0.14667508006095886,
"learning_rate": 0.00019677144845174226,
"loss": 1.204,
"step": 198
},
{
"epoch": 0.08545356951154053,
"grad_norm": 0.16105642914772034,
"learning_rate": 0.00019673719931112004,
"loss": 1.2272,
"step": 199
},
{
"epoch": 0.08588298443370908,
"grad_norm": 0.17806339263916016,
"learning_rate": 0.00019670277247913205,
"loss": 0.9928,
"step": 200
},
{
"epoch": 0.08631239935587762,
"grad_norm": 0.15053167939186096,
"learning_rate": 0.0001966681680190151,
"loss": 0.8566,
"step": 201
},
{
"epoch": 0.08674181427804616,
"grad_norm": 0.13740143179893494,
"learning_rate": 0.00019663338599433227,
"loss": 0.7979,
"step": 202
},
{
"epoch": 0.08717122920021471,
"grad_norm": 0.17480605840682983,
"learning_rate": 0.00019659842646897282,
"loss": 0.9794,
"step": 203
},
{
"epoch": 0.08760064412238325,
"grad_norm": 0.192199245095253,
"learning_rate": 0.00019656328950715194,
"loss": 1.2525,
"step": 204
},
{
"epoch": 0.0880300590445518,
"grad_norm": 0.18914753198623657,
"learning_rate": 0.00019652797517341096,
"loss": 1.2156,
"step": 205
},
{
"epoch": 0.08845947396672034,
"grad_norm": 0.19193218648433685,
"learning_rate": 0.00019649248353261674,
"loss": 1.385,
"step": 206
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.19617465138435364,
"learning_rate": 0.00019645681464996206,
"loss": 1.2991,
"step": 207
},
{
"epoch": 0.08931830381105743,
"grad_norm": 0.16679921746253967,
"learning_rate": 0.00019642096859096516,
"loss": 1.0183,
"step": 208
},
{
"epoch": 0.08974771873322598,
"grad_norm": 0.1839999556541443,
"learning_rate": 0.00019638494542146973,
"loss": 1.2098,
"step": 209
},
{
"epoch": 0.09017713365539452,
"grad_norm": 0.17847347259521484,
"learning_rate": 0.0001963487452076448,
"loss": 1.1791,
"step": 210
},
{
"epoch": 0.09060654857756308,
"grad_norm": 0.1537715196609497,
"learning_rate": 0.00019631236801598458,
"loss": 1.307,
"step": 211
},
{
"epoch": 0.09103596349973161,
"grad_norm": 0.16377565264701843,
"learning_rate": 0.0001962758139133084,
"loss": 0.9766,
"step": 212
},
{
"epoch": 0.09146537842190017,
"grad_norm": 0.1567695438861847,
"learning_rate": 0.0001962390829667605,
"loss": 1.1082,
"step": 213
},
{
"epoch": 0.0918947933440687,
"grad_norm": 0.14198783040046692,
"learning_rate": 0.00019620217524381005,
"loss": 1.0773,
"step": 214
},
{
"epoch": 0.09232420826623725,
"grad_norm": 0.16413229703903198,
"learning_rate": 0.0001961650908122508,
"loss": 1.1947,
"step": 215
},
{
"epoch": 0.0927536231884058,
"grad_norm": 0.15348884463310242,
"learning_rate": 0.00019612782974020118,
"loss": 0.7186,
"step": 216
},
{
"epoch": 0.09318303811057434,
"grad_norm": 0.1820840686559677,
"learning_rate": 0.00019609039209610404,
"loss": 1.0661,
"step": 217
},
{
"epoch": 0.09361245303274289,
"grad_norm": 0.1551450490951538,
"learning_rate": 0.00019605277794872657,
"loss": 0.8472,
"step": 218
},
{
"epoch": 0.09404186795491143,
"grad_norm": 0.19438843429088593,
"learning_rate": 0.00019601498736716017,
"loss": 1.2454,
"step": 219
},
{
"epoch": 0.09447128287707998,
"grad_norm": 0.16173028945922852,
"learning_rate": 0.00019597702042082037,
"loss": 0.8713,
"step": 220
},
{
"epoch": 0.09490069779924852,
"grad_norm": 0.18918974697589874,
"learning_rate": 0.00019593887717944659,
"loss": 1.2559,
"step": 221
},
{
"epoch": 0.09533011272141707,
"grad_norm": 0.1581108570098877,
"learning_rate": 0.00019590055771310212,
"loss": 0.7194,
"step": 222
},
{
"epoch": 0.09575952764358561,
"grad_norm": 0.13984139263629913,
"learning_rate": 0.0001958620620921739,
"loss": 0.7027,
"step": 223
},
{
"epoch": 0.09618894256575417,
"grad_norm": 0.1842825710773468,
"learning_rate": 0.00019582339038737247,
"loss": 1.2838,
"step": 224
},
{
"epoch": 0.0966183574879227,
"grad_norm": 0.16079159080982208,
"learning_rate": 0.00019578454266973183,
"loss": 1.0553,
"step": 225
},
{
"epoch": 0.09704777241009126,
"grad_norm": 0.16030196845531464,
"learning_rate": 0.00019574551901060922,
"loss": 1.0496,
"step": 226
},
{
"epoch": 0.0974771873322598,
"grad_norm": 0.16699260473251343,
"learning_rate": 0.0001957063194816852,
"loss": 1.3505,
"step": 227
},
{
"epoch": 0.09790660225442833,
"grad_norm": 0.1571999043226242,
"learning_rate": 0.00019566694415496316,
"loss": 1.2156,
"step": 228
},
{
"epoch": 0.09833601717659689,
"grad_norm": 0.15415778756141663,
"learning_rate": 0.0001956273931027696,
"loss": 1.0225,
"step": 229
},
{
"epoch": 0.09876543209876543,
"grad_norm": 0.16700062155723572,
"learning_rate": 0.0001955876663977537,
"loss": 1.0049,
"step": 230
},
{
"epoch": 0.09919484702093398,
"grad_norm": 0.16353946924209595,
"learning_rate": 0.00019554776411288732,
"loss": 1.2387,
"step": 231
},
{
"epoch": 0.09962426194310252,
"grad_norm": 0.16290371119976044,
"learning_rate": 0.00019550768632146484,
"loss": 1.044,
"step": 232
},
{
"epoch": 0.10005367686527107,
"grad_norm": 0.15819229185581207,
"learning_rate": 0.00019546743309710297,
"loss": 1.13,
"step": 233
},
{
"epoch": 0.10048309178743961,
"grad_norm": 0.18955904245376587,
"learning_rate": 0.00019542700451374067,
"loss": 1.1663,
"step": 234
},
{
"epoch": 0.10091250670960816,
"grad_norm": 0.14698690176010132,
"learning_rate": 0.0001953864006456391,
"loss": 1.1295,
"step": 235
},
{
"epoch": 0.1013419216317767,
"grad_norm": 0.1734054684638977,
"learning_rate": 0.00019534562156738129,
"loss": 0.8559,
"step": 236
},
{
"epoch": 0.10177133655394525,
"grad_norm": 0.16847679018974304,
"learning_rate": 0.00019530466735387213,
"loss": 1.0313,
"step": 237
},
{
"epoch": 0.1022007514761138,
"grad_norm": 0.1666480153799057,
"learning_rate": 0.00019526353808033825,
"loss": 1.0825,
"step": 238
},
{
"epoch": 0.10263016639828235,
"grad_norm": 0.14294366538524628,
"learning_rate": 0.0001952222338223278,
"loss": 0.9846,
"step": 239
},
{
"epoch": 0.10305958132045089,
"grad_norm": 0.1204523891210556,
"learning_rate": 0.00019518075465571028,
"loss": 0.9862,
"step": 240
},
{
"epoch": 0.10348899624261942,
"grad_norm": 0.14956791698932648,
"learning_rate": 0.00019513910065667664,
"loss": 1.0975,
"step": 241
},
{
"epoch": 0.10391841116478798,
"grad_norm": 0.16827872395515442,
"learning_rate": 0.00019509727190173884,
"loss": 1.3116,
"step": 242
},
{
"epoch": 0.10434782608695652,
"grad_norm": 0.16410714387893677,
"learning_rate": 0.00019505526846772984,
"loss": 0.9231,
"step": 243
},
{
"epoch": 0.10477724100912507,
"grad_norm": 0.19388873875141144,
"learning_rate": 0.00019501309043180352,
"loss": 1.0604,
"step": 244
},
{
"epoch": 0.10520665593129361,
"grad_norm": 0.17403458058834076,
"learning_rate": 0.00019497073787143446,
"loss": 1.0757,
"step": 245
},
{
"epoch": 0.10563607085346216,
"grad_norm": 0.1442354917526245,
"learning_rate": 0.0001949282108644178,
"loss": 0.6964,
"step": 246
},
{
"epoch": 0.1060654857756307,
"grad_norm": 0.1477101892232895,
"learning_rate": 0.0001948855094888691,
"loss": 1.2497,
"step": 247
},
{
"epoch": 0.10649490069779925,
"grad_norm": 0.1691221445798874,
"learning_rate": 0.0001948426338232242,
"loss": 1.1567,
"step": 248
},
{
"epoch": 0.10692431561996779,
"grad_norm": 0.16259369254112244,
"learning_rate": 0.00019479958394623913,
"loss": 0.9878,
"step": 249
},
{
"epoch": 0.10735373054213634,
"grad_norm": 0.17605777084827423,
"learning_rate": 0.00019475635993698994,
"loss": 1.0964,
"step": 250
},
{
"epoch": 0.10778314546430488,
"grad_norm": 0.17357371747493744,
"learning_rate": 0.0001947129618748724,
"loss": 1.0984,
"step": 251
},
{
"epoch": 0.10821256038647344,
"grad_norm": 0.16604338586330414,
"learning_rate": 0.00019466938983960218,
"loss": 1.2584,
"step": 252
},
{
"epoch": 0.10864197530864197,
"grad_norm": 0.15120381116867065,
"learning_rate": 0.00019462564391121436,
"loss": 0.7606,
"step": 253
},
{
"epoch": 0.10907139023081051,
"grad_norm": 0.18790557980537415,
"learning_rate": 0.00019458172417006347,
"loss": 1.1506,
"step": 254
},
{
"epoch": 0.10950080515297907,
"grad_norm": 0.17807306349277496,
"learning_rate": 0.00019453763069682335,
"loss": 1.1895,
"step": 255
},
{
"epoch": 0.1099302200751476,
"grad_norm": 0.18234007060527802,
"learning_rate": 0.00019449336357248696,
"loss": 1.1112,
"step": 256
},
{
"epoch": 0.11035963499731616,
"grad_norm": 0.1744687557220459,
"learning_rate": 0.00019444892287836613,
"loss": 1.042,
"step": 257
},
{
"epoch": 0.1107890499194847,
"grad_norm": 0.15671797096729279,
"learning_rate": 0.00019440430869609166,
"loss": 1.1334,
"step": 258
},
{
"epoch": 0.11121846484165325,
"grad_norm": 0.17378878593444824,
"learning_rate": 0.00019435952110761289,
"loss": 1.1142,
"step": 259
},
{
"epoch": 0.11164787976382179,
"grad_norm": 0.17875009775161743,
"learning_rate": 0.00019431456019519775,
"loss": 1.0393,
"step": 260
},
{
"epoch": 0.11207729468599034,
"grad_norm": 0.15020230412483215,
"learning_rate": 0.00019426942604143253,
"loss": 1.2424,
"step": 261
},
{
"epoch": 0.11250670960815888,
"grad_norm": 0.17647111415863037,
"learning_rate": 0.00019422411872922171,
"loss": 1.1036,
"step": 262
},
{
"epoch": 0.11293612453032743,
"grad_norm": 0.1858074814081192,
"learning_rate": 0.00019417863834178794,
"loss": 1.1087,
"step": 263
},
{
"epoch": 0.11336553945249597,
"grad_norm": 0.18380528688430786,
"learning_rate": 0.0001941329849626716,
"loss": 1.1344,
"step": 264
},
{
"epoch": 0.11379495437466453,
"grad_norm": 0.1671726554632187,
"learning_rate": 0.000194087158675731,
"loss": 0.8795,
"step": 265
},
{
"epoch": 0.11422436929683306,
"grad_norm": 0.17651990056037903,
"learning_rate": 0.00019404115956514194,
"loss": 1.1036,
"step": 266
},
{
"epoch": 0.11465378421900162,
"grad_norm": 0.17102883756160736,
"learning_rate": 0.00019399498771539774,
"loss": 1.0949,
"step": 267
},
{
"epoch": 0.11508319914117016,
"grad_norm": 0.18060144782066345,
"learning_rate": 0.000193948643211309,
"loss": 1.1315,
"step": 268
},
{
"epoch": 0.1155126140633387,
"grad_norm": 0.15454426407814026,
"learning_rate": 0.0001939021261380034,
"loss": 1.057,
"step": 269
},
{
"epoch": 0.11594202898550725,
"grad_norm": 0.14077837765216827,
"learning_rate": 0.0001938554365809257,
"loss": 0.8064,
"step": 270
},
{
"epoch": 0.11637144390767579,
"grad_norm": 0.17142775654792786,
"learning_rate": 0.00019380857462583743,
"loss": 1.156,
"step": 271
},
{
"epoch": 0.11680085882984434,
"grad_norm": 0.1670989692211151,
"learning_rate": 0.0001937615403588168,
"loss": 0.9589,
"step": 272
},
{
"epoch": 0.11723027375201288,
"grad_norm": 0.19140732288360596,
"learning_rate": 0.00019371433386625856,
"loss": 0.9871,
"step": 273
},
{
"epoch": 0.11765968867418143,
"grad_norm": 0.18820329010486603,
"learning_rate": 0.00019366695523487368,
"loss": 1.0285,
"step": 274
},
{
"epoch": 0.11808910359634997,
"grad_norm": 0.17042939364910126,
"learning_rate": 0.00019361940455168956,
"loss": 1.0943,
"step": 275
},
{
"epoch": 0.11851851851851852,
"grad_norm": 0.16640831530094147,
"learning_rate": 0.00019357168190404936,
"loss": 1.1504,
"step": 276
},
{
"epoch": 0.11894793344068706,
"grad_norm": 0.16726379096508026,
"learning_rate": 0.00019352378737961235,
"loss": 1.3996,
"step": 277
},
{
"epoch": 0.11937734836285561,
"grad_norm": 0.1757480800151825,
"learning_rate": 0.00019347572106635335,
"loss": 1.1903,
"step": 278
},
{
"epoch": 0.11980676328502415,
"grad_norm": 0.1531904935836792,
"learning_rate": 0.00019342748305256285,
"loss": 1.0287,
"step": 279
},
{
"epoch": 0.1202361782071927,
"grad_norm": 0.19600524008274078,
"learning_rate": 0.0001933790734268466,
"loss": 1.1248,
"step": 280
},
{
"epoch": 0.12066559312936125,
"grad_norm": 0.1654789000749588,
"learning_rate": 0.0001933304922781257,
"loss": 1.2959,
"step": 281
},
{
"epoch": 0.12109500805152978,
"grad_norm": 0.16465742886066437,
"learning_rate": 0.0001932817396956362,
"loss": 0.9625,
"step": 282
},
{
"epoch": 0.12152442297369834,
"grad_norm": 0.16723015904426575,
"learning_rate": 0.00019323281576892916,
"loss": 1.034,
"step": 283
},
{
"epoch": 0.12195383789586688,
"grad_norm": 0.15436948835849762,
"learning_rate": 0.00019318372058787025,
"loss": 1.085,
"step": 284
},
{
"epoch": 0.12238325281803543,
"grad_norm": 0.17568649351596832,
"learning_rate": 0.00019313445424263978,
"loss": 1.1922,
"step": 285
},
{
"epoch": 0.12281266774020397,
"grad_norm": 0.15134669840335846,
"learning_rate": 0.0001930850168237325,
"loss": 1.1783,
"step": 286
},
{
"epoch": 0.12324208266237252,
"grad_norm": 0.19426967203617096,
"learning_rate": 0.00019303540842195732,
"loss": 1.2244,
"step": 287
},
{
"epoch": 0.12367149758454106,
"grad_norm": 0.17754550278186798,
"learning_rate": 0.00019298562912843724,
"loss": 0.9266,
"step": 288
},
{
"epoch": 0.12410091250670961,
"grad_norm": 0.18942666053771973,
"learning_rate": 0.00019293567903460918,
"loss": 1.0538,
"step": 289
},
{
"epoch": 0.12453032742887815,
"grad_norm": 0.14974556863307953,
"learning_rate": 0.0001928855582322238,
"loss": 0.8825,
"step": 290
},
{
"epoch": 0.1249597423510467,
"grad_norm": 0.16468919813632965,
"learning_rate": 0.0001928352668133453,
"loss": 1.2179,
"step": 291
},
{
"epoch": 0.12538915727321526,
"grad_norm": 0.18979178369045258,
"learning_rate": 0.00019278480487035126,
"loss": 1.0274,
"step": 292
},
{
"epoch": 0.12581857219538378,
"grad_norm": 0.1661735624074936,
"learning_rate": 0.00019273417249593256,
"loss": 1.0588,
"step": 293
},
{
"epoch": 0.12624798711755233,
"grad_norm": 0.18528646230697632,
"learning_rate": 0.00019268336978309303,
"loss": 1.1263,
"step": 294
},
{
"epoch": 0.1266774020397209,
"grad_norm": 0.16602130234241486,
"learning_rate": 0.00019263239682514952,
"loss": 0.7833,
"step": 295
},
{
"epoch": 0.1271068169618894,
"grad_norm": 0.18867306411266327,
"learning_rate": 0.00019258125371573144,
"loss": 1.1295,
"step": 296
},
{
"epoch": 0.12753623188405797,
"grad_norm": 0.1883901059627533,
"learning_rate": 0.00019252994054878088,
"loss": 1.0669,
"step": 297
},
{
"epoch": 0.12796564680622652,
"grad_norm": 0.1632394641637802,
"learning_rate": 0.00019247845741855222,
"loss": 1.0846,
"step": 298
},
{
"epoch": 0.12839506172839507,
"grad_norm": 0.18154770135879517,
"learning_rate": 0.00019242680441961205,
"loss": 1.1138,
"step": 299
},
{
"epoch": 0.1288244766505636,
"grad_norm": 0.16086812317371368,
"learning_rate": 0.00019237498164683897,
"loss": 0.9613,
"step": 300
},
{
"epoch": 0.12925389157273215,
"grad_norm": 0.19330988824367523,
"learning_rate": 0.0001923229891954235,
"loss": 0.7739,
"step": 301
},
{
"epoch": 0.1296833064949007,
"grad_norm": 0.1668129414319992,
"learning_rate": 0.00019227082716086777,
"loss": 1.0718,
"step": 302
},
{
"epoch": 0.13011272141706925,
"grad_norm": 0.1654328554868698,
"learning_rate": 0.00019221849563898536,
"loss": 0.9797,
"step": 303
},
{
"epoch": 0.13054213633923778,
"grad_norm": 0.1601610779762268,
"learning_rate": 0.00019216599472590134,
"loss": 1.0867,
"step": 304
},
{
"epoch": 0.13097155126140633,
"grad_norm": 0.16391853988170624,
"learning_rate": 0.0001921133245180517,
"loss": 0.8036,
"step": 305
},
{
"epoch": 0.13140096618357489,
"grad_norm": 0.18757081031799316,
"learning_rate": 0.0001920604851121836,
"loss": 1.3174,
"step": 306
},
{
"epoch": 0.13183038110574344,
"grad_norm": 0.18147063255310059,
"learning_rate": 0.00019200747660535488,
"loss": 1.1763,
"step": 307
},
{
"epoch": 0.13225979602791196,
"grad_norm": 0.16341471672058105,
"learning_rate": 0.000191954299094934,
"loss": 1.0075,
"step": 308
},
{
"epoch": 0.13268921095008052,
"grad_norm": 0.183994323015213,
"learning_rate": 0.00019190095267859988,
"loss": 1.144,
"step": 309
},
{
"epoch": 0.13311862587224907,
"grad_norm": 0.1656254529953003,
"learning_rate": 0.0001918474374543417,
"loss": 1.0775,
"step": 310
},
{
"epoch": 0.1335480407944176,
"grad_norm": 0.15094861388206482,
"learning_rate": 0.0001917937535204587,
"loss": 0.6977,
"step": 311
},
{
"epoch": 0.13397745571658615,
"grad_norm": 0.1565057784318924,
"learning_rate": 0.00019173990097556002,
"loss": 1.1004,
"step": 312
},
{
"epoch": 0.1344068706387547,
"grad_norm": 0.18779979646205902,
"learning_rate": 0.00019168587991856448,
"loss": 1.257,
"step": 313
},
{
"epoch": 0.13483628556092325,
"grad_norm": 0.15053409337997437,
"learning_rate": 0.0001916316904487005,
"loss": 0.8913,
"step": 314
},
{
"epoch": 0.13526570048309178,
"grad_norm": 0.16636574268341064,
"learning_rate": 0.00019157733266550575,
"loss": 0.8063,
"step": 315
},
{
"epoch": 0.13569511540526033,
"grad_norm": 0.19238772988319397,
"learning_rate": 0.00019152280666882718,
"loss": 1.2016,
"step": 316
},
{
"epoch": 0.13612453032742888,
"grad_norm": 0.17583003640174866,
"learning_rate": 0.00019146811255882064,
"loss": 1.0703,
"step": 317
},
{
"epoch": 0.13655394524959744,
"grad_norm": 0.1871437430381775,
"learning_rate": 0.0001914132504359508,
"loss": 1.2822,
"step": 318
},
{
"epoch": 0.13698336017176596,
"grad_norm": 0.15960069000720978,
"learning_rate": 0.00019135822040099095,
"loss": 0.9356,
"step": 319
},
{
"epoch": 0.1374127750939345,
"grad_norm": 0.17675542831420898,
"learning_rate": 0.0001913030225550228,
"loss": 1.1216,
"step": 320
},
{
"epoch": 0.13784219001610307,
"grad_norm": 0.18341028690338135,
"learning_rate": 0.00019124765699943632,
"loss": 1.1436,
"step": 321
},
{
"epoch": 0.1382716049382716,
"grad_norm": 0.1786155104637146,
"learning_rate": 0.00019119212383592954,
"loss": 1.1862,
"step": 322
},
{
"epoch": 0.13870101986044014,
"grad_norm": 0.15550769865512848,
"learning_rate": 0.0001911364231665083,
"loss": 1.107,
"step": 323
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.16558977961540222,
"learning_rate": 0.00019108055509348623,
"loss": 1.1584,
"step": 324
},
{
"epoch": 0.13955984970477725,
"grad_norm": 0.15727491676807404,
"learning_rate": 0.0001910245197194843,
"loss": 1.1332,
"step": 325
},
{
"epoch": 0.13998926462694578,
"grad_norm": 0.16455912590026855,
"learning_rate": 0.00019096831714743098,
"loss": 0.8548,
"step": 326
},
{
"epoch": 0.14041867954911433,
"grad_norm": 0.16871945559978485,
"learning_rate": 0.00019091194748056172,
"loss": 0.9473,
"step": 327
},
{
"epoch": 0.14084809447128288,
"grad_norm": 0.18946193158626556,
"learning_rate": 0.0001908554108224189,
"loss": 1.1623,
"step": 328
},
{
"epoch": 0.14127750939345143,
"grad_norm": 0.18290971219539642,
"learning_rate": 0.0001907987072768517,
"loss": 1.0757,
"step": 329
},
{
"epoch": 0.14170692431561996,
"grad_norm": 0.17551882565021515,
"learning_rate": 0.0001907418369480158,
"loss": 1.0275,
"step": 330
},
{
"epoch": 0.1421363392377885,
"grad_norm": 0.1738695502281189,
"learning_rate": 0.00019068479994037327,
"loss": 1.0504,
"step": 331
},
{
"epoch": 0.14256575415995706,
"grad_norm": 0.18197093904018402,
"learning_rate": 0.00019062759635869232,
"loss": 1.2005,
"step": 332
},
{
"epoch": 0.14299516908212562,
"grad_norm": 0.16323554515838623,
"learning_rate": 0.00019057022630804716,
"loss": 1.1509,
"step": 333
},
{
"epoch": 0.14342458400429414,
"grad_norm": 0.1790863275527954,
"learning_rate": 0.00019051268989381771,
"loss": 0.9633,
"step": 334
},
{
"epoch": 0.1438539989264627,
"grad_norm": 0.17193441092967987,
"learning_rate": 0.00019045498722168955,
"loss": 1.0501,
"step": 335
},
{
"epoch": 0.14428341384863125,
"grad_norm": 0.18548649549484253,
"learning_rate": 0.0001903971183976536,
"loss": 1.2305,
"step": 336
},
{
"epoch": 0.14471282877079977,
"grad_norm": 0.16440680623054504,
"learning_rate": 0.00019033908352800608,
"loss": 1.1256,
"step": 337
},
{
"epoch": 0.14514224369296833,
"grad_norm": 0.18403667211532593,
"learning_rate": 0.00019028088271934798,
"loss": 1.2889,
"step": 338
},
{
"epoch": 0.14557165861513688,
"grad_norm": 0.16041843593120575,
"learning_rate": 0.0001902225160785853,
"loss": 1.0806,
"step": 339
},
{
"epoch": 0.14600107353730543,
"grad_norm": 0.15153127908706665,
"learning_rate": 0.00019016398371292864,
"loss": 0.7621,
"step": 340
},
{
"epoch": 0.14643048845947396,
"grad_norm": 0.14983665943145752,
"learning_rate": 0.0001901052857298929,
"loss": 0.9134,
"step": 341
},
{
"epoch": 0.1468599033816425,
"grad_norm": 0.17730404436588287,
"learning_rate": 0.00019004642223729727,
"loss": 1.2925,
"step": 342
},
{
"epoch": 0.14728931830381106,
"grad_norm": 0.1685967743396759,
"learning_rate": 0.00018998739334326494,
"loss": 1.1359,
"step": 343
},
{
"epoch": 0.14771873322597961,
"grad_norm": 0.15899759531021118,
"learning_rate": 0.00018992819915622291,
"loss": 1.0883,
"step": 344
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.1822543740272522,
"learning_rate": 0.00018986883978490182,
"loss": 1.1186,
"step": 345
},
{
"epoch": 0.1485775630703167,
"grad_norm": 0.17298339307308197,
"learning_rate": 0.00018980931533833567,
"loss": 0.8858,
"step": 346
},
{
"epoch": 0.14900697799248525,
"grad_norm": 0.17505380511283875,
"learning_rate": 0.00018974962592586178,
"loss": 1.1411,
"step": 347
},
{
"epoch": 0.14943639291465377,
"grad_norm": 0.1915581226348877,
"learning_rate": 0.00018968977165712036,
"loss": 1.1323,
"step": 348
},
{
"epoch": 0.14986580783682232,
"grad_norm": 0.17531049251556396,
"learning_rate": 0.00018962975264205455,
"loss": 0.886,
"step": 349
},
{
"epoch": 0.15029522275899088,
"grad_norm": 0.1736138015985489,
"learning_rate": 0.00018956956899091003,
"loss": 1.1875,
"step": 350
},
{
"epoch": 0.15072463768115943,
"grad_norm": 0.16522866487503052,
"learning_rate": 0.00018950922081423493,
"loss": 0.9511,
"step": 351
},
{
"epoch": 0.15115405260332795,
"grad_norm": 0.15171727538108826,
"learning_rate": 0.00018944870822287956,
"loss": 1.1202,
"step": 352
},
{
"epoch": 0.1515834675254965,
"grad_norm": 0.18102163076400757,
"learning_rate": 0.00018938803132799626,
"loss": 1.2382,
"step": 353
},
{
"epoch": 0.15201288244766506,
"grad_norm": 0.1564633846282959,
"learning_rate": 0.0001893271902410392,
"loss": 0.9987,
"step": 354
},
{
"epoch": 0.1524422973698336,
"grad_norm": 0.17558157444000244,
"learning_rate": 0.00018926618507376399,
"loss": 1.274,
"step": 355
},
{
"epoch": 0.15287171229200214,
"grad_norm": 0.1743505746126175,
"learning_rate": 0.00018920501593822789,
"loss": 0.8533,
"step": 356
},
{
"epoch": 0.1533011272141707,
"grad_norm": 0.19371235370635986,
"learning_rate": 0.0001891436829467891,
"loss": 1.2622,
"step": 357
},
{
"epoch": 0.15373054213633924,
"grad_norm": 0.16197408735752106,
"learning_rate": 0.00018908218621210688,
"loss": 0.7451,
"step": 358
},
{
"epoch": 0.1541599570585078,
"grad_norm": 0.2163006216287613,
"learning_rate": 0.00018902052584714136,
"loss": 1.2091,
"step": 359
},
{
"epoch": 0.15458937198067632,
"grad_norm": 0.1739387959241867,
"learning_rate": 0.00018895870196515314,
"loss": 0.9003,
"step": 360
},
{
"epoch": 0.15501878690284487,
"grad_norm": 0.16117063164710999,
"learning_rate": 0.00018889671467970317,
"loss": 1.0175,
"step": 361
},
{
"epoch": 0.15544820182501343,
"grad_norm": 0.16463720798492432,
"learning_rate": 0.0001888345641046525,
"loss": 1.2892,
"step": 362
},
{
"epoch": 0.15587761674718195,
"grad_norm": 0.19594573974609375,
"learning_rate": 0.0001887722503541623,
"loss": 1.1554,
"step": 363
},
{
"epoch": 0.1563070316693505,
"grad_norm": 0.15671700239181519,
"learning_rate": 0.00018870977354269326,
"loss": 0.9604,
"step": 364
},
{
"epoch": 0.15673644659151906,
"grad_norm": 0.16734743118286133,
"learning_rate": 0.00018864713378500574,
"loss": 1.0694,
"step": 365
},
{
"epoch": 0.1571658615136876,
"grad_norm": 0.13222168385982513,
"learning_rate": 0.0001885843311961593,
"loss": 0.6987,
"step": 366
},
{
"epoch": 0.15759527643585614,
"grad_norm": 0.17755256593227386,
"learning_rate": 0.00018852136589151268,
"loss": 1.0576,
"step": 367
},
{
"epoch": 0.1580246913580247,
"grad_norm": 0.17115449905395508,
"learning_rate": 0.00018845823798672347,
"loss": 1.2332,
"step": 368
},
{
"epoch": 0.15845410628019324,
"grad_norm": 0.17211580276489258,
"learning_rate": 0.00018839494759774787,
"loss": 1.0443,
"step": 369
},
{
"epoch": 0.1588835212023618,
"grad_norm": 0.16635645925998688,
"learning_rate": 0.00018833149484084066,
"loss": 1.3116,
"step": 370
},
{
"epoch": 0.15931293612453032,
"grad_norm": 0.13584615290164948,
"learning_rate": 0.00018826787983255473,
"loss": 0.816,
"step": 371
},
{
"epoch": 0.15974235104669887,
"grad_norm": 0.15319599211215973,
"learning_rate": 0.00018820410268974115,
"loss": 1.3403,
"step": 372
},
{
"epoch": 0.16017176596886742,
"grad_norm": 0.1778756082057953,
"learning_rate": 0.00018814016352954873,
"loss": 0.9581,
"step": 373
},
{
"epoch": 0.16060118089103598,
"grad_norm": 0.17817425727844238,
"learning_rate": 0.00018807606246942383,
"loss": 1.0942,
"step": 374
},
{
"epoch": 0.1610305958132045,
"grad_norm": 0.19471527636051178,
"learning_rate": 0.00018801179962711019,
"loss": 1.1226,
"step": 375
},
{
"epoch": 0.16146001073537306,
"grad_norm": 0.1694117933511734,
"learning_rate": 0.0001879473751206489,
"loss": 1.1468,
"step": 376
},
{
"epoch": 0.1618894256575416,
"grad_norm": 0.18657226860523224,
"learning_rate": 0.0001878827890683778,
"loss": 1.3482,
"step": 377
},
{
"epoch": 0.16231884057971013,
"grad_norm": 0.17072419822216034,
"learning_rate": 0.0001878180415889316,
"loss": 1.1668,
"step": 378
},
{
"epoch": 0.16274825550187869,
"grad_norm": 0.15484756231307983,
"learning_rate": 0.00018775313280124142,
"loss": 1.1584,
"step": 379
},
{
"epoch": 0.16317767042404724,
"grad_norm": 0.1646227240562439,
"learning_rate": 0.00018768806282453467,
"loss": 1.1282,
"step": 380
},
{
"epoch": 0.1636070853462158,
"grad_norm": 0.18709446489810944,
"learning_rate": 0.000187622831778335,
"loss": 1.1701,
"step": 381
},
{
"epoch": 0.16403650026838432,
"grad_norm": 0.1889953762292862,
"learning_rate": 0.0001875574397824618,
"loss": 1.1496,
"step": 382
},
{
"epoch": 0.16446591519055287,
"grad_norm": 0.16929011046886444,
"learning_rate": 0.00018749188695703006,
"loss": 0.8927,
"step": 383
},
{
"epoch": 0.16489533011272142,
"grad_norm": 0.16205012798309326,
"learning_rate": 0.0001874261734224503,
"loss": 1.135,
"step": 384
},
{
"epoch": 0.16532474503488997,
"grad_norm": 0.16252653300762177,
"learning_rate": 0.00018736029929942812,
"loss": 0.9563,
"step": 385
},
{
"epoch": 0.1657541599570585,
"grad_norm": 0.18884459137916565,
"learning_rate": 0.0001872942647089642,
"loss": 0.8866,
"step": 386
},
{
"epoch": 0.16618357487922705,
"grad_norm": 0.1668461114168167,
"learning_rate": 0.00018722806977235391,
"loss": 1.0448,
"step": 387
},
{
"epoch": 0.1666129898013956,
"grad_norm": 0.17943502962589264,
"learning_rate": 0.0001871617146111872,
"loss": 1.1933,
"step": 388
},
{
"epoch": 0.16704240472356413,
"grad_norm": 0.16244441270828247,
"learning_rate": 0.0001870951993473483,
"loss": 1.0513,
"step": 389
},
{
"epoch": 0.16747181964573268,
"grad_norm": 0.18279998004436493,
"learning_rate": 0.00018702852410301554,
"loss": 1.3546,
"step": 390
},
{
"epoch": 0.16790123456790124,
"grad_norm": 0.174489825963974,
"learning_rate": 0.00018696168900066105,
"loss": 1.1154,
"step": 391
},
{
"epoch": 0.1683306494900698,
"grad_norm": 0.19099275767803192,
"learning_rate": 0.00018689469416305067,
"loss": 1.3016,
"step": 392
},
{
"epoch": 0.16876006441223831,
"grad_norm": 0.1332124024629593,
"learning_rate": 0.00018682753971324358,
"loss": 0.8249,
"step": 393
},
{
"epoch": 0.16918947933440687,
"grad_norm": 0.17980900406837463,
"learning_rate": 0.00018676022577459225,
"loss": 1.2107,
"step": 394
},
{
"epoch": 0.16961889425657542,
"grad_norm": 0.1861777901649475,
"learning_rate": 0.000186692752470742,
"loss": 1.1602,
"step": 395
},
{
"epoch": 0.17004830917874397,
"grad_norm": 0.1574292778968811,
"learning_rate": 0.0001866251199256309,
"loss": 0.758,
"step": 396
},
{
"epoch": 0.1704777241009125,
"grad_norm": 0.17709052562713623,
"learning_rate": 0.00018655732826348956,
"loss": 0.965,
"step": 397
},
{
"epoch": 0.17090713902308105,
"grad_norm": 0.18563103675842285,
"learning_rate": 0.00018648937760884084,
"loss": 1.14,
"step": 398
},
{
"epoch": 0.1713365539452496,
"grad_norm": 0.19391857087612152,
"learning_rate": 0.00018642126808649968,
"loss": 0.8621,
"step": 399
},
{
"epoch": 0.17176596886741816,
"grad_norm": 0.13754752278327942,
"learning_rate": 0.00018635299982157274,
"loss": 0.8559,
"step": 400
},
{
"epoch": 0.17219538378958668,
"grad_norm": 0.17602375149726868,
"learning_rate": 0.0001862845729394584,
"loss": 1.0353,
"step": 401
},
{
"epoch": 0.17262479871175523,
"grad_norm": 0.1522264927625656,
"learning_rate": 0.00018621598756584623,
"loss": 1.0975,
"step": 402
},
{
"epoch": 0.1730542136339238,
"grad_norm": 0.13852877914905548,
"learning_rate": 0.00018614724382671712,
"loss": 0.8971,
"step": 403
},
{
"epoch": 0.1734836285560923,
"grad_norm": 0.16204625368118286,
"learning_rate": 0.0001860783418483427,
"loss": 0.8758,
"step": 404
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.17039796710014343,
"learning_rate": 0.00018600928175728534,
"loss": 0.9861,
"step": 405
},
{
"epoch": 0.17434245840042942,
"grad_norm": 0.13860173523426056,
"learning_rate": 0.00018594006368039779,
"loss": 0.9373,
"step": 406
},
{
"epoch": 0.17477187332259797,
"grad_norm": 0.16568392515182495,
"learning_rate": 0.00018587068774482299,
"loss": 1.1601,
"step": 407
},
{
"epoch": 0.1752012882447665,
"grad_norm": 0.15709200501441956,
"learning_rate": 0.00018580115407799394,
"loss": 1.0979,
"step": 408
},
{
"epoch": 0.17563070316693505,
"grad_norm": 0.1760331690311432,
"learning_rate": 0.00018573146280763324,
"loss": 0.9153,
"step": 409
},
{
"epoch": 0.1760601180891036,
"grad_norm": 0.16068683564662933,
"learning_rate": 0.00018566161406175308,
"loss": 0.9569,
"step": 410
},
{
"epoch": 0.17648953301127215,
"grad_norm": 0.19457021355628967,
"learning_rate": 0.00018559160796865484,
"loss": 1.0332,
"step": 411
},
{
"epoch": 0.17691894793344068,
"grad_norm": 0.18924041092395782,
"learning_rate": 0.00018552144465692897,
"loss": 1.0282,
"step": 412
},
{
"epoch": 0.17734836285560923,
"grad_norm": 0.17188721895217896,
"learning_rate": 0.0001854511242554547,
"loss": 1.1342,
"step": 413
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.1609194427728653,
"learning_rate": 0.0001853806468933997,
"loss": 1.0553,
"step": 414
},
{
"epoch": 0.1782071926999463,
"grad_norm": 0.16070395708084106,
"learning_rate": 0.00018531001270022022,
"loss": 1.2386,
"step": 415
},
{
"epoch": 0.17863660762211486,
"grad_norm": 0.17878350615501404,
"learning_rate": 0.00018523922180566028,
"loss": 1.0539,
"step": 416
},
{
"epoch": 0.17906602254428342,
"grad_norm": 0.19119922816753387,
"learning_rate": 0.00018516827433975194,
"loss": 1.105,
"step": 417
},
{
"epoch": 0.17949543746645197,
"grad_norm": 0.19245749711990356,
"learning_rate": 0.00018509717043281479,
"loss": 0.9197,
"step": 418
},
{
"epoch": 0.1799248523886205,
"grad_norm": 0.1675061136484146,
"learning_rate": 0.00018502591021545573,
"loss": 1.1746,
"step": 419
},
{
"epoch": 0.18035426731078905,
"grad_norm": 0.1748921126127243,
"learning_rate": 0.00018495449381856886,
"loss": 1.2055,
"step": 420
},
{
"epoch": 0.1807836822329576,
"grad_norm": 0.1709417849779129,
"learning_rate": 0.00018488292137333514,
"loss": 1.2112,
"step": 421
},
{
"epoch": 0.18121309715512615,
"grad_norm": 0.16465428471565247,
"learning_rate": 0.0001848111930112221,
"loss": 0.9713,
"step": 422
},
{
"epoch": 0.18164251207729468,
"grad_norm": 0.14309629797935486,
"learning_rate": 0.00018473930886398377,
"loss": 0.7619,
"step": 423
},
{
"epoch": 0.18207192699946323,
"grad_norm": 0.15775880217552185,
"learning_rate": 0.0001846672690636602,
"loss": 0.9245,
"step": 424
},
{
"epoch": 0.18250134192163178,
"grad_norm": 0.18402914702892303,
"learning_rate": 0.00018459507374257755,
"loss": 1.0844,
"step": 425
},
{
"epoch": 0.18293075684380034,
"grad_norm": 0.15407468378543854,
"learning_rate": 0.00018452272303334742,
"loss": 0.9946,
"step": 426
},
{
"epoch": 0.18336017176596886,
"grad_norm": 0.19107265770435333,
"learning_rate": 0.000184450217068867,
"loss": 1.2696,
"step": 427
},
{
"epoch": 0.1837895866881374,
"grad_norm": 0.16658765077590942,
"learning_rate": 0.00018437755598231856,
"loss": 1.2813,
"step": 428
},
{
"epoch": 0.18421900161030597,
"grad_norm": 0.1602768748998642,
"learning_rate": 0.0001843047399071694,
"loss": 1.1808,
"step": 429
},
{
"epoch": 0.1846484165324745,
"grad_norm": 0.16247111558914185,
"learning_rate": 0.00018423176897717141,
"loss": 0.9986,
"step": 430
},
{
"epoch": 0.18507783145464304,
"grad_norm": 0.152525395154953,
"learning_rate": 0.00018415864332636104,
"loss": 1.0343,
"step": 431
},
{
"epoch": 0.1855072463768116,
"grad_norm": 0.17383332550525665,
"learning_rate": 0.00018408536308905878,
"loss": 0.981,
"step": 432
},
{
"epoch": 0.18593666129898015,
"grad_norm": 0.17568951845169067,
"learning_rate": 0.0001840119283998692,
"loss": 1.1869,
"step": 433
},
{
"epoch": 0.18636607622114867,
"grad_norm": 0.18272657692432404,
"learning_rate": 0.00018393833939368056,
"loss": 1.0451,
"step": 434
},
{
"epoch": 0.18679549114331723,
"grad_norm": 0.1720953732728958,
"learning_rate": 0.0001838645962056645,
"loss": 0.914,
"step": 435
},
{
"epoch": 0.18722490606548578,
"grad_norm": 0.20161637663841248,
"learning_rate": 0.00018379069897127601,
"loss": 1.189,
"step": 436
},
{
"epoch": 0.18765432098765433,
"grad_norm": 0.17120416462421417,
"learning_rate": 0.00018371664782625287,
"loss": 1.0226,
"step": 437
},
{
"epoch": 0.18808373590982286,
"grad_norm": 0.19251450896263123,
"learning_rate": 0.00018364244290661568,
"loss": 1.1604,
"step": 438
},
{
"epoch": 0.1885131508319914,
"grad_norm": 0.16157999634742737,
"learning_rate": 0.00018356808434866748,
"loss": 1.1928,
"step": 439
},
{
"epoch": 0.18894256575415996,
"grad_norm": 0.16121311485767365,
"learning_rate": 0.00018349357228899347,
"loss": 0.8092,
"step": 440
},
{
"epoch": 0.18937198067632852,
"grad_norm": 0.18607012927532196,
"learning_rate": 0.0001834189068644609,
"loss": 1.0936,
"step": 441
},
{
"epoch": 0.18980139559849704,
"grad_norm": 0.15668633580207825,
"learning_rate": 0.00018334408821221864,
"loss": 1.1534,
"step": 442
},
{
"epoch": 0.1902308105206656,
"grad_norm": 0.1856255829334259,
"learning_rate": 0.0001832691164696971,
"loss": 1.0586,
"step": 443
},
{
"epoch": 0.19066022544283415,
"grad_norm": 0.14413128793239594,
"learning_rate": 0.0001831939917746078,
"loss": 0.9904,
"step": 444
},
{
"epoch": 0.19108964036500267,
"grad_norm": 0.15035253763198853,
"learning_rate": 0.0001831187142649433,
"loss": 0.9658,
"step": 445
},
{
"epoch": 0.19151905528717122,
"grad_norm": 0.19175738096237183,
"learning_rate": 0.00018304328407897676,
"loss": 1.1088,
"step": 446
},
{
"epoch": 0.19194847020933978,
"grad_norm": 0.1885284036397934,
"learning_rate": 0.0001829677013552619,
"loss": 1.233,
"step": 447
},
{
"epoch": 0.19237788513150833,
"grad_norm": 0.16992244124412537,
"learning_rate": 0.00018289196623263253,
"loss": 0.9719,
"step": 448
},
{
"epoch": 0.19280730005367686,
"grad_norm": 0.17281030118465424,
"learning_rate": 0.00018281607885020242,
"loss": 0.9497,
"step": 449
},
{
"epoch": 0.1932367149758454,
"grad_norm": 0.18136782944202423,
"learning_rate": 0.00018274003934736505,
"loss": 1.0897,
"step": 450
},
{
"epoch": 0.19366612989801396,
"grad_norm": 0.15827056765556335,
"learning_rate": 0.0001826638478637933,
"loss": 0.9363,
"step": 451
},
{
"epoch": 0.19409554482018251,
"grad_norm": 0.20995981991291046,
"learning_rate": 0.00018258750453943918,
"loss": 1.049,
"step": 452
},
{
"epoch": 0.19452495974235104,
"grad_norm": 0.17867140471935272,
"learning_rate": 0.00018251100951453367,
"loss": 1.0149,
"step": 453
},
{
"epoch": 0.1949543746645196,
"grad_norm": 0.1835739016532898,
"learning_rate": 0.00018243436292958638,
"loss": 1.1985,
"step": 454
},
{
"epoch": 0.19538378958668814,
"grad_norm": 0.17710070312023163,
"learning_rate": 0.0001823575649253853,
"loss": 0.9616,
"step": 455
},
{
"epoch": 0.19581320450885667,
"grad_norm": 0.16101765632629395,
"learning_rate": 0.0001822806156429965,
"loss": 1.2936,
"step": 456
},
{
"epoch": 0.19624261943102522,
"grad_norm": 0.1469978541135788,
"learning_rate": 0.00018220351522376407,
"loss": 1.1137,
"step": 457
},
{
"epoch": 0.19667203435319378,
"grad_norm": 0.17269261181354523,
"learning_rate": 0.00018212626380930967,
"loss": 1.35,
"step": 458
},
{
"epoch": 0.19710144927536233,
"grad_norm": 0.18232795596122742,
"learning_rate": 0.0001820488615415321,
"loss": 1.0693,
"step": 459
},
{
"epoch": 0.19753086419753085,
"grad_norm": 0.19020916521549225,
"learning_rate": 0.00018197130856260758,
"loss": 1.085,
"step": 460
},
{
"epoch": 0.1979602791196994,
"grad_norm": 0.1793365776538849,
"learning_rate": 0.00018189360501498896,
"loss": 1.1711,
"step": 461
},
{
"epoch": 0.19838969404186796,
"grad_norm": 0.17583267390727997,
"learning_rate": 0.00018181575104140568,
"loss": 1.2276,
"step": 462
},
{
"epoch": 0.1988191089640365,
"grad_norm": 0.16527873277664185,
"learning_rate": 0.00018173774678486356,
"loss": 1.1692,
"step": 463
},
{
"epoch": 0.19924852388620504,
"grad_norm": 0.15330368280410767,
"learning_rate": 0.00018165959238864446,
"loss": 1.0472,
"step": 464
},
{
"epoch": 0.1996779388083736,
"grad_norm": 0.18043364584445953,
"learning_rate": 0.00018158128799630594,
"loss": 1.1462,
"step": 465
},
{
"epoch": 0.20010735373054214,
"grad_norm": 0.1676676869392395,
"learning_rate": 0.00018150283375168114,
"loss": 1.1693,
"step": 466
},
{
"epoch": 0.2005367686527107,
"grad_norm": 0.17557865381240845,
"learning_rate": 0.00018142422979887848,
"loss": 0.9993,
"step": 467
},
{
"epoch": 0.20096618357487922,
"grad_norm": 0.17406152188777924,
"learning_rate": 0.00018134547628228132,
"loss": 1.2718,
"step": 468
},
{
"epoch": 0.20139559849704777,
"grad_norm": 0.16246803104877472,
"learning_rate": 0.00018126657334654772,
"loss": 0.906,
"step": 469
},
{
"epoch": 0.20182501341921633,
"grad_norm": 0.19664785265922546,
"learning_rate": 0.00018118752113661034,
"loss": 1.1194,
"step": 470
},
{
"epoch": 0.20225442834138485,
"grad_norm": 0.17243239283561707,
"learning_rate": 0.00018110831979767586,
"loss": 0.9779,
"step": 471
},
{
"epoch": 0.2026838432635534,
"grad_norm": 0.1569763720035553,
"learning_rate": 0.000181028969475225,
"loss": 1.2128,
"step": 472
},
{
"epoch": 0.20311325818572196,
"grad_norm": 0.17845910787582397,
"learning_rate": 0.0001809494703150121,
"loss": 1.087,
"step": 473
},
{
"epoch": 0.2035426731078905,
"grad_norm": 0.15362991392612457,
"learning_rate": 0.0001808698224630649,
"loss": 0.8389,
"step": 474
},
{
"epoch": 0.20397208803005903,
"grad_norm": 0.1604796200990677,
"learning_rate": 0.00018079002606568426,
"loss": 0.9256,
"step": 475
},
{
"epoch": 0.2044015029522276,
"grad_norm": 0.16644595563411713,
"learning_rate": 0.00018071008126944386,
"loss": 1.0327,
"step": 476
},
{
"epoch": 0.20483091787439614,
"grad_norm": 0.1740645319223404,
"learning_rate": 0.00018062998822119007,
"loss": 1.0971,
"step": 477
},
{
"epoch": 0.2052603327965647,
"grad_norm": 0.17992867529392242,
"learning_rate": 0.00018054974706804147,
"loss": 0.8937,
"step": 478
},
{
"epoch": 0.20568974771873322,
"grad_norm": 0.16396278142929077,
"learning_rate": 0.00018046935795738872,
"loss": 0.8748,
"step": 479
},
{
"epoch": 0.20611916264090177,
"grad_norm": 0.16882237792015076,
"learning_rate": 0.00018038882103689426,
"loss": 0.859,
"step": 480
},
{
"epoch": 0.20654857756307032,
"grad_norm": 0.142868772149086,
"learning_rate": 0.00018030813645449208,
"loss": 0.8051,
"step": 481
},
{
"epoch": 0.20697799248523885,
"grad_norm": 0.17199325561523438,
"learning_rate": 0.00018022730435838727,
"loss": 1.1636,
"step": 482
},
{
"epoch": 0.2074074074074074,
"grad_norm": 0.17648378014564514,
"learning_rate": 0.00018014632489705604,
"loss": 1.1394,
"step": 483
},
{
"epoch": 0.20783682232957595,
"grad_norm": 0.1827528178691864,
"learning_rate": 0.0001800651982192452,
"loss": 1.1095,
"step": 484
},
{
"epoch": 0.2082662372517445,
"grad_norm": 0.13080927729606628,
"learning_rate": 0.00017998392447397197,
"loss": 0.7807,
"step": 485
},
{
"epoch": 0.20869565217391303,
"grad_norm": 0.17123474180698395,
"learning_rate": 0.00017990250381052372,
"loss": 1.2197,
"step": 486
},
{
"epoch": 0.20912506709608158,
"grad_norm": 0.17640285193920135,
"learning_rate": 0.00017982093637845768,
"loss": 1.1285,
"step": 487
},
{
"epoch": 0.20955448201825014,
"grad_norm": 0.1964927464723587,
"learning_rate": 0.00017973922232760074,
"loss": 1.3984,
"step": 488
},
{
"epoch": 0.2099838969404187,
"grad_norm": 0.18344812095165253,
"learning_rate": 0.00017965736180804905,
"loss": 0.8897,
"step": 489
},
{
"epoch": 0.21041331186258722,
"grad_norm": 0.17509503662586212,
"learning_rate": 0.00017957535497016772,
"loss": 1.0808,
"step": 490
},
{
"epoch": 0.21084272678475577,
"grad_norm": 0.16462327539920807,
"learning_rate": 0.00017949320196459077,
"loss": 0.982,
"step": 491
},
{
"epoch": 0.21127214170692432,
"grad_norm": 0.17547428607940674,
"learning_rate": 0.00017941090294222066,
"loss": 1.0466,
"step": 492
},
{
"epoch": 0.21170155662909287,
"grad_norm": 0.18705184757709503,
"learning_rate": 0.000179328458054228,
"loss": 1.1574,
"step": 493
},
{
"epoch": 0.2121309715512614,
"grad_norm": 0.17873774468898773,
"learning_rate": 0.00017924586745205143,
"loss": 1.3599,
"step": 494
},
{
"epoch": 0.21256038647342995,
"grad_norm": 0.1929023265838623,
"learning_rate": 0.0001791631312873971,
"loss": 1.2727,
"step": 495
},
{
"epoch": 0.2129898013955985,
"grad_norm": 0.1473141312599182,
"learning_rate": 0.00017908024971223876,
"loss": 1.0392,
"step": 496
},
{
"epoch": 0.21341921631776703,
"grad_norm": 0.1641705185174942,
"learning_rate": 0.00017899722287881699,
"loss": 0.9458,
"step": 497
},
{
"epoch": 0.21384863123993558,
"grad_norm": 0.16218411922454834,
"learning_rate": 0.00017891405093963938,
"loss": 0.8449,
"step": 498
},
{
"epoch": 0.21427804616210414,
"grad_norm": 0.15134935081005096,
"learning_rate": 0.00017883073404748002,
"loss": 1.0388,
"step": 499
},
{
"epoch": 0.2147074610842727,
"grad_norm": 0.13633696734905243,
"learning_rate": 0.00017874727235537918,
"loss": 0.6724,
"step": 500
},
{
"epoch": 0.2151368760064412,
"grad_norm": 0.18835188448429108,
"learning_rate": 0.0001786636660166432,
"loss": 1.2972,
"step": 501
},
{
"epoch": 0.21556629092860977,
"grad_norm": 0.16085697710514069,
"learning_rate": 0.00017857991518484406,
"loss": 1.0825,
"step": 502
},
{
"epoch": 0.21599570585077832,
"grad_norm": 0.17221853137016296,
"learning_rate": 0.00017849602001381918,
"loss": 1.2739,
"step": 503
},
{
"epoch": 0.21642512077294687,
"grad_norm": 0.1634456366300583,
"learning_rate": 0.00017841198065767107,
"loss": 0.9839,
"step": 504
},
{
"epoch": 0.2168545356951154,
"grad_norm": 0.18110795319080353,
"learning_rate": 0.00017832779727076708,
"loss": 1.3229,
"step": 505
},
{
"epoch": 0.21728395061728395,
"grad_norm": 0.13345003128051758,
"learning_rate": 0.00017824347000773927,
"loss": 0.8383,
"step": 506
},
{
"epoch": 0.2177133655394525,
"grad_norm": 0.15196914970874786,
"learning_rate": 0.00017815899902348377,
"loss": 1.0096,
"step": 507
},
{
"epoch": 0.21814278046162103,
"grad_norm": 0.17290259897708893,
"learning_rate": 0.00017807438447316076,
"loss": 0.8173,
"step": 508
},
{
"epoch": 0.21857219538378958,
"grad_norm": 0.16334594786167145,
"learning_rate": 0.00017798962651219424,
"loss": 1.0307,
"step": 509
},
{
"epoch": 0.21900161030595813,
"grad_norm": 0.16071034967899323,
"learning_rate": 0.00017790472529627152,
"loss": 1.0597,
"step": 510
},
{
"epoch": 0.21943102522812669,
"grad_norm": 0.14360260963439941,
"learning_rate": 0.0001778196809813431,
"loss": 0.9411,
"step": 511
},
{
"epoch": 0.2198604401502952,
"grad_norm": 0.1717967838048935,
"learning_rate": 0.0001777344937236223,
"loss": 1.1883,
"step": 512
},
{
"epoch": 0.22028985507246376,
"grad_norm": 0.1511518657207489,
"learning_rate": 0.00017764916367958502,
"loss": 0.9472,
"step": 513
},
{
"epoch": 0.22071926999463232,
"grad_norm": 0.1570175439119339,
"learning_rate": 0.00017756369100596942,
"loss": 0.8677,
"step": 514
},
{
"epoch": 0.22114868491680087,
"grad_norm": 0.17275646328926086,
"learning_rate": 0.00017747807585977575,
"loss": 1.1496,
"step": 515
},
{
"epoch": 0.2215780998389694,
"grad_norm": 0.16934038698673248,
"learning_rate": 0.00017739231839826575,
"loss": 0.9445,
"step": 516
},
{
"epoch": 0.22200751476113795,
"grad_norm": 0.18247805535793304,
"learning_rate": 0.00017730641877896275,
"loss": 1.2478,
"step": 517
},
{
"epoch": 0.2224369296833065,
"grad_norm": 0.17023034393787384,
"learning_rate": 0.00017722037715965115,
"loss": 1.0587,
"step": 518
},
{
"epoch": 0.22286634460547505,
"grad_norm": 0.17108768224716187,
"learning_rate": 0.00017713419369837617,
"loss": 1.2587,
"step": 519
},
{
"epoch": 0.22329575952764358,
"grad_norm": 0.16779127717018127,
"learning_rate": 0.00017704786855344363,
"loss": 0.8168,
"step": 520
},
{
"epoch": 0.22372517444981213,
"grad_norm": 0.17807330191135406,
"learning_rate": 0.00017696140188341945,
"loss": 1.2265,
"step": 521
},
{
"epoch": 0.22415458937198068,
"grad_norm": 0.15085840225219727,
"learning_rate": 0.0001768747938471297,
"loss": 0.9862,
"step": 522
},
{
"epoch": 0.2245840042941492,
"grad_norm": 0.16962507367134094,
"learning_rate": 0.00017678804460366,
"loss": 1.2014,
"step": 523
},
{
"epoch": 0.22501341921631776,
"grad_norm": 0.20221249759197235,
"learning_rate": 0.00017670115431235538,
"loss": 1.15,
"step": 524
},
{
"epoch": 0.22544283413848631,
"grad_norm": 0.1703234761953354,
"learning_rate": 0.00017661412313281995,
"loss": 1.1397,
"step": 525
},
{
"epoch": 0.22587224906065487,
"grad_norm": 0.15764622390270233,
"learning_rate": 0.00017652695122491663,
"loss": 1.0963,
"step": 526
},
{
"epoch": 0.2263016639828234,
"grad_norm": 0.1757158637046814,
"learning_rate": 0.00017643963874876677,
"loss": 1.2059,
"step": 527
},
{
"epoch": 0.22673107890499195,
"grad_norm": 0.17365393042564392,
"learning_rate": 0.00017635218586474998,
"loss": 1.0233,
"step": 528
},
{
"epoch": 0.2271604938271605,
"grad_norm": 0.1677040010690689,
"learning_rate": 0.0001762645927335038,
"loss": 1.1272,
"step": 529
},
{
"epoch": 0.22758990874932905,
"grad_norm": 0.1669892817735672,
"learning_rate": 0.0001761768595159233,
"loss": 0.9677,
"step": 530
},
{
"epoch": 0.22801932367149758,
"grad_norm": 0.19120194017887115,
"learning_rate": 0.00017608898637316096,
"loss": 1.2069,
"step": 531
},
{
"epoch": 0.22844873859366613,
"grad_norm": 0.15439291298389435,
"learning_rate": 0.00017600097346662623,
"loss": 0.8796,
"step": 532
},
{
"epoch": 0.22887815351583468,
"grad_norm": 0.1759713590145111,
"learning_rate": 0.00017591282095798526,
"loss": 0.7718,
"step": 533
},
{
"epoch": 0.22930756843800323,
"grad_norm": 0.17327053844928741,
"learning_rate": 0.00017582452900916063,
"loss": 1.4072,
"step": 534
},
{
"epoch": 0.22973698336017176,
"grad_norm": 0.1783333122730255,
"learning_rate": 0.0001757360977823312,
"loss": 1.4336,
"step": 535
},
{
"epoch": 0.2301663982823403,
"grad_norm": 0.16632091999053955,
"learning_rate": 0.00017564752743993143,
"loss": 0.9684,
"step": 536
},
{
"epoch": 0.23059581320450886,
"grad_norm": 0.17739808559417725,
"learning_rate": 0.00017555881814465148,
"loss": 0.9855,
"step": 537
},
{
"epoch": 0.2310252281266774,
"grad_norm": 0.16482579708099365,
"learning_rate": 0.00017546997005943665,
"loss": 1.1435,
"step": 538
},
{
"epoch": 0.23145464304884594,
"grad_norm": 0.19359920918941498,
"learning_rate": 0.00017538098334748722,
"loss": 1.2677,
"step": 539
},
{
"epoch": 0.2318840579710145,
"grad_norm": 0.1723766326904297,
"learning_rate": 0.00017529185817225816,
"loss": 1.3,
"step": 540
},
{
"epoch": 0.23231347289318305,
"grad_norm": 0.18761831521987915,
"learning_rate": 0.00017520259469745866,
"loss": 1.2971,
"step": 541
},
{
"epoch": 0.23274288781535157,
"grad_norm": 0.139839306473732,
"learning_rate": 0.00017511319308705198,
"loss": 0.975,
"step": 542
},
{
"epoch": 0.23317230273752013,
"grad_norm": 0.17375217378139496,
"learning_rate": 0.00017502365350525524,
"loss": 0.9755,
"step": 543
},
{
"epoch": 0.23360171765968868,
"grad_norm": 0.1978386640548706,
"learning_rate": 0.00017493397611653875,
"loss": 1.3327,
"step": 544
},
{
"epoch": 0.23403113258185723,
"grad_norm": 0.21363678574562073,
"learning_rate": 0.0001748441610856262,
"loss": 1.1973,
"step": 545
},
{
"epoch": 0.23446054750402576,
"grad_norm": 0.18306796252727509,
"learning_rate": 0.00017475420857749398,
"loss": 1.0939,
"step": 546
},
{
"epoch": 0.2348899624261943,
"grad_norm": 0.1709376573562622,
"learning_rate": 0.00017466411875737098,
"loss": 1.1383,
"step": 547
},
{
"epoch": 0.23531937734836286,
"grad_norm": 0.19025692343711853,
"learning_rate": 0.0001745738917907384,
"loss": 0.9749,
"step": 548
},
{
"epoch": 0.2357487922705314,
"grad_norm": 0.1548996865749359,
"learning_rate": 0.00017448352784332926,
"loss": 1.1391,
"step": 549
},
{
"epoch": 0.23617820719269994,
"grad_norm": 0.15124543011188507,
"learning_rate": 0.00017439302708112826,
"loss": 1.0438,
"step": 550
},
{
"epoch": 0.2366076221148685,
"grad_norm": 0.178885355591774,
"learning_rate": 0.00017430238967037137,
"loss": 1.2482,
"step": 551
},
{
"epoch": 0.23703703703703705,
"grad_norm": 0.16636434197425842,
"learning_rate": 0.00017421161577754564,
"loss": 1.079,
"step": 552
},
{
"epoch": 0.23746645195920557,
"grad_norm": 0.16374240815639496,
"learning_rate": 0.00017412070556938872,
"loss": 1.1511,
"step": 553
},
{
"epoch": 0.23789586688137412,
"grad_norm": 0.15488043427467346,
"learning_rate": 0.00017402965921288865,
"loss": 1.1565,
"step": 554
},
{
"epoch": 0.23832528180354268,
"grad_norm": 0.16751627624034882,
"learning_rate": 0.00017393847687528367,
"loss": 1.1209,
"step": 555
},
{
"epoch": 0.23875469672571123,
"grad_norm": 0.17798767983913422,
"learning_rate": 0.00017384715872406168,
"loss": 1.2118,
"step": 556
},
{
"epoch": 0.23918411164787975,
"grad_norm": 0.17087987065315247,
"learning_rate": 0.00017375570492696009,
"loss": 0.9564,
"step": 557
},
{
"epoch": 0.2396135265700483,
"grad_norm": 0.14827404916286469,
"learning_rate": 0.00017366411565196543,
"loss": 0.9969,
"step": 558
},
{
"epoch": 0.24004294149221686,
"grad_norm": 0.16151390969753265,
"learning_rate": 0.00017357239106731317,
"loss": 1.0805,
"step": 559
},
{
"epoch": 0.2404723564143854,
"grad_norm": 0.20443901419639587,
"learning_rate": 0.00017348053134148727,
"loss": 1.1291,
"step": 560
},
{
"epoch": 0.24090177133655394,
"grad_norm": 0.15805144608020782,
"learning_rate": 0.00017338853664321992,
"loss": 1.067,
"step": 561
},
{
"epoch": 0.2413311862587225,
"grad_norm": 0.17929919064044952,
"learning_rate": 0.00017329640714149123,
"loss": 1.1768,
"step": 562
},
{
"epoch": 0.24176060118089104,
"grad_norm": 0.15413890779018402,
"learning_rate": 0.00017320414300552893,
"loss": 1.1613,
"step": 563
},
{
"epoch": 0.24219001610305957,
"grad_norm": 0.16163668036460876,
"learning_rate": 0.0001731117444048081,
"loss": 1.0257,
"step": 564
},
{
"epoch": 0.24261943102522812,
"grad_norm": 0.17742857336997986,
"learning_rate": 0.0001730192115090507,
"loss": 1.0139,
"step": 565
},
{
"epoch": 0.24304884594739667,
"grad_norm": 0.1430206149816513,
"learning_rate": 0.0001729265444882255,
"loss": 0.8641,
"step": 566
},
{
"epoch": 0.24347826086956523,
"grad_norm": 0.1846974492073059,
"learning_rate": 0.00017283374351254754,
"loss": 1.3239,
"step": 567
},
{
"epoch": 0.24390767579173375,
"grad_norm": 0.16652631759643555,
"learning_rate": 0.00017274080875247794,
"loss": 1.0221,
"step": 568
},
{
"epoch": 0.2443370907139023,
"grad_norm": 0.1801396608352661,
"learning_rate": 0.00017264774037872358,
"loss": 1.2199,
"step": 569
},
{
"epoch": 0.24476650563607086,
"grad_norm": 0.1728580743074417,
"learning_rate": 0.00017255453856223675,
"loss": 1.0899,
"step": 570
},
{
"epoch": 0.2451959205582394,
"grad_norm": 0.1778605431318283,
"learning_rate": 0.00017246120347421488,
"loss": 0.949,
"step": 571
},
{
"epoch": 0.24562533548040794,
"grad_norm": 0.16379563510417938,
"learning_rate": 0.00017236773528610017,
"loss": 1.2364,
"step": 572
},
{
"epoch": 0.2460547504025765,
"grad_norm": 0.15087537467479706,
"learning_rate": 0.0001722741341695793,
"loss": 0.9602,
"step": 573
},
{
"epoch": 0.24648416532474504,
"grad_norm": 0.18357989192008972,
"learning_rate": 0.00017218040029658315,
"loss": 1.2449,
"step": 574
},
{
"epoch": 0.24691358024691357,
"grad_norm": 0.1720157265663147,
"learning_rate": 0.00017208653383928642,
"loss": 1.1534,
"step": 575
},
{
"epoch": 0.24734299516908212,
"grad_norm": 0.19645382463932037,
"learning_rate": 0.00017199253497010743,
"loss": 1.0639,
"step": 576
},
{
"epoch": 0.24777241009125067,
"grad_norm": 0.1753363013267517,
"learning_rate": 0.00017189840386170756,
"loss": 0.8053,
"step": 577
},
{
"epoch": 0.24820182501341922,
"grad_norm": 0.19694557785987854,
"learning_rate": 0.00017180414068699126,
"loss": 1.0593,
"step": 578
},
{
"epoch": 0.24863123993558775,
"grad_norm": 0.20301617681980133,
"learning_rate": 0.00017170974561910542,
"loss": 1.2998,
"step": 579
},
{
"epoch": 0.2490606548577563,
"grad_norm": 0.18933315575122833,
"learning_rate": 0.00017161521883143934,
"loss": 1.2534,
"step": 580
},
{
"epoch": 0.24949006977992486,
"grad_norm": 0.17308446764945984,
"learning_rate": 0.00017152056049762418,
"loss": 1.2115,
"step": 581
},
{
"epoch": 0.2499194847020934,
"grad_norm": 0.17606200277805328,
"learning_rate": 0.0001714257707915327,
"loss": 1.0521,
"step": 582
}
],
"logging_steps": 1,
"max_steps": 2328,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 291,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.7796911480465e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}