|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984871406959153, |
|
"eval_steps": 42, |
|
"global_step": 165, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006051437216338881, |
|
"grad_norm": 2485872861586.4854, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1824, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012102874432677761, |
|
"grad_norm": 2374258153298.8555, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.2088, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.018154311649016642, |
|
"grad_norm": 396.1724444566197, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.2027, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.024205748865355523, |
|
"grad_norm": 7397.331100311796, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.1208, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.030257186081694403, |
|
"grad_norm": 84.86083374215458, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9113, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.036308623298033284, |
|
"grad_norm": 978.6354359559489, |
|
"learning_rate": 1.9998072404820648e-05, |
|
"loss": 0.9526, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04236006051437216, |
|
"grad_norm": 21.90749318992135, |
|
"learning_rate": 1.9992290362407232e-05, |
|
"loss": 0.8599, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.048411497730711045, |
|
"grad_norm": 74.82087090537057, |
|
"learning_rate": 1.998265610184716e-05, |
|
"loss": 1.1003, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05446293494704992, |
|
"grad_norm": 4.882285623570468, |
|
"learning_rate": 1.9969173337331283e-05, |
|
"loss": 0.8362, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.060514372163388806, |
|
"grad_norm": 7.899288581834525, |
|
"learning_rate": 1.995184726672197e-05, |
|
"loss": 0.8923, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06656580937972768, |
|
"grad_norm": 3.4242116473646225, |
|
"learning_rate": 1.9930684569549265e-05, |
|
"loss": 0.8299, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07261724659606657, |
|
"grad_norm": 2.5233147164579126, |
|
"learning_rate": 1.990569340443577e-05, |
|
"loss": 0.8059, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07866868381240545, |
|
"grad_norm": 3.051443861824001, |
|
"learning_rate": 1.9876883405951378e-05, |
|
"loss": 0.7651, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08472012102874432, |
|
"grad_norm": 2.6097710859490357, |
|
"learning_rate": 1.9844265680898917e-05, |
|
"loss": 0.7637, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0907715582450832, |
|
"grad_norm": 2.1938445776094744, |
|
"learning_rate": 1.9807852804032306e-05, |
|
"loss": 0.7701, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09682299546142209, |
|
"grad_norm": 3.039971331985702, |
|
"learning_rate": 1.9767658813208725e-05, |
|
"loss": 0.7326, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10287443267776097, |
|
"grad_norm": 2.3582600372243494, |
|
"learning_rate": 1.9723699203976768e-05, |
|
"loss": 0.7477, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10892586989409984, |
|
"grad_norm": 2.044504064644785, |
|
"learning_rate": 1.96759909236026e-05, |
|
"loss": 0.7279, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11497730711043873, |
|
"grad_norm": 2.110191157985126, |
|
"learning_rate": 1.9624552364536472e-05, |
|
"loss": 0.7295, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12102874432677761, |
|
"grad_norm": 2.0796944865165714, |
|
"learning_rate": 1.956940335732209e-05, |
|
"loss": 0.7085, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12708018154311648, |
|
"grad_norm": 2.522674133160586, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.7311, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13313161875945537, |
|
"grad_norm": 2.0811107085167437, |
|
"learning_rate": 1.944806046466878e-05, |
|
"loss": 0.7146, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13918305597579425, |
|
"grad_norm": 2.265003278256734, |
|
"learning_rate": 1.9381913359224844e-05, |
|
"loss": 0.7091, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14523449319213314, |
|
"grad_norm": 1.8790084467988264, |
|
"learning_rate": 1.9312149347588035e-05, |
|
"loss": 0.7004, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15128593040847202, |
|
"grad_norm": 2.1570176309321036, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 0.7147, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1573373676248109, |
|
"grad_norm": 2.044429433610023, |
|
"learning_rate": 1.916187957117136e-05, |
|
"loss": 0.683, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16338880484114976, |
|
"grad_norm": 1.9500024997189427, |
|
"learning_rate": 1.9081431738250815e-05, |
|
"loss": 0.6534, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16944024205748864, |
|
"grad_norm": 1.8107376027349673, |
|
"learning_rate": 1.8997482840522218e-05, |
|
"loss": 0.658, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17549167927382753, |
|
"grad_norm": 1.893340947855662, |
|
"learning_rate": 1.891006524188368e-05, |
|
"loss": 0.6909, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1815431164901664, |
|
"grad_norm": 1.7401143745241425, |
|
"learning_rate": 1.881921264348355e-05, |
|
"loss": 0.6714, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1875945537065053, |
|
"grad_norm": 1.6745153162006545, |
|
"learning_rate": 1.8724960070727974e-05, |
|
"loss": 0.6977, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19364599092284418, |
|
"grad_norm": 1.7140573912873658, |
|
"learning_rate": 1.862734385977792e-05, |
|
"loss": 0.6804, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.19969742813918306, |
|
"grad_norm": 1.6077423149312056, |
|
"learning_rate": 1.8526401643540924e-05, |
|
"loss": 0.6716, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20574886535552195, |
|
"grad_norm": 1.4358147690664171, |
|
"learning_rate": 1.8422172337162865e-05, |
|
"loss": 0.6664, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2118003025718608, |
|
"grad_norm": 2.812059503039093, |
|
"learning_rate": 1.8314696123025456e-05, |
|
"loss": 0.667, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2178517397881997, |
|
"grad_norm": 1.7686345457161963, |
|
"learning_rate": 1.8204014435255136e-05, |
|
"loss": 0.6973, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22390317700453857, |
|
"grad_norm": 1.470592147128763, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.6708, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.22995461422087746, |
|
"grad_norm": 1.4325487355932993, |
|
"learning_rate": 1.797320653772707e-05, |
|
"loss": 0.636, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23600605143721634, |
|
"grad_norm": 1.5158022833486517, |
|
"learning_rate": 1.785316930880745e-05, |
|
"loss": 0.632, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24205748865355523, |
|
"grad_norm": 1.4035210760893406, |
|
"learning_rate": 1.773010453362737e-05, |
|
"loss": 0.654, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2481089258698941, |
|
"grad_norm": 1.344913257545713, |
|
"learning_rate": 1.7604059656000313e-05, |
|
"loss": 0.6524, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25416036308623297, |
|
"grad_norm": 1.5331768604954814, |
|
"learning_rate": 1.747508326862597e-05, |
|
"loss": 0.6752, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.25416036308623297, |
|
"eval_loss": 0.6497564315795898, |
|
"eval_runtime": 240.9191, |
|
"eval_samples_per_second": 17.558, |
|
"eval_steps_per_second": 0.278, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26021180030257185, |
|
"grad_norm": 1.3453112551443027, |
|
"learning_rate": 1.7343225094356857e-05, |
|
"loss": 0.6378, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.26626323751891073, |
|
"grad_norm": 1.2901427277827926, |
|
"learning_rate": 1.720853596702919e-05, |
|
"loss": 0.6535, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2723146747352496, |
|
"grad_norm": 1.279593413533858, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.6482, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2783661119515885, |
|
"grad_norm": 1.485821081722045, |
|
"learning_rate": 1.6930873625456362e-05, |
|
"loss": 0.6701, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2844175491679274, |
|
"grad_norm": 1.1782047694848008, |
|
"learning_rate": 1.678800745532942e-05, |
|
"loss": 0.639, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.29046898638426627, |
|
"grad_norm": 1.182935652668345, |
|
"learning_rate": 1.664252437911282e-05, |
|
"loss": 0.6353, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.29652042360060515, |
|
"grad_norm": 1.3371967812238952, |
|
"learning_rate": 1.6494480483301836e-05, |
|
"loss": 0.6513, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.30257186081694404, |
|
"grad_norm": 1.3464560857933685, |
|
"learning_rate": 1.6343932841636455e-05, |
|
"loss": 0.637, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3086232980332829, |
|
"grad_norm": 1.2225531139436927, |
|
"learning_rate": 1.6190939493098344e-05, |
|
"loss": 0.6313, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3146747352496218, |
|
"grad_norm": 1.2321828901853542, |
|
"learning_rate": 1.6035559419535714e-05, |
|
"loss": 0.6469, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3207261724659607, |
|
"grad_norm": 1.3402583020833378, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 0.6366, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3267776096822995, |
|
"grad_norm": 1.1557802726800193, |
|
"learning_rate": 1.5717879602276123e-05, |
|
"loss": 0.6178, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3328290468986384, |
|
"grad_norm": 1.2101508308008404, |
|
"learning_rate": 1.5555702330196024e-05, |
|
"loss": 0.6479, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3388804841149773, |
|
"grad_norm": 1.42141235053728, |
|
"learning_rate": 1.5391383229110005e-05, |
|
"loss": 0.6458, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.34493192133131617, |
|
"grad_norm": 1.2999993867502941, |
|
"learning_rate": 1.5224985647159489e-05, |
|
"loss": 0.6347, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.35098335854765506, |
|
"grad_norm": 1.3406194889045318, |
|
"learning_rate": 1.5056573733779848e-05, |
|
"loss": 0.6391, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.35703479576399394, |
|
"grad_norm": 1.3433628422807478, |
|
"learning_rate": 1.4886212414969551e-05, |
|
"loss": 0.6468, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3630862329803328, |
|
"grad_norm": 1.2918891896945706, |
|
"learning_rate": 1.4713967368259981e-05, |
|
"loss": 0.6263, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3691376701966717, |
|
"grad_norm": 2.773236659243322, |
|
"learning_rate": 1.4539904997395468e-05, |
|
"loss": 0.6446, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3751891074130106, |
|
"grad_norm": 1.4437223516048685, |
|
"learning_rate": 1.436409240673342e-05, |
|
"loss": 0.6379, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3812405446293495, |
|
"grad_norm": 1.4841854042584977, |
|
"learning_rate": 1.4186597375374283e-05, |
|
"loss": 0.6222, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.38729198184568836, |
|
"grad_norm": 1.2593547293298168, |
|
"learning_rate": 1.4007488331031409e-05, |
|
"loss": 0.6166, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.39334341906202724, |
|
"grad_norm": 1.2697035696692358, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 0.654, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.39939485627836613, |
|
"grad_norm": 1.5125181803042715, |
|
"learning_rate": 1.3644704998791501e-05, |
|
"loss": 0.6491, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.405446293494705, |
|
"grad_norm": 1.1671607547969556, |
|
"learning_rate": 1.346117057077493e-05, |
|
"loss": 0.6456, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4114977307110439, |
|
"grad_norm": 1.1858707062216176, |
|
"learning_rate": 1.3276301795616937e-05, |
|
"loss": 0.6409, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4175491679273828, |
|
"grad_norm": 1.274887597351961, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.6314, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4236006051437216, |
|
"grad_norm": 1.1952319371476656, |
|
"learning_rate": 1.2902846772544625e-05, |
|
"loss": 0.6338, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4296520423600605, |
|
"grad_norm": 1.2039189391326874, |
|
"learning_rate": 1.2714404498650743e-05, |
|
"loss": 0.6367, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4357034795763994, |
|
"grad_norm": 1.2493232936493541, |
|
"learning_rate": 1.252491577015158e-05, |
|
"loss": 0.6143, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.44175491679273826, |
|
"grad_norm": 1.2124826419990378, |
|
"learning_rate": 1.2334453638559057e-05, |
|
"loss": 0.6281, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.44780635400907715, |
|
"grad_norm": 1.754723638913915, |
|
"learning_rate": 1.2143091530650508e-05, |
|
"loss": 0.637, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.45385779122541603, |
|
"grad_norm": 1.2667778628244222, |
|
"learning_rate": 1.1950903220161286e-05, |
|
"loss": 0.6477, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4599092284417549, |
|
"grad_norm": 1.1693479009083907, |
|
"learning_rate": 1.1757962799343548e-05, |
|
"loss": 0.6133, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4659606656580938, |
|
"grad_norm": 1.1085663874308165, |
|
"learning_rate": 1.156434465040231e-05, |
|
"loss": 0.6053, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.4720121028744327, |
|
"grad_norm": 1.2275476075576535, |
|
"learning_rate": 1.1370123416819683e-05, |
|
"loss": 0.6222, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.47806354009077157, |
|
"grad_norm": 1.2913381715274397, |
|
"learning_rate": 1.1175373974578378e-05, |
|
"loss": 0.6225, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.48411497730711045, |
|
"grad_norm": 1.1929623616502596, |
|
"learning_rate": 1.098017140329561e-05, |
|
"loss": 0.6149, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49016641452344933, |
|
"grad_norm": 1.1929734366136742, |
|
"learning_rate": 1.0784590957278452e-05, |
|
"loss": 0.6109, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4962178517397882, |
|
"grad_norm": 1.166286080240029, |
|
"learning_rate": 1.058870803651189e-05, |
|
"loss": 0.6197, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5022692889561271, |
|
"grad_norm": 1.251236063248938, |
|
"learning_rate": 1.0392598157590687e-05, |
|
"loss": 0.615, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5083207261724659, |
|
"grad_norm": 1.1723348633981843, |
|
"learning_rate": 1.0196336924606282e-05, |
|
"loss": 0.6243, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5083207261724659, |
|
"eval_loss": 0.6196625232696533, |
|
"eval_runtime": 229.3836, |
|
"eval_samples_per_second": 18.441, |
|
"eval_steps_per_second": 0.292, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5143721633888049, |
|
"grad_norm": 1.264238673034934, |
|
"learning_rate": 1e-05, |
|
"loss": 0.601, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5204236006051437, |
|
"grad_norm": 1.1869923729073102, |
|
"learning_rate": 9.80366307539372e-06, |
|
"loss": 0.635, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5264750378214826, |
|
"grad_norm": 1.155605537137944, |
|
"learning_rate": 9.607401842409318e-06, |
|
"loss": 0.6086, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5325264750378215, |
|
"grad_norm": 1.1810059527345225, |
|
"learning_rate": 9.41129196348811e-06, |
|
"loss": 0.6066, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5385779122541604, |
|
"grad_norm": 1.48288275051847, |
|
"learning_rate": 9.215409042721553e-06, |
|
"loss": 0.6074, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5446293494704992, |
|
"grad_norm": 1.2236480314592646, |
|
"learning_rate": 9.019828596704394e-06, |
|
"loss": 0.6005, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5506807866868382, |
|
"grad_norm": 1.225104434933414, |
|
"learning_rate": 8.824626025421625e-06, |
|
"loss": 0.6209, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.556732223903177, |
|
"grad_norm": 1.3753239293064896, |
|
"learning_rate": 8.629876583180322e-06, |
|
"loss": 0.6284, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5627836611195158, |
|
"grad_norm": 1.1165345494257204, |
|
"learning_rate": 8.43565534959769e-06, |
|
"loss": 0.5922, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5688350983358548, |
|
"grad_norm": 1.1302938437987953, |
|
"learning_rate": 8.242037200656455e-06, |
|
"loss": 0.5945, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5748865355521936, |
|
"grad_norm": 1.0925454165360629, |
|
"learning_rate": 8.04909677983872e-06, |
|
"loss": 0.603, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5809379727685325, |
|
"grad_norm": 1.257734967726645, |
|
"learning_rate": 7.856908469349495e-06, |
|
"loss": 0.614, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5869894099848714, |
|
"grad_norm": 1.128472303920492, |
|
"learning_rate": 7.66554636144095e-06, |
|
"loss": 0.6285, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5930408472012103, |
|
"grad_norm": 1.1421760663082452, |
|
"learning_rate": 7.4750842298484205e-06, |
|
"loss": 0.6185, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5990922844175491, |
|
"grad_norm": 1.1243624229579912, |
|
"learning_rate": 7.285595501349259e-06, |
|
"loss": 0.6312, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6051437216338881, |
|
"grad_norm": 1.2053726091706483, |
|
"learning_rate": 7.097153227455379e-06, |
|
"loss": 0.6184, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6111951588502269, |
|
"grad_norm": 1.2905397800780836, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.6016, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6172465960665658, |
|
"grad_norm": 1.1119701235429014, |
|
"learning_rate": 6.723698204383067e-06, |
|
"loss": 0.5947, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6232980332829047, |
|
"grad_norm": 1.0439743919075113, |
|
"learning_rate": 6.538829429225068e-06, |
|
"loss": 0.5838, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6293494704992436, |
|
"grad_norm": 1.0887711124033599, |
|
"learning_rate": 6.355295001208504e-06, |
|
"loss": 0.6027, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6354009077155824, |
|
"grad_norm": 1.0882356853978903, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 0.5957, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6414523449319214, |
|
"grad_norm": 1.098158642309974, |
|
"learning_rate": 5.9925116689685925e-06, |
|
"loss": 0.6121, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6475037821482602, |
|
"grad_norm": 1.118376343327207, |
|
"learning_rate": 5.813402624625722e-06, |
|
"loss": 0.5984, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.653555219364599, |
|
"grad_norm": 1.1834759273908468, |
|
"learning_rate": 5.635907593266578e-06, |
|
"loss": 0.6317, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.659606656580938, |
|
"grad_norm": 1.0902626718892547, |
|
"learning_rate": 5.460095002604533e-06, |
|
"loss": 0.598, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6656580937972768, |
|
"grad_norm": 1.0904339069847084, |
|
"learning_rate": 5.286032631740023e-06, |
|
"loss": 0.595, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6717095310136157, |
|
"grad_norm": 1.1365339473040479, |
|
"learning_rate": 5.1137875850304545e-06, |
|
"loss": 0.6077, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6777609682299546, |
|
"grad_norm": 1.2850035932836996, |
|
"learning_rate": 4.943426266220156e-06, |
|
"loss": 0.5951, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6838124054462935, |
|
"grad_norm": 1.1128153520291575, |
|
"learning_rate": 4.775014352840512e-06, |
|
"loss": 0.6092, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6898638426626323, |
|
"grad_norm": 1.106251089348545, |
|
"learning_rate": 4.608616770889998e-06, |
|
"loss": 0.61, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6959152798789713, |
|
"grad_norm": 1.1431262362719048, |
|
"learning_rate": 4.444297669803981e-06, |
|
"loss": 0.602, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7019667170953101, |
|
"grad_norm": 1.2142474830783798, |
|
"learning_rate": 4.282120397723879e-06, |
|
"loss": 0.5988, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.708018154311649, |
|
"grad_norm": 1.1882349202638232, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 0.6238, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7140695915279879, |
|
"grad_norm": 1.109339846325976, |
|
"learning_rate": 3.964440580464286e-06, |
|
"loss": 0.5895, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7201210287443268, |
|
"grad_norm": 1.1110191612272007, |
|
"learning_rate": 3.8090605069016596e-06, |
|
"loss": 0.6105, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7261724659606656, |
|
"grad_norm": 1.1179333014320294, |
|
"learning_rate": 3.6560671583635467e-06, |
|
"loss": 0.5932, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7322239031770046, |
|
"grad_norm": 1.1265078817662724, |
|
"learning_rate": 3.505519516698165e-06, |
|
"loss": 0.5999, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7382753403933434, |
|
"grad_norm": 1.075578490050491, |
|
"learning_rate": 3.3574756208871862e-06, |
|
"loss": 0.5915, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7443267776096822, |
|
"grad_norm": 1.1242426718753973, |
|
"learning_rate": 3.2119925446705824e-06, |
|
"loss": 0.5989, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7503782148260212, |
|
"grad_norm": 1.1091583294027973, |
|
"learning_rate": 3.069126374543643e-06, |
|
"loss": 0.5961, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.75642965204236, |
|
"grad_norm": 1.0901305589473516, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.6161, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.762481089258699, |
|
"grad_norm": 1.099855491597428, |
|
"learning_rate": 2.791464032970812e-06, |
|
"loss": 0.6126, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.762481089258699, |
|
"eval_loss": 0.5977321863174438, |
|
"eval_runtime": 231.0076, |
|
"eval_samples_per_second": 18.311, |
|
"eval_steps_per_second": 0.29, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7685325264750378, |
|
"grad_norm": 1.0434061101718985, |
|
"learning_rate": 2.656774905643147e-06, |
|
"loss": 0.5642, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7745839636913767, |
|
"grad_norm": 1.221163997302398, |
|
"learning_rate": 2.5249167313740307e-06, |
|
"loss": 0.5941, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7806354009077155, |
|
"grad_norm": 1.10442870611896, |
|
"learning_rate": 2.395940343999691e-06, |
|
"loss": 0.5968, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7866868381240545, |
|
"grad_norm": 1.0758064420406537, |
|
"learning_rate": 2.26989546637263e-06, |
|
"loss": 0.5608, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7927382753403933, |
|
"grad_norm": 1.1099638697418817, |
|
"learning_rate": 2.146830691192553e-06, |
|
"loss": 0.6061, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7987897125567323, |
|
"grad_norm": 1.1012628722358688, |
|
"learning_rate": 2.02679346227293e-06, |
|
"loss": 0.6086, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8048411497730711, |
|
"grad_norm": 1.0825334967123086, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.5999, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.81089258698941, |
|
"grad_norm": 1.0920843483364964, |
|
"learning_rate": 1.7959855647448642e-06, |
|
"loss": 0.5851, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8169440242057489, |
|
"grad_norm": 1.0722929205729042, |
|
"learning_rate": 1.6853038769745466e-06, |
|
"loss": 0.5943, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8229954614220878, |
|
"grad_norm": 1.0785356189051434, |
|
"learning_rate": 1.577827662837136e-06, |
|
"loss": 0.5939, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8290468986384266, |
|
"grad_norm": 1.105404459630531, |
|
"learning_rate": 1.4735983564590784e-06, |
|
"loss": 0.5972, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8350983358547656, |
|
"grad_norm": 1.064384582646943, |
|
"learning_rate": 1.3726561402220818e-06, |
|
"loss": 0.5834, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8411497730711044, |
|
"grad_norm": 1.0873361026145811, |
|
"learning_rate": 1.2750399292720284e-06, |
|
"loss": 0.5727, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8472012102874432, |
|
"grad_norm": 1.0146195797904674, |
|
"learning_rate": 1.1807873565164507e-06, |
|
"loss": 0.5555, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8532526475037822, |
|
"grad_norm": 1.1102017699410431, |
|
"learning_rate": 1.0899347581163222e-06, |
|
"loss": 0.6126, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.859304084720121, |
|
"grad_norm": 1.046330613810264, |
|
"learning_rate": 1.0025171594777872e-06, |
|
"loss": 0.5866, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8653555219364599, |
|
"grad_norm": 1.054167660384435, |
|
"learning_rate": 9.185682617491865e-07, |
|
"loss": 0.6009, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8714069591527988, |
|
"grad_norm": 1.0445275144423711, |
|
"learning_rate": 8.381204288286415e-07, |
|
"loss": 0.5756, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8774583963691377, |
|
"grad_norm": 1.046910850171513, |
|
"learning_rate": 7.612046748871327e-07, |
|
"loss": 0.569, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.8835098335854765, |
|
"grad_norm": 1.0770440342943608, |
|
"learning_rate": 6.878506524119644e-07, |
|
"loss": 0.6137, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8895612708018155, |
|
"grad_norm": 1.0749107136634029, |
|
"learning_rate": 6.180866407751595e-07, |
|
"loss": 0.5798, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8956127080181543, |
|
"grad_norm": 1.1573025313733445, |
|
"learning_rate": 5.519395353312195e-07, |
|
"loss": 0.5959, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9016641452344932, |
|
"grad_norm": 1.0236891796646002, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 0.5574, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9077155824508321, |
|
"grad_norm": 1.0420631935147442, |
|
"learning_rate": 4.305966426779118e-07, |
|
"loss": 0.5898, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.913767019667171, |
|
"grad_norm": 1.0706305440714106, |
|
"learning_rate": 3.7544763546352834e-07, |
|
"loss": 0.6051, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9198184568835098, |
|
"grad_norm": 1.0849945955512819, |
|
"learning_rate": 3.2400907639740243e-07, |
|
"loss": 0.5875, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9258698940998488, |
|
"grad_norm": 1.10481174468143, |
|
"learning_rate": 2.7630079602323447e-07, |
|
"loss": 0.5891, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9319213313161876, |
|
"grad_norm": 1.0696523504505417, |
|
"learning_rate": 2.3234118679127615e-07, |
|
"loss": 0.5979, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9379727685325264, |
|
"grad_norm": 1.0717642752483758, |
|
"learning_rate": 1.921471959676957e-07, |
|
"loss": 0.6036, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9440242057488654, |
|
"grad_norm": 3.074194566505756, |
|
"learning_rate": 1.5573431910108404e-07, |
|
"loss": 0.5895, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9500756429652042, |
|
"grad_norm": 1.0642248909874026, |
|
"learning_rate": 1.231165940486234e-07, |
|
"loss": 0.5872, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9561270801815431, |
|
"grad_norm": 1.116298645041687, |
|
"learning_rate": 9.43065955642275e-08, |
|
"loss": 0.5891, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.962178517397882, |
|
"grad_norm": 1.0761046903826774, |
|
"learning_rate": 6.931543045073708e-08, |
|
"loss": 0.5885, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9682299546142209, |
|
"grad_norm": 1.098459824451828, |
|
"learning_rate": 4.815273327803183e-08, |
|
"loss": 0.589, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9742813918305597, |
|
"grad_norm": 1.1259065136256217, |
|
"learning_rate": 3.082666266872036e-08, |
|
"loss": 0.6024, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9803328290468987, |
|
"grad_norm": 1.0518319114128796, |
|
"learning_rate": 1.7343898152841765e-08, |
|
"loss": 0.6053, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9863842662632375, |
|
"grad_norm": 1.1340594173533018, |
|
"learning_rate": 7.70963759277099e-09, |
|
"loss": 0.5828, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9924357034795764, |
|
"grad_norm": 1.0611854225309354, |
|
"learning_rate": 1.9275951793518154e-09, |
|
"loss": 0.6039, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9984871406959153, |
|
"grad_norm": 1.0395934059040783, |
|
"learning_rate": 0.0, |
|
"loss": 0.5772, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9984871406959153, |
|
"step": 165, |
|
"total_flos": 275962386186240.0, |
|
"train_loss": 0.6538035761226307, |
|
"train_runtime": 8048.9961, |
|
"train_samples_per_second": 5.256, |
|
"train_steps_per_second": 0.02 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 165, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 42, |
|
"total_flos": 275962386186240.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|