|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.3328340822100183, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006656681644200366, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 1.9001, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013313363288400732, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 1.1836, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00199700449326011, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.0, |
|
"loss": 2.0432, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0026626726576801465, |
|
"grad_norm": Infinity, |
|
"learning_rate": 0.0, |
|
"loss": 2.7364, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003328340822100183, |
|
"grad_norm": 305.05010986328125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7278, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00399400898652022, |
|
"grad_norm": 195.2991180419922, |
|
"learning_rate": 0.0002, |
|
"loss": 1.5935, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004659677150940256, |
|
"grad_norm": 135.4779815673828, |
|
"learning_rate": 0.00019995559502664298, |
|
"loss": 1.558, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005325345315360293, |
|
"grad_norm": 175.23486328125, |
|
"learning_rate": 0.00019991119005328598, |
|
"loss": 1.8278, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.005991013479780329, |
|
"grad_norm": 70.18768310546875, |
|
"learning_rate": 0.00019986678507992895, |
|
"loss": 1.2821, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006656681644200366, |
|
"grad_norm": 91.9547348022461, |
|
"learning_rate": 0.00019982238010657195, |
|
"loss": 1.3009, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007322349808620403, |
|
"grad_norm": 140.80667114257812, |
|
"learning_rate": 0.00019977797513321492, |
|
"loss": 1.4739, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00798801797304044, |
|
"grad_norm": 117.13510131835938, |
|
"learning_rate": 0.00019973357015985792, |
|
"loss": 1.0922, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008653686137460476, |
|
"grad_norm": 87.54082489013672, |
|
"learning_rate": 0.0001996891651865009, |
|
"loss": 1.6633, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.009319354301880512, |
|
"grad_norm": 178.7975616455078, |
|
"learning_rate": 0.0001996447602131439, |
|
"loss": 1.7614, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.00998502246630055, |
|
"grad_norm": 199.108154296875, |
|
"learning_rate": 0.00019960035523978686, |
|
"loss": 1.4873, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.010650690630720586, |
|
"grad_norm": 97.36341857910156, |
|
"learning_rate": 0.00019955595026642986, |
|
"loss": 1.2921, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.011316358795140622, |
|
"grad_norm": 145.5021209716797, |
|
"learning_rate": 0.00019951154529307283, |
|
"loss": 1.4667, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.011982026959560658, |
|
"grad_norm": 163.09628295898438, |
|
"learning_rate": 0.00019946714031971583, |
|
"loss": 1.4959, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.012647695123980696, |
|
"grad_norm": 181.53123474121094, |
|
"learning_rate": 0.0001994227353463588, |
|
"loss": 1.4931, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.013313363288400732, |
|
"grad_norm": 134.01087951660156, |
|
"learning_rate": 0.00019937833037300177, |
|
"loss": 1.1797, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013979031452820768, |
|
"grad_norm": 138.4213409423828, |
|
"learning_rate": 0.00019933392539964477, |
|
"loss": 1.3021, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.014644699617240806, |
|
"grad_norm": 94.12645721435547, |
|
"learning_rate": 0.00019928952042628774, |
|
"loss": 1.2099, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.015310367781660842, |
|
"grad_norm": 68.36018371582031, |
|
"learning_rate": 0.00019924511545293074, |
|
"loss": 1.0593, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01597603594608088, |
|
"grad_norm": 83.23944854736328, |
|
"learning_rate": 0.0001992007104795737, |
|
"loss": 1.1152, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.016641704110500914, |
|
"grad_norm": 145.29953002929688, |
|
"learning_rate": 0.0001991563055062167, |
|
"loss": 1.194, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017307372274920952, |
|
"grad_norm": 71.01487731933594, |
|
"learning_rate": 0.00019911190053285968, |
|
"loss": 1.0201, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.01797304043934099, |
|
"grad_norm": 82.99989318847656, |
|
"learning_rate": 0.00019906749555950268, |
|
"loss": 1.3319, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.018638708603761024, |
|
"grad_norm": 51.65462112426758, |
|
"learning_rate": 0.00019902309058614565, |
|
"loss": 0.8622, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.019304376768181062, |
|
"grad_norm": 106.69747161865234, |
|
"learning_rate": 0.00019897868561278865, |
|
"loss": 1.0051, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0199700449326011, |
|
"grad_norm": 85.53784942626953, |
|
"learning_rate": 0.00019893428063943162, |
|
"loss": 0.7687, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.020635713097021134, |
|
"grad_norm": 141.33482360839844, |
|
"learning_rate": 0.00019888987566607462, |
|
"loss": 0.9697, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.021301381261441172, |
|
"grad_norm": 110.15571594238281, |
|
"learning_rate": 0.0001988454706927176, |
|
"loss": 0.6955, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02196704942586121, |
|
"grad_norm": 123.28990173339844, |
|
"learning_rate": 0.00019880106571936056, |
|
"loss": 0.9405, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.022632717590281244, |
|
"grad_norm": 104.15376281738281, |
|
"learning_rate": 0.00019875666074600356, |
|
"loss": 1.1353, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.023298385754701282, |
|
"grad_norm": 122.83297729492188, |
|
"learning_rate": 0.00019871225577264653, |
|
"loss": 0.9912, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023964053919121316, |
|
"grad_norm": 137.1550750732422, |
|
"learning_rate": 0.00019866785079928953, |
|
"loss": 1.0156, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.024629722083541354, |
|
"grad_norm": 95.7591781616211, |
|
"learning_rate": 0.0001986234458259325, |
|
"loss": 0.9295, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.025295390247961392, |
|
"grad_norm": 163.35743713378906, |
|
"learning_rate": 0.0001985790408525755, |
|
"loss": 0.9697, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.025961058412381426, |
|
"grad_norm": 42.57448959350586, |
|
"learning_rate": 0.00019853463587921847, |
|
"loss": 0.8329, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.026626726576801464, |
|
"grad_norm": 161.05844116210938, |
|
"learning_rate": 0.00019849023090586147, |
|
"loss": 0.8588, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.027292394741221502, |
|
"grad_norm": 123.78165435791016, |
|
"learning_rate": 0.00019844582593250444, |
|
"loss": 0.8082, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.027958062905641536, |
|
"grad_norm": 117.62382507324219, |
|
"learning_rate": 0.00019840142095914744, |
|
"loss": 1.0692, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.028623731070061574, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019840142095914744, |
|
"loss": 1.316, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.029289399234481612, |
|
"grad_norm": 63.35919952392578, |
|
"learning_rate": 0.0001983570159857904, |
|
"loss": 0.6028, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.029955067398901646, |
|
"grad_norm": 277.1460266113281, |
|
"learning_rate": 0.00019831261101243338, |
|
"loss": 1.4349, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.030620735563321684, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00019831261101243338, |
|
"loss": 1.816, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03128640372774172, |
|
"grad_norm": 206.0849151611328, |
|
"learning_rate": 0.00019826820603907638, |
|
"loss": 1.7035, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03195207189216176, |
|
"grad_norm": 154.10183715820312, |
|
"learning_rate": 0.00019822380106571935, |
|
"loss": 0.7897, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.032617740056581794, |
|
"grad_norm": 182.32650756835938, |
|
"learning_rate": 0.00019817939609236235, |
|
"loss": 1.2611, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03328340822100183, |
|
"grad_norm": 100.30379486083984, |
|
"learning_rate": 0.00019813499111900532, |
|
"loss": 0.8746, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03394907638542187, |
|
"grad_norm": 99.05520629882812, |
|
"learning_rate": 0.00019809058614564832, |
|
"loss": 0.6544, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.034614744549841904, |
|
"grad_norm": 75.6047592163086, |
|
"learning_rate": 0.0001980461811722913, |
|
"loss": 0.5308, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03528041271426194, |
|
"grad_norm": 81.81549835205078, |
|
"learning_rate": 0.0001980017761989343, |
|
"loss": 1.0103, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.03594608087868198, |
|
"grad_norm": 305.5655517578125, |
|
"learning_rate": 0.00019795737122557726, |
|
"loss": 2.0811, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.036611749043102014, |
|
"grad_norm": 254.49461364746094, |
|
"learning_rate": 0.00019791296625222026, |
|
"loss": 1.5316, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03727741720752205, |
|
"grad_norm": 249.2161407470703, |
|
"learning_rate": 0.00019786856127886323, |
|
"loss": 1.4851, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03794308537194209, |
|
"grad_norm": 187.69805908203125, |
|
"learning_rate": 0.00019782415630550623, |
|
"loss": 1.0635, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.038608753536362124, |
|
"grad_norm": 135.42141723632812, |
|
"learning_rate": 0.0001977797513321492, |
|
"loss": 0.8954, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.03927442170078216, |
|
"grad_norm": 198.7463836669922, |
|
"learning_rate": 0.0001977353463587922, |
|
"loss": 1.2765, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0399400898652022, |
|
"grad_norm": 256.67279052734375, |
|
"learning_rate": 0.00019769094138543517, |
|
"loss": 1.4927, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.040605758029622234, |
|
"grad_norm": 64.07623291015625, |
|
"learning_rate": 0.00019764653641207817, |
|
"loss": 0.7874, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04127142619404227, |
|
"grad_norm": 147.7261962890625, |
|
"learning_rate": 0.00019760213143872114, |
|
"loss": 0.7924, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04193709435846231, |
|
"grad_norm": 122.39730072021484, |
|
"learning_rate": 0.00019755772646536413, |
|
"loss": 0.7656, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.042602762522882344, |
|
"grad_norm": 94.13729095458984, |
|
"learning_rate": 0.0001975133214920071, |
|
"loss": 0.9051, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04326843068730238, |
|
"grad_norm": 48.220184326171875, |
|
"learning_rate": 0.0001974689165186501, |
|
"loss": 0.6262, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04393409885172242, |
|
"grad_norm": 32.33949661254883, |
|
"learning_rate": 0.00019742451154529308, |
|
"loss": 0.544, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.044599767016142454, |
|
"grad_norm": 133.78097534179688, |
|
"learning_rate": 0.00019738010657193607, |
|
"loss": 0.6472, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04526543518056249, |
|
"grad_norm": 105.52882385253906, |
|
"learning_rate": 0.00019733570159857907, |
|
"loss": 0.7272, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04593110334498253, |
|
"grad_norm": 40.91939926147461, |
|
"learning_rate": 0.00019729129662522204, |
|
"loss": 0.6232, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.046596771509402564, |
|
"grad_norm": 48.693763732910156, |
|
"learning_rate": 0.00019724689165186504, |
|
"loss": 0.3698, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0472624396738226, |
|
"grad_norm": 98.95121765136719, |
|
"learning_rate": 0.00019720248667850801, |
|
"loss": 0.4914, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.04792810783824263, |
|
"grad_norm": 217.00401306152344, |
|
"learning_rate": 0.00019715808170515098, |
|
"loss": 1.3837, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.048593776002662674, |
|
"grad_norm": 97.31861877441406, |
|
"learning_rate": 0.00019711367673179398, |
|
"loss": 0.7037, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.04925944416708271, |
|
"grad_norm": 101.28430938720703, |
|
"learning_rate": 0.00019706927175843695, |
|
"loss": 1.3999, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.04992511233150274, |
|
"grad_norm": 86.35800170898438, |
|
"learning_rate": 0.00019702486678507995, |
|
"loss": 0.8045, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.050590780495922784, |
|
"grad_norm": 102.23155212402344, |
|
"learning_rate": 0.00019698046181172292, |
|
"loss": 0.7521, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.05125644866034282, |
|
"grad_norm": 125.80261993408203, |
|
"learning_rate": 0.00019693605683836592, |
|
"loss": 0.7933, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05192211682476285, |
|
"grad_norm": 160.28123474121094, |
|
"learning_rate": 0.0001968916518650089, |
|
"loss": 0.6355, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.052587784989182894, |
|
"grad_norm": 42.919464111328125, |
|
"learning_rate": 0.0001968472468916519, |
|
"loss": 0.2693, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05325345315360293, |
|
"grad_norm": 56.61810302734375, |
|
"learning_rate": 0.00019680284191829486, |
|
"loss": 0.3141, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05391912131802296, |
|
"grad_norm": 299.3166198730469, |
|
"learning_rate": 0.00019675843694493786, |
|
"loss": 1.7119, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.054584789482443004, |
|
"grad_norm": 133.17459106445312, |
|
"learning_rate": 0.00019671403197158083, |
|
"loss": 1.2885, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.05525045764686304, |
|
"grad_norm": 131.6356964111328, |
|
"learning_rate": 0.00019666962699822383, |
|
"loss": 0.7976, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05591612581128307, |
|
"grad_norm": 122.43846130371094, |
|
"learning_rate": 0.0001966252220248668, |
|
"loss": 0.7031, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.056581793975703114, |
|
"grad_norm": 51.365901947021484, |
|
"learning_rate": 0.00019658081705150977, |
|
"loss": 0.6684, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05724746214012315, |
|
"grad_norm": 251.70664978027344, |
|
"learning_rate": 0.00019653641207815277, |
|
"loss": 1.2329, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.05791313030454318, |
|
"grad_norm": 94.94196319580078, |
|
"learning_rate": 0.00019649200710479574, |
|
"loss": 0.746, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.058578798468963224, |
|
"grad_norm": 106.89917755126953, |
|
"learning_rate": 0.00019644760213143874, |
|
"loss": 0.8735, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.05924446663338326, |
|
"grad_norm": 130.65577697753906, |
|
"learning_rate": 0.00019640319715808171, |
|
"loss": 1.0905, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.05991013479780329, |
|
"grad_norm": 37.6524543762207, |
|
"learning_rate": 0.0001963587921847247, |
|
"loss": 0.447, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.060575802962223334, |
|
"grad_norm": 55.81675720214844, |
|
"learning_rate": 0.00019631438721136768, |
|
"loss": 0.8305, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.06124147112664337, |
|
"grad_norm": 56.32381057739258, |
|
"learning_rate": 0.00019626998223801068, |
|
"loss": 0.6437, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0619071392910634, |
|
"grad_norm": 107.72248840332031, |
|
"learning_rate": 0.00019622557726465365, |
|
"loss": 0.7331, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.06257280745548344, |
|
"grad_norm": 60.965084075927734, |
|
"learning_rate": 0.00019618117229129665, |
|
"loss": 0.1975, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.06323847561990348, |
|
"grad_norm": 236.83071899414062, |
|
"learning_rate": 0.00019613676731793962, |
|
"loss": 1.3465, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06390414378432352, |
|
"grad_norm": 32.12284851074219, |
|
"learning_rate": 0.0001960923623445826, |
|
"loss": 0.4818, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.06456981194874355, |
|
"grad_norm": 139.45631408691406, |
|
"learning_rate": 0.0001960479573712256, |
|
"loss": 0.7907, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06523548011316359, |
|
"grad_norm": 85.41361999511719, |
|
"learning_rate": 0.00019600355239786856, |
|
"loss": 0.5648, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.06590114827758363, |
|
"grad_norm": 79.85189056396484, |
|
"learning_rate": 0.00019595914742451156, |
|
"loss": 0.7735, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.06656681644200366, |
|
"grad_norm": 64.68962860107422, |
|
"learning_rate": 0.00019591474245115453, |
|
"loss": 0.4128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0672324846064237, |
|
"grad_norm": 58.707054138183594, |
|
"learning_rate": 0.00019587033747779753, |
|
"loss": 0.8143, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.06789815277084374, |
|
"grad_norm": 107.83698272705078, |
|
"learning_rate": 0.0001958259325044405, |
|
"loss": 0.8662, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.06856382093526377, |
|
"grad_norm": 221.64857482910156, |
|
"learning_rate": 0.0001957815275310835, |
|
"loss": 1.2667, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.06922948909968381, |
|
"grad_norm": 75.43016815185547, |
|
"learning_rate": 0.00019573712255772647, |
|
"loss": 0.6056, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.06989515726410385, |
|
"grad_norm": 81.48358917236328, |
|
"learning_rate": 0.00019569271758436947, |
|
"loss": 0.4954, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07056082542852388, |
|
"grad_norm": 100.88373565673828, |
|
"learning_rate": 0.00019564831261101244, |
|
"loss": 0.8129, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.07122649359294392, |
|
"grad_norm": 56.06926727294922, |
|
"learning_rate": 0.00019560390763765544, |
|
"loss": 0.5694, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.07189216175736396, |
|
"grad_norm": 119.89936828613281, |
|
"learning_rate": 0.0001955595026642984, |
|
"loss": 0.8492, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.07255782992178399, |
|
"grad_norm": 226.14344787597656, |
|
"learning_rate": 0.00019551509769094138, |
|
"loss": 1.5915, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.07322349808620403, |
|
"grad_norm": 167.6849365234375, |
|
"learning_rate": 0.00019547069271758438, |
|
"loss": 0.8734, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07388916625062407, |
|
"grad_norm": 191.29388427734375, |
|
"learning_rate": 0.00019542628774422735, |
|
"loss": 1.1915, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.0745548344150441, |
|
"grad_norm": 74.29916381835938, |
|
"learning_rate": 0.00019538188277087035, |
|
"loss": 0.399, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07522050257946414, |
|
"grad_norm": 193.20899963378906, |
|
"learning_rate": 0.00019533747779751332, |
|
"loss": 1.0812, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.07588617074388418, |
|
"grad_norm": 104.37834930419922, |
|
"learning_rate": 0.00019529307282415632, |
|
"loss": 0.7821, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.0765518389083042, |
|
"grad_norm": 63.434146881103516, |
|
"learning_rate": 0.0001952486678507993, |
|
"loss": 0.5636, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07721750707272425, |
|
"grad_norm": 163.28578186035156, |
|
"learning_rate": 0.0001952042628774423, |
|
"loss": 0.8423, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.07788317523714429, |
|
"grad_norm": 94.21780395507812, |
|
"learning_rate": 0.00019515985790408526, |
|
"loss": 0.6704, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.07854884340156432, |
|
"grad_norm": 97.0186767578125, |
|
"learning_rate": 0.00019511545293072826, |
|
"loss": 0.4906, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.07921451156598436, |
|
"grad_norm": 67.47974395751953, |
|
"learning_rate": 0.00019507104795737123, |
|
"loss": 0.5328, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.0798801797304044, |
|
"grad_norm": 113.68756103515625, |
|
"learning_rate": 0.00019502664298401423, |
|
"loss": 0.7459, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08054584789482443, |
|
"grad_norm": 71.1500244140625, |
|
"learning_rate": 0.0001949822380106572, |
|
"loss": 0.6177, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.08121151605924447, |
|
"grad_norm": 131.37168884277344, |
|
"learning_rate": 0.00019493783303730017, |
|
"loss": 0.7955, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.08187718422366451, |
|
"grad_norm": 72.7484359741211, |
|
"learning_rate": 0.00019489342806394317, |
|
"loss": 0.7026, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.08254285238808454, |
|
"grad_norm": 34.95873260498047, |
|
"learning_rate": 0.00019484902309058614, |
|
"loss": 0.3607, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.08320852055250458, |
|
"grad_norm": 23.322694778442383, |
|
"learning_rate": 0.00019480461811722914, |
|
"loss": 0.4757, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08387418871692462, |
|
"grad_norm": 102.11334228515625, |
|
"learning_rate": 0.0001947602131438721, |
|
"loss": 0.9689, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.08453985688134465, |
|
"grad_norm": 70.20091247558594, |
|
"learning_rate": 0.0001947158081705151, |
|
"loss": 0.5523, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08520552504576469, |
|
"grad_norm": 108.84575653076172, |
|
"learning_rate": 0.00019467140319715808, |
|
"loss": 0.7221, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.08587119321018473, |
|
"grad_norm": 74.28465270996094, |
|
"learning_rate": 0.00019462699822380108, |
|
"loss": 0.3408, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.08653686137460476, |
|
"grad_norm": 45.54611587524414, |
|
"learning_rate": 0.00019458259325044405, |
|
"loss": 0.3064, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0872025295390248, |
|
"grad_norm": 107.5720443725586, |
|
"learning_rate": 0.00019453818827708705, |
|
"loss": 0.4285, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.08786819770344484, |
|
"grad_norm": 113.14404296875, |
|
"learning_rate": 0.00019449378330373002, |
|
"loss": 0.7893, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.08853386586786487, |
|
"grad_norm": 41.36758041381836, |
|
"learning_rate": 0.00019444937833037302, |
|
"loss": 0.2135, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.08919953403228491, |
|
"grad_norm": 111.0009765625, |
|
"learning_rate": 0.000194404973357016, |
|
"loss": 0.4909, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.08986520219670495, |
|
"grad_norm": 119.50690460205078, |
|
"learning_rate": 0.00019436056838365896, |
|
"loss": 0.8744, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.09053087036112498, |
|
"grad_norm": 83.16583251953125, |
|
"learning_rate": 0.00019431616341030196, |
|
"loss": 0.5229, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.09119653852554502, |
|
"grad_norm": 96.94984436035156, |
|
"learning_rate": 0.00019427175843694493, |
|
"loss": 0.6746, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.09186220668996506, |
|
"grad_norm": 65.90393829345703, |
|
"learning_rate": 0.00019422735346358793, |
|
"loss": 0.3112, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.09252787485438509, |
|
"grad_norm": 92.0575180053711, |
|
"learning_rate": 0.0001941829484902309, |
|
"loss": 0.5264, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.09319354301880513, |
|
"grad_norm": 63.846744537353516, |
|
"learning_rate": 0.0001941385435168739, |
|
"loss": 0.8869, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09385921118322517, |
|
"grad_norm": 129.04086303710938, |
|
"learning_rate": 0.00019409413854351687, |
|
"loss": 0.7077, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.0945248793476452, |
|
"grad_norm": 38.49999237060547, |
|
"learning_rate": 0.00019404973357015987, |
|
"loss": 0.1854, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.09519054751206524, |
|
"grad_norm": 142.83619689941406, |
|
"learning_rate": 0.00019400532859680284, |
|
"loss": 1.3549, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.09585621567648527, |
|
"grad_norm": 58.420223236083984, |
|
"learning_rate": 0.00019396092362344584, |
|
"loss": 0.3978, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.0965218838409053, |
|
"grad_norm": 82.57951354980469, |
|
"learning_rate": 0.0001939165186500888, |
|
"loss": 0.9771, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09718755200532535, |
|
"grad_norm": 52.69244384765625, |
|
"learning_rate": 0.00019387211367673178, |
|
"loss": 0.389, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.09785322016974538, |
|
"grad_norm": 99.57772827148438, |
|
"learning_rate": 0.00019382770870337478, |
|
"loss": 0.938, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.09851888833416542, |
|
"grad_norm": 66.55644226074219, |
|
"learning_rate": 0.00019378330373001775, |
|
"loss": 0.4984, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.09918455649858546, |
|
"grad_norm": 64.8500747680664, |
|
"learning_rate": 0.00019373889875666075, |
|
"loss": 0.4189, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.09985022466300549, |
|
"grad_norm": 113.6292495727539, |
|
"learning_rate": 0.00019369449378330372, |
|
"loss": 0.486, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10051589282742553, |
|
"grad_norm": 83.64993286132812, |
|
"learning_rate": 0.00019365008880994672, |
|
"loss": 0.3288, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.10118156099184557, |
|
"grad_norm": 70.0993423461914, |
|
"learning_rate": 0.0001936056838365897, |
|
"loss": 0.7637, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1018472291562656, |
|
"grad_norm": 68.05122375488281, |
|
"learning_rate": 0.0001935612788632327, |
|
"loss": 0.3255, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.10251289732068564, |
|
"grad_norm": 56.31193923950195, |
|
"learning_rate": 0.00019351687388987566, |
|
"loss": 0.1978, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.10317856548510568, |
|
"grad_norm": 131.5350799560547, |
|
"learning_rate": 0.00019347246891651866, |
|
"loss": 0.7034, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1038442336495257, |
|
"grad_norm": 143.9272003173828, |
|
"learning_rate": 0.00019342806394316163, |
|
"loss": 0.6122, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.10450990181394575, |
|
"grad_norm": 56.10287094116211, |
|
"learning_rate": 0.00019338365896980463, |
|
"loss": 0.5982, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.10517556997836579, |
|
"grad_norm": 113.03327941894531, |
|
"learning_rate": 0.0001933392539964476, |
|
"loss": 0.5578, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.10584123814278582, |
|
"grad_norm": 156.54730224609375, |
|
"learning_rate": 0.0001932948490230906, |
|
"loss": 0.9944, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.10650690630720586, |
|
"grad_norm": 117.35420989990234, |
|
"learning_rate": 0.00019325044404973357, |
|
"loss": 0.9612, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1071725744716259, |
|
"grad_norm": 137.27517700195312, |
|
"learning_rate": 0.00019320603907637657, |
|
"loss": 0.8892, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.10783824263604593, |
|
"grad_norm": 121.9662094116211, |
|
"learning_rate": 0.00019316163410301954, |
|
"loss": 0.8541, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.10850391080046597, |
|
"grad_norm": 50.96897506713867, |
|
"learning_rate": 0.00019311722912966254, |
|
"loss": 0.4626, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.10916957896488601, |
|
"grad_norm": 86.26678466796875, |
|
"learning_rate": 0.0001930728241563055, |
|
"loss": 0.4941, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.10983524712930604, |
|
"grad_norm": 92.44398498535156, |
|
"learning_rate": 0.0001930284191829485, |
|
"loss": 0.6016, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.11050091529372608, |
|
"grad_norm": 76.84557342529297, |
|
"learning_rate": 0.00019298401420959148, |
|
"loss": 0.465, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.11116658345814612, |
|
"grad_norm": 120.3193588256836, |
|
"learning_rate": 0.00019293960923623448, |
|
"loss": 0.7597, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.11183225162256615, |
|
"grad_norm": 98.53234100341797, |
|
"learning_rate": 0.00019289520426287745, |
|
"loss": 0.4161, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.11249791978698619, |
|
"grad_norm": 136.8874053955078, |
|
"learning_rate": 0.00019285079928952045, |
|
"loss": 1.0431, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.11316358795140623, |
|
"grad_norm": 145.96075439453125, |
|
"learning_rate": 0.00019280639431616342, |
|
"loss": 0.8223, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11382925611582626, |
|
"grad_norm": 95.09919738769531, |
|
"learning_rate": 0.00019276198934280642, |
|
"loss": 0.4007, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.1144949242802463, |
|
"grad_norm": 95.6208267211914, |
|
"learning_rate": 0.0001927175843694494, |
|
"loss": 0.4496, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.11516059244466634, |
|
"grad_norm": 131.4598846435547, |
|
"learning_rate": 0.00019267317939609239, |
|
"loss": 0.7203, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.11582626060908637, |
|
"grad_norm": 183.2200469970703, |
|
"learning_rate": 0.00019262877442273536, |
|
"loss": 0.8879, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.1164919287735064, |
|
"grad_norm": 65.30191802978516, |
|
"learning_rate": 0.00019258436944937836, |
|
"loss": 0.2161, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11715759693792645, |
|
"grad_norm": 100.97784423828125, |
|
"learning_rate": 0.00019253996447602133, |
|
"loss": 0.8095, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.11782326510234647, |
|
"grad_norm": 57.705963134765625, |
|
"learning_rate": 0.00019249555950266432, |
|
"loss": 0.3389, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.11848893326676652, |
|
"grad_norm": 119.22857666015625, |
|
"learning_rate": 0.0001924511545293073, |
|
"loss": 0.5287, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.11915460143118656, |
|
"grad_norm": 97.59465026855469, |
|
"learning_rate": 0.0001924067495559503, |
|
"loss": 0.4567, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.11982026959560658, |
|
"grad_norm": 47.80794906616211, |
|
"learning_rate": 0.00019236234458259327, |
|
"loss": 0.3795, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12048593776002663, |
|
"grad_norm": 93.19955444335938, |
|
"learning_rate": 0.00019231793960923626, |
|
"loss": 1.0376, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.12115160592444667, |
|
"grad_norm": 57.24619674682617, |
|
"learning_rate": 0.00019227353463587924, |
|
"loss": 0.3167, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.1218172740888667, |
|
"grad_norm": 71.7004165649414, |
|
"learning_rate": 0.00019222912966252223, |
|
"loss": 0.3515, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.12248294225328674, |
|
"grad_norm": 42.93682098388672, |
|
"learning_rate": 0.0001921847246891652, |
|
"loss": 0.3272, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.12314861041770678, |
|
"grad_norm": 88.22802734375, |
|
"learning_rate": 0.00019214031971580818, |
|
"loss": 0.5751, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1238142785821268, |
|
"grad_norm": 88.80204772949219, |
|
"learning_rate": 0.00019209591474245117, |
|
"loss": 0.5306, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.12447994674654685, |
|
"grad_norm": 92.46751403808594, |
|
"learning_rate": 0.00019205150976909415, |
|
"loss": 0.4262, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.12514561491096687, |
|
"grad_norm": 55.72347640991211, |
|
"learning_rate": 0.00019200710479573714, |
|
"loss": 0.3939, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.12581128307538691, |
|
"grad_norm": 72.36625671386719, |
|
"learning_rate": 0.00019196269982238012, |
|
"loss": 0.3739, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.12647695123980696, |
|
"grad_norm": 96.56300354003906, |
|
"learning_rate": 0.00019191829484902311, |
|
"loss": 0.4574, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.127142619404227, |
|
"grad_norm": 122.78588104248047, |
|
"learning_rate": 0.00019187388987566609, |
|
"loss": 0.7646, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.12780828756864704, |
|
"grad_norm": 74.9114990234375, |
|
"learning_rate": 0.00019182948490230908, |
|
"loss": 0.2156, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.12847395573306705, |
|
"grad_norm": 61.17184829711914, |
|
"learning_rate": 0.00019178507992895206, |
|
"loss": 0.7067, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1291396238974871, |
|
"grad_norm": 120.3477783203125, |
|
"learning_rate": 0.00019174067495559505, |
|
"loss": 0.7562, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.12980529206190713, |
|
"grad_norm": 204.0528564453125, |
|
"learning_rate": 0.00019169626998223802, |
|
"loss": 1.2893, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.13047096022632718, |
|
"grad_norm": 91.55975341796875, |
|
"learning_rate": 0.000191651865008881, |
|
"loss": 1.025, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.13113662839074722, |
|
"grad_norm": 111.15852355957031, |
|
"learning_rate": 0.000191607460035524, |
|
"loss": 0.4063, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.13180229655516726, |
|
"grad_norm": 90.34394073486328, |
|
"learning_rate": 0.00019156305506216697, |
|
"loss": 0.6954, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.13246796471958727, |
|
"grad_norm": 95.49404907226562, |
|
"learning_rate": 0.00019151865008880996, |
|
"loss": 0.6267, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.1331336328840073, |
|
"grad_norm": 41.02389144897461, |
|
"learning_rate": 0.00019147424511545294, |
|
"loss": 0.2385, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13379930104842735, |
|
"grad_norm": 91.21604919433594, |
|
"learning_rate": 0.00019142984014209593, |
|
"loss": 0.4531, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.1344649692128474, |
|
"grad_norm": 49.900184631347656, |
|
"learning_rate": 0.0001913854351687389, |
|
"loss": 0.1657, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.13513063737726744, |
|
"grad_norm": 144.78623962402344, |
|
"learning_rate": 0.0001913410301953819, |
|
"loss": 0.7326, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.13579630554168748, |
|
"grad_norm": 143.92132568359375, |
|
"learning_rate": 0.00019129662522202487, |
|
"loss": 1.0342, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1364619737061075, |
|
"grad_norm": 107.93486022949219, |
|
"learning_rate": 0.00019125222024866787, |
|
"loss": 0.3643, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.13712764187052753, |
|
"grad_norm": 69.4767074584961, |
|
"learning_rate": 0.00019120781527531084, |
|
"loss": 0.4302, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.13779331003494757, |
|
"grad_norm": 102.22624206542969, |
|
"learning_rate": 0.00019116341030195384, |
|
"loss": 0.7252, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.13845897819936762, |
|
"grad_norm": 39.15114974975586, |
|
"learning_rate": 0.00019111900532859681, |
|
"loss": 0.376, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.13912464636378766, |
|
"grad_norm": 65.12492370605469, |
|
"learning_rate": 0.00019107460035523979, |
|
"loss": 0.6005, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.1397903145282077, |
|
"grad_norm": 95.04318237304688, |
|
"learning_rate": 0.00019103019538188278, |
|
"loss": 0.8371, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1404559826926277, |
|
"grad_norm": 119.12702941894531, |
|
"learning_rate": 0.00019098579040852576, |
|
"loss": 0.5701, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.14112165085704775, |
|
"grad_norm": 56.38259506225586, |
|
"learning_rate": 0.00019094138543516875, |
|
"loss": 0.6285, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.1417873190214678, |
|
"grad_norm": 29.10639762878418, |
|
"learning_rate": 0.00019089698046181172, |
|
"loss": 0.4639, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.14245298718588784, |
|
"grad_norm": 60.445865631103516, |
|
"learning_rate": 0.00019085257548845472, |
|
"loss": 0.6099, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.14311865535030788, |
|
"grad_norm": 83.33357238769531, |
|
"learning_rate": 0.0001908081705150977, |
|
"loss": 0.6992, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.14378432351472792, |
|
"grad_norm": 50.539451599121094, |
|
"learning_rate": 0.0001907637655417407, |
|
"loss": 0.673, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.14444999167914793, |
|
"grad_norm": 108.4279556274414, |
|
"learning_rate": 0.00019071936056838366, |
|
"loss": 0.7867, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.14511565984356797, |
|
"grad_norm": 71.25292205810547, |
|
"learning_rate": 0.00019067495559502666, |
|
"loss": 0.2414, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.14578132800798801, |
|
"grad_norm": 96.37142944335938, |
|
"learning_rate": 0.00019063055062166963, |
|
"loss": 0.4556, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.14644699617240806, |
|
"grad_norm": 105.84346008300781, |
|
"learning_rate": 0.00019058614564831263, |
|
"loss": 0.5288, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1471126643368281, |
|
"grad_norm": 53.66656494140625, |
|
"learning_rate": 0.0001905417406749556, |
|
"loss": 0.2665, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.14777833250124814, |
|
"grad_norm": 73.43135070800781, |
|
"learning_rate": 0.00019049733570159857, |
|
"loss": 0.465, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.14844400066566815, |
|
"grad_norm": 108.94127655029297, |
|
"learning_rate": 0.00019045293072824157, |
|
"loss": 0.6516, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.1491096688300882, |
|
"grad_norm": 23.10163688659668, |
|
"learning_rate": 0.00019040852575488454, |
|
"loss": 0.1401, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.14977533699450823, |
|
"grad_norm": 182.62322998046875, |
|
"learning_rate": 0.00019036412078152754, |
|
"loss": 0.7953, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15044100515892828, |
|
"grad_norm": 104.31300354003906, |
|
"learning_rate": 0.00019031971580817051, |
|
"loss": 0.5117, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.15110667332334832, |
|
"grad_norm": 122.88026428222656, |
|
"learning_rate": 0.0001902753108348135, |
|
"loss": 0.8029, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.15177234148776836, |
|
"grad_norm": 72.17605590820312, |
|
"learning_rate": 0.00019023090586145648, |
|
"loss": 0.3403, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.15243800965218837, |
|
"grad_norm": 55.70381164550781, |
|
"learning_rate": 0.00019018650088809948, |
|
"loss": 0.2811, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.1531036778166084, |
|
"grad_norm": 80.8816146850586, |
|
"learning_rate": 0.00019014209591474245, |
|
"loss": 1.0904, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15376934598102845, |
|
"grad_norm": 77.0992431640625, |
|
"learning_rate": 0.00019009769094138545, |
|
"loss": 0.8617, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1544350141454485, |
|
"grad_norm": 47.77476119995117, |
|
"learning_rate": 0.00019005328596802842, |
|
"loss": 0.3185, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.15510068230986854, |
|
"grad_norm": 77.98711395263672, |
|
"learning_rate": 0.00019000888099467142, |
|
"loss": 0.2655, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.15576635047428858, |
|
"grad_norm": 63.54255294799805, |
|
"learning_rate": 0.0001899644760213144, |
|
"loss": 0.3955, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1564320186387086, |
|
"grad_norm": 190.26271057128906, |
|
"learning_rate": 0.00018992007104795736, |
|
"loss": 1.2886, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.15709768680312863, |
|
"grad_norm": 117.44766998291016, |
|
"learning_rate": 0.00018987566607460036, |
|
"loss": 0.7142, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.15776335496754867, |
|
"grad_norm": 133.50717163085938, |
|
"learning_rate": 0.00018983126110124333, |
|
"loss": 0.8898, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.15842902313196872, |
|
"grad_norm": 62.771507263183594, |
|
"learning_rate": 0.00018978685612788633, |
|
"loss": 0.3062, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.15909469129638876, |
|
"grad_norm": 74.36737060546875, |
|
"learning_rate": 0.0001897424511545293, |
|
"loss": 0.5336, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.1597603594608088, |
|
"grad_norm": 137.45458984375, |
|
"learning_rate": 0.0001896980461811723, |
|
"loss": 0.7025, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1604260276252288, |
|
"grad_norm": 54.84712600708008, |
|
"learning_rate": 0.00018965364120781527, |
|
"loss": 0.338, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.16109169578964885, |
|
"grad_norm": 127.16575622558594, |
|
"learning_rate": 0.00018960923623445827, |
|
"loss": 1.0382, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.1617573639540689, |
|
"grad_norm": 93.72176361083984, |
|
"learning_rate": 0.00018956483126110124, |
|
"loss": 1.0084, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.16242303211848894, |
|
"grad_norm": 56.54580307006836, |
|
"learning_rate": 0.00018952042628774424, |
|
"loss": 0.5353, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.16308870028290898, |
|
"grad_norm": 76.96385955810547, |
|
"learning_rate": 0.0001894760213143872, |
|
"loss": 0.397, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.16375436844732902, |
|
"grad_norm": 91.15630340576172, |
|
"learning_rate": 0.00018943161634103018, |
|
"loss": 0.4533, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.16442003661174903, |
|
"grad_norm": 103.4432373046875, |
|
"learning_rate": 0.00018938721136767318, |
|
"loss": 0.6514, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.16508570477616907, |
|
"grad_norm": 60.81359100341797, |
|
"learning_rate": 0.00018934280639431615, |
|
"loss": 0.4884, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.16575137294058911, |
|
"grad_norm": 46.649139404296875, |
|
"learning_rate": 0.00018929840142095915, |
|
"loss": 0.271, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.16641704110500916, |
|
"grad_norm": 58.7072868347168, |
|
"learning_rate": 0.00018925399644760212, |
|
"loss": 0.27, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1670827092694292, |
|
"grad_norm": 79.08338928222656, |
|
"learning_rate": 0.00018920959147424512, |
|
"loss": 0.4671, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.16774837743384924, |
|
"grad_norm": 98.57723236083984, |
|
"learning_rate": 0.0001891651865008881, |
|
"loss": 0.3235, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.16841404559826925, |
|
"grad_norm": 131.836181640625, |
|
"learning_rate": 0.0001891207815275311, |
|
"loss": 0.4688, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1690797137626893, |
|
"grad_norm": 67.51493835449219, |
|
"learning_rate": 0.00018907637655417406, |
|
"loss": 0.6911, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.16974538192710933, |
|
"grad_norm": 51.76738357543945, |
|
"learning_rate": 0.00018903197158081706, |
|
"loss": 0.1837, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.17041105009152938, |
|
"grad_norm": 52.23995590209961, |
|
"learning_rate": 0.00018898756660746003, |
|
"loss": 0.71, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.17107671825594942, |
|
"grad_norm": 49.984336853027344, |
|
"learning_rate": 0.00018894316163410303, |
|
"loss": 0.3221, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.17174238642036946, |
|
"grad_norm": 48.08108901977539, |
|
"learning_rate": 0.000188898756660746, |
|
"loss": 0.3334, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.17240805458478947, |
|
"grad_norm": 24.143003463745117, |
|
"learning_rate": 0.00018885435168738897, |
|
"loss": 0.2096, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.1730737227492095, |
|
"grad_norm": 116.61510467529297, |
|
"learning_rate": 0.00018880994671403197, |
|
"loss": 0.8852, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.17373939091362955, |
|
"grad_norm": 41.168052673339844, |
|
"learning_rate": 0.00018876554174067494, |
|
"loss": 0.2541, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.1744050590780496, |
|
"grad_norm": 19.256343841552734, |
|
"learning_rate": 0.00018872113676731794, |
|
"loss": 0.2278, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.17507072724246964, |
|
"grad_norm": 51.543418884277344, |
|
"learning_rate": 0.0001886767317939609, |
|
"loss": 0.2494, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.17573639540688968, |
|
"grad_norm": 44.67826461791992, |
|
"learning_rate": 0.0001886323268206039, |
|
"loss": 0.3112, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.1764020635713097, |
|
"grad_norm": 41.30339431762695, |
|
"learning_rate": 0.00018858792184724688, |
|
"loss": 0.5114, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.17706773173572973, |
|
"grad_norm": 12.575431823730469, |
|
"learning_rate": 0.00018854351687388988, |
|
"loss": 0.0375, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.17773339990014977, |
|
"grad_norm": 71.91178894042969, |
|
"learning_rate": 0.00018849911190053285, |
|
"loss": 0.5989, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.17839906806456982, |
|
"grad_norm": 83.31620788574219, |
|
"learning_rate": 0.00018845470692717585, |
|
"loss": 0.51, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.17906473622898986, |
|
"grad_norm": 83.13764190673828, |
|
"learning_rate": 0.00018841030195381885, |
|
"loss": 1.1146, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.1797304043934099, |
|
"grad_norm": 151.9849853515625, |
|
"learning_rate": 0.00018836589698046182, |
|
"loss": 1.1219, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1803960725578299, |
|
"grad_norm": 121.70146179199219, |
|
"learning_rate": 0.00018832149200710482, |
|
"loss": 0.9397, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.18106174072224995, |
|
"grad_norm": 51.62641906738281, |
|
"learning_rate": 0.0001882770870337478, |
|
"loss": 0.2968, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.18172740888667, |
|
"grad_norm": 66.82881164550781, |
|
"learning_rate": 0.0001882326820603908, |
|
"loss": 0.5365, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.18239307705109004, |
|
"grad_norm": 78.71537017822266, |
|
"learning_rate": 0.00018818827708703376, |
|
"loss": 0.8149, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.18305874521551008, |
|
"grad_norm": 137.4581756591797, |
|
"learning_rate": 0.00018814387211367676, |
|
"loss": 0.6777, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.18372441337993012, |
|
"grad_norm": 64.81407165527344, |
|
"learning_rate": 0.00018809946714031973, |
|
"loss": 0.7677, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.18439008154435013, |
|
"grad_norm": 96.37416076660156, |
|
"learning_rate": 0.00018805506216696273, |
|
"loss": 1.332, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.18505574970877017, |
|
"grad_norm": 47.11737823486328, |
|
"learning_rate": 0.0001880106571936057, |
|
"loss": 0.29, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.18572141787319021, |
|
"grad_norm": 111.5100326538086, |
|
"learning_rate": 0.0001879662522202487, |
|
"loss": 0.5466, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.18638708603761026, |
|
"grad_norm": 123.91747283935547, |
|
"learning_rate": 0.00018792184724689167, |
|
"loss": 0.6733, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1870527542020303, |
|
"grad_norm": 107.9887924194336, |
|
"learning_rate": 0.00018787744227353467, |
|
"loss": 0.4885, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.18771842236645034, |
|
"grad_norm": 76.66865539550781, |
|
"learning_rate": 0.00018783303730017764, |
|
"loss": 0.6393, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.18838409053087035, |
|
"grad_norm": 67.36553192138672, |
|
"learning_rate": 0.00018778863232682064, |
|
"loss": 0.4087, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.1890497586952904, |
|
"grad_norm": 50.0120849609375, |
|
"learning_rate": 0.0001877442273534636, |
|
"loss": 0.3911, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.18971542685971043, |
|
"grad_norm": 109.88864135742188, |
|
"learning_rate": 0.00018769982238010658, |
|
"loss": 0.6727, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.19038109502413048, |
|
"grad_norm": 114.64707946777344, |
|
"learning_rate": 0.00018765541740674958, |
|
"loss": 0.8044, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.19104676318855052, |
|
"grad_norm": 130.69219970703125, |
|
"learning_rate": 0.00018761101243339255, |
|
"loss": 0.6119, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.19171243135297053, |
|
"grad_norm": 50.73808670043945, |
|
"learning_rate": 0.00018756660746003555, |
|
"loss": 0.4887, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.19237809951739057, |
|
"grad_norm": 143.2826385498047, |
|
"learning_rate": 0.00018752220248667852, |
|
"loss": 0.8228, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.1930437676818106, |
|
"grad_norm": 132.3501739501953, |
|
"learning_rate": 0.00018747779751332152, |
|
"loss": 0.808, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.19370943584623065, |
|
"grad_norm": 74.29531860351562, |
|
"learning_rate": 0.0001874333925399645, |
|
"loss": 0.7815, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.1943751040106507, |
|
"grad_norm": 81.61593627929688, |
|
"learning_rate": 0.00018738898756660749, |
|
"loss": 0.5732, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.19504077217507074, |
|
"grad_norm": 86.1830825805664, |
|
"learning_rate": 0.00018734458259325046, |
|
"loss": 0.5226, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.19570644033949075, |
|
"grad_norm": 23.495864868164062, |
|
"learning_rate": 0.00018730017761989346, |
|
"loss": 0.1476, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.1963721085039108, |
|
"grad_norm": 45.22040939331055, |
|
"learning_rate": 0.00018725577264653643, |
|
"loss": 0.4335, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.19703777666833083, |
|
"grad_norm": 102.33810424804688, |
|
"learning_rate": 0.0001872113676731794, |
|
"loss": 0.6778, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.19770344483275087, |
|
"grad_norm": 90.49777221679688, |
|
"learning_rate": 0.0001871669626998224, |
|
"loss": 0.7652, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.19836911299717092, |
|
"grad_norm": 14.38054084777832, |
|
"learning_rate": 0.00018712255772646537, |
|
"loss": 0.0598, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.19903478116159096, |
|
"grad_norm": 24.415199279785156, |
|
"learning_rate": 0.00018707815275310837, |
|
"loss": 0.3666, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.19970044932601097, |
|
"grad_norm": 37.869869232177734, |
|
"learning_rate": 0.00018703374777975134, |
|
"loss": 0.1511, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.200366117490431, |
|
"grad_norm": 115.4464111328125, |
|
"learning_rate": 0.00018698934280639434, |
|
"loss": 0.7441, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.20103178565485105, |
|
"grad_norm": 124.49742889404297, |
|
"learning_rate": 0.0001869449378330373, |
|
"loss": 0.6898, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.2016974538192711, |
|
"grad_norm": 45.529197692871094, |
|
"learning_rate": 0.0001869005328596803, |
|
"loss": 0.2891, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.20236312198369114, |
|
"grad_norm": 107.60458374023438, |
|
"learning_rate": 0.00018685612788632328, |
|
"loss": 0.6037, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.20302879014811118, |
|
"grad_norm": 72.92047119140625, |
|
"learning_rate": 0.00018681172291296628, |
|
"loss": 0.8364, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2036944583125312, |
|
"grad_norm": 80.35264587402344, |
|
"learning_rate": 0.00018676731793960925, |
|
"loss": 0.6218, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.20436012647695123, |
|
"grad_norm": 181.0091552734375, |
|
"learning_rate": 0.00018672291296625225, |
|
"loss": 0.8943, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.20502579464137127, |
|
"grad_norm": 54.508758544921875, |
|
"learning_rate": 0.00018667850799289522, |
|
"loss": 0.3581, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.20569146280579131, |
|
"grad_norm": 22.76518440246582, |
|
"learning_rate": 0.0001866341030195382, |
|
"loss": 0.0707, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.20635713097021136, |
|
"grad_norm": 144.45138549804688, |
|
"learning_rate": 0.00018658969804618119, |
|
"loss": 1.199, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2070227991346314, |
|
"grad_norm": 51.15176010131836, |
|
"learning_rate": 0.00018654529307282416, |
|
"loss": 0.2966, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.2076884672990514, |
|
"grad_norm": 84.0188980102539, |
|
"learning_rate": 0.00018650088809946716, |
|
"loss": 0.3478, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.20835413546347145, |
|
"grad_norm": 74.65904235839844, |
|
"learning_rate": 0.00018645648312611013, |
|
"loss": 0.5906, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.2090198036278915, |
|
"grad_norm": 76.40025329589844, |
|
"learning_rate": 0.00018641207815275313, |
|
"loss": 0.5313, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.20968547179231153, |
|
"grad_norm": 67.46442413330078, |
|
"learning_rate": 0.0001863676731793961, |
|
"loss": 0.5431, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.21035113995673158, |
|
"grad_norm": 153.51947021484375, |
|
"learning_rate": 0.0001863232682060391, |
|
"loss": 1.4613, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.21101680812115162, |
|
"grad_norm": 141.46417236328125, |
|
"learning_rate": 0.00018627886323268207, |
|
"loss": 0.6173, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.21168247628557163, |
|
"grad_norm": 117.94355773925781, |
|
"learning_rate": 0.00018623445825932506, |
|
"loss": 0.703, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.21234814444999167, |
|
"grad_norm": 72.00027465820312, |
|
"learning_rate": 0.00018619005328596804, |
|
"loss": 0.3786, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.2130138126144117, |
|
"grad_norm": 54.18503189086914, |
|
"learning_rate": 0.00018614564831261103, |
|
"loss": 0.3633, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21367948077883175, |
|
"grad_norm": 121.49830627441406, |
|
"learning_rate": 0.000186101243339254, |
|
"loss": 0.7459, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2143451489432518, |
|
"grad_norm": 176.50872802734375, |
|
"learning_rate": 0.00018605683836589698, |
|
"loss": 0.8166, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.21501081710767184, |
|
"grad_norm": 151.5113067626953, |
|
"learning_rate": 0.00018601243339253998, |
|
"loss": 0.7919, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.21567648527209185, |
|
"grad_norm": 84.63894653320312, |
|
"learning_rate": 0.00018596802841918295, |
|
"loss": 0.7106, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2163421534365119, |
|
"grad_norm": 102.90968322753906, |
|
"learning_rate": 0.00018592362344582595, |
|
"loss": 0.689, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.21700782160093193, |
|
"grad_norm": 61.89885330200195, |
|
"learning_rate": 0.00018587921847246892, |
|
"loss": 0.3393, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.21767348976535197, |
|
"grad_norm": 106.49290466308594, |
|
"learning_rate": 0.00018583481349911191, |
|
"loss": 0.3928, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.21833915792977202, |
|
"grad_norm": 43.59565353393555, |
|
"learning_rate": 0.00018579040852575489, |
|
"loss": 0.3696, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.21900482609419206, |
|
"grad_norm": 67.05834197998047, |
|
"learning_rate": 0.00018574600355239788, |
|
"loss": 0.8861, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.21967049425861207, |
|
"grad_norm": 108.7596435546875, |
|
"learning_rate": 0.00018570159857904086, |
|
"loss": 1.1147, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2203361624230321, |
|
"grad_norm": 45.871212005615234, |
|
"learning_rate": 0.00018565719360568385, |
|
"loss": 0.2634, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.22100183058745215, |
|
"grad_norm": 97.05673217773438, |
|
"learning_rate": 0.00018561278863232683, |
|
"loss": 0.7176, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2216674987518722, |
|
"grad_norm": 37.64607238769531, |
|
"learning_rate": 0.00018556838365896982, |
|
"loss": 0.405, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.22233316691629224, |
|
"grad_norm": 82.15957641601562, |
|
"learning_rate": 0.0001855239786856128, |
|
"loss": 0.3589, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.22299883508071228, |
|
"grad_norm": 48.32238006591797, |
|
"learning_rate": 0.00018547957371225577, |
|
"loss": 0.2946, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.2236645032451323, |
|
"grad_norm": 33.3898811340332, |
|
"learning_rate": 0.00018543516873889876, |
|
"loss": 0.388, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.22433017140955233, |
|
"grad_norm": 42.87260818481445, |
|
"learning_rate": 0.00018539076376554174, |
|
"loss": 0.2302, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.22499583957397237, |
|
"grad_norm": 58.45392608642578, |
|
"learning_rate": 0.00018534635879218473, |
|
"loss": 0.6616, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.22566150773839241, |
|
"grad_norm": 28.114885330200195, |
|
"learning_rate": 0.0001853019538188277, |
|
"loss": 0.1042, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.22632717590281246, |
|
"grad_norm": 51.35744857788086, |
|
"learning_rate": 0.0001852575488454707, |
|
"loss": 0.4782, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2269928440672325, |
|
"grad_norm": 82.15337371826172, |
|
"learning_rate": 0.00018521314387211368, |
|
"loss": 0.3746, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.2276585122316525, |
|
"grad_norm": 109.32012176513672, |
|
"learning_rate": 0.00018516873889875667, |
|
"loss": 0.6589, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.22832418039607255, |
|
"grad_norm": 155.4207000732422, |
|
"learning_rate": 0.00018512433392539965, |
|
"loss": 1.1968, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.2289898485604926, |
|
"grad_norm": 33.03417205810547, |
|
"learning_rate": 0.00018507992895204264, |
|
"loss": 0.3845, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.22965551672491263, |
|
"grad_norm": 37.060585021972656, |
|
"learning_rate": 0.00018503552397868561, |
|
"loss": 0.1341, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.23032118488933268, |
|
"grad_norm": 106.52214050292969, |
|
"learning_rate": 0.00018499111900532859, |
|
"loss": 0.7823, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.23098685305375272, |
|
"grad_norm": 48.00297164916992, |
|
"learning_rate": 0.00018494671403197158, |
|
"loss": 0.6375, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.23165252121817273, |
|
"grad_norm": 51.31806564331055, |
|
"learning_rate": 0.00018490230905861456, |
|
"loss": 0.4176, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.23231818938259277, |
|
"grad_norm": 57.65751647949219, |
|
"learning_rate": 0.00018485790408525755, |
|
"loss": 0.2368, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.2329838575470128, |
|
"grad_norm": 46.041908264160156, |
|
"learning_rate": 0.00018481349911190053, |
|
"loss": 0.4219, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23364952571143285, |
|
"grad_norm": 97.23726654052734, |
|
"learning_rate": 0.00018476909413854352, |
|
"loss": 0.8618, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.2343151938758529, |
|
"grad_norm": 39.60933303833008, |
|
"learning_rate": 0.0001847246891651865, |
|
"loss": 0.2446, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.23498086204027294, |
|
"grad_norm": 83.39789581298828, |
|
"learning_rate": 0.0001846802841918295, |
|
"loss": 0.5283, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.23564653020469295, |
|
"grad_norm": 23.28626823425293, |
|
"learning_rate": 0.00018463587921847246, |
|
"loss": 0.2269, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.236312198369113, |
|
"grad_norm": 75.43800354003906, |
|
"learning_rate": 0.00018459147424511546, |
|
"loss": 0.5959, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.23697786653353303, |
|
"grad_norm": 33.72787094116211, |
|
"learning_rate": 0.00018454706927175843, |
|
"loss": 0.2795, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.23764353469795307, |
|
"grad_norm": 56.4429817199707, |
|
"learning_rate": 0.00018450266429840143, |
|
"loss": 0.6191, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.23830920286237312, |
|
"grad_norm": 97.20005798339844, |
|
"learning_rate": 0.0001844582593250444, |
|
"loss": 0.6252, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.23897487102679316, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001844582593250444, |
|
"loss": 1.1109, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.23964053919121317, |
|
"grad_norm": 103.0977783203125, |
|
"learning_rate": 0.00018441385435168738, |
|
"loss": 0.5326, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2403062073556332, |
|
"grad_norm": 33.12345504760742, |
|
"learning_rate": 0.00018436944937833037, |
|
"loss": 0.4465, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.24097187552005325, |
|
"grad_norm": 29.757606506347656, |
|
"learning_rate": 0.00018432504440497335, |
|
"loss": 0.3853, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.2416375436844733, |
|
"grad_norm": 52.76167678833008, |
|
"learning_rate": 0.00018428063943161634, |
|
"loss": 0.3521, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.24230321184889334, |
|
"grad_norm": 80.51199340820312, |
|
"learning_rate": 0.00018423623445825931, |
|
"loss": 0.4551, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.24296888001331338, |
|
"grad_norm": 47.3682861328125, |
|
"learning_rate": 0.0001841918294849023, |
|
"loss": 0.2393, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2436345481777334, |
|
"grad_norm": 80.18375396728516, |
|
"learning_rate": 0.00018414742451154528, |
|
"loss": 0.9122, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.24430021634215343, |
|
"grad_norm": 65.6572265625, |
|
"learning_rate": 0.00018410301953818828, |
|
"loss": 0.3345, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.24496588450657347, |
|
"grad_norm": 108.21941375732422, |
|
"learning_rate": 0.00018405861456483125, |
|
"loss": 0.5405, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.2456315526709935, |
|
"grad_norm": 57.63541030883789, |
|
"learning_rate": 0.00018401420959147425, |
|
"loss": 0.2502, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.24629722083541356, |
|
"grad_norm": 35.54652786254883, |
|
"learning_rate": 0.00018396980461811722, |
|
"loss": 0.5846, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2469628889998336, |
|
"grad_norm": 55.58504867553711, |
|
"learning_rate": 0.00018392539964476022, |
|
"loss": 0.2154, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.2476285571642536, |
|
"grad_norm": 55.62077331542969, |
|
"learning_rate": 0.0001838809946714032, |
|
"loss": 0.4984, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.24829422532867365, |
|
"grad_norm": 87.00872039794922, |
|
"learning_rate": 0.0001838365896980462, |
|
"loss": 0.4538, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.2489598934930937, |
|
"grad_norm": 46.597068786621094, |
|
"learning_rate": 0.00018379218472468916, |
|
"loss": 0.648, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.24962556165751373, |
|
"grad_norm": 99.84630584716797, |
|
"learning_rate": 0.00018374777975133216, |
|
"loss": 0.7931, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.25029122982193375, |
|
"grad_norm": 54.02254104614258, |
|
"learning_rate": 0.00018370337477797513, |
|
"loss": 0.2499, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.2509568979863538, |
|
"grad_norm": 29.319000244140625, |
|
"learning_rate": 0.00018365896980461813, |
|
"loss": 0.4007, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.25162256615077383, |
|
"grad_norm": 113.27352905273438, |
|
"learning_rate": 0.0001836145648312611, |
|
"loss": 0.7379, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.25228823431519387, |
|
"grad_norm": 112.12903594970703, |
|
"learning_rate": 0.0001835701598579041, |
|
"loss": 0.6326, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.2529539024796139, |
|
"grad_norm": 123.96865844726562, |
|
"learning_rate": 0.00018352575488454707, |
|
"loss": 0.5521, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.25361957064403395, |
|
"grad_norm": 42.07058334350586, |
|
"learning_rate": 0.00018348134991119007, |
|
"loss": 0.4259, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.254285238808454, |
|
"grad_norm": 55.002960205078125, |
|
"learning_rate": 0.00018343694493783304, |
|
"loss": 0.4797, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.25495090697287404, |
|
"grad_norm": 33.028038024902344, |
|
"learning_rate": 0.00018339253996447604, |
|
"loss": 0.2893, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.2556165751372941, |
|
"grad_norm": 54.003570556640625, |
|
"learning_rate": 0.000183348134991119, |
|
"loss": 0.4897, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.2562822433017141, |
|
"grad_norm": 80.12110137939453, |
|
"learning_rate": 0.000183303730017762, |
|
"loss": 0.2902, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2569479114661341, |
|
"grad_norm": 46.586753845214844, |
|
"learning_rate": 0.00018325932504440498, |
|
"loss": 0.3721, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.25761357963055415, |
|
"grad_norm": 45.596553802490234, |
|
"learning_rate": 0.00018321492007104798, |
|
"loss": 0.2719, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.2582792477949742, |
|
"grad_norm": 90.06937408447266, |
|
"learning_rate": 0.00018317051509769095, |
|
"loss": 0.6424, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.25894491595939423, |
|
"grad_norm": 44.34123611450195, |
|
"learning_rate": 0.00018312611012433395, |
|
"loss": 0.2926, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.25961058412381427, |
|
"grad_norm": 123.05774688720703, |
|
"learning_rate": 0.00018308170515097692, |
|
"loss": 0.8703, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2602762522882343, |
|
"grad_norm": 132.67784118652344, |
|
"learning_rate": 0.00018303730017761992, |
|
"loss": 0.492, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.26094192045265435, |
|
"grad_norm": 65.75527954101562, |
|
"learning_rate": 0.0001829928952042629, |
|
"loss": 0.2371, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.2616075886170744, |
|
"grad_norm": 70.54750061035156, |
|
"learning_rate": 0.0001829484902309059, |
|
"loss": 0.5192, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.26227325678149443, |
|
"grad_norm": 85.00897216796875, |
|
"learning_rate": 0.00018290408525754886, |
|
"loss": 0.7114, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2629389249459145, |
|
"grad_norm": 96.78849792480469, |
|
"learning_rate": 0.00018285968028419186, |
|
"loss": 0.4515, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2636045931103345, |
|
"grad_norm": 21.13523292541504, |
|
"learning_rate": 0.00018281527531083483, |
|
"loss": 0.3526, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.26427026127475456, |
|
"grad_norm": 29.27351188659668, |
|
"learning_rate": 0.0001827708703374778, |
|
"loss": 0.5151, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.26493592943917454, |
|
"grad_norm": 35.63692855834961, |
|
"learning_rate": 0.0001827264653641208, |
|
"loss": 0.2374, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2656015976035946, |
|
"grad_norm": 66.08139038085938, |
|
"learning_rate": 0.00018268206039076377, |
|
"loss": 0.4133, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.2662672657680146, |
|
"grad_norm": 34.3465461730957, |
|
"learning_rate": 0.00018263765541740677, |
|
"loss": 0.3082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26693293393243467, |
|
"grad_norm": 100.62875366210938, |
|
"learning_rate": 0.00018259325044404974, |
|
"loss": 0.6884, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.2675986020968547, |
|
"grad_norm": 139.31932067871094, |
|
"learning_rate": 0.00018254884547069274, |
|
"loss": 1.0101, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.26826427026127475, |
|
"grad_norm": 118.92940521240234, |
|
"learning_rate": 0.0001825044404973357, |
|
"loss": 0.9706, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.2689299384256948, |
|
"grad_norm": 151.63421630859375, |
|
"learning_rate": 0.0001824600355239787, |
|
"loss": 0.961, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.26959560659011483, |
|
"grad_norm": 27.14082145690918, |
|
"learning_rate": 0.00018241563055062168, |
|
"loss": 0.227, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.2702612747545349, |
|
"grad_norm": 80.59782409667969, |
|
"learning_rate": 0.00018237122557726468, |
|
"loss": 0.6828, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2709269429189549, |
|
"grad_norm": 49.46958541870117, |
|
"learning_rate": 0.00018232682060390765, |
|
"loss": 0.3184, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.27159261108337496, |
|
"grad_norm": 73.98738098144531, |
|
"learning_rate": 0.00018228241563055065, |
|
"loss": 0.6858, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.272258279247795, |
|
"grad_norm": 86.27637481689453, |
|
"learning_rate": 0.00018223801065719362, |
|
"loss": 0.5358, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.272923947412215, |
|
"grad_norm": 145.42340087890625, |
|
"learning_rate": 0.0001821936056838366, |
|
"loss": 1.0992, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.273589615576635, |
|
"grad_norm": 41.17599105834961, |
|
"learning_rate": 0.0001821492007104796, |
|
"loss": 0.369, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.27425528374105507, |
|
"grad_norm": 55.49694061279297, |
|
"learning_rate": 0.00018210479573712256, |
|
"loss": 0.2842, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.2749209519054751, |
|
"grad_norm": 47.01858139038086, |
|
"learning_rate": 0.00018206039076376556, |
|
"loss": 0.3368, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.27558662006989515, |
|
"grad_norm": 40.550880432128906, |
|
"learning_rate": 0.00018201598579040853, |
|
"loss": 0.2099, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.2762522882343152, |
|
"grad_norm": 38.81869125366211, |
|
"learning_rate": 0.00018197158081705153, |
|
"loss": 0.28, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.27691795639873523, |
|
"grad_norm": 65.78675842285156, |
|
"learning_rate": 0.0001819271758436945, |
|
"loss": 0.521, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.2775836245631553, |
|
"grad_norm": 73.13219451904297, |
|
"learning_rate": 0.0001818827708703375, |
|
"loss": 0.7937, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.2782492927275753, |
|
"grad_norm": 65.54695129394531, |
|
"learning_rate": 0.00018183836589698047, |
|
"loss": 0.4079, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.27891496089199536, |
|
"grad_norm": 34.30622863769531, |
|
"learning_rate": 0.00018179396092362347, |
|
"loss": 0.0964, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.2795806290564154, |
|
"grad_norm": 67.12702941894531, |
|
"learning_rate": 0.00018174955595026644, |
|
"loss": 1.4374, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.28024629722083544, |
|
"grad_norm": 33.8336181640625, |
|
"learning_rate": 0.00018170515097690944, |
|
"loss": 0.1509, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.2809119653852554, |
|
"grad_norm": 66.7548599243164, |
|
"learning_rate": 0.0001816607460035524, |
|
"loss": 0.2623, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.28157763354967547, |
|
"grad_norm": 57.463253021240234, |
|
"learning_rate": 0.00018161634103019538, |
|
"loss": 0.3713, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.2822433017140955, |
|
"grad_norm": 32.50680923461914, |
|
"learning_rate": 0.00018157193605683838, |
|
"loss": 0.1945, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.28290896987851555, |
|
"grad_norm": 72.0036849975586, |
|
"learning_rate": 0.00018152753108348135, |
|
"loss": 0.3324, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2835746380429356, |
|
"grad_norm": 38.52781295776367, |
|
"learning_rate": 0.00018148312611012435, |
|
"loss": 0.185, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.28424030620735563, |
|
"grad_norm": 74.13616943359375, |
|
"learning_rate": 0.00018143872113676732, |
|
"loss": 0.2684, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.28490597437177567, |
|
"grad_norm": 76.92218017578125, |
|
"learning_rate": 0.00018139431616341032, |
|
"loss": 0.1456, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.2855716425361957, |
|
"grad_norm": 131.1004638671875, |
|
"learning_rate": 0.0001813499111900533, |
|
"loss": 0.6252, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.28623731070061575, |
|
"grad_norm": 112.32926940917969, |
|
"learning_rate": 0.0001813055062166963, |
|
"loss": 0.8337, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2869029788650358, |
|
"grad_norm": 125.7912368774414, |
|
"learning_rate": 0.00018126110124333926, |
|
"loss": 0.3741, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.28756864702945584, |
|
"grad_norm": 164.3785858154297, |
|
"learning_rate": 0.00018121669626998226, |
|
"loss": 0.9616, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.2882343151938759, |
|
"grad_norm": 191.42367553710938, |
|
"learning_rate": 0.00018117229129662523, |
|
"loss": 0.8257, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.28889998335829586, |
|
"grad_norm": 79.08208465576172, |
|
"learning_rate": 0.00018112788632326823, |
|
"loss": 0.5202, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.2895656515227159, |
|
"grad_norm": 109.46979522705078, |
|
"learning_rate": 0.0001810834813499112, |
|
"loss": 0.941, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.29023131968713595, |
|
"grad_norm": 74.53866577148438, |
|
"learning_rate": 0.00018103907637655417, |
|
"loss": 0.7279, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.290896987851556, |
|
"grad_norm": 137.80848693847656, |
|
"learning_rate": 0.00018099467140319717, |
|
"loss": 1.4923, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.29156265601597603, |
|
"grad_norm": 126.38627624511719, |
|
"learning_rate": 0.00018095026642984014, |
|
"loss": 0.721, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.29222832418039607, |
|
"grad_norm": 98.57434844970703, |
|
"learning_rate": 0.00018090586145648314, |
|
"loss": 0.5846, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.2928939923448161, |
|
"grad_norm": 33.60892868041992, |
|
"learning_rate": 0.0001808614564831261, |
|
"loss": 0.4007, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.29355966050923615, |
|
"grad_norm": 84.4782943725586, |
|
"learning_rate": 0.0001808170515097691, |
|
"loss": 0.6742, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.2942253286736562, |
|
"grad_norm": 78.77970886230469, |
|
"learning_rate": 0.00018077264653641208, |
|
"loss": 0.5823, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.29489099683807624, |
|
"grad_norm": 157.4933319091797, |
|
"learning_rate": 0.00018072824156305508, |
|
"loss": 0.7795, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.2955566650024963, |
|
"grad_norm": 18.716060638427734, |
|
"learning_rate": 0.00018068383658969805, |
|
"loss": 0.3715, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.2962223331669163, |
|
"grad_norm": 85.15080261230469, |
|
"learning_rate": 0.00018063943161634105, |
|
"loss": 0.606, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.2968880013313363, |
|
"grad_norm": 83.07491302490234, |
|
"learning_rate": 0.00018059502664298402, |
|
"loss": 0.4614, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.29755366949575635, |
|
"grad_norm": 52.511085510253906, |
|
"learning_rate": 0.000180550621669627, |
|
"loss": 0.4219, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.2982193376601764, |
|
"grad_norm": 126.2627944946289, |
|
"learning_rate": 0.00018050621669627, |
|
"loss": 0.6889, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.29888500582459643, |
|
"grad_norm": 73.53312683105469, |
|
"learning_rate": 0.00018046181172291296, |
|
"loss": 0.577, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.29955067398901647, |
|
"grad_norm": 103.77356719970703, |
|
"learning_rate": 0.00018041740674955596, |
|
"loss": 0.7924, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3002163421534365, |
|
"grad_norm": 89.16803741455078, |
|
"learning_rate": 0.00018037300177619893, |
|
"loss": 0.5117, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.30088201031785655, |
|
"grad_norm": 116.20187377929688, |
|
"learning_rate": 0.00018032859680284193, |
|
"loss": 0.6893, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3015476784822766, |
|
"grad_norm": 130.5649871826172, |
|
"learning_rate": 0.0001802841918294849, |
|
"loss": 0.6782, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.30221334664669663, |
|
"grad_norm": 71.50609588623047, |
|
"learning_rate": 0.0001802397868561279, |
|
"loss": 0.4608, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.3028790148111167, |
|
"grad_norm": 101.877197265625, |
|
"learning_rate": 0.00018019538188277087, |
|
"loss": 0.5512, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3035446829755367, |
|
"grad_norm": 135.80545043945312, |
|
"learning_rate": 0.00018015097690941387, |
|
"loss": 0.6684, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.30421035113995676, |
|
"grad_norm": 83.38865661621094, |
|
"learning_rate": 0.00018010657193605684, |
|
"loss": 0.4582, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.30487601930437674, |
|
"grad_norm": 93.57701110839844, |
|
"learning_rate": 0.00018006216696269984, |
|
"loss": 0.7277, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3055416874687968, |
|
"grad_norm": 72.99111938476562, |
|
"learning_rate": 0.0001800177619893428, |
|
"loss": 0.5737, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.3062073556332168, |
|
"grad_norm": 64.9205551147461, |
|
"learning_rate": 0.00017997335701598578, |
|
"loss": 0.3831, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.30687302379763687, |
|
"grad_norm": 36.749935150146484, |
|
"learning_rate": 0.00017992895204262878, |
|
"loss": 0.4547, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.3075386919620569, |
|
"grad_norm": 77.0653076171875, |
|
"learning_rate": 0.00017988454706927175, |
|
"loss": 0.8629, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.30820436012647695, |
|
"grad_norm": 48.22278594970703, |
|
"learning_rate": 0.00017984014209591475, |
|
"loss": 0.3634, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.308870028290897, |
|
"grad_norm": 39.0225715637207, |
|
"learning_rate": 0.00017979573712255772, |
|
"loss": 0.317, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.30953569645531703, |
|
"grad_norm": 68.747314453125, |
|
"learning_rate": 0.00017975133214920072, |
|
"loss": 0.3624, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3102013646197371, |
|
"grad_norm": 71.55371856689453, |
|
"learning_rate": 0.0001797069271758437, |
|
"loss": 0.3992, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.3108670327841571, |
|
"grad_norm": 36.87495422363281, |
|
"learning_rate": 0.00017966252220248669, |
|
"loss": 0.2098, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.31153270094857716, |
|
"grad_norm": 10.664894104003906, |
|
"learning_rate": 0.00017961811722912966, |
|
"loss": 0.0542, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.3121983691129972, |
|
"grad_norm": 34.32514953613281, |
|
"learning_rate": 0.00017957371225577265, |
|
"loss": 0.3147, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.3128640372774172, |
|
"grad_norm": 24.026748657226562, |
|
"learning_rate": 0.00017952930728241563, |
|
"loss": 0.0797, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3135297054418372, |
|
"grad_norm": 23.764530181884766, |
|
"learning_rate": 0.00017948490230905862, |
|
"loss": 0.0965, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.31419537360625727, |
|
"grad_norm": 5.237178325653076, |
|
"learning_rate": 0.0001794404973357016, |
|
"loss": 0.0106, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.3148610417706773, |
|
"grad_norm": 20.264196395874023, |
|
"learning_rate": 0.0001793960923623446, |
|
"loss": 0.0546, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.31552670993509735, |
|
"grad_norm": 79.56272888183594, |
|
"learning_rate": 0.00017935168738898757, |
|
"loss": 0.5564, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.3161923780995174, |
|
"grad_norm": 37.59955596923828, |
|
"learning_rate": 0.00017930728241563056, |
|
"loss": 0.3443, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.31685804626393743, |
|
"grad_norm": 49.22467803955078, |
|
"learning_rate": 0.00017926287744227354, |
|
"loss": 0.3749, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.3175237144283575, |
|
"grad_norm": 175.10850524902344, |
|
"learning_rate": 0.00017921847246891653, |
|
"loss": 1.7856, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.3181893825927775, |
|
"grad_norm": 47.71009826660156, |
|
"learning_rate": 0.0001791740674955595, |
|
"loss": 0.7082, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.31885505075719756, |
|
"grad_norm": 82.27182006835938, |
|
"learning_rate": 0.0001791296625222025, |
|
"loss": 0.9351, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.3195207189216176, |
|
"grad_norm": 79.33952331542969, |
|
"learning_rate": 0.00017908525754884547, |
|
"loss": 0.3707, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3201863870860376, |
|
"grad_norm": 88.69058227539062, |
|
"learning_rate": 0.00017904085257548847, |
|
"loss": 0.85, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.3208520552504576, |
|
"grad_norm": 110.95427703857422, |
|
"learning_rate": 0.00017899644760213144, |
|
"loss": 0.3507, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.32151772341487767, |
|
"grad_norm": 101.72648620605469, |
|
"learning_rate": 0.00017895204262877444, |
|
"loss": 0.8123, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.3221833915792977, |
|
"grad_norm": 172.9845733642578, |
|
"learning_rate": 0.00017890763765541741, |
|
"loss": 1.1193, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.32284905974371775, |
|
"grad_norm": 40.887081146240234, |
|
"learning_rate": 0.0001788632326820604, |
|
"loss": 0.1791, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3235147279081378, |
|
"grad_norm": 89.45999145507812, |
|
"learning_rate": 0.00017881882770870338, |
|
"loss": 0.6104, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.32418039607255783, |
|
"grad_norm": 101.5306167602539, |
|
"learning_rate": 0.00017877442273534638, |
|
"loss": 0.5197, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.32484606423697787, |
|
"grad_norm": 25.401044845581055, |
|
"learning_rate": 0.00017873001776198935, |
|
"loss": 0.2328, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.3255117324013979, |
|
"grad_norm": 122.61588287353516, |
|
"learning_rate": 0.00017868561278863235, |
|
"loss": 0.9357, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.32617740056581795, |
|
"grad_norm": 51.50986099243164, |
|
"learning_rate": 0.00017864120781527532, |
|
"loss": 0.506, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.326843068730238, |
|
"grad_norm": 64.19971466064453, |
|
"learning_rate": 0.00017859680284191832, |
|
"loss": 0.2713, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.32750873689465804, |
|
"grad_norm": 19.61275863647461, |
|
"learning_rate": 0.0001785523978685613, |
|
"loss": 0.1013, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.328174405059078, |
|
"grad_norm": 66.36785125732422, |
|
"learning_rate": 0.0001785079928952043, |
|
"loss": 0.3726, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.32884007322349806, |
|
"grad_norm": 87.43013763427734, |
|
"learning_rate": 0.00017846358792184726, |
|
"loss": 0.4851, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.3295057413879181, |
|
"grad_norm": 63.96004867553711, |
|
"learning_rate": 0.00017841918294849026, |
|
"loss": 0.2111, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.33017140955233815, |
|
"grad_norm": 81.01036834716797, |
|
"learning_rate": 0.00017837477797513323, |
|
"loss": 0.6494, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3308370777167582, |
|
"grad_norm": 63.75148010253906, |
|
"learning_rate": 0.0001783303730017762, |
|
"loss": 0.3041, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.33150274588117823, |
|
"grad_norm": 91.487060546875, |
|
"learning_rate": 0.0001782859680284192, |
|
"loss": 0.689, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.33216841404559827, |
|
"grad_norm": 58.02032470703125, |
|
"learning_rate": 0.00017824156305506217, |
|
"loss": 0.5346, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.3328340822100183, |
|
"grad_norm": 73.02722930908203, |
|
"learning_rate": 0.00017819715808170517, |
|
"loss": 0.541, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4506, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.22820987830272e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|