{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_accuracy": 0.7505882352941177, "eval_f1": 0.12396694214876033, "eval_loss": 0.6111783981323242, "eval_precision": 0.36585365853658536, "eval_recall": 0.07462686567164178, "eval_runtime": 34.3787, "eval_samples_per_second": 6.516, "eval_steps_per_second": 0.204, "step": 0 }, { "epoch": 0.0026595744680851063, "grad_norm": 3.310136556625366, "learning_rate": 5.263157894736843e-07, "loss": 0.6542, "step": 1 }, { "epoch": 0.005319148936170213, "grad_norm": 2.5591301918029785, "learning_rate": 1.0526315789473685e-06, "loss": 0.6609, "step": 2 }, { "epoch": 0.007978723404255319, "grad_norm": 2.7341604232788086, "learning_rate": 1.5789473684210526e-06, "loss": 0.6752, "step": 3 }, { "epoch": 0.010638297872340425, "grad_norm": 2.8091554641723633, "learning_rate": 2.105263157894737e-06, "loss": 0.618, "step": 4 }, { "epoch": 0.013297872340425532, "grad_norm": 2.5653722286224365, "learning_rate": 2.631578947368421e-06, "loss": 0.6752, "step": 5 }, { "epoch": 0.015957446808510637, "grad_norm": 3.4398417472839355, "learning_rate": 3.157894736842105e-06, "loss": 0.6783, "step": 6 }, { "epoch": 0.018617021276595744, "grad_norm": 2.5178332328796387, "learning_rate": 3.6842105263157896e-06, "loss": 0.631, "step": 7 }, { "epoch": 0.02127659574468085, "grad_norm": 2.8207452297210693, "learning_rate": 4.210526315789474e-06, "loss": 0.6361, "step": 8 }, { "epoch": 0.023936170212765957, "grad_norm": 2.5525949001312256, "learning_rate": 4.736842105263158e-06, "loss": 0.6088, "step": 9 }, { "epoch": 0.026595744680851064, "grad_norm": 2.415248155593872, "learning_rate": 5.263157894736842e-06, "loss": 0.6556, "step": 10 }, { "epoch": 0.02925531914893617, "grad_norm": 3.0247888565063477, "learning_rate": 5.789473684210527e-06, "loss": 0.6039, "step": 11 }, { "epoch": 0.031914893617021274, "grad_norm": 2.5060417652130127, "learning_rate": 6.31578947368421e-06, "loss": 0.6012, "step": 12 }, { "epoch": 0.034574468085106384, "grad_norm": 2.770636558532715, "learning_rate": 6.842105263157896e-06, "loss": 0.6775, "step": 13 }, { "epoch": 0.03723404255319149, "grad_norm": 2.582097291946411, "learning_rate": 7.368421052631579e-06, "loss": 0.5993, "step": 14 }, { "epoch": 0.0398936170212766, "grad_norm": 2.5408666133880615, "learning_rate": 7.894736842105265e-06, "loss": 0.596, "step": 15 }, { "epoch": 0.0425531914893617, "grad_norm": 2.478731870651245, "learning_rate": 8.421052631578948e-06, "loss": 0.5919, "step": 16 }, { "epoch": 0.04521276595744681, "grad_norm": 2.5440561771392822, "learning_rate": 8.947368421052632e-06, "loss": 0.5223, "step": 17 }, { "epoch": 0.047872340425531915, "grad_norm": 2.1620945930480957, "learning_rate": 9.473684210526315e-06, "loss": 0.5412, "step": 18 }, { "epoch": 0.05053191489361702, "grad_norm": 2.2398860454559326, "learning_rate": 1e-05, "loss": 0.5847, "step": 19 }, { "epoch": 0.05319148936170213, "grad_norm": 2.1689343452453613, "learning_rate": 1.0526315789473684e-05, "loss": 0.5265, "step": 20 }, { "epoch": 0.05319148936170213, "eval_accuracy": 0.7647058823529411, "eval_f1": 0.05660377358490566, "eval_loss": 0.6081312894821167, "eval_precision": 0.5454545454545454, "eval_recall": 0.029850746268656716, "eval_runtime": 34.9953, "eval_samples_per_second": 6.401, "eval_steps_per_second": 0.2, "step": 20 }, { "epoch": 0.05585106382978723, "grad_norm": 1.8998128175735474, "learning_rate": 1.105263157894737e-05, "loss": 0.5347, "step": 21 }, { "epoch": 0.05851063829787234, "grad_norm": 2.079780340194702, "learning_rate": 1.1578947368421053e-05, "loss": 0.4814, "step": 22 }, { "epoch": 0.061170212765957445, "grad_norm": 1.8792980909347534, "learning_rate": 1.2105263157894737e-05, "loss": 0.5084, "step": 23 }, { "epoch": 0.06382978723404255, "grad_norm": 1.9132519960403442, "learning_rate": 1.263157894736842e-05, "loss": 0.5027, "step": 24 }, { "epoch": 0.06648936170212766, "grad_norm": 1.3962018489837646, "learning_rate": 1.3157894736842108e-05, "loss": 0.5136, "step": 25 }, { "epoch": 0.06914893617021277, "grad_norm": 1.4877433776855469, "learning_rate": 1.3684210526315791e-05, "loss": 0.4567, "step": 26 }, { "epoch": 0.07180851063829788, "grad_norm": 1.5485683679580688, "learning_rate": 1.4210526315789475e-05, "loss": 0.4365, "step": 27 }, { "epoch": 0.07446808510638298, "grad_norm": 1.164844036102295, "learning_rate": 1.4736842105263159e-05, "loss": 0.4142, "step": 28 }, { "epoch": 0.07712765957446809, "grad_norm": 1.354490876197815, "learning_rate": 1.5263157894736846e-05, "loss": 0.4492, "step": 29 }, { "epoch": 0.0797872340425532, "grad_norm": 1.067051649093628, "learning_rate": 1.578947368421053e-05, "loss": 0.4294, "step": 30 }, { "epoch": 0.08244680851063829, "grad_norm": 1.3097209930419922, "learning_rate": 1.6315789473684213e-05, "loss": 0.452, "step": 31 }, { "epoch": 0.0851063829787234, "grad_norm": 0.9226462244987488, "learning_rate": 1.6842105263157896e-05, "loss": 0.3848, "step": 32 }, { "epoch": 0.08776595744680851, "grad_norm": 1.1755656003952026, "learning_rate": 1.736842105263158e-05, "loss": 0.4307, "step": 33 }, { "epoch": 0.09042553191489362, "grad_norm": 1.2210921049118042, "learning_rate": 1.7894736842105264e-05, "loss": 0.4232, "step": 34 }, { "epoch": 0.09308510638297872, "grad_norm": 0.9078745245933533, "learning_rate": 1.8421052631578947e-05, "loss": 0.3752, "step": 35 }, { "epoch": 0.09574468085106383, "grad_norm": 0.936310350894928, "learning_rate": 1.894736842105263e-05, "loss": 0.3655, "step": 36 }, { "epoch": 0.09840425531914894, "grad_norm": 1.5738509893417358, "learning_rate": 1.9473684210526318e-05, "loss": 0.4547, "step": 37 }, { "epoch": 0.10106382978723404, "grad_norm": 1.1838228702545166, "learning_rate": 2e-05, "loss": 0.4347, "step": 38 }, { "epoch": 0.10372340425531915, "grad_norm": 1.4948188066482544, "learning_rate": 1.9999568050254373e-05, "loss": 0.4135, "step": 39 }, { "epoch": 0.10638297872340426, "grad_norm": 1.1098586320877075, "learning_rate": 1.9998272238333606e-05, "loss": 0.4127, "step": 40 }, { "epoch": 0.10638297872340426, "eval_accuracy": 0.768235294117647, "eval_f1": 0.06635071090047394, "eval_loss": 0.5411638021469116, "eval_precision": 0.7, "eval_recall": 0.03482587064676617, "eval_runtime": 34.6313, "eval_samples_per_second": 6.468, "eval_steps_per_second": 0.202, "step": 40 }, { "epoch": 0.10904255319148937, "grad_norm": 0.779417872428894, "learning_rate": 1.999611267618283e-05, "loss": 0.3893, "step": 41 }, { "epoch": 0.11170212765957446, "grad_norm": 1.021106481552124, "learning_rate": 1.99930895503665e-05, "loss": 0.3104, "step": 42 }, { "epoch": 0.11436170212765957, "grad_norm": 1.174867868423462, "learning_rate": 1.998920312205231e-05, "loss": 0.4124, "step": 43 }, { "epoch": 0.11702127659574468, "grad_norm": 0.8697633743286133, "learning_rate": 1.99844537269886e-05, "loss": 0.3785, "step": 44 }, { "epoch": 0.1196808510638298, "grad_norm": 0.9835452437400818, "learning_rate": 1.9978841775475368e-05, "loss": 0.4014, "step": 45 }, { "epoch": 0.12234042553191489, "grad_norm": 1.1810511350631714, "learning_rate": 1.9972367752328824e-05, "loss": 0.3518, "step": 46 }, { "epoch": 0.125, "grad_norm": 0.9265549778938293, "learning_rate": 1.9965032216839493e-05, "loss": 0.4024, "step": 47 }, { "epoch": 0.1276595744680851, "grad_norm": 1.186259150505066, "learning_rate": 1.9956835802723916e-05, "loss": 0.3599, "step": 48 }, { "epoch": 0.13031914893617022, "grad_norm": 1.2196171283721924, "learning_rate": 1.994777921806989e-05, "loss": 0.3411, "step": 49 }, { "epoch": 0.13297872340425532, "grad_norm": 1.1862437725067139, "learning_rate": 1.9937863245275303e-05, "loss": 0.362, "step": 50 }, { "epoch": 0.1356382978723404, "grad_norm": 1.319501280784607, "learning_rate": 1.992708874098054e-05, "loss": 0.4189, "step": 51 }, { "epoch": 0.13829787234042554, "grad_norm": 0.9766789674758911, "learning_rate": 1.991545663599448e-05, "loss": 0.358, "step": 52 }, { "epoch": 0.14095744680851063, "grad_norm": 1.0482966899871826, "learning_rate": 1.990296793521408e-05, "loss": 0.3736, "step": 53 }, { "epoch": 0.14361702127659576, "grad_norm": 0.8634902834892273, "learning_rate": 1.9889623717537564e-05, "loss": 0.3582, "step": 54 }, { "epoch": 0.14627659574468085, "grad_norm": 0.9416165947914124, "learning_rate": 1.987542513577122e-05, "loss": 0.3495, "step": 55 }, { "epoch": 0.14893617021276595, "grad_norm": 0.9823614358901978, "learning_rate": 1.9860373416529804e-05, "loss": 0.3446, "step": 56 }, { "epoch": 0.15159574468085107, "grad_norm": 0.8403105139732361, "learning_rate": 1.984446986013057e-05, "loss": 0.3177, "step": 57 }, { "epoch": 0.15425531914893617, "grad_norm": 1.0707823038101196, "learning_rate": 1.9827715840480962e-05, "loss": 0.323, "step": 58 }, { "epoch": 0.15691489361702127, "grad_norm": 0.933045506477356, "learning_rate": 1.9810112804959867e-05, "loss": 0.3123, "step": 59 }, { "epoch": 0.1595744680851064, "grad_norm": 0.9361464977264404, "learning_rate": 1.9791662274292638e-05, "loss": 0.3347, "step": 60 }, { "epoch": 0.1595744680851064, "eval_accuracy": 0.7741176470588236, "eval_f1": 0.12727272727272726, "eval_loss": 0.5019634962081909, "eval_precision": 0.7368421052631579, "eval_recall": 0.06965174129353234, "eval_runtime": 34.7325, "eval_samples_per_second": 6.449, "eval_steps_per_second": 0.202, "step": 60 }, { "epoch": 0.1622340425531915, "grad_norm": 0.8992587327957153, "learning_rate": 1.977236584241968e-05, "loss": 0.3457, "step": 61 }, { "epoch": 0.16489361702127658, "grad_norm": 1.282809853553772, "learning_rate": 1.9752225176358757e-05, "loss": 0.3226, "step": 62 }, { "epoch": 0.1675531914893617, "grad_norm": 2.4324252605438232, "learning_rate": 1.9731242016060985e-05, "loss": 0.4227, "step": 63 }, { "epoch": 0.1702127659574468, "grad_norm": 1.0456701517105103, "learning_rate": 1.9709418174260523e-05, "loss": 0.3102, "step": 64 }, { "epoch": 0.17287234042553193, "grad_norm": 1.2882471084594727, "learning_rate": 1.9686755536317945e-05, "loss": 0.3145, "step": 65 }, { "epoch": 0.17553191489361702, "grad_norm": 1.1312603950500488, "learning_rate": 1.9663256060057395e-05, "loss": 0.3353, "step": 66 }, { "epoch": 0.17819148936170212, "grad_norm": 1.0174272060394287, "learning_rate": 1.9638921775597428e-05, "loss": 0.2845, "step": 67 }, { "epoch": 0.18085106382978725, "grad_norm": 1.241572380065918, "learning_rate": 1.961375478517564e-05, "loss": 0.3015, "step": 68 }, { "epoch": 0.18351063829787234, "grad_norm": 1.3726611137390137, "learning_rate": 1.958775726296706e-05, "loss": 0.3671, "step": 69 }, { "epoch": 0.18617021276595744, "grad_norm": 1.2311499118804932, "learning_rate": 1.95609314548963e-05, "loss": 0.2902, "step": 70 }, { "epoch": 0.18882978723404256, "grad_norm": 1.3199646472930908, "learning_rate": 1.953327967844356e-05, "loss": 0.3594, "step": 71 }, { "epoch": 0.19148936170212766, "grad_norm": 1.6513502597808838, "learning_rate": 1.95048043224444e-05, "loss": 0.2831, "step": 72 }, { "epoch": 0.19414893617021275, "grad_norm": 1.763235330581665, "learning_rate": 1.9475507846883377e-05, "loss": 0.3675, "step": 73 }, { "epoch": 0.19680851063829788, "grad_norm": 1.8195736408233643, "learning_rate": 1.9445392782681523e-05, "loss": 0.398, "step": 74 }, { "epoch": 0.19946808510638298, "grad_norm": 1.9659175872802734, "learning_rate": 1.94144617314777e-05, "loss": 0.353, "step": 75 }, { "epoch": 0.20212765957446807, "grad_norm": 1.60419762134552, "learning_rate": 1.9382717365403854e-05, "loss": 0.3565, "step": 76 }, { "epoch": 0.2047872340425532, "grad_norm": 1.5443696975708008, "learning_rate": 1.9350162426854152e-05, "loss": 0.3246, "step": 77 }, { "epoch": 0.2074468085106383, "grad_norm": 1.8536072969436646, "learning_rate": 1.9316799728248074e-05, "loss": 0.3491, "step": 78 }, { "epoch": 0.21010638297872342, "grad_norm": 2.2563788890838623, "learning_rate": 1.9282632151787462e-05, "loss": 0.4211, "step": 79 }, { "epoch": 0.2127659574468085, "grad_norm": 1.3425776958465576, "learning_rate": 1.924766264920751e-05, "loss": 0.3077, "step": 80 }, { "epoch": 0.2127659574468085, "eval_accuracy": 0.7964705882352942, "eval_f1": 0.3663003663003663, "eval_loss": 0.4462856650352478, "eval_precision": 0.6944444444444444, "eval_recall": 0.24875621890547264, "eval_runtime": 34.8097, "eval_samples_per_second": 6.435, "eval_steps_per_second": 0.201, "step": 80 }, { "epoch": 0.2154255319148936, "grad_norm": 2.540194272994995, "learning_rate": 1.9211894241521757e-05, "loss": 0.3127, "step": 81 }, { "epoch": 0.21808510638297873, "grad_norm": 1.8769720792770386, "learning_rate": 1.917533001876113e-05, "loss": 0.2998, "step": 82 }, { "epoch": 0.22074468085106383, "grad_norm": 1.4883919954299927, "learning_rate": 1.9137973139706973e-05, "loss": 0.3245, "step": 83 }, { "epoch": 0.22340425531914893, "grad_norm": 1.6703698635101318, "learning_rate": 1.9099826831618168e-05, "loss": 0.3199, "step": 84 }, { "epoch": 0.22606382978723405, "grad_norm": 2.8918988704681396, "learning_rate": 1.9060894389952328e-05, "loss": 0.2825, "step": 85 }, { "epoch": 0.22872340425531915, "grad_norm": 1.5494073629379272, "learning_rate": 1.9021179178081107e-05, "loss": 0.3213, "step": 86 }, { "epoch": 0.23138297872340424, "grad_norm": 1.331063151359558, "learning_rate": 1.898068462699964e-05, "loss": 0.2572, "step": 87 }, { "epoch": 0.23404255319148937, "grad_norm": 1.5478427410125732, "learning_rate": 1.8939414235030137e-05, "loss": 0.3001, "step": 88 }, { "epoch": 0.23670212765957446, "grad_norm": 2.469545602798462, "learning_rate": 1.889737156751965e-05, "loss": 0.3199, "step": 89 }, { "epoch": 0.2393617021276596, "grad_norm": 2.134981155395508, "learning_rate": 1.8854560256532098e-05, "loss": 0.3192, "step": 90 }, { "epoch": 0.24202127659574468, "grad_norm": 1.727616548538208, "learning_rate": 1.8810984000534457e-05, "loss": 0.3072, "step": 91 }, { "epoch": 0.24468085106382978, "grad_norm": 2.0483591556549072, "learning_rate": 1.8766646564077265e-05, "loss": 0.297, "step": 92 }, { "epoch": 0.2473404255319149, "grad_norm": 1.8018875122070312, "learning_rate": 1.8721551777469397e-05, "loss": 0.2798, "step": 93 }, { "epoch": 0.25, "grad_norm": 1.7120018005371094, "learning_rate": 1.8675703536447178e-05, "loss": 0.2438, "step": 94 }, { "epoch": 0.2526595744680851, "grad_norm": 1.8456470966339111, "learning_rate": 1.862910580183782e-05, "loss": 0.333, "step": 95 }, { "epoch": 0.2553191489361702, "grad_norm": 2.701077461242676, "learning_rate": 1.858176259921724e-05, "loss": 0.3214, "step": 96 }, { "epoch": 0.2579787234042553, "grad_norm": 1.6109999418258667, "learning_rate": 1.853367801856231e-05, "loss": 0.2701, "step": 97 }, { "epoch": 0.26063829787234044, "grad_norm": 1.524688482284546, "learning_rate": 1.8484856213897496e-05, "loss": 0.2455, "step": 98 }, { "epoch": 0.2632978723404255, "grad_norm": 1.8650296926498413, "learning_rate": 1.843530140293603e-05, "loss": 0.273, "step": 99 }, { "epoch": 0.26595744680851063, "grad_norm": 1.5295664072036743, "learning_rate": 1.8385017866715507e-05, "loss": 0.307, "step": 100 }, { "epoch": 0.26595744680851063, "eval_accuracy": 0.8, "eval_f1": 0.4097222222222222, "eval_loss": 0.44977959990501404, "eval_precision": 0.6781609195402298, "eval_recall": 0.2935323383084577, "eval_runtime": 34.7061, "eval_samples_per_second": 6.454, "eval_steps_per_second": 0.202, "step": 100 }, { "epoch": 0.26861702127659576, "grad_norm": 2.1255381107330322, "learning_rate": 1.833400994922806e-05, "loss": 0.2532, "step": 101 }, { "epoch": 0.2712765957446808, "grad_norm": 2.4879391193389893, "learning_rate": 1.8282282057045087e-05, "loss": 0.3593, "step": 102 }, { "epoch": 0.27393617021276595, "grad_norm": 2.0561375617980957, "learning_rate": 1.8229838658936566e-05, "loss": 0.266, "step": 103 }, { "epoch": 0.2765957446808511, "grad_norm": 2.101980447769165, "learning_rate": 1.8176684285484985e-05, "loss": 0.3686, "step": 104 }, { "epoch": 0.27925531914893614, "grad_norm": 2.0041894912719727, "learning_rate": 1.8122823528693966e-05, "loss": 0.2551, "step": 105 }, { "epoch": 0.28191489361702127, "grad_norm": 1.981961727142334, "learning_rate": 1.8068261041591548e-05, "loss": 0.2932, "step": 106 }, { "epoch": 0.2845744680851064, "grad_norm": 2.636021614074707, "learning_rate": 1.8013001537828213e-05, "loss": 0.2584, "step": 107 }, { "epoch": 0.2872340425531915, "grad_norm": 2.6354217529296875, "learning_rate": 1.7957049791269684e-05, "loss": 0.3208, "step": 108 }, { "epoch": 0.2898936170212766, "grad_norm": 3.6334121227264404, "learning_rate": 1.79004106355845e-05, "loss": 0.3142, "step": 109 }, { "epoch": 0.2925531914893617, "grad_norm": 2.6944894790649414, "learning_rate": 1.7843088963826437e-05, "loss": 0.2854, "step": 110 }, { "epoch": 0.29521276595744683, "grad_norm": 4.576889514923096, "learning_rate": 1.7785089728011798e-05, "loss": 0.2685, "step": 111 }, { "epoch": 0.2978723404255319, "grad_norm": 2.23494029045105, "learning_rate": 1.772641793869162e-05, "loss": 0.2604, "step": 112 }, { "epoch": 0.300531914893617, "grad_norm": 3.0733425617218018, "learning_rate": 1.7667078664518796e-05, "loss": 0.2542, "step": 113 }, { "epoch": 0.30319148936170215, "grad_norm": 1.9046289920806885, "learning_rate": 1.7607077031810204e-05, "loss": 0.2879, "step": 114 }, { "epoch": 0.3058510638297872, "grad_norm": 2.2374041080474854, "learning_rate": 1.7546418224103838e-05, "loss": 0.2998, "step": 115 }, { "epoch": 0.30851063829787234, "grad_norm": 5.9824395179748535, "learning_rate": 1.7485107481711014e-05, "loss": 0.3637, "step": 116 }, { "epoch": 0.31117021276595747, "grad_norm": 3.0998919010162354, "learning_rate": 1.7423150101263645e-05, "loss": 0.2746, "step": 117 }, { "epoch": 0.31382978723404253, "grad_norm": 2.05523419380188, "learning_rate": 1.7360551435256673e-05, "loss": 0.2776, "step": 118 }, { "epoch": 0.31648936170212766, "grad_norm": 2.1908273696899414, "learning_rate": 1.729731689158568e-05, "loss": 0.3184, "step": 119 }, { "epoch": 0.3191489361702128, "grad_norm": 2.3177342414855957, "learning_rate": 1.7233451933079663e-05, "loss": 0.2413, "step": 120 }, { "epoch": 0.3191489361702128, "eval_accuracy": 0.8141176470588235, "eval_f1": 0.48026315789473684, "eval_loss": 0.43155437707901, "eval_precision": 0.7087378640776699, "eval_recall": 0.36318407960199006, "eval_runtime": 34.0012, "eval_samples_per_second": 6.588, "eval_steps_per_second": 0.206, "step": 120 }, { "epoch": 0.32180851063829785, "grad_norm": 2.1571784019470215, "learning_rate": 1.7168962077029146e-05, "loss": 0.3229, "step": 121 }, { "epoch": 0.324468085106383, "grad_norm": 3.056910991668701, "learning_rate": 1.7103852894709517e-05, "loss": 0.3116, "step": 122 }, { "epoch": 0.3271276595744681, "grad_norm": 1.9665093421936035, "learning_rate": 1.7038130010899716e-05, "loss": 0.2743, "step": 123 }, { "epoch": 0.32978723404255317, "grad_norm": 2.3583879470825195, "learning_rate": 1.6971799103396332e-05, "loss": 0.2776, "step": 124 }, { "epoch": 0.3324468085106383, "grad_norm": 2.8476576805114746, "learning_rate": 1.6904865902523098e-05, "loss": 0.3213, "step": 125 }, { "epoch": 0.3351063829787234, "grad_norm": 1.9458303451538086, "learning_rate": 1.6837336190635824e-05, "loss": 0.2771, "step": 126 }, { "epoch": 0.3377659574468085, "grad_norm": 2.4472289085388184, "learning_rate": 1.6769215801622884e-05, "loss": 0.2924, "step": 127 }, { "epoch": 0.3404255319148936, "grad_norm": 2.520463228225708, "learning_rate": 1.6700510620401223e-05, "loss": 0.269, "step": 128 }, { "epoch": 0.34308510638297873, "grad_norm": 2.2465851306915283, "learning_rate": 1.6631226582407954e-05, "loss": 0.3043, "step": 129 }, { "epoch": 0.34574468085106386, "grad_norm": 2.4705588817596436, "learning_rate": 1.6561369673087588e-05, "loss": 0.3375, "step": 130 }, { "epoch": 0.3484042553191489, "grad_norm": 2.332902669906616, "learning_rate": 1.649094592737497e-05, "loss": 0.2313, "step": 131 }, { "epoch": 0.35106382978723405, "grad_norm": 2.050671100616455, "learning_rate": 1.641996142917391e-05, "loss": 0.3066, "step": 132 }, { "epoch": 0.3537234042553192, "grad_norm": 3.541461706161499, "learning_rate": 1.63484223108316e-05, "loss": 0.2937, "step": 133 }, { "epoch": 0.35638297872340424, "grad_norm": 2.344451665878296, "learning_rate": 1.6276334752608823e-05, "loss": 0.2666, "step": 134 }, { "epoch": 0.35904255319148937, "grad_norm": 2.1711394786834717, "learning_rate": 1.6203704982146073e-05, "loss": 0.2457, "step": 135 }, { "epoch": 0.3617021276595745, "grad_norm": 3.414870023727417, "learning_rate": 1.613053927392553e-05, "loss": 0.331, "step": 136 }, { "epoch": 0.36436170212765956, "grad_norm": 3.037440299987793, "learning_rate": 1.6056843948729e-05, "loss": 0.3025, "step": 137 }, { "epoch": 0.3670212765957447, "grad_norm": 3.548393726348877, "learning_rate": 1.5982625373091877e-05, "loss": 0.3203, "step": 138 }, { "epoch": 0.3696808510638298, "grad_norm": 2.598219633102417, "learning_rate": 1.5907889958753134e-05, "loss": 0.3155, "step": 139 }, { "epoch": 0.3723404255319149, "grad_norm": 2.790419101715088, "learning_rate": 1.5832644162101417e-05, "loss": 0.326, "step": 140 }, { "epoch": 0.3723404255319149, "eval_accuracy": 0.8235294117647058, "eval_f1": 0.5222929936305732, "eval_loss": 0.4106709063053131, "eval_precision": 0.7256637168141593, "eval_recall": 0.4079601990049751, "eval_runtime": 33.9867, "eval_samples_per_second": 6.591, "eval_steps_per_second": 0.206, "step": 140 }, { "epoch": 0.375, "grad_norm": 3.642287492752075, "learning_rate": 1.5756894483617268e-05, "loss": 0.2809, "step": 141 }, { "epoch": 0.3776595744680851, "grad_norm": 2.40323805809021, "learning_rate": 1.568064746731156e-05, "loss": 0.2835, "step": 142 }, { "epoch": 0.3803191489361702, "grad_norm": 1.9183332920074463, "learning_rate": 1.560390970016015e-05, "loss": 0.2534, "step": 143 }, { "epoch": 0.3829787234042553, "grad_norm": 3.2929575443267822, "learning_rate": 1.552668781153484e-05, "loss": 0.373, "step": 144 }, { "epoch": 0.38563829787234044, "grad_norm": 2.27150559425354, "learning_rate": 1.5448988472630654e-05, "loss": 0.2783, "step": 145 }, { "epoch": 0.3882978723404255, "grad_norm": 2.780089855194092, "learning_rate": 1.5370818395889536e-05, "loss": 0.322, "step": 146 }, { "epoch": 0.39095744680851063, "grad_norm": 2.2651729583740234, "learning_rate": 1.5292184334420434e-05, "loss": 0.3145, "step": 147 }, { "epoch": 0.39361702127659576, "grad_norm": 2.8416588306427, "learning_rate": 1.521309308141592e-05, "loss": 0.2979, "step": 148 }, { "epoch": 0.3962765957446808, "grad_norm": 2.6914663314819336, "learning_rate": 1.5133551469565313e-05, "loss": 0.3314, "step": 149 }, { "epoch": 0.39893617021276595, "grad_norm": 4.730180740356445, "learning_rate": 1.5053566370464416e-05, "loss": 0.2545, "step": 150 }, { "epoch": 0.4015957446808511, "grad_norm": 2.2047128677368164, "learning_rate": 1.4973144694021874e-05, "loss": 0.2487, "step": 151 }, { "epoch": 0.40425531914893614, "grad_norm": 2.841487407684326, "learning_rate": 1.4892293387862221e-05, "loss": 0.3067, "step": 152 }, { "epoch": 0.40691489361702127, "grad_norm": 5.28929328918457, "learning_rate": 1.4811019436725684e-05, "loss": 0.242, "step": 153 }, { "epoch": 0.4095744680851064, "grad_norm": 3.347501039505005, "learning_rate": 1.472932986186477e-05, "loss": 0.207, "step": 154 }, { "epoch": 0.4122340425531915, "grad_norm": 3.1569905281066895, "learning_rate": 1.4647231720437687e-05, "loss": 0.3062, "step": 155 }, { "epoch": 0.4148936170212766, "grad_norm": 2.134598970413208, "learning_rate": 1.4564732104898702e-05, "loss": 0.2443, "step": 156 }, { "epoch": 0.4175531914893617, "grad_norm": 2.528136968612671, "learning_rate": 1.4481838142385403e-05, "loss": 0.2308, "step": 157 }, { "epoch": 0.42021276595744683, "grad_norm": 2.756695032119751, "learning_rate": 1.4398556994102996e-05, "loss": 0.2461, "step": 158 }, { "epoch": 0.4228723404255319, "grad_norm": 4.9117631912231445, "learning_rate": 1.4314895854705641e-05, "loss": 0.2911, "step": 159 }, { "epoch": 0.425531914893617, "grad_norm": 2.877560615539551, "learning_rate": 1.4230861951674914e-05, "loss": 0.2404, "step": 160 }, { "epoch": 0.425531914893617, "eval_accuracy": 0.8094117647058824, "eval_f1": 0.40875912408759124, "eval_loss": 0.46145251393318176, "eval_precision": 0.7671232876712328, "eval_recall": 0.27860696517412936, "eval_runtime": 34.0326, "eval_samples_per_second": 6.582, "eval_steps_per_second": 0.206, "step": 160 }, { "epoch": 0.42819148936170215, "grad_norm": 4.159635066986084, "learning_rate": 1.4146462544695428e-05, "loss": 0.2858, "step": 161 }, { "epoch": 0.4308510638297872, "grad_norm": 2.716390609741211, "learning_rate": 1.4061704925027653e-05, "loss": 0.2299, "step": 162 }, { "epoch": 0.43351063829787234, "grad_norm": 2.3737223148345947, "learning_rate": 1.3976596414878044e-05, "loss": 0.2371, "step": 163 }, { "epoch": 0.43617021276595747, "grad_norm": 3.5703928470611572, "learning_rate": 1.3891144366766457e-05, "loss": 0.3007, "step": 164 }, { "epoch": 0.43882978723404253, "grad_norm": 2.449308156967163, "learning_rate": 1.380535616289099e-05, "loss": 0.2414, "step": 165 }, { "epoch": 0.44148936170212766, "grad_norm": 3.272531509399414, "learning_rate": 1.3719239214490203e-05, "loss": 0.2961, "step": 166 }, { "epoch": 0.4441489361702128, "grad_norm": 3.6306636333465576, "learning_rate": 1.363280096120289e-05, "loss": 0.2923, "step": 167 }, { "epoch": 0.44680851063829785, "grad_norm": 2.5956878662109375, "learning_rate": 1.3546048870425356e-05, "loss": 0.251, "step": 168 }, { "epoch": 0.449468085106383, "grad_norm": 5.468013286590576, "learning_rate": 1.3458990436666313e-05, "loss": 0.287, "step": 169 }, { "epoch": 0.4521276595744681, "grad_norm": 2.5763583183288574, "learning_rate": 1.3371633180899417e-05, "loss": 0.2779, "step": 170 }, { "epoch": 0.45478723404255317, "grad_norm": 3.8822455406188965, "learning_rate": 1.3283984649913552e-05, "loss": 0.2197, "step": 171 }, { "epoch": 0.4574468085106383, "grad_norm": 2.4867823123931885, "learning_rate": 1.3196052415660856e-05, "loss": 0.2875, "step": 172 }, { "epoch": 0.4601063829787234, "grad_norm": 2.161820888519287, "learning_rate": 1.3107844074602566e-05, "loss": 0.2416, "step": 173 }, { "epoch": 0.4627659574468085, "grad_norm": 3.0401649475097656, "learning_rate": 1.3019367247052781e-05, "loss": 0.2634, "step": 174 }, { "epoch": 0.4654255319148936, "grad_norm": 2.273088216781616, "learning_rate": 1.2930629576520133e-05, "loss": 0.2709, "step": 175 }, { "epoch": 0.46808510638297873, "grad_norm": 3.001025438308716, "learning_rate": 1.2841638729047463e-05, "loss": 0.2806, "step": 176 }, { "epoch": 0.47074468085106386, "grad_norm": 2.348917245864868, "learning_rate": 1.2752402392549556e-05, "loss": 0.2702, "step": 177 }, { "epoch": 0.4734042553191489, "grad_norm": 2.713019847869873, "learning_rate": 1.2662928276148985e-05, "loss": 0.2588, "step": 178 }, { "epoch": 0.47606382978723405, "grad_norm": 3.061501979827881, "learning_rate": 1.2573224109510112e-05, "loss": 0.2701, "step": 179 }, { "epoch": 0.4787234042553192, "grad_norm": 5.120430946350098, "learning_rate": 1.2483297642171332e-05, "loss": 0.2962, "step": 180 }, { "epoch": 0.4787234042553192, "eval_accuracy": 0.8282352941176471, "eval_f1": 0.5228758169934641, "eval_loss": 0.42048707604408264, "eval_precision": 0.7619047619047619, "eval_recall": 0.39800995024875624, "eval_runtime": 34.4467, "eval_samples_per_second": 6.503, "eval_steps_per_second": 0.203, "step": 180 }, { "epoch": 0.48138297872340424, "grad_norm": 2.8563108444213867, "learning_rate": 1.2393156642875579e-05, "loss": 0.2855, "step": 181 }, { "epoch": 0.48404255319148937, "grad_norm": 3.6837549209594727, "learning_rate": 1.23028088988992e-05, "loss": 0.2976, "step": 182 }, { "epoch": 0.4867021276595745, "grad_norm": 3.085362434387207, "learning_rate": 1.2212262215379199e-05, "loss": 0.2775, "step": 183 }, { "epoch": 0.48936170212765956, "grad_norm": 3.395561695098877, "learning_rate": 1.2121524414638958e-05, "loss": 0.3076, "step": 184 }, { "epoch": 0.4920212765957447, "grad_norm": 3.6867411136627197, "learning_rate": 1.2030603335512467e-05, "loss": 0.2402, "step": 185 }, { "epoch": 0.4946808510638298, "grad_norm": 5.76826810836792, "learning_rate": 1.1939506832667129e-05, "loss": 0.2715, "step": 186 }, { "epoch": 0.4973404255319149, "grad_norm": 3.938023328781128, "learning_rate": 1.1848242775925188e-05, "loss": 0.2773, "step": 187 }, { "epoch": 0.5, "grad_norm": 3.6675262451171875, "learning_rate": 1.1756819049583861e-05, "loss": 0.2752, "step": 188 }, { "epoch": 0.5026595744680851, "grad_norm": 2.274174213409424, "learning_rate": 1.166524355173422e-05, "loss": 0.2545, "step": 189 }, { "epoch": 0.5053191489361702, "grad_norm": 3.854417562484741, "learning_rate": 1.1573524193578863e-05, "loss": 0.2804, "step": 190 }, { "epoch": 0.5079787234042553, "grad_norm": 5.1708550453186035, "learning_rate": 1.1481668898748474e-05, "loss": 0.2371, "step": 191 }, { "epoch": 0.5106382978723404, "grad_norm": 4.153345584869385, "learning_rate": 1.1389685602617302e-05, "loss": 0.2405, "step": 192 }, { "epoch": 0.5132978723404256, "grad_norm": 3.244084119796753, "learning_rate": 1.1297582251617618e-05, "loss": 0.2737, "step": 193 }, { "epoch": 0.5159574468085106, "grad_norm": 2.50569486618042, "learning_rate": 1.1205366802553231e-05, "loss": 0.2647, "step": 194 }, { "epoch": 0.5186170212765957, "grad_norm": 2.3251872062683105, "learning_rate": 1.1113047221912097e-05, "loss": 0.1958, "step": 195 }, { "epoch": 0.5212765957446809, "grad_norm": 2.288127899169922, "learning_rate": 1.1020631485178084e-05, "loss": 0.2109, "step": 196 }, { "epoch": 0.523936170212766, "grad_norm": 4.095820426940918, "learning_rate": 1.0928127576141992e-05, "loss": 0.2998, "step": 197 }, { "epoch": 0.526595744680851, "grad_norm": 5.008273601531982, "learning_rate": 1.0835543486211815e-05, "loss": 0.2841, "step": 198 }, { "epoch": 0.5292553191489362, "grad_norm": 5.711911678314209, "learning_rate": 1.0742887213722372e-05, "loss": 0.2488, "step": 199 }, { "epoch": 0.5319148936170213, "grad_norm": 5.29080867767334, "learning_rate": 1.065016676324433e-05, "loss": 0.2727, "step": 200 }, { "epoch": 0.5319148936170213, "eval_accuracy": 0.8, "eval_f1": 0.34615384615384615, "eval_loss": 0.4829849600791931, "eval_precision": 0.7627118644067796, "eval_recall": 0.22388059701492538, "eval_runtime": 33.8087, "eval_samples_per_second": 6.626, "eval_steps_per_second": 0.207, "step": 200 }, { "epoch": 0.5345744680851063, "grad_norm": 6.333003044128418, "learning_rate": 1.0557390144892684e-05, "loss": 0.3334, "step": 201 }, { "epoch": 0.5372340425531915, "grad_norm": 2.1432178020477295, "learning_rate": 1.0464565373634784e-05, "loss": 0.2513, "step": 202 }, { "epoch": 0.5398936170212766, "grad_norm": 5.119022369384766, "learning_rate": 1.0371700468597886e-05, "loss": 0.2566, "step": 203 }, { "epoch": 0.5425531914893617, "grad_norm": 3.5691733360290527, "learning_rate": 1.0278803452376416e-05, "loss": 0.3084, "step": 204 }, { "epoch": 0.5452127659574468, "grad_norm": 3.0961036682128906, "learning_rate": 1.018588235033888e-05, "loss": 0.2085, "step": 205 }, { "epoch": 0.5478723404255319, "grad_norm": 2.27486515045166, "learning_rate": 1.0092945189934558e-05, "loss": 0.2524, "step": 206 }, { "epoch": 0.550531914893617, "grad_norm": 2.3716437816619873, "learning_rate": 1e-05, "loss": 0.2011, "step": 207 }, { "epoch": 0.5531914893617021, "grad_norm": 2.6007697582244873, "learning_rate": 9.907054810065446e-06, "loss": 0.2451, "step": 208 }, { "epoch": 0.5558510638297872, "grad_norm": 2.5963995456695557, "learning_rate": 9.81411764966112e-06, "loss": 0.2705, "step": 209 }, { "epoch": 0.5585106382978723, "grad_norm": 2.1203646659851074, "learning_rate": 9.721196547623585e-06, "loss": 0.2101, "step": 210 }, { "epoch": 0.5611702127659575, "grad_norm": 3.2986724376678467, "learning_rate": 9.628299531402118e-06, "loss": 0.2659, "step": 211 }, { "epoch": 0.5638297872340425, "grad_norm": 2.127525568008423, "learning_rate": 9.535434626365221e-06, "loss": 0.251, "step": 212 }, { "epoch": 0.5664893617021277, "grad_norm": 3.1327059268951416, "learning_rate": 9.442609855107317e-06, "loss": 0.2255, "step": 213 }, { "epoch": 0.5691489361702128, "grad_norm": 2.0999770164489746, "learning_rate": 9.349833236755675e-06, "loss": 0.2549, "step": 214 }, { "epoch": 0.5718085106382979, "grad_norm": 2.7766880989074707, "learning_rate": 9.257112786277631e-06, "loss": 0.2224, "step": 215 }, { "epoch": 0.574468085106383, "grad_norm": 2.451842784881592, "learning_rate": 9.164456513788186e-06, "loss": 0.2599, "step": 216 }, { "epoch": 0.5771276595744681, "grad_norm": 2.7746975421905518, "learning_rate": 9.07187242385801e-06, "loss": 0.2601, "step": 217 }, { "epoch": 0.5797872340425532, "grad_norm": 2.561441421508789, "learning_rate": 8.979368514821917e-06, "loss": 0.284, "step": 218 }, { "epoch": 0.5824468085106383, "grad_norm": 2.425262928009033, "learning_rate": 8.88695277808791e-06, "loss": 0.2593, "step": 219 }, { "epoch": 0.5851063829787234, "grad_norm": 3.180457830429077, "learning_rate": 8.79463319744677e-06, "loss": 0.2844, "step": 220 }, { "epoch": 0.5851063829787234, "eval_accuracy": 0.8258823529411765, "eval_f1": 0.5163398692810458, "eval_loss": 0.41871950030326843, "eval_precision": 0.7523809523809524, "eval_recall": 0.39303482587064675, "eval_runtime": 34.0471, "eval_samples_per_second": 6.579, "eval_steps_per_second": 0.206, "step": 220 }, { "epoch": 0.5877659574468085, "grad_norm": 2.8783645629882812, "learning_rate": 8.702417748382384e-06, "loss": 0.2458, "step": 221 }, { "epoch": 0.5904255319148937, "grad_norm": 2.950291395187378, "learning_rate": 8.610314397382701e-06, "loss": 0.3062, "step": 222 }, { "epoch": 0.5930851063829787, "grad_norm": 2.8430628776550293, "learning_rate": 8.51833110125153e-06, "loss": 0.2913, "step": 223 }, { "epoch": 0.5957446808510638, "grad_norm": 6.691501617431641, "learning_rate": 8.426475806421139e-06, "loss": 0.3716, "step": 224 }, { "epoch": 0.598404255319149, "grad_norm": 2.705397367477417, "learning_rate": 8.334756448265782e-06, "loss": 0.2692, "step": 225 }, { "epoch": 0.601063829787234, "grad_norm": 2.276686429977417, "learning_rate": 8.243180950416142e-06, "loss": 0.214, "step": 226 }, { "epoch": 0.6037234042553191, "grad_norm": 4.622035980224609, "learning_rate": 8.151757224074815e-06, "loss": 0.1863, "step": 227 }, { "epoch": 0.6063829787234043, "grad_norm": 2.3402657508850098, "learning_rate": 8.060493167332874e-06, "loss": 0.2895, "step": 228 }, { "epoch": 0.6090425531914894, "grad_norm": 4.533783912658691, "learning_rate": 7.969396664487534e-06, "loss": 0.256, "step": 229 }, { "epoch": 0.6117021276595744, "grad_norm": 4.254709243774414, "learning_rate": 7.878475585361045e-06, "loss": 0.2798, "step": 230 }, { "epoch": 0.6143617021276596, "grad_norm": 2.4173777103424072, "learning_rate": 7.787737784620803e-06, "loss": 0.3046, "step": 231 }, { "epoch": 0.6170212765957447, "grad_norm": 2.9640042781829834, "learning_rate": 7.697191101100802e-06, "loss": 0.2893, "step": 232 }, { "epoch": 0.6196808510638298, "grad_norm": 2.9573986530303955, "learning_rate": 7.606843357124426e-06, "loss": 0.2764, "step": 233 }, { "epoch": 0.6223404255319149, "grad_norm": 3.9960691928863525, "learning_rate": 7.516702357828672e-06, "loss": 0.3243, "step": 234 }, { "epoch": 0.625, "grad_norm": 2.9117209911346436, "learning_rate": 7.42677589048989e-06, "loss": 0.2863, "step": 235 }, { "epoch": 0.6276595744680851, "grad_norm": 2.57856822013855, "learning_rate": 7.337071723851018e-06, "loss": 0.2433, "step": 236 }, { "epoch": 0.6303191489361702, "grad_norm": 3.1635406017303467, "learning_rate": 7.247597607450446e-06, "loss": 0.2622, "step": 237 }, { "epoch": 0.6329787234042553, "grad_norm": 3.4039433002471924, "learning_rate": 7.1583612709525405e-06, "loss": 0.2313, "step": 238 }, { "epoch": 0.6356382978723404, "grad_norm": 3.072800397872925, "learning_rate": 7.06937042347987e-06, "loss": 0.3117, "step": 239 }, { "epoch": 0.6382978723404256, "grad_norm": 3.175246000289917, "learning_rate": 6.980632752947221e-06, "loss": 0.2632, "step": 240 }, { "epoch": 0.6382978723404256, "eval_accuracy": 0.8235294117647058, "eval_f1": 0.5161290322580645, "eval_loss": 0.4037013053894043, "eval_precision": 0.7339449541284404, "eval_recall": 0.39800995024875624, "eval_runtime": 34.0215, "eval_samples_per_second": 6.584, "eval_steps_per_second": 0.206, "step": 240 }, { "epoch": 0.6409574468085106, "grad_norm": 2.5714304447174072, "learning_rate": 6.892155925397437e-06, "loss": 0.2749, "step": 241 }, { "epoch": 0.6436170212765957, "grad_norm": 3.128525733947754, "learning_rate": 6.803947584339148e-06, "loss": 0.3527, "step": 242 }, { "epoch": 0.6462765957446809, "grad_norm": 3.6604840755462646, "learning_rate": 6.716015350086449e-06, "loss": 0.2686, "step": 243 }, { "epoch": 0.648936170212766, "grad_norm": 2.6133296489715576, "learning_rate": 6.628366819100586e-06, "loss": 0.2836, "step": 244 }, { "epoch": 0.651595744680851, "grad_norm": 2.5161774158477783, "learning_rate": 6.54100956333369e-06, "loss": 0.2395, "step": 245 }, { "epoch": 0.6542553191489362, "grad_norm": 2.824259042739868, "learning_rate": 6.453951129574644e-06, "loss": 0.2906, "step": 246 }, { "epoch": 0.6569148936170213, "grad_norm": 2.747422456741333, "learning_rate": 6.3671990387971096e-06, "loss": 0.2368, "step": 247 }, { "epoch": 0.6595744680851063, "grad_norm": 2.540599822998047, "learning_rate": 6.280760785509802e-06, "loss": 0.3036, "step": 248 }, { "epoch": 0.6622340425531915, "grad_norm": 2.4649527072906494, "learning_rate": 6.194643837109015e-06, "loss": 0.2935, "step": 249 }, { "epoch": 0.6648936170212766, "grad_norm": 2.2564632892608643, "learning_rate": 6.108855633233546e-06, "loss": 0.2276, "step": 250 }, { "epoch": 0.6675531914893617, "grad_norm": 2.5052363872528076, "learning_rate": 6.0234035851219604e-06, "loss": 0.2464, "step": 251 }, { "epoch": 0.6702127659574468, "grad_norm": 3.091642141342163, "learning_rate": 5.93829507497235e-06, "loss": 0.2766, "step": 252 }, { "epoch": 0.6728723404255319, "grad_norm": 3.3672595024108887, "learning_rate": 5.853537455304575e-06, "loss": 0.2567, "step": 253 }, { "epoch": 0.675531914893617, "grad_norm": 2.4779727458953857, "learning_rate": 5.769138048325087e-06, "loss": 0.2628, "step": 254 }, { "epoch": 0.6781914893617021, "grad_norm": 2.5639469623565674, "learning_rate": 5.685104145294364e-06, "loss": 0.2204, "step": 255 }, { "epoch": 0.6808510638297872, "grad_norm": 3.3351776599884033, "learning_rate": 5.601443005897012e-06, "loss": 0.2535, "step": 256 }, { "epoch": 0.6835106382978723, "grad_norm": 2.3642754554748535, "learning_rate": 5.5181618576146e-06, "loss": 0.2234, "step": 257 }, { "epoch": 0.6861702127659575, "grad_norm": 2.9997129440307617, "learning_rate": 5.435267895101303e-06, "loss": 0.2643, "step": 258 }, { "epoch": 0.6888297872340425, "grad_norm": 2.4532787799835205, "learning_rate": 5.352768279562315e-06, "loss": 0.2621, "step": 259 }, { "epoch": 0.6914893617021277, "grad_norm": 2.572538137435913, "learning_rate": 5.270670138135234e-06, "loss": 0.2499, "step": 260 }, { "epoch": 0.6914893617021277, "eval_accuracy": 0.8247058823529412, "eval_f1": 0.5299684542586751, "eval_loss": 0.3885125517845154, "eval_precision": 0.7241379310344828, "eval_recall": 0.417910447761194, "eval_runtime": 33.8843, "eval_samples_per_second": 6.611, "eval_steps_per_second": 0.207, "step": 260 }, { "epoch": 0.6941489361702128, "grad_norm": 2.906144618988037, "learning_rate": 5.188980563274315e-06, "loss": 0.3095, "step": 261 }, { "epoch": 0.6968085106382979, "grad_norm": 2.319133996963501, "learning_rate": 5.107706612137776e-06, "loss": 0.2388, "step": 262 }, { "epoch": 0.699468085106383, "grad_norm": 3.162642478942871, "learning_rate": 5.026855305978129e-06, "loss": 0.2462, "step": 263 }, { "epoch": 0.7021276595744681, "grad_norm": 2.749540090560913, "learning_rate": 4.946433629535585e-06, "loss": 0.2659, "step": 264 }, { "epoch": 0.7047872340425532, "grad_norm": 2.891836643218994, "learning_rate": 4.866448530434692e-06, "loss": 0.2332, "step": 265 }, { "epoch": 0.7074468085106383, "grad_norm": 2.4717514514923096, "learning_rate": 4.786906918584083e-06, "loss": 0.2136, "step": 266 }, { "epoch": 0.7101063829787234, "grad_norm": 2.679591655731201, "learning_rate": 4.707815665579569e-06, "loss": 0.3036, "step": 267 }, { "epoch": 0.7127659574468085, "grad_norm": 2.3344614505767822, "learning_rate": 4.629181604110464e-06, "loss": 0.2853, "step": 268 }, { "epoch": 0.7154255319148937, "grad_norm": 2.839320182800293, "learning_rate": 4.551011527369348e-06, "loss": 0.2394, "step": 269 }, { "epoch": 0.7180851063829787, "grad_norm": 2.27245831489563, "learning_rate": 4.4733121884651665e-06, "loss": 0.2496, "step": 270 }, { "epoch": 0.7207446808510638, "grad_norm": 3.038536548614502, "learning_rate": 4.3960902998398524e-06, "loss": 0.2787, "step": 271 }, { "epoch": 0.723404255319149, "grad_norm": 3.1204025745391846, "learning_rate": 4.319352532688444e-06, "loss": 0.2678, "step": 272 }, { "epoch": 0.726063829787234, "grad_norm": 3.8436288833618164, "learning_rate": 4.243105516382732e-06, "loss": 0.2405, "step": 273 }, { "epoch": 0.7287234042553191, "grad_norm": 3.1559836864471436, "learning_rate": 4.167355837898585e-06, "loss": 0.2881, "step": 274 }, { "epoch": 0.7313829787234043, "grad_norm": 2.5084681510925293, "learning_rate": 4.092110041246865e-06, "loss": 0.2365, "step": 275 }, { "epoch": 0.7340425531914894, "grad_norm": 3.0584487915039062, "learning_rate": 4.017374626908125e-06, "loss": 0.2808, "step": 276 }, { "epoch": 0.7367021276595744, "grad_norm": 3.6234519481658936, "learning_rate": 3.943156051271003e-06, "loss": 0.2993, "step": 277 }, { "epoch": 0.7393617021276596, "grad_norm": 1.8584307432174683, "learning_rate": 3.8694607260744745e-06, "loss": 0.2012, "step": 278 }, { "epoch": 0.7420212765957447, "grad_norm": 2.4248085021972656, "learning_rate": 3.7962950178539282e-06, "loss": 0.2352, "step": 279 }, { "epoch": 0.7446808510638298, "grad_norm": 2.5359675884246826, "learning_rate": 3.7236652473911817e-06, "loss": 0.2121, "step": 280 }, { "epoch": 0.7446808510638298, "eval_accuracy": 0.8223529411764706, "eval_f1": 0.5175718849840255, "eval_loss": 0.3953240215778351, "eval_precision": 0.7232142857142857, "eval_recall": 0.40298507462686567, "eval_runtime": 34.1139, "eval_samples_per_second": 6.566, "eval_steps_per_second": 0.205, "step": 280 }, { "epoch": 0.7473404255319149, "grad_norm": 2.3844354152679443, "learning_rate": 3.651577689168405e-06, "loss": 0.2212, "step": 281 }, { "epoch": 0.75, "grad_norm": 2.8635263442993164, "learning_rate": 3.580038570826093e-06, "loss": 0.2259, "step": 282 }, { "epoch": 0.7526595744680851, "grad_norm": 3.1672933101654053, "learning_rate": 3.509054072625031e-06, "loss": 0.2691, "step": 283 }, { "epoch": 0.7553191489361702, "grad_norm": 3.298377752304077, "learning_rate": 3.4386303269124142e-06, "loss": 0.261, "step": 284 }, { "epoch": 0.7579787234042553, "grad_norm": 3.3718481063842773, "learning_rate": 3.3687734175920505e-06, "loss": 0.2842, "step": 285 }, { "epoch": 0.7606382978723404, "grad_norm": 2.822702646255493, "learning_rate": 3.299489379598777e-06, "loss": 0.2416, "step": 286 }, { "epoch": 0.7632978723404256, "grad_norm": 3.209895372390747, "learning_rate": 3.2307841983771182e-06, "loss": 0.2706, "step": 287 }, { "epoch": 0.7659574468085106, "grad_norm": 2.953824996948242, "learning_rate": 3.162663809364178e-06, "loss": 0.2629, "step": 288 }, { "epoch": 0.7686170212765957, "grad_norm": 4.190698623657227, "learning_rate": 3.095134097476904e-06, "loss": 0.2609, "step": 289 }, { "epoch": 0.7712765957446809, "grad_norm": 4.36337423324585, "learning_rate": 3.0282008966036647e-06, "loss": 0.2549, "step": 290 }, { "epoch": 0.773936170212766, "grad_norm": 2.8681600093841553, "learning_rate": 2.9618699891002843e-06, "loss": 0.2464, "step": 291 }, { "epoch": 0.776595744680851, "grad_norm": 3.781843900680542, "learning_rate": 2.8961471052904855e-06, "loss": 0.3261, "step": 292 }, { "epoch": 0.7792553191489362, "grad_norm": 3.1815481185913086, "learning_rate": 2.831037922970855e-06, "loss": 0.2659, "step": 293 }, { "epoch": 0.7819148936170213, "grad_norm": 3.2825517654418945, "learning_rate": 2.7665480669203383e-06, "loss": 0.2239, "step": 294 }, { "epoch": 0.7845744680851063, "grad_norm": 2.418006420135498, "learning_rate": 2.702683108414326e-06, "loss": 0.2476, "step": 295 }, { "epoch": 0.7872340425531915, "grad_norm": 3.483743906021118, "learning_rate": 2.639448564743328e-06, "loss": 0.2306, "step": 296 }, { "epoch": 0.7898936170212766, "grad_norm": 3.201629638671875, "learning_rate": 2.57684989873636e-06, "loss": 0.2562, "step": 297 }, { "epoch": 0.7925531914893617, "grad_norm": 2.7855303287506104, "learning_rate": 2.514892518288988e-06, "loss": 0.2245, "step": 298 }, { "epoch": 0.7952127659574468, "grad_norm": 3.742940664291382, "learning_rate": 2.4535817758961644e-06, "loss": 0.3192, "step": 299 }, { "epoch": 0.7978723404255319, "grad_norm": 2.966266393661499, "learning_rate": 2.3929229681898005e-06, "loss": 0.2704, "step": 300 }, { "epoch": 0.7978723404255319, "eval_accuracy": 0.8329411764705882, "eval_f1": 0.5644171779141104, "eval_loss": 0.38487711548805237, "eval_precision": 0.736, "eval_recall": 0.4577114427860697, "eval_runtime": 33.5166, "eval_samples_per_second": 6.683, "eval_steps_per_second": 0.209, "step": 300 }, { "epoch": 0.800531914893617, "grad_norm": 3.4099960327148438, "learning_rate": 2.332921335481205e-06, "loss": 0.2715, "step": 301 }, { "epoch": 0.8031914893617021, "grad_norm": 4.202554702758789, "learning_rate": 2.2735820613083837e-06, "loss": 0.2616, "step": 302 }, { "epoch": 0.8058510638297872, "grad_norm": 2.95456862449646, "learning_rate": 2.2149102719882044e-06, "loss": 0.2455, "step": 303 }, { "epoch": 0.8085106382978723, "grad_norm": 2.7879536151885986, "learning_rate": 2.156911036173568e-06, "loss": 0.2054, "step": 304 }, { "epoch": 0.8111702127659575, "grad_norm": 2.4969985485076904, "learning_rate": 2.0995893644155007e-06, "loss": 0.2814, "step": 305 }, { "epoch": 0.8138297872340425, "grad_norm": 3.3959643840789795, "learning_rate": 2.0429502087303164e-06, "loss": 0.2382, "step": 306 }, { "epoch": 0.8164893617021277, "grad_norm": 2.825615882873535, "learning_rate": 1.9869984621717888e-06, "loss": 0.2808, "step": 307 }, { "epoch": 0.8191489361702128, "grad_norm": 2.766301155090332, "learning_rate": 1.931738958408457e-06, "loss": 0.2371, "step": 308 }, { "epoch": 0.8218085106382979, "grad_norm": 3.683234930038452, "learning_rate": 1.8771764713060359e-06, "loss": 0.2617, "step": 309 }, { "epoch": 0.824468085106383, "grad_norm": 3.0581727027893066, "learning_rate": 1.8233157145150183e-06, "loss": 0.254, "step": 310 }, { "epoch": 0.8271276595744681, "grad_norm": 3.316701889038086, "learning_rate": 1.7701613410634367e-06, "loss": 0.2596, "step": 311 }, { "epoch": 0.8297872340425532, "grad_norm": 2.8315346240997314, "learning_rate": 1.717717942954914e-06, "loss": 0.222, "step": 312 }, { "epoch": 0.8324468085106383, "grad_norm": 2.781020164489746, "learning_rate": 1.6659900507719406e-06, "loss": 0.2643, "step": 313 }, { "epoch": 0.8351063829787234, "grad_norm": 2.389970302581787, "learning_rate": 1.614982133284495e-06, "loss": 0.2161, "step": 314 }, { "epoch": 0.8377659574468085, "grad_norm": 3.4777987003326416, "learning_rate": 1.5646985970639717e-06, "loss": 0.3309, "step": 315 }, { "epoch": 0.8404255319148937, "grad_norm": 4.487973690032959, "learning_rate": 1.5151437861025032e-06, "loss": 0.3284, "step": 316 }, { "epoch": 0.8430851063829787, "grad_norm": 4.822957515716553, "learning_rate": 1.466321981437694e-06, "loss": 0.2033, "step": 317 }, { "epoch": 0.8457446808510638, "grad_norm": 2.9255247116088867, "learning_rate": 1.4182374007827605e-06, "loss": 0.2528, "step": 318 }, { "epoch": 0.848404255319149, "grad_norm": 2.9784889221191406, "learning_rate": 1.3708941981621814e-06, "loss": 0.2151, "step": 319 }, { "epoch": 0.851063829787234, "grad_norm": 2.522810459136963, "learning_rate": 1.324296463552821e-06, "loss": 0.2333, "step": 320 }, { "epoch": 0.851063829787234, "eval_accuracy": 0.831764705882353, "eval_f1": 0.5545171339563862, "eval_loss": 0.38777896761894226, "eval_precision": 0.7416666666666667, "eval_recall": 0.4427860696517413, "eval_runtime": 34.5031, "eval_samples_per_second": 6.492, "eval_steps_per_second": 0.203, "step": 320 }, { "epoch": 0.8537234042553191, "grad_norm": 2.794802665710449, "learning_rate": 1.2784482225306061e-06, "loss": 0.2338, "step": 321 }, { "epoch": 0.8563829787234043, "grad_norm": 2.8740601539611816, "learning_rate": 1.2333534359227383e-06, "loss": 0.2526, "step": 322 }, { "epoch": 0.8590425531914894, "grad_norm": 2.600721597671509, "learning_rate": 1.1890159994655425e-06, "loss": 0.2165, "step": 323 }, { "epoch": 0.8617021276595744, "grad_norm": 2.781907796859741, "learning_rate": 1.1454397434679022e-06, "loss": 0.2414, "step": 324 }, { "epoch": 0.8643617021276596, "grad_norm": 2.8299474716186523, "learning_rate": 1.1026284324803493e-06, "loss": 0.2389, "step": 325 }, { "epoch": 0.8670212765957447, "grad_norm": 2.6625523567199707, "learning_rate": 1.060585764969867e-06, "loss": 0.2444, "step": 326 }, { "epoch": 0.8696808510638298, "grad_norm": 3.0182435512542725, "learning_rate": 1.0193153730003603e-06, "loss": 0.2967, "step": 327 }, { "epoch": 0.8723404255319149, "grad_norm": 2.5358083248138428, "learning_rate": 9.788208219188932e-07, "loss": 0.2091, "step": 328 }, { "epoch": 0.875, "grad_norm": 3.2480201721191406, "learning_rate": 9.391056100476736e-07, "loss": 0.2195, "step": 329 }, { "epoch": 0.8776595744680851, "grad_norm": 2.449801445007324, "learning_rate": 9.001731683818338e-07, "loss": 0.2316, "step": 330 }, { "epoch": 0.8803191489361702, "grad_norm": 3.304652690887451, "learning_rate": 8.620268602930271e-07, "loss": 0.2719, "step": 331 }, { "epoch": 0.8829787234042553, "grad_norm": 3.1013834476470947, "learning_rate": 8.246699812388714e-07, "loss": 0.2412, "step": 332 }, { "epoch": 0.8856382978723404, "grad_norm": 2.4398679733276367, "learning_rate": 7.881057584782448e-07, "loss": 0.1909, "step": 333 }, { "epoch": 0.8882978723404256, "grad_norm": 3.296792984008789, "learning_rate": 7.523373507924947e-07, "loss": 0.2592, "step": 334 }, { "epoch": 0.8909574468085106, "grad_norm": 3.5089118480682373, "learning_rate": 7.17367848212539e-07, "loss": 0.2341, "step": 335 }, { "epoch": 0.8936170212765957, "grad_norm": 2.9826953411102295, "learning_rate": 6.83200271751927e-07, "loss": 0.239, "step": 336 }, { "epoch": 0.8962765957446809, "grad_norm": 2.965322732925415, "learning_rate": 6.498375731458529e-07, "loss": 0.242, "step": 337 }, { "epoch": 0.898936170212766, "grad_norm": 2.855252504348755, "learning_rate": 6.17282634596148e-07, "loss": 0.2503, "step": 338 }, { "epoch": 0.901595744680851, "grad_norm": 5.112611293792725, "learning_rate": 5.85538268522301e-07, "loss": 0.2665, "step": 339 }, { "epoch": 0.9042553191489362, "grad_norm": 3.4850215911865234, "learning_rate": 5.546072173184791e-07, "loss": 0.2896, "step": 340 }, { "epoch": 0.9042553191489362, "eval_accuracy": 0.8305882352941176, "eval_f1": 0.55, "eval_loss": 0.38858291506767273, "eval_precision": 0.7394957983193278, "eval_recall": 0.43781094527363185, "eval_runtime": 34.3336, "eval_samples_per_second": 6.524, "eval_steps_per_second": 0.204, "step": 340 }, { "epoch": 0.9069148936170213, "grad_norm": 2.3722422122955322, "learning_rate": 5.244921531166247e-07, "loss": 0.2334, "step": 341 }, { "epoch": 0.9095744680851063, "grad_norm": 2.8881895542144775, "learning_rate": 4.951956775556e-07, "loss": 0.2339, "step": 342 }, { "epoch": 0.9122340425531915, "grad_norm": 4.109971046447754, "learning_rate": 4.667203215564431e-07, "loss": 0.2837, "step": 343 }, { "epoch": 0.9148936170212766, "grad_norm": 3.7027337551116943, "learning_rate": 4.3906854510370245e-07, "loss": 0.2862, "step": 344 }, { "epoch": 0.9175531914893617, "grad_norm": 3.069493532180786, "learning_rate": 4.1224273703294515e-07, "loss": 0.2456, "step": 345 }, { "epoch": 0.9202127659574468, "grad_norm": 2.9162609577178955, "learning_rate": 3.862452148243623e-07, "loss": 0.2633, "step": 346 }, { "epoch": 0.9228723404255319, "grad_norm": 3.10223388671875, "learning_rate": 3.610782244025768e-07, "loss": 0.2165, "step": 347 }, { "epoch": 0.925531914893617, "grad_norm": 3.3466663360595703, "learning_rate": 3.367439399426087e-07, "loss": 0.2748, "step": 348 }, { "epoch": 0.9281914893617021, "grad_norm": 3.4505677223205566, "learning_rate": 3.132444636820575e-07, "loss": 0.2789, "step": 349 }, { "epoch": 0.9308510638297872, "grad_norm": 3.7714152336120605, "learning_rate": 2.905818257394799e-07, "loss": 0.233, "step": 350 }, { "epoch": 0.9335106382978723, "grad_norm": 5.176234722137451, "learning_rate": 2.687579839390153e-07, "loss": 0.2933, "step": 351 }, { "epoch": 0.9361702127659575, "grad_norm": 2.8145923614501953, "learning_rate": 2.4777482364124695e-07, "loss": 0.2916, "step": 352 }, { "epoch": 0.9388297872340425, "grad_norm": 2.452026605606079, "learning_rate": 2.2763415758032316e-07, "loss": 0.2072, "step": 353 }, { "epoch": 0.9414893617021277, "grad_norm": 2.741774559020996, "learning_rate": 2.0833772570736376e-07, "loss": 0.2365, "step": 354 }, { "epoch": 0.9441489361702128, "grad_norm": 2.6265206336975098, "learning_rate": 1.8988719504013375e-07, "loss": 0.2226, "step": 355 }, { "epoch": 0.9468085106382979, "grad_norm": 4.149282932281494, "learning_rate": 1.7228415951904165e-07, "loss": 0.1923, "step": 356 }, { "epoch": 0.949468085106383, "grad_norm": 2.389505624771118, "learning_rate": 1.5553013986942645e-07, "loss": 0.21, "step": 357 }, { "epoch": 0.9521276595744681, "grad_norm": 4.067861557006836, "learning_rate": 1.3962658347019819e-07, "loss": 0.2497, "step": 358 }, { "epoch": 0.9547872340425532, "grad_norm": 2.5128250122070312, "learning_rate": 1.245748642287814e-07, "loss": 0.2559, "step": 359 }, { "epoch": 0.9574468085106383, "grad_norm": 2.755162477493286, "learning_rate": 1.103762824624377e-07, "loss": 0.2398, "step": 360 }, { "epoch": 0.9574468085106383, "eval_accuracy": 0.8329411764705882, "eval_f1": 0.5617283950617284, "eval_loss": 0.38481393456459045, "eval_precision": 0.7398373983739838, "eval_recall": 0.4527363184079602, "eval_runtime": 34.7008, "eval_samples_per_second": 6.455, "eval_steps_per_second": 0.202, "step": 360 }, { "epoch": 0.9601063829787234, "grad_norm": 3.078138828277588, "learning_rate": 9.70320647859213e-08, "loss": 0.2091, "step": 361 }, { "epoch": 0.9627659574468085, "grad_norm": 2.8632972240448, "learning_rate": 8.454336400552154e-08, "loss": 0.2513, "step": 362 }, { "epoch": 0.9654255319148937, "grad_norm": 2.500767469406128, "learning_rate": 7.291125901946027e-08, "loss": 0.2346, "step": 363 }, { "epoch": 0.9680851063829787, "grad_norm": 4.420257091522217, "learning_rate": 6.21367547246976e-08, "loss": 0.2701, "step": 364 }, { "epoch": 0.9707446808510638, "grad_norm": 2.459460973739624, "learning_rate": 5.2220781930111263e-08, "loss": 0.2441, "step": 365 }, { "epoch": 0.973404255319149, "grad_norm": 3.661996841430664, "learning_rate": 4.316419727608434e-08, "loss": 0.2704, "step": 366 }, { "epoch": 0.976063829787234, "grad_norm": 3.0439155101776123, "learning_rate": 3.4967783160507753e-08, "loss": 0.2187, "step": 367 }, { "epoch": 0.9787234042553191, "grad_norm": 3.629185914993286, "learning_rate": 2.763224767117767e-08, "loss": 0.3418, "step": 368 }, { "epoch": 0.9813829787234043, "grad_norm": 2.30877423286438, "learning_rate": 2.115822452463223e-08, "loss": 0.2607, "step": 369 }, { "epoch": 0.9840425531914894, "grad_norm": 3.398482084274292, "learning_rate": 1.554627301140199e-08, "loss": 0.2494, "step": 370 }, { "epoch": 0.9867021276595744, "grad_norm": 3.0833022594451904, "learning_rate": 1.0796877947691909e-08, "loss": 0.2924, "step": 371 }, { "epoch": 0.9893617021276596, "grad_norm": 2.702519655227661, "learning_rate": 6.910449633501515e-09, "loss": 0.2222, "step": 372 }, { "epoch": 0.9920212765957447, "grad_norm": 3.0397112369537354, "learning_rate": 3.887323817173272e-09, "loss": 0.2145, "step": 373 }, { "epoch": 0.9946808510638298, "grad_norm": 2.342505931854248, "learning_rate": 1.7277616663946562e-09, "loss": 0.2471, "step": 374 }, { "epoch": 0.9973404255319149, "grad_norm": 2.674713611602783, "learning_rate": 4.319497456273247e-10, "loss": 0.2519, "step": 375 }, { "epoch": 1.0, "grad_norm": 4.508094310760498, "learning_rate": 0.0, "loss": 0.3025, "step": 376 } ], "logging_steps": 1, "max_steps": 376, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2170791543740826e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }