lesso's picture
Training in progress, step 200, checkpoint
cebc020 verified
{
"best_metric": 0.20430698990821838,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.07931786634939521,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000396589331746976,
"grad_norm": 6.674250602722168,
"learning_rate": 1.001e-05,
"loss": 4.3034,
"step": 1
},
{
"epoch": 0.000396589331746976,
"eval_loss": 3.849916934967041,
"eval_runtime": 46.9826,
"eval_samples_per_second": 22.604,
"eval_steps_per_second": 5.662,
"step": 1
},
{
"epoch": 0.000793178663493952,
"grad_norm": 13.111907958984375,
"learning_rate": 2.002e-05,
"loss": 6.024,
"step": 2
},
{
"epoch": 0.001189767995240928,
"grad_norm": 9.942648887634277,
"learning_rate": 3.0029999999999995e-05,
"loss": 5.3876,
"step": 3
},
{
"epoch": 0.001586357326987904,
"grad_norm": 8.252339363098145,
"learning_rate": 4.004e-05,
"loss": 2.8694,
"step": 4
},
{
"epoch": 0.00198294665873488,
"grad_norm": 10.045882225036621,
"learning_rate": 5.005e-05,
"loss": 2.5194,
"step": 5
},
{
"epoch": 0.002379535990481856,
"grad_norm": 6.507627487182617,
"learning_rate": 6.005999999999999e-05,
"loss": 2.0718,
"step": 6
},
{
"epoch": 0.002776125322228832,
"grad_norm": 7.497224807739258,
"learning_rate": 7.006999999999998e-05,
"loss": 2.275,
"step": 7
},
{
"epoch": 0.003172714653975808,
"grad_norm": 5.831888675689697,
"learning_rate": 8.008e-05,
"loss": 1.198,
"step": 8
},
{
"epoch": 0.003569303985722784,
"grad_norm": 6.313211917877197,
"learning_rate": 9.009e-05,
"loss": 1.3938,
"step": 9
},
{
"epoch": 0.00396589331746976,
"grad_norm": 4.629086017608643,
"learning_rate": 0.0001001,
"loss": 1.3091,
"step": 10
},
{
"epoch": 0.004362482649216736,
"grad_norm": 4.553199291229248,
"learning_rate": 9.957315789473684e-05,
"loss": 1.1133,
"step": 11
},
{
"epoch": 0.004759071980963712,
"grad_norm": 5.859588623046875,
"learning_rate": 9.904631578947367e-05,
"loss": 0.9577,
"step": 12
},
{
"epoch": 0.005155661312710688,
"grad_norm": 4.0679707527160645,
"learning_rate": 9.851947368421052e-05,
"loss": 0.8043,
"step": 13
},
{
"epoch": 0.005552250644457664,
"grad_norm": 3.8273086547851562,
"learning_rate": 9.799263157894736e-05,
"loss": 0.6524,
"step": 14
},
{
"epoch": 0.00594883997620464,
"grad_norm": 5.04181432723999,
"learning_rate": 9.746578947368421e-05,
"loss": 0.785,
"step": 15
},
{
"epoch": 0.006345429307951616,
"grad_norm": 4.491847038269043,
"learning_rate": 9.693894736842104e-05,
"loss": 0.8275,
"step": 16
},
{
"epoch": 0.0067420186396985925,
"grad_norm": 7.865641117095947,
"learning_rate": 9.641210526315789e-05,
"loss": 1.1221,
"step": 17
},
{
"epoch": 0.007138607971445568,
"grad_norm": 5.287400722503662,
"learning_rate": 9.588526315789473e-05,
"loss": 0.7424,
"step": 18
},
{
"epoch": 0.007535197303192544,
"grad_norm": 5.461188793182373,
"learning_rate": 9.535842105263157e-05,
"loss": 0.4901,
"step": 19
},
{
"epoch": 0.00793178663493952,
"grad_norm": 3.19498348236084,
"learning_rate": 9.483157894736841e-05,
"loss": 0.5121,
"step": 20
},
{
"epoch": 0.008328375966686495,
"grad_norm": 6.649216175079346,
"learning_rate": 9.430473684210526e-05,
"loss": 0.203,
"step": 21
},
{
"epoch": 0.008724965298433472,
"grad_norm": 6.187960624694824,
"learning_rate": 9.37778947368421e-05,
"loss": 1.141,
"step": 22
},
{
"epoch": 0.009121554630180448,
"grad_norm": 3.0972740650177,
"learning_rate": 9.325105263157894e-05,
"loss": 0.5017,
"step": 23
},
{
"epoch": 0.009518143961927425,
"grad_norm": 4.420783042907715,
"learning_rate": 9.272421052631578e-05,
"loss": 0.7917,
"step": 24
},
{
"epoch": 0.009914733293674401,
"grad_norm": 2.6894378662109375,
"learning_rate": 9.219736842105263e-05,
"loss": 0.2605,
"step": 25
},
{
"epoch": 0.010311322625421376,
"grad_norm": 2.54012131690979,
"learning_rate": 9.167052631578946e-05,
"loss": 0.4831,
"step": 26
},
{
"epoch": 0.010707911957168352,
"grad_norm": 2.170597791671753,
"learning_rate": 9.114368421052632e-05,
"loss": 0.3522,
"step": 27
},
{
"epoch": 0.011104501288915328,
"grad_norm": 5.271856307983398,
"learning_rate": 9.061684210526315e-05,
"loss": 1.2457,
"step": 28
},
{
"epoch": 0.011501090620662305,
"grad_norm": 2.632899522781372,
"learning_rate": 9.009e-05,
"loss": 0.4753,
"step": 29
},
{
"epoch": 0.01189767995240928,
"grad_norm": 3.3594906330108643,
"learning_rate": 8.956315789473683e-05,
"loss": 0.6309,
"step": 30
},
{
"epoch": 0.012294269284156256,
"grad_norm": 2.8951773643493652,
"learning_rate": 8.903631578947368e-05,
"loss": 0.439,
"step": 31
},
{
"epoch": 0.012690858615903232,
"grad_norm": 4.174351215362549,
"learning_rate": 8.850947368421052e-05,
"loss": 0.5533,
"step": 32
},
{
"epoch": 0.013087447947650209,
"grad_norm": 4.82841157913208,
"learning_rate": 8.798263157894736e-05,
"loss": 0.8201,
"step": 33
},
{
"epoch": 0.013484037279397185,
"grad_norm": 4.0411858558654785,
"learning_rate": 8.745578947368422e-05,
"loss": 0.5246,
"step": 34
},
{
"epoch": 0.01388062661114416,
"grad_norm": 3.1116089820861816,
"learning_rate": 8.692894736842105e-05,
"loss": 0.5387,
"step": 35
},
{
"epoch": 0.014277215942891136,
"grad_norm": 3.7147281169891357,
"learning_rate": 8.64021052631579e-05,
"loss": 0.4417,
"step": 36
},
{
"epoch": 0.014673805274638112,
"grad_norm": 3.4639275074005127,
"learning_rate": 8.587526315789473e-05,
"loss": 0.6997,
"step": 37
},
{
"epoch": 0.015070394606385089,
"grad_norm": 5.578957557678223,
"learning_rate": 8.534842105263157e-05,
"loss": 1.3211,
"step": 38
},
{
"epoch": 0.015466983938132063,
"grad_norm": 2.867260456085205,
"learning_rate": 8.482157894736842e-05,
"loss": 0.5126,
"step": 39
},
{
"epoch": 0.01586357326987904,
"grad_norm": 3.506054401397705,
"learning_rate": 8.429473684210525e-05,
"loss": 0.4488,
"step": 40
},
{
"epoch": 0.016260162601626018,
"grad_norm": 3.3142287731170654,
"learning_rate": 8.376789473684211e-05,
"loss": 0.373,
"step": 41
},
{
"epoch": 0.01665675193337299,
"grad_norm": 1.8013468980789185,
"learning_rate": 8.324105263157894e-05,
"loss": 0.1376,
"step": 42
},
{
"epoch": 0.017053341265119967,
"grad_norm": 2.2930386066436768,
"learning_rate": 8.271421052631579e-05,
"loss": 0.0869,
"step": 43
},
{
"epoch": 0.017449930596866944,
"grad_norm": 0.5169830918312073,
"learning_rate": 8.218736842105262e-05,
"loss": 0.0196,
"step": 44
},
{
"epoch": 0.01784651992861392,
"grad_norm": 1.5832732915878296,
"learning_rate": 8.166052631578947e-05,
"loss": 0.0804,
"step": 45
},
{
"epoch": 0.018243109260360896,
"grad_norm": 2.339470386505127,
"learning_rate": 8.113368421052631e-05,
"loss": 0.4453,
"step": 46
},
{
"epoch": 0.018639698592107873,
"grad_norm": 2.9261984825134277,
"learning_rate": 8.060684210526315e-05,
"loss": 0.2987,
"step": 47
},
{
"epoch": 0.01903628792385485,
"grad_norm": 0.43078291416168213,
"learning_rate": 8.008e-05,
"loss": 0.0167,
"step": 48
},
{
"epoch": 0.019432877255601826,
"grad_norm": 25.611412048339844,
"learning_rate": 7.955315789473684e-05,
"loss": 0.0732,
"step": 49
},
{
"epoch": 0.019829466587348802,
"grad_norm": 2.956575393676758,
"learning_rate": 7.902631578947368e-05,
"loss": 0.1045,
"step": 50
},
{
"epoch": 0.019829466587348802,
"eval_loss": 0.5711997747421265,
"eval_runtime": 47.4495,
"eval_samples_per_second": 22.382,
"eval_steps_per_second": 5.606,
"step": 50
},
{
"epoch": 0.020226055919095775,
"grad_norm": 8.624287605285645,
"learning_rate": 7.849947368421052e-05,
"loss": 1.3873,
"step": 51
},
{
"epoch": 0.02062264525084275,
"grad_norm": 7.417449951171875,
"learning_rate": 7.797263157894736e-05,
"loss": 0.8735,
"step": 52
},
{
"epoch": 0.021019234582589728,
"grad_norm": 4.000467300415039,
"learning_rate": 7.744578947368421e-05,
"loss": 0.8792,
"step": 53
},
{
"epoch": 0.021415823914336704,
"grad_norm": 1.1016407012939453,
"learning_rate": 7.691894736842104e-05,
"loss": 0.1425,
"step": 54
},
{
"epoch": 0.02181241324608368,
"grad_norm": 1.845973014831543,
"learning_rate": 7.63921052631579e-05,
"loss": 0.1722,
"step": 55
},
{
"epoch": 0.022209002577830657,
"grad_norm": 2.5237021446228027,
"learning_rate": 7.586526315789473e-05,
"loss": 0.5188,
"step": 56
},
{
"epoch": 0.022605591909577633,
"grad_norm": 1.7162082195281982,
"learning_rate": 7.533842105263158e-05,
"loss": 0.2142,
"step": 57
},
{
"epoch": 0.02300218124132461,
"grad_norm": 3.1163854598999023,
"learning_rate": 7.481157894736841e-05,
"loss": 0.8212,
"step": 58
},
{
"epoch": 0.023398770573071586,
"grad_norm": 1.6111253499984741,
"learning_rate": 7.428473684210526e-05,
"loss": 0.302,
"step": 59
},
{
"epoch": 0.02379535990481856,
"grad_norm": 2.3448877334594727,
"learning_rate": 7.375789473684209e-05,
"loss": 0.5013,
"step": 60
},
{
"epoch": 0.024191949236565535,
"grad_norm": 1.4863951206207275,
"learning_rate": 7.323105263157895e-05,
"loss": 0.0785,
"step": 61
},
{
"epoch": 0.02458853856831251,
"grad_norm": 0.46337777376174927,
"learning_rate": 7.270421052631578e-05,
"loss": 0.0252,
"step": 62
},
{
"epoch": 0.024985127900059488,
"grad_norm": 1.6508891582489014,
"learning_rate": 7.217736842105263e-05,
"loss": 0.2864,
"step": 63
},
{
"epoch": 0.025381717231806464,
"grad_norm": 1.8168219327926636,
"learning_rate": 7.165052631578947e-05,
"loss": 0.1634,
"step": 64
},
{
"epoch": 0.02577830656355344,
"grad_norm": 1.4724931716918945,
"learning_rate": 7.11236842105263e-05,
"loss": 0.1565,
"step": 65
},
{
"epoch": 0.026174895895300417,
"grad_norm": 1.3949257135391235,
"learning_rate": 7.059684210526315e-05,
"loss": 0.2016,
"step": 66
},
{
"epoch": 0.026571485227047394,
"grad_norm": 5.898593902587891,
"learning_rate": 7.006999999999998e-05,
"loss": 0.2504,
"step": 67
},
{
"epoch": 0.02696807455879437,
"grad_norm": 1.9633386135101318,
"learning_rate": 6.954315789473684e-05,
"loss": 0.221,
"step": 68
},
{
"epoch": 0.027364663890541343,
"grad_norm": 2.800323963165283,
"learning_rate": 6.901631578947368e-05,
"loss": 0.457,
"step": 69
},
{
"epoch": 0.02776125322228832,
"grad_norm": 1.5002562999725342,
"learning_rate": 6.848947368421052e-05,
"loss": 0.0964,
"step": 70
},
{
"epoch": 0.028157842554035296,
"grad_norm": 2.7115468978881836,
"learning_rate": 6.796263157894737e-05,
"loss": 0.4686,
"step": 71
},
{
"epoch": 0.028554431885782272,
"grad_norm": 4.224088668823242,
"learning_rate": 6.74357894736842e-05,
"loss": 0.9701,
"step": 72
},
{
"epoch": 0.02895102121752925,
"grad_norm": 4.742822647094727,
"learning_rate": 6.690894736842105e-05,
"loss": 0.7263,
"step": 73
},
{
"epoch": 0.029347610549276225,
"grad_norm": 2.388258934020996,
"learning_rate": 6.638210526315788e-05,
"loss": 0.3153,
"step": 74
},
{
"epoch": 0.0297441998810232,
"grad_norm": 2.4255616664886475,
"learning_rate": 6.585526315789474e-05,
"loss": 0.2614,
"step": 75
},
{
"epoch": 0.030140789212770178,
"grad_norm": 1.9446436166763306,
"learning_rate": 6.532842105263157e-05,
"loss": 0.1088,
"step": 76
},
{
"epoch": 0.030537378544517154,
"grad_norm": 2.5515191555023193,
"learning_rate": 6.480157894736842e-05,
"loss": 0.4212,
"step": 77
},
{
"epoch": 0.030933967876264127,
"grad_norm": 1.997567892074585,
"learning_rate": 6.427473684210526e-05,
"loss": 0.2342,
"step": 78
},
{
"epoch": 0.0313305572080111,
"grad_norm": 2.1094398498535156,
"learning_rate": 6.37478947368421e-05,
"loss": 0.2247,
"step": 79
},
{
"epoch": 0.03172714653975808,
"grad_norm": 2.6329636573791504,
"learning_rate": 6.322105263157894e-05,
"loss": 0.431,
"step": 80
},
{
"epoch": 0.032123735871505056,
"grad_norm": 2.7955644130706787,
"learning_rate": 6.269421052631577e-05,
"loss": 0.2616,
"step": 81
},
{
"epoch": 0.032520325203252036,
"grad_norm": 2.9589076042175293,
"learning_rate": 6.216736842105263e-05,
"loss": 0.3463,
"step": 82
},
{
"epoch": 0.03291691453499901,
"grad_norm": 2.4867746829986572,
"learning_rate": 6.164052631578947e-05,
"loss": 0.4058,
"step": 83
},
{
"epoch": 0.03331350386674598,
"grad_norm": 12.15013313293457,
"learning_rate": 6.111368421052631e-05,
"loss": 0.845,
"step": 84
},
{
"epoch": 0.03371009319849296,
"grad_norm": 1.4896100759506226,
"learning_rate": 6.058684210526315e-05,
"loss": 0.0909,
"step": 85
},
{
"epoch": 0.034106682530239935,
"grad_norm": 3.909092426300049,
"learning_rate": 6.005999999999999e-05,
"loss": 0.7803,
"step": 86
},
{
"epoch": 0.034503271861986914,
"grad_norm": 3.4637069702148438,
"learning_rate": 5.953315789473684e-05,
"loss": 0.2718,
"step": 87
},
{
"epoch": 0.03489986119373389,
"grad_norm": 3.0589916706085205,
"learning_rate": 5.9006315789473676e-05,
"loss": 0.4081,
"step": 88
},
{
"epoch": 0.03529645052548087,
"grad_norm": 0.2933317720890045,
"learning_rate": 5.847947368421053e-05,
"loss": 0.0098,
"step": 89
},
{
"epoch": 0.03569303985722784,
"grad_norm": 2.7271687984466553,
"learning_rate": 5.795263157894737e-05,
"loss": 0.1337,
"step": 90
},
{
"epoch": 0.03608962918897482,
"grad_norm": 0.08103972673416138,
"learning_rate": 5.742578947368421e-05,
"loss": 0.0037,
"step": 91
},
{
"epoch": 0.03648621852072179,
"grad_norm": 2.7736382484436035,
"learning_rate": 5.6898947368421046e-05,
"loss": 0.2963,
"step": 92
},
{
"epoch": 0.036882807852468766,
"grad_norm": 2.4266488552093506,
"learning_rate": 5.6372105263157886e-05,
"loss": 0.1306,
"step": 93
},
{
"epoch": 0.037279397184215746,
"grad_norm": 1.565101981163025,
"learning_rate": 5.584526315789473e-05,
"loss": 0.0939,
"step": 94
},
{
"epoch": 0.03767598651596272,
"grad_norm": 0.204191654920578,
"learning_rate": 5.531842105263158e-05,
"loss": 0.0056,
"step": 95
},
{
"epoch": 0.0380725758477097,
"grad_norm": 1.7860132455825806,
"learning_rate": 5.4791578947368424e-05,
"loss": 0.2133,
"step": 96
},
{
"epoch": 0.03846916517945667,
"grad_norm": 3.4425370693206787,
"learning_rate": 5.426473684210526e-05,
"loss": 0.2545,
"step": 97
},
{
"epoch": 0.03886575451120365,
"grad_norm": 2.9134604930877686,
"learning_rate": 5.37378947368421e-05,
"loss": 0.282,
"step": 98
},
{
"epoch": 0.039262343842950624,
"grad_norm": 2.0041611194610596,
"learning_rate": 5.321105263157894e-05,
"loss": 0.175,
"step": 99
},
{
"epoch": 0.039658933174697604,
"grad_norm": 2.66727352142334,
"learning_rate": 5.268421052631578e-05,
"loss": 0.2079,
"step": 100
},
{
"epoch": 0.039658933174697604,
"eval_loss": 0.2756502330303192,
"eval_runtime": 47.4815,
"eval_samples_per_second": 22.367,
"eval_steps_per_second": 5.602,
"step": 100
},
{
"epoch": 0.04005552250644458,
"grad_norm": 4.0711822509765625,
"learning_rate": 5.2157368421052626e-05,
"loss": 0.6342,
"step": 101
},
{
"epoch": 0.04045211183819155,
"grad_norm": 1.8220709562301636,
"learning_rate": 5.163052631578947e-05,
"loss": 0.2614,
"step": 102
},
{
"epoch": 0.04084870116993853,
"grad_norm": 0.1610630750656128,
"learning_rate": 5.110368421052632e-05,
"loss": 0.0077,
"step": 103
},
{
"epoch": 0.0412452905016855,
"grad_norm": 1.660254955291748,
"learning_rate": 5.057684210526316e-05,
"loss": 0.2139,
"step": 104
},
{
"epoch": 0.04164187983343248,
"grad_norm": 1.2034050226211548,
"learning_rate": 5.005e-05,
"loss": 0.1488,
"step": 105
},
{
"epoch": 0.042038469165179455,
"grad_norm": 1.123096227645874,
"learning_rate": 4.9523157894736836e-05,
"loss": 0.1183,
"step": 106
},
{
"epoch": 0.042435058496926435,
"grad_norm": 1.3603556156158447,
"learning_rate": 4.899631578947368e-05,
"loss": 0.2229,
"step": 107
},
{
"epoch": 0.04283164782867341,
"grad_norm": 1.2517480850219727,
"learning_rate": 4.846947368421052e-05,
"loss": 0.1483,
"step": 108
},
{
"epoch": 0.04322823716042039,
"grad_norm": 0.9318723082542419,
"learning_rate": 4.794263157894737e-05,
"loss": 0.0662,
"step": 109
},
{
"epoch": 0.04362482649216736,
"grad_norm": 1.0203633308410645,
"learning_rate": 4.7415789473684206e-05,
"loss": 0.1234,
"step": 110
},
{
"epoch": 0.044021415823914334,
"grad_norm": 1.8892359733581543,
"learning_rate": 4.688894736842105e-05,
"loss": 0.1998,
"step": 111
},
{
"epoch": 0.044418005155661314,
"grad_norm": 0.6476733088493347,
"learning_rate": 4.636210526315789e-05,
"loss": 0.039,
"step": 112
},
{
"epoch": 0.04481459448740829,
"grad_norm": 1.5910590887069702,
"learning_rate": 4.583526315789473e-05,
"loss": 0.2231,
"step": 113
},
{
"epoch": 0.045211183819155266,
"grad_norm": 1.6052759885787964,
"learning_rate": 4.530842105263158e-05,
"loss": 0.2969,
"step": 114
},
{
"epoch": 0.04560777315090224,
"grad_norm": 2.3403847217559814,
"learning_rate": 4.4781578947368416e-05,
"loss": 0.1473,
"step": 115
},
{
"epoch": 0.04600436248264922,
"grad_norm": 3.6207292079925537,
"learning_rate": 4.425473684210526e-05,
"loss": 0.5701,
"step": 116
},
{
"epoch": 0.04640095181439619,
"grad_norm": 0.6984104514122009,
"learning_rate": 4.372789473684211e-05,
"loss": 0.0504,
"step": 117
},
{
"epoch": 0.04679754114614317,
"grad_norm": 2.5241734981536865,
"learning_rate": 4.320105263157895e-05,
"loss": 0.2509,
"step": 118
},
{
"epoch": 0.047194130477890145,
"grad_norm": 1.5402919054031372,
"learning_rate": 4.2674210526315786e-05,
"loss": 0.1996,
"step": 119
},
{
"epoch": 0.04759071980963712,
"grad_norm": 2.3560476303100586,
"learning_rate": 4.2147368421052626e-05,
"loss": 0.3034,
"step": 120
},
{
"epoch": 0.0479873091413841,
"grad_norm": 1.9385439157485962,
"learning_rate": 4.162052631578947e-05,
"loss": 0.3015,
"step": 121
},
{
"epoch": 0.04838389847313107,
"grad_norm": 1.4906829595565796,
"learning_rate": 4.109368421052631e-05,
"loss": 0.1302,
"step": 122
},
{
"epoch": 0.04878048780487805,
"grad_norm": 2.4340176582336426,
"learning_rate": 4.056684210526316e-05,
"loss": 0.4329,
"step": 123
},
{
"epoch": 0.04917707713662502,
"grad_norm": 2.110347032546997,
"learning_rate": 4.004e-05,
"loss": 0.3068,
"step": 124
},
{
"epoch": 0.049573666468372,
"grad_norm": 1.2856132984161377,
"learning_rate": 3.951315789473684e-05,
"loss": 0.1518,
"step": 125
},
{
"epoch": 0.049970255800118976,
"grad_norm": 2.3550281524658203,
"learning_rate": 3.898631578947368e-05,
"loss": 0.117,
"step": 126
},
{
"epoch": 0.050366845131865956,
"grad_norm": 2.4217026233673096,
"learning_rate": 3.845947368421052e-05,
"loss": 0.4333,
"step": 127
},
{
"epoch": 0.05076343446361293,
"grad_norm": 1.6932317018508911,
"learning_rate": 3.7932631578947367e-05,
"loss": 0.148,
"step": 128
},
{
"epoch": 0.0511600237953599,
"grad_norm": 1.6000185012817383,
"learning_rate": 3.7405789473684206e-05,
"loss": 0.0932,
"step": 129
},
{
"epoch": 0.05155661312710688,
"grad_norm": 1.805180311203003,
"learning_rate": 3.6878947368421045e-05,
"loss": 0.2301,
"step": 130
},
{
"epoch": 0.051953202458853855,
"grad_norm": 2.063298463821411,
"learning_rate": 3.635210526315789e-05,
"loss": 0.1673,
"step": 131
},
{
"epoch": 0.052349791790600834,
"grad_norm": 2.706456422805786,
"learning_rate": 3.582526315789474e-05,
"loss": 0.389,
"step": 132
},
{
"epoch": 0.05274638112234781,
"grad_norm": 2.0284225940704346,
"learning_rate": 3.5298421052631576e-05,
"loss": 0.1468,
"step": 133
},
{
"epoch": 0.05314297045409479,
"grad_norm": 1.6976109743118286,
"learning_rate": 3.477157894736842e-05,
"loss": 0.0762,
"step": 134
},
{
"epoch": 0.05353955978584176,
"grad_norm": 1.7858450412750244,
"learning_rate": 3.424473684210526e-05,
"loss": 0.2102,
"step": 135
},
{
"epoch": 0.05393614911758874,
"grad_norm": 1.9341448545455933,
"learning_rate": 3.37178947368421e-05,
"loss": 0.2661,
"step": 136
},
{
"epoch": 0.05433273844933571,
"grad_norm": 3.936488628387451,
"learning_rate": 3.319105263157894e-05,
"loss": 0.6706,
"step": 137
},
{
"epoch": 0.054729327781082686,
"grad_norm": 1.7080624103546143,
"learning_rate": 3.2664210526315786e-05,
"loss": 0.1062,
"step": 138
},
{
"epoch": 0.055125917112829666,
"grad_norm": 1.5581916570663452,
"learning_rate": 3.213736842105263e-05,
"loss": 0.0934,
"step": 139
},
{
"epoch": 0.05552250644457664,
"grad_norm": 1.3080987930297852,
"learning_rate": 3.161052631578947e-05,
"loss": 0.0814,
"step": 140
},
{
"epoch": 0.05591909577632362,
"grad_norm": 2.823137044906616,
"learning_rate": 3.108368421052632e-05,
"loss": 0.4453,
"step": 141
},
{
"epoch": 0.05631568510807059,
"grad_norm": 0.21419651806354523,
"learning_rate": 3.0556842105263156e-05,
"loss": 0.0073,
"step": 142
},
{
"epoch": 0.05671227443981757,
"grad_norm": 0.22324123978614807,
"learning_rate": 3.0029999999999995e-05,
"loss": 0.0051,
"step": 143
},
{
"epoch": 0.057108863771564544,
"grad_norm": 0.2457597404718399,
"learning_rate": 2.9503157894736838e-05,
"loss": 0.0057,
"step": 144
},
{
"epoch": 0.057505453103311524,
"grad_norm": 3.4714629650115967,
"learning_rate": 2.8976315789473684e-05,
"loss": 0.5372,
"step": 145
},
{
"epoch": 0.0579020424350585,
"grad_norm": 0.028041277080774307,
"learning_rate": 2.8449473684210523e-05,
"loss": 0.0009,
"step": 146
},
{
"epoch": 0.05829863176680547,
"grad_norm": 1.089476466178894,
"learning_rate": 2.7922631578947366e-05,
"loss": 0.0318,
"step": 147
},
{
"epoch": 0.05869522109855245,
"grad_norm": 2.4500927925109863,
"learning_rate": 2.7395789473684212e-05,
"loss": 0.4794,
"step": 148
},
{
"epoch": 0.05909181043029942,
"grad_norm": 0.0174604132771492,
"learning_rate": 2.686894736842105e-05,
"loss": 0.0006,
"step": 149
},
{
"epoch": 0.0594883997620464,
"grad_norm": 2.791302442550659,
"learning_rate": 2.634210526315789e-05,
"loss": 0.4109,
"step": 150
},
{
"epoch": 0.0594883997620464,
"eval_loss": 0.23389868438243866,
"eval_runtime": 47.3967,
"eval_samples_per_second": 22.407,
"eval_steps_per_second": 5.612,
"step": 150
},
{
"epoch": 0.059884989093793375,
"grad_norm": 4.477522373199463,
"learning_rate": 2.5815263157894736e-05,
"loss": 0.5733,
"step": 151
},
{
"epoch": 0.060281578425540355,
"grad_norm": 3.334212303161621,
"learning_rate": 2.528842105263158e-05,
"loss": 0.3337,
"step": 152
},
{
"epoch": 0.06067816775728733,
"grad_norm": 2.710191011428833,
"learning_rate": 2.4761578947368418e-05,
"loss": 0.1571,
"step": 153
},
{
"epoch": 0.06107475708903431,
"grad_norm": 1.1137654781341553,
"learning_rate": 2.423473684210526e-05,
"loss": 0.1697,
"step": 154
},
{
"epoch": 0.06147134642078128,
"grad_norm": 0.505480945110321,
"learning_rate": 2.3707894736842103e-05,
"loss": 0.0237,
"step": 155
},
{
"epoch": 0.061867935752528254,
"grad_norm": 1.122707724571228,
"learning_rate": 2.3181052631578946e-05,
"loss": 0.0803,
"step": 156
},
{
"epoch": 0.062264525084275234,
"grad_norm": 0.782159149646759,
"learning_rate": 2.265421052631579e-05,
"loss": 0.063,
"step": 157
},
{
"epoch": 0.0626611144160222,
"grad_norm": 2.235445499420166,
"learning_rate": 2.212736842105263e-05,
"loss": 0.2019,
"step": 158
},
{
"epoch": 0.06305770374776919,
"grad_norm": 3.1838295459747314,
"learning_rate": 2.1600526315789474e-05,
"loss": 0.3564,
"step": 159
},
{
"epoch": 0.06345429307951617,
"grad_norm": 1.417310118675232,
"learning_rate": 2.1073684210526313e-05,
"loss": 0.1137,
"step": 160
},
{
"epoch": 0.06385088241126313,
"grad_norm": 1.3451471328735352,
"learning_rate": 2.0546842105263155e-05,
"loss": 0.1321,
"step": 161
},
{
"epoch": 0.06424747174301011,
"grad_norm": 3.381690740585327,
"learning_rate": 2.002e-05,
"loss": 0.3365,
"step": 162
},
{
"epoch": 0.06464406107475709,
"grad_norm": 1.9907605648040771,
"learning_rate": 1.949315789473684e-05,
"loss": 0.1682,
"step": 163
},
{
"epoch": 0.06504065040650407,
"grad_norm": 1.1349059343338013,
"learning_rate": 1.8966315789473683e-05,
"loss": 0.0735,
"step": 164
},
{
"epoch": 0.06543723973825104,
"grad_norm": 1.4078447818756104,
"learning_rate": 1.8439473684210522e-05,
"loss": 0.1848,
"step": 165
},
{
"epoch": 0.06583382906999802,
"grad_norm": 2.635918617248535,
"learning_rate": 1.791263157894737e-05,
"loss": 0.288,
"step": 166
},
{
"epoch": 0.066230418401745,
"grad_norm": 2.0039432048797607,
"learning_rate": 1.738578947368421e-05,
"loss": 0.4102,
"step": 167
},
{
"epoch": 0.06662700773349196,
"grad_norm": 2.923654794692993,
"learning_rate": 1.685894736842105e-05,
"loss": 0.8078,
"step": 168
},
{
"epoch": 0.06702359706523894,
"grad_norm": 2.5879900455474854,
"learning_rate": 1.6332105263157893e-05,
"loss": 0.3793,
"step": 169
},
{
"epoch": 0.06742018639698592,
"grad_norm": 1.7998006343841553,
"learning_rate": 1.5805263157894735e-05,
"loss": 0.1745,
"step": 170
},
{
"epoch": 0.0678167757287329,
"grad_norm": 2.5468177795410156,
"learning_rate": 1.5278421052631578e-05,
"loss": 0.1744,
"step": 171
},
{
"epoch": 0.06821336506047987,
"grad_norm": 2.898380994796753,
"learning_rate": 1.4751578947368419e-05,
"loss": 0.379,
"step": 172
},
{
"epoch": 0.06860995439222685,
"grad_norm": 3.7562787532806396,
"learning_rate": 1.4224736842105262e-05,
"loss": 0.5993,
"step": 173
},
{
"epoch": 0.06900654372397383,
"grad_norm": 1.9443773031234741,
"learning_rate": 1.3697894736842106e-05,
"loss": 0.1486,
"step": 174
},
{
"epoch": 0.0694031330557208,
"grad_norm": 2.690262794494629,
"learning_rate": 1.3171052631578945e-05,
"loss": 0.2218,
"step": 175
},
{
"epoch": 0.06979972238746777,
"grad_norm": 2.491971492767334,
"learning_rate": 1.264421052631579e-05,
"loss": 0.1893,
"step": 176
},
{
"epoch": 0.07019631171921475,
"grad_norm": 3.372171401977539,
"learning_rate": 1.211736842105263e-05,
"loss": 0.5962,
"step": 177
},
{
"epoch": 0.07059290105096173,
"grad_norm": 2.8883213996887207,
"learning_rate": 1.1590526315789473e-05,
"loss": 0.3897,
"step": 178
},
{
"epoch": 0.0709894903827087,
"grad_norm": 2.1615426540374756,
"learning_rate": 1.1063684210526316e-05,
"loss": 0.1384,
"step": 179
},
{
"epoch": 0.07138607971445568,
"grad_norm": 3.077579975128174,
"learning_rate": 1.0536842105263156e-05,
"loss": 0.6491,
"step": 180
},
{
"epoch": 0.07178266904620266,
"grad_norm": 1.228853464126587,
"learning_rate": 1.001e-05,
"loss": 0.06,
"step": 181
},
{
"epoch": 0.07217925837794964,
"grad_norm": 1.8013077974319458,
"learning_rate": 9.483157894736842e-06,
"loss": 0.1734,
"step": 182
},
{
"epoch": 0.0725758477096966,
"grad_norm": 0.8816391229629517,
"learning_rate": 8.956315789473684e-06,
"loss": 0.0388,
"step": 183
},
{
"epoch": 0.07297243704144359,
"grad_norm": 1.7668546438217163,
"learning_rate": 8.429473684210525e-06,
"loss": 0.217,
"step": 184
},
{
"epoch": 0.07336902637319057,
"grad_norm": 3.179753541946411,
"learning_rate": 7.902631578947368e-06,
"loss": 0.28,
"step": 185
},
{
"epoch": 0.07376561570493753,
"grad_norm": 3.8374545574188232,
"learning_rate": 7.3757894736842095e-06,
"loss": 0.4457,
"step": 186
},
{
"epoch": 0.07416220503668451,
"grad_norm": 3.593108892440796,
"learning_rate": 6.848947368421053e-06,
"loss": 1.1263,
"step": 187
},
{
"epoch": 0.07455879436843149,
"grad_norm": 3.4441983699798584,
"learning_rate": 6.322105263157895e-06,
"loss": 0.4889,
"step": 188
},
{
"epoch": 0.07495538370017847,
"grad_norm": 2.782144546508789,
"learning_rate": 5.7952631578947365e-06,
"loss": 0.3363,
"step": 189
},
{
"epoch": 0.07535197303192544,
"grad_norm": 0.27459532022476196,
"learning_rate": 5.268421052631578e-06,
"loss": 0.0108,
"step": 190
},
{
"epoch": 0.07574856236367242,
"grad_norm": 2.1841888427734375,
"learning_rate": 4.741578947368421e-06,
"loss": 0.3438,
"step": 191
},
{
"epoch": 0.0761451516954194,
"grad_norm": 0.02204926311969757,
"learning_rate": 4.2147368421052626e-06,
"loss": 0.0007,
"step": 192
},
{
"epoch": 0.07654174102716636,
"grad_norm": 1.9425251483917236,
"learning_rate": 3.6878947368421047e-06,
"loss": 0.2348,
"step": 193
},
{
"epoch": 0.07693833035891334,
"grad_norm": 0.04739471897482872,
"learning_rate": 3.1610526315789474e-06,
"loss": 0.0012,
"step": 194
},
{
"epoch": 0.07733491969066032,
"grad_norm": 0.06455172598361969,
"learning_rate": 2.634210526315789e-06,
"loss": 0.0017,
"step": 195
},
{
"epoch": 0.0777315090224073,
"grad_norm": 3.44230318069458,
"learning_rate": 2.1073684210526313e-06,
"loss": 0.4222,
"step": 196
},
{
"epoch": 0.07812809835415427,
"grad_norm": 2.755849838256836,
"learning_rate": 1.5805263157894737e-06,
"loss": 0.4239,
"step": 197
},
{
"epoch": 0.07852468768590125,
"grad_norm": 1.2250853776931763,
"learning_rate": 1.0536842105263156e-06,
"loss": 0.0493,
"step": 198
},
{
"epoch": 0.07892127701764823,
"grad_norm": 2.685635566711426,
"learning_rate": 5.268421052631578e-07,
"loss": 0.3463,
"step": 199
},
{
"epoch": 0.07931786634939521,
"grad_norm": 4.368965148925781,
"learning_rate": 0.0,
"loss": 0.6687,
"step": 200
},
{
"epoch": 0.07931786634939521,
"eval_loss": 0.20430698990821838,
"eval_runtime": 47.6165,
"eval_samples_per_second": 22.303,
"eval_steps_per_second": 5.586,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.39474174476288e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}