mtzig commited on
Commit
991c0fd
·
verified ·
1 Parent(s): f113ea9

Training in progress, step 376, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcabfd92be31b6499987f272f2e66282673bbf0f3477e3b95d3ea40fdce5a631
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2341607ce9b81fbddc316e1d8ed745adebc0924533d3f31d7116e1338ca52548
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45cb6bcfdabd08869c69097cfa5bee5aef04ef8c28f157d1f489ca78f5fe777b
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d76e6d82da5288a2ef5759e596842c9b010e3c843177c0a0569fcc85be1fa7
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:596eeeb9c5038b414042e03c790c4d8a3ba4d45df7e2d23dd912b3398d87b3c1
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c690ffe3102be826e29c5633cf8620ca47d1b8b819efc2742c27a8604739ff49
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1034c58512d1793e1275c069dc6457aa4efaaf3eead8bde0452447a4e033790a
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1485dbdd93d559396936d60e442a160868c25454e2f8eb093acbfacc2547006b
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4790eab0dde508fbf6099ce52ddbe518d5cf97627bbdf3949e06dde5e08e25fd
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f43d12007991353f51361573d6d7482f2e62e2ba4187b198fad307fac606fa9
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29186fb25040ec4572ae0b84469b79877a09d72bc3dfd7003bd296fc03a5510c
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7e0918853c587646eb55ae02c94dac10dd95a4e905a8656aa368dc6e541224d
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bd437dc3c0c22d9c434de5ec29821436fc23d3c711bd99c0f72ce1ee249cbd7
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0158d971ba71808f5326cd56710c7e448844128b673df1a6f529bff95750524c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c1fbfb8b80209395e13448bf1015ddfce9474a48c7701f6ac933493628333aa
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78def40bb72508d5352010e3b2abe8d73620bbc2697a530d2c483328a80c449f
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca0f8832d1e0a99012ddffa0912becb483c91f8a60016c8ffce71b49b64e355b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e302c9460dd4b2d18e32dcb2207c4813128526fb6cb1fc5ceb7324259f0491ba
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05e4871c2d84ecbc1efac64854d25ee3f52b310cdc9e7aa704123b1be1e82dc1
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96559dc2d5bf69154ad885b0fecd6a00ab728919e684f3c7a11e872f73da62b1
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2224282bdf68451bf16fa617356ee085cc44369ceb8aeadcfdd169e4f1c53cda
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:844c660102e4d575fd724e57758d180804c47275487517ac2966e44f0456ff72
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d400bf947081cbe3449eeae2e2bf5d09fede9253207279d467443269b9ffad31
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20c48845b93131d80f6356e44142d40faf3a38bdc6caa9bdebf4e90c2b7ceda2
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f19c1c760c44b1d56e40c2b6486967adae47f40de9c950993205f9a2e7e66c38
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8979d33fb7a17f61e829a30bf98bf52a2f74ab1c472a4e63d6f1ec93d04d0c66
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7978723404255319,
5
  "eval_steps": 20,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2299,6 +2299,574 @@
2299
  "eval_samples_per_second": 6.683,
2300
  "eval_steps_per_second": 0.209,
2301
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2302
  }
2303
  ],
2304
  "logging_steps": 1,
@@ -2313,12 +2881,12 @@
2313
  "should_evaluate": false,
2314
  "should_log": false,
2315
  "should_save": true,
2316
- "should_training_stop": false
2317
  },
2318
  "attributes": {}
2319
  }
2320
  },
2321
- "total_flos": 9.717414664287027e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 20,
6
+ "global_step": 376,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2299
  "eval_samples_per_second": 6.683,
2300
  "eval_steps_per_second": 0.209,
2301
  "step": 300
2302
+ },
2303
+ {
2304
+ "epoch": 0.800531914893617,
2305
+ "grad_norm": 3.4099960327148438,
2306
+ "learning_rate": 2.332921335481205e-06,
2307
+ "loss": 0.2715,
2308
+ "step": 301
2309
+ },
2310
+ {
2311
+ "epoch": 0.8031914893617021,
2312
+ "grad_norm": 4.202554702758789,
2313
+ "learning_rate": 2.2735820613083837e-06,
2314
+ "loss": 0.2616,
2315
+ "step": 302
2316
+ },
2317
+ {
2318
+ "epoch": 0.8058510638297872,
2319
+ "grad_norm": 2.95456862449646,
2320
+ "learning_rate": 2.2149102719882044e-06,
2321
+ "loss": 0.2455,
2322
+ "step": 303
2323
+ },
2324
+ {
2325
+ "epoch": 0.8085106382978723,
2326
+ "grad_norm": 2.7879536151885986,
2327
+ "learning_rate": 2.156911036173568e-06,
2328
+ "loss": 0.2054,
2329
+ "step": 304
2330
+ },
2331
+ {
2332
+ "epoch": 0.8111702127659575,
2333
+ "grad_norm": 2.4969985485076904,
2334
+ "learning_rate": 2.0995893644155007e-06,
2335
+ "loss": 0.2814,
2336
+ "step": 305
2337
+ },
2338
+ {
2339
+ "epoch": 0.8138297872340425,
2340
+ "grad_norm": 3.3959643840789795,
2341
+ "learning_rate": 2.0429502087303164e-06,
2342
+ "loss": 0.2382,
2343
+ "step": 306
2344
+ },
2345
+ {
2346
+ "epoch": 0.8164893617021277,
2347
+ "grad_norm": 2.825615882873535,
2348
+ "learning_rate": 1.9869984621717888e-06,
2349
+ "loss": 0.2808,
2350
+ "step": 307
2351
+ },
2352
+ {
2353
+ "epoch": 0.8191489361702128,
2354
+ "grad_norm": 2.766301155090332,
2355
+ "learning_rate": 1.931738958408457e-06,
2356
+ "loss": 0.2371,
2357
+ "step": 308
2358
+ },
2359
+ {
2360
+ "epoch": 0.8218085106382979,
2361
+ "grad_norm": 3.683234930038452,
2362
+ "learning_rate": 1.8771764713060359e-06,
2363
+ "loss": 0.2617,
2364
+ "step": 309
2365
+ },
2366
+ {
2367
+ "epoch": 0.824468085106383,
2368
+ "grad_norm": 3.0581727027893066,
2369
+ "learning_rate": 1.8233157145150183e-06,
2370
+ "loss": 0.254,
2371
+ "step": 310
2372
+ },
2373
+ {
2374
+ "epoch": 0.8271276595744681,
2375
+ "grad_norm": 3.316701889038086,
2376
+ "learning_rate": 1.7701613410634367e-06,
2377
+ "loss": 0.2596,
2378
+ "step": 311
2379
+ },
2380
+ {
2381
+ "epoch": 0.8297872340425532,
2382
+ "grad_norm": 2.8315346240997314,
2383
+ "learning_rate": 1.717717942954914e-06,
2384
+ "loss": 0.222,
2385
+ "step": 312
2386
+ },
2387
+ {
2388
+ "epoch": 0.8324468085106383,
2389
+ "grad_norm": 2.781020164489746,
2390
+ "learning_rate": 1.6659900507719406e-06,
2391
+ "loss": 0.2643,
2392
+ "step": 313
2393
+ },
2394
+ {
2395
+ "epoch": 0.8351063829787234,
2396
+ "grad_norm": 2.389970302581787,
2397
+ "learning_rate": 1.614982133284495e-06,
2398
+ "loss": 0.2161,
2399
+ "step": 314
2400
+ },
2401
+ {
2402
+ "epoch": 0.8377659574468085,
2403
+ "grad_norm": 3.4777987003326416,
2404
+ "learning_rate": 1.5646985970639717e-06,
2405
+ "loss": 0.3309,
2406
+ "step": 315
2407
+ },
2408
+ {
2409
+ "epoch": 0.8404255319148937,
2410
+ "grad_norm": 4.487973690032959,
2411
+ "learning_rate": 1.5151437861025032e-06,
2412
+ "loss": 0.3284,
2413
+ "step": 316
2414
+ },
2415
+ {
2416
+ "epoch": 0.8430851063829787,
2417
+ "grad_norm": 4.822957515716553,
2418
+ "learning_rate": 1.466321981437694e-06,
2419
+ "loss": 0.2033,
2420
+ "step": 317
2421
+ },
2422
+ {
2423
+ "epoch": 0.8457446808510638,
2424
+ "grad_norm": 2.9255247116088867,
2425
+ "learning_rate": 1.4182374007827605e-06,
2426
+ "loss": 0.2528,
2427
+ "step": 318
2428
+ },
2429
+ {
2430
+ "epoch": 0.848404255319149,
2431
+ "grad_norm": 2.9784889221191406,
2432
+ "learning_rate": 1.3708941981621814e-06,
2433
+ "loss": 0.2151,
2434
+ "step": 319
2435
+ },
2436
+ {
2437
+ "epoch": 0.851063829787234,
2438
+ "grad_norm": 2.522810459136963,
2439
+ "learning_rate": 1.324296463552821e-06,
2440
+ "loss": 0.2333,
2441
+ "step": 320
2442
+ },
2443
+ {
2444
+ "epoch": 0.851063829787234,
2445
+ "eval_accuracy": 0.831764705882353,
2446
+ "eval_f1": 0.5545171339563862,
2447
+ "eval_loss": 0.38777896761894226,
2448
+ "eval_precision": 0.7416666666666667,
2449
+ "eval_recall": 0.4427860696517413,
2450
+ "eval_runtime": 34.5031,
2451
+ "eval_samples_per_second": 6.492,
2452
+ "eval_steps_per_second": 0.203,
2453
+ "step": 320
2454
+ },
2455
+ {
2456
+ "epoch": 0.8537234042553191,
2457
+ "grad_norm": 2.794802665710449,
2458
+ "learning_rate": 1.2784482225306061e-06,
2459
+ "loss": 0.2338,
2460
+ "step": 321
2461
+ },
2462
+ {
2463
+ "epoch": 0.8563829787234043,
2464
+ "grad_norm": 2.8740601539611816,
2465
+ "learning_rate": 1.2333534359227383e-06,
2466
+ "loss": 0.2526,
2467
+ "step": 322
2468
+ },
2469
+ {
2470
+ "epoch": 0.8590425531914894,
2471
+ "grad_norm": 2.600721597671509,
2472
+ "learning_rate": 1.1890159994655425e-06,
2473
+ "loss": 0.2165,
2474
+ "step": 323
2475
+ },
2476
+ {
2477
+ "epoch": 0.8617021276595744,
2478
+ "grad_norm": 2.781907796859741,
2479
+ "learning_rate": 1.1454397434679022e-06,
2480
+ "loss": 0.2414,
2481
+ "step": 324
2482
+ },
2483
+ {
2484
+ "epoch": 0.8643617021276596,
2485
+ "grad_norm": 2.8299474716186523,
2486
+ "learning_rate": 1.1026284324803493e-06,
2487
+ "loss": 0.2389,
2488
+ "step": 325
2489
+ },
2490
+ {
2491
+ "epoch": 0.8670212765957447,
2492
+ "grad_norm": 2.6625523567199707,
2493
+ "learning_rate": 1.060585764969867e-06,
2494
+ "loss": 0.2444,
2495
+ "step": 326
2496
+ },
2497
+ {
2498
+ "epoch": 0.8696808510638298,
2499
+ "grad_norm": 3.0182435512542725,
2500
+ "learning_rate": 1.0193153730003603e-06,
2501
+ "loss": 0.2967,
2502
+ "step": 327
2503
+ },
2504
+ {
2505
+ "epoch": 0.8723404255319149,
2506
+ "grad_norm": 2.5358083248138428,
2507
+ "learning_rate": 9.788208219188932e-07,
2508
+ "loss": 0.2091,
2509
+ "step": 328
2510
+ },
2511
+ {
2512
+ "epoch": 0.875,
2513
+ "grad_norm": 3.2480201721191406,
2514
+ "learning_rate": 9.391056100476736e-07,
2515
+ "loss": 0.2195,
2516
+ "step": 329
2517
+ },
2518
+ {
2519
+ "epoch": 0.8776595744680851,
2520
+ "grad_norm": 2.449801445007324,
2521
+ "learning_rate": 9.001731683818338e-07,
2522
+ "loss": 0.2316,
2523
+ "step": 330
2524
+ },
2525
+ {
2526
+ "epoch": 0.8803191489361702,
2527
+ "grad_norm": 3.304652690887451,
2528
+ "learning_rate": 8.620268602930271e-07,
2529
+ "loss": 0.2719,
2530
+ "step": 331
2531
+ },
2532
+ {
2533
+ "epoch": 0.8829787234042553,
2534
+ "grad_norm": 3.1013834476470947,
2535
+ "learning_rate": 8.246699812388714e-07,
2536
+ "loss": 0.2412,
2537
+ "step": 332
2538
+ },
2539
+ {
2540
+ "epoch": 0.8856382978723404,
2541
+ "grad_norm": 2.4398679733276367,
2542
+ "learning_rate": 7.881057584782448e-07,
2543
+ "loss": 0.1909,
2544
+ "step": 333
2545
+ },
2546
+ {
2547
+ "epoch": 0.8882978723404256,
2548
+ "grad_norm": 3.296792984008789,
2549
+ "learning_rate": 7.523373507924947e-07,
2550
+ "loss": 0.2592,
2551
+ "step": 334
2552
+ },
2553
+ {
2554
+ "epoch": 0.8909574468085106,
2555
+ "grad_norm": 3.5089118480682373,
2556
+ "learning_rate": 7.17367848212539e-07,
2557
+ "loss": 0.2341,
2558
+ "step": 335
2559
+ },
2560
+ {
2561
+ "epoch": 0.8936170212765957,
2562
+ "grad_norm": 2.9826953411102295,
2563
+ "learning_rate": 6.83200271751927e-07,
2564
+ "loss": 0.239,
2565
+ "step": 336
2566
+ },
2567
+ {
2568
+ "epoch": 0.8962765957446809,
2569
+ "grad_norm": 2.965322732925415,
2570
+ "learning_rate": 6.498375731458529e-07,
2571
+ "loss": 0.242,
2572
+ "step": 337
2573
+ },
2574
+ {
2575
+ "epoch": 0.898936170212766,
2576
+ "grad_norm": 2.855252504348755,
2577
+ "learning_rate": 6.17282634596148e-07,
2578
+ "loss": 0.2503,
2579
+ "step": 338
2580
+ },
2581
+ {
2582
+ "epoch": 0.901595744680851,
2583
+ "grad_norm": 5.112611293792725,
2584
+ "learning_rate": 5.85538268522301e-07,
2585
+ "loss": 0.2665,
2586
+ "step": 339
2587
+ },
2588
+ {
2589
+ "epoch": 0.9042553191489362,
2590
+ "grad_norm": 3.4850215911865234,
2591
+ "learning_rate": 5.546072173184791e-07,
2592
+ "loss": 0.2896,
2593
+ "step": 340
2594
+ },
2595
+ {
2596
+ "epoch": 0.9042553191489362,
2597
+ "eval_accuracy": 0.8305882352941176,
2598
+ "eval_f1": 0.55,
2599
+ "eval_loss": 0.38858291506767273,
2600
+ "eval_precision": 0.7394957983193278,
2601
+ "eval_recall": 0.43781094527363185,
2602
+ "eval_runtime": 34.3336,
2603
+ "eval_samples_per_second": 6.524,
2604
+ "eval_steps_per_second": 0.204,
2605
+ "step": 340
2606
+ },
2607
+ {
2608
+ "epoch": 0.9069148936170213,
2609
+ "grad_norm": 2.3722422122955322,
2610
+ "learning_rate": 5.244921531166247e-07,
2611
+ "loss": 0.2334,
2612
+ "step": 341
2613
+ },
2614
+ {
2615
+ "epoch": 0.9095744680851063,
2616
+ "grad_norm": 2.8881895542144775,
2617
+ "learning_rate": 4.951956775556e-07,
2618
+ "loss": 0.2339,
2619
+ "step": 342
2620
+ },
2621
+ {
2622
+ "epoch": 0.9122340425531915,
2623
+ "grad_norm": 4.109971046447754,
2624
+ "learning_rate": 4.667203215564431e-07,
2625
+ "loss": 0.2837,
2626
+ "step": 343
2627
+ },
2628
+ {
2629
+ "epoch": 0.9148936170212766,
2630
+ "grad_norm": 3.7027337551116943,
2631
+ "learning_rate": 4.3906854510370245e-07,
2632
+ "loss": 0.2862,
2633
+ "step": 344
2634
+ },
2635
+ {
2636
+ "epoch": 0.9175531914893617,
2637
+ "grad_norm": 3.069493532180786,
2638
+ "learning_rate": 4.1224273703294515e-07,
2639
+ "loss": 0.2456,
2640
+ "step": 345
2641
+ },
2642
+ {
2643
+ "epoch": 0.9202127659574468,
2644
+ "grad_norm": 2.9162609577178955,
2645
+ "learning_rate": 3.862452148243623e-07,
2646
+ "loss": 0.2633,
2647
+ "step": 346
2648
+ },
2649
+ {
2650
+ "epoch": 0.9228723404255319,
2651
+ "grad_norm": 3.10223388671875,
2652
+ "learning_rate": 3.610782244025768e-07,
2653
+ "loss": 0.2165,
2654
+ "step": 347
2655
+ },
2656
+ {
2657
+ "epoch": 0.925531914893617,
2658
+ "grad_norm": 3.3466663360595703,
2659
+ "learning_rate": 3.367439399426087e-07,
2660
+ "loss": 0.2748,
2661
+ "step": 348
2662
+ },
2663
+ {
2664
+ "epoch": 0.9281914893617021,
2665
+ "grad_norm": 3.4505677223205566,
2666
+ "learning_rate": 3.132444636820575e-07,
2667
+ "loss": 0.2789,
2668
+ "step": 349
2669
+ },
2670
+ {
2671
+ "epoch": 0.9308510638297872,
2672
+ "grad_norm": 3.7714152336120605,
2673
+ "learning_rate": 2.905818257394799e-07,
2674
+ "loss": 0.233,
2675
+ "step": 350
2676
+ },
2677
+ {
2678
+ "epoch": 0.9335106382978723,
2679
+ "grad_norm": 5.176234722137451,
2680
+ "learning_rate": 2.687579839390153e-07,
2681
+ "loss": 0.2933,
2682
+ "step": 351
2683
+ },
2684
+ {
2685
+ "epoch": 0.9361702127659575,
2686
+ "grad_norm": 2.8145923614501953,
2687
+ "learning_rate": 2.4777482364124695e-07,
2688
+ "loss": 0.2916,
2689
+ "step": 352
2690
+ },
2691
+ {
2692
+ "epoch": 0.9388297872340425,
2693
+ "grad_norm": 2.452026605606079,
2694
+ "learning_rate": 2.2763415758032316e-07,
2695
+ "loss": 0.2072,
2696
+ "step": 353
2697
+ },
2698
+ {
2699
+ "epoch": 0.9414893617021277,
2700
+ "grad_norm": 2.741774559020996,
2701
+ "learning_rate": 2.0833772570736376e-07,
2702
+ "loss": 0.2365,
2703
+ "step": 354
2704
+ },
2705
+ {
2706
+ "epoch": 0.9441489361702128,
2707
+ "grad_norm": 2.6265206336975098,
2708
+ "learning_rate": 1.8988719504013375e-07,
2709
+ "loss": 0.2226,
2710
+ "step": 355
2711
+ },
2712
+ {
2713
+ "epoch": 0.9468085106382979,
2714
+ "grad_norm": 4.149282932281494,
2715
+ "learning_rate": 1.7228415951904165e-07,
2716
+ "loss": 0.1923,
2717
+ "step": 356
2718
+ },
2719
+ {
2720
+ "epoch": 0.949468085106383,
2721
+ "grad_norm": 2.389505624771118,
2722
+ "learning_rate": 1.5553013986942645e-07,
2723
+ "loss": 0.21,
2724
+ "step": 357
2725
+ },
2726
+ {
2727
+ "epoch": 0.9521276595744681,
2728
+ "grad_norm": 4.067861557006836,
2729
+ "learning_rate": 1.3962658347019819e-07,
2730
+ "loss": 0.2497,
2731
+ "step": 358
2732
+ },
2733
+ {
2734
+ "epoch": 0.9547872340425532,
2735
+ "grad_norm": 2.5128250122070312,
2736
+ "learning_rate": 1.245748642287814e-07,
2737
+ "loss": 0.2559,
2738
+ "step": 359
2739
+ },
2740
+ {
2741
+ "epoch": 0.9574468085106383,
2742
+ "grad_norm": 2.755162477493286,
2743
+ "learning_rate": 1.103762824624377e-07,
2744
+ "loss": 0.2398,
2745
+ "step": 360
2746
+ },
2747
+ {
2748
+ "epoch": 0.9574468085106383,
2749
+ "eval_accuracy": 0.8329411764705882,
2750
+ "eval_f1": 0.5617283950617284,
2751
+ "eval_loss": 0.38481393456459045,
2752
+ "eval_precision": 0.7398373983739838,
2753
+ "eval_recall": 0.4527363184079602,
2754
+ "eval_runtime": 34.7008,
2755
+ "eval_samples_per_second": 6.455,
2756
+ "eval_steps_per_second": 0.202,
2757
+ "step": 360
2758
+ },
2759
+ {
2760
+ "epoch": 0.9601063829787234,
2761
+ "grad_norm": 3.078138828277588,
2762
+ "learning_rate": 9.70320647859213e-08,
2763
+ "loss": 0.2091,
2764
+ "step": 361
2765
+ },
2766
+ {
2767
+ "epoch": 0.9627659574468085,
2768
+ "grad_norm": 2.8632972240448,
2769
+ "learning_rate": 8.454336400552154e-08,
2770
+ "loss": 0.2513,
2771
+ "step": 362
2772
+ },
2773
+ {
2774
+ "epoch": 0.9654255319148937,
2775
+ "grad_norm": 2.500767469406128,
2776
+ "learning_rate": 7.291125901946027e-08,
2777
+ "loss": 0.2346,
2778
+ "step": 363
2779
+ },
2780
+ {
2781
+ "epoch": 0.9680851063829787,
2782
+ "grad_norm": 4.420257091522217,
2783
+ "learning_rate": 6.21367547246976e-08,
2784
+ "loss": 0.2701,
2785
+ "step": 364
2786
+ },
2787
+ {
2788
+ "epoch": 0.9707446808510638,
2789
+ "grad_norm": 2.459460973739624,
2790
+ "learning_rate": 5.2220781930111263e-08,
2791
+ "loss": 0.2441,
2792
+ "step": 365
2793
+ },
2794
+ {
2795
+ "epoch": 0.973404255319149,
2796
+ "grad_norm": 3.661996841430664,
2797
+ "learning_rate": 4.316419727608434e-08,
2798
+ "loss": 0.2704,
2799
+ "step": 366
2800
+ },
2801
+ {
2802
+ "epoch": 0.976063829787234,
2803
+ "grad_norm": 3.0439155101776123,
2804
+ "learning_rate": 3.4967783160507753e-08,
2805
+ "loss": 0.2187,
2806
+ "step": 367
2807
+ },
2808
+ {
2809
+ "epoch": 0.9787234042553191,
2810
+ "grad_norm": 3.629185914993286,
2811
+ "learning_rate": 2.763224767117767e-08,
2812
+ "loss": 0.3418,
2813
+ "step": 368
2814
+ },
2815
+ {
2816
+ "epoch": 0.9813829787234043,
2817
+ "grad_norm": 2.30877423286438,
2818
+ "learning_rate": 2.115822452463223e-08,
2819
+ "loss": 0.2607,
2820
+ "step": 369
2821
+ },
2822
+ {
2823
+ "epoch": 0.9840425531914894,
2824
+ "grad_norm": 3.398482084274292,
2825
+ "learning_rate": 1.554627301140199e-08,
2826
+ "loss": 0.2494,
2827
+ "step": 370
2828
+ },
2829
+ {
2830
+ "epoch": 0.9867021276595744,
2831
+ "grad_norm": 3.0833022594451904,
2832
+ "learning_rate": 1.0796877947691909e-08,
2833
+ "loss": 0.2924,
2834
+ "step": 371
2835
+ },
2836
+ {
2837
+ "epoch": 0.9893617021276596,
2838
+ "grad_norm": 2.702519655227661,
2839
+ "learning_rate": 6.910449633501515e-09,
2840
+ "loss": 0.2222,
2841
+ "step": 372
2842
+ },
2843
+ {
2844
+ "epoch": 0.9920212765957447,
2845
+ "grad_norm": 3.0397112369537354,
2846
+ "learning_rate": 3.887323817173272e-09,
2847
+ "loss": 0.2145,
2848
+ "step": 373
2849
+ },
2850
+ {
2851
+ "epoch": 0.9946808510638298,
2852
+ "grad_norm": 2.342505931854248,
2853
+ "learning_rate": 1.7277616663946562e-09,
2854
+ "loss": 0.2471,
2855
+ "step": 374
2856
+ },
2857
+ {
2858
+ "epoch": 0.9973404255319149,
2859
+ "grad_norm": 2.674713611602783,
2860
+ "learning_rate": 4.319497456273247e-10,
2861
+ "loss": 0.2519,
2862
+ "step": 375
2863
+ },
2864
+ {
2865
+ "epoch": 1.0,
2866
+ "grad_norm": 4.508094310760498,
2867
+ "learning_rate": 0.0,
2868
+ "loss": 0.3025,
2869
+ "step": 376
2870
  }
2871
  ],
2872
  "logging_steps": 1,
 
2881
  "should_evaluate": false,
2882
  "should_log": false,
2883
  "should_save": true,
2884
+ "should_training_stop": true
2885
  },
2886
  "attributes": {}
2887
  }
2888
  },
2889
+ "total_flos": 1.2170791543740826e+17,
2890
  "train_batch_size": 8,
2891
  "trial_name": null,
2892
  "trial_params": null