mtzig commited on
Commit
f113ea9
·
verified ·
1 Parent(s): ee7a537

Training in progress, step 300, checkpoint

Browse files
last-checkpoint/optimizer_0/.metadata CHANGED
Binary files a/last-checkpoint/optimizer_0/.metadata and b/last-checkpoint/optimizer_0/.metadata differ
 
last-checkpoint/optimizer_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3843927cedc4d683f3269d495867dcd7b2405c910617a503028f960732f07e6c
3
  size 13934748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcabfd92be31b6499987f272f2e66282673bbf0f3477e3b95d3ea40fdce5a631
3
  size 13934748
last-checkpoint/optimizer_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47fd5ac1aedf65d1e43923149a90aa599911c73d408d5994466b0ae9f9c88c76
3
  size 13999412
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45cb6bcfdabd08869c69097cfa5bee5aef04ef8c28f157d1f489ca78f5fe777b
3
  size 13999412
last-checkpoint/optimizer_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df9266203b37a0254a86248c42410caf65ae1b76706802247a77e92c1d88e294
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:596eeeb9c5038b414042e03c790c4d8a3ba4d45df7e2d23dd912b3398d87b3c1
3
  size 13990904
last-checkpoint/optimizer_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:564a75b416e4e48eb1f76e79505f22d2d45bf4fc8b254f4c8ce9b3ce5890dc81
3
  size 13990904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1034c58512d1793e1275c069dc6457aa4efaaf3eead8bde0452447a4e033790a
3
  size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata CHANGED
Binary files a/last-checkpoint/pytorch_model_fsdp_0/.metadata and b/last-checkpoint/pytorch_model_fsdp_0/.metadata differ
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd41c478f550639121d913c62a920bd1cb03accab9182666486c78e6e8a330ba
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4790eab0dde508fbf6099ce52ddbe518d5cf97627bbdf3949e06dde5e08e25fd
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e2ea2567791532b38b5fab7ba9b89492d30645a423a9f0f1bc21e98535902c9
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29186fb25040ec4572ae0b84469b79877a09d72bc3dfd7003bd296fc03a5510c
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6cd518b6b383ab04cec632660a3cece5e59683a59a2ffb32a3f0ca2075f162e
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bd437dc3c0c22d9c434de5ec29821436fc23d3c711bd99c0f72ce1ee249cbd7
3
  size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:655a2930a3b9fe448e6f767f8b25b9ebdd3f906d256322c915a95c99f18bba8f
3
  size 6966784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c1fbfb8b80209395e13448bf1015ddfce9474a48c7701f6ac933493628333aa
3
  size 6966784
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab8cdd08f60cbd3036bbd610c5a42dde3ec47637b7e45c85683a417a9d360a6f
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca0f8832d1e0a99012ddffa0912becb483c91f8a60016c8ffce71b49b64e355b
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59a8af46ddb45218bc7cbc9b3f81796f6f16e1bc3531c4213c3b740a3fa6722a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f19c1c760c44b1d56e40c2b6486967adae47f40de9c950993205f9a2e7e66c38
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5319148936170213,
5
  "eval_steps": 20,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1539,6 +1539,766 @@
1539
  "eval_samples_per_second": 6.626,
1540
  "eval_steps_per_second": 0.207,
1541
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1542
  }
1543
  ],
1544
  "logging_steps": 1,
@@ -1558,7 +2318,7 @@
1558
  "attributes": {}
1559
  }
1560
  },
1561
- "total_flos": 6.492182455884186e+16,
1562
  "train_batch_size": 8,
1563
  "trial_name": null,
1564
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7978723404255319,
5
  "eval_steps": 20,
6
+ "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1539
  "eval_samples_per_second": 6.626,
1540
  "eval_steps_per_second": 0.207,
1541
  "step": 200
1542
+ },
1543
+ {
1544
+ "epoch": 0.5345744680851063,
1545
+ "grad_norm": 6.333003044128418,
1546
+ "learning_rate": 1.0557390144892684e-05,
1547
+ "loss": 0.3334,
1548
+ "step": 201
1549
+ },
1550
+ {
1551
+ "epoch": 0.5372340425531915,
1552
+ "grad_norm": 2.1432178020477295,
1553
+ "learning_rate": 1.0464565373634784e-05,
1554
+ "loss": 0.2513,
1555
+ "step": 202
1556
+ },
1557
+ {
1558
+ "epoch": 0.5398936170212766,
1559
+ "grad_norm": 5.119022369384766,
1560
+ "learning_rate": 1.0371700468597886e-05,
1561
+ "loss": 0.2566,
1562
+ "step": 203
1563
+ },
1564
+ {
1565
+ "epoch": 0.5425531914893617,
1566
+ "grad_norm": 3.5691733360290527,
1567
+ "learning_rate": 1.0278803452376416e-05,
1568
+ "loss": 0.3084,
1569
+ "step": 204
1570
+ },
1571
+ {
1572
+ "epoch": 0.5452127659574468,
1573
+ "grad_norm": 3.0961036682128906,
1574
+ "learning_rate": 1.018588235033888e-05,
1575
+ "loss": 0.2085,
1576
+ "step": 205
1577
+ },
1578
+ {
1579
+ "epoch": 0.5478723404255319,
1580
+ "grad_norm": 2.27486515045166,
1581
+ "learning_rate": 1.0092945189934558e-05,
1582
+ "loss": 0.2524,
1583
+ "step": 206
1584
+ },
1585
+ {
1586
+ "epoch": 0.550531914893617,
1587
+ "grad_norm": 2.3716437816619873,
1588
+ "learning_rate": 1e-05,
1589
+ "loss": 0.2011,
1590
+ "step": 207
1591
+ },
1592
+ {
1593
+ "epoch": 0.5531914893617021,
1594
+ "grad_norm": 2.6007697582244873,
1595
+ "learning_rate": 9.907054810065446e-06,
1596
+ "loss": 0.2451,
1597
+ "step": 208
1598
+ },
1599
+ {
1600
+ "epoch": 0.5558510638297872,
1601
+ "grad_norm": 2.5963995456695557,
1602
+ "learning_rate": 9.81411764966112e-06,
1603
+ "loss": 0.2705,
1604
+ "step": 209
1605
+ },
1606
+ {
1607
+ "epoch": 0.5585106382978723,
1608
+ "grad_norm": 2.1203646659851074,
1609
+ "learning_rate": 9.721196547623585e-06,
1610
+ "loss": 0.2101,
1611
+ "step": 210
1612
+ },
1613
+ {
1614
+ "epoch": 0.5611702127659575,
1615
+ "grad_norm": 3.2986724376678467,
1616
+ "learning_rate": 9.628299531402118e-06,
1617
+ "loss": 0.2659,
1618
+ "step": 211
1619
+ },
1620
+ {
1621
+ "epoch": 0.5638297872340425,
1622
+ "grad_norm": 2.127525568008423,
1623
+ "learning_rate": 9.535434626365221e-06,
1624
+ "loss": 0.251,
1625
+ "step": 212
1626
+ },
1627
+ {
1628
+ "epoch": 0.5664893617021277,
1629
+ "grad_norm": 3.1327059268951416,
1630
+ "learning_rate": 9.442609855107317e-06,
1631
+ "loss": 0.2255,
1632
+ "step": 213
1633
+ },
1634
+ {
1635
+ "epoch": 0.5691489361702128,
1636
+ "grad_norm": 2.0999770164489746,
1637
+ "learning_rate": 9.349833236755675e-06,
1638
+ "loss": 0.2549,
1639
+ "step": 214
1640
+ },
1641
+ {
1642
+ "epoch": 0.5718085106382979,
1643
+ "grad_norm": 2.7766880989074707,
1644
+ "learning_rate": 9.257112786277631e-06,
1645
+ "loss": 0.2224,
1646
+ "step": 215
1647
+ },
1648
+ {
1649
+ "epoch": 0.574468085106383,
1650
+ "grad_norm": 2.451842784881592,
1651
+ "learning_rate": 9.164456513788186e-06,
1652
+ "loss": 0.2599,
1653
+ "step": 216
1654
+ },
1655
+ {
1656
+ "epoch": 0.5771276595744681,
1657
+ "grad_norm": 2.7746975421905518,
1658
+ "learning_rate": 9.07187242385801e-06,
1659
+ "loss": 0.2601,
1660
+ "step": 217
1661
+ },
1662
+ {
1663
+ "epoch": 0.5797872340425532,
1664
+ "grad_norm": 2.561441421508789,
1665
+ "learning_rate": 8.979368514821917e-06,
1666
+ "loss": 0.284,
1667
+ "step": 218
1668
+ },
1669
+ {
1670
+ "epoch": 0.5824468085106383,
1671
+ "grad_norm": 2.425262928009033,
1672
+ "learning_rate": 8.88695277808791e-06,
1673
+ "loss": 0.2593,
1674
+ "step": 219
1675
+ },
1676
+ {
1677
+ "epoch": 0.5851063829787234,
1678
+ "grad_norm": 3.180457830429077,
1679
+ "learning_rate": 8.79463319744677e-06,
1680
+ "loss": 0.2844,
1681
+ "step": 220
1682
+ },
1683
+ {
1684
+ "epoch": 0.5851063829787234,
1685
+ "eval_accuracy": 0.8258823529411765,
1686
+ "eval_f1": 0.5163398692810458,
1687
+ "eval_loss": 0.41871950030326843,
1688
+ "eval_precision": 0.7523809523809524,
1689
+ "eval_recall": 0.39303482587064675,
1690
+ "eval_runtime": 34.0471,
1691
+ "eval_samples_per_second": 6.579,
1692
+ "eval_steps_per_second": 0.206,
1693
+ "step": 220
1694
+ },
1695
+ {
1696
+ "epoch": 0.5877659574468085,
1697
+ "grad_norm": 2.8783645629882812,
1698
+ "learning_rate": 8.702417748382384e-06,
1699
+ "loss": 0.2458,
1700
+ "step": 221
1701
+ },
1702
+ {
1703
+ "epoch": 0.5904255319148937,
1704
+ "grad_norm": 2.950291395187378,
1705
+ "learning_rate": 8.610314397382701e-06,
1706
+ "loss": 0.3062,
1707
+ "step": 222
1708
+ },
1709
+ {
1710
+ "epoch": 0.5930851063829787,
1711
+ "grad_norm": 2.8430628776550293,
1712
+ "learning_rate": 8.51833110125153e-06,
1713
+ "loss": 0.2913,
1714
+ "step": 223
1715
+ },
1716
+ {
1717
+ "epoch": 0.5957446808510638,
1718
+ "grad_norm": 6.691501617431641,
1719
+ "learning_rate": 8.426475806421139e-06,
1720
+ "loss": 0.3716,
1721
+ "step": 224
1722
+ },
1723
+ {
1724
+ "epoch": 0.598404255319149,
1725
+ "grad_norm": 2.705397367477417,
1726
+ "learning_rate": 8.334756448265782e-06,
1727
+ "loss": 0.2692,
1728
+ "step": 225
1729
+ },
1730
+ {
1731
+ "epoch": 0.601063829787234,
1732
+ "grad_norm": 2.276686429977417,
1733
+ "learning_rate": 8.243180950416142e-06,
1734
+ "loss": 0.214,
1735
+ "step": 226
1736
+ },
1737
+ {
1738
+ "epoch": 0.6037234042553191,
1739
+ "grad_norm": 4.622035980224609,
1740
+ "learning_rate": 8.151757224074815e-06,
1741
+ "loss": 0.1863,
1742
+ "step": 227
1743
+ },
1744
+ {
1745
+ "epoch": 0.6063829787234043,
1746
+ "grad_norm": 2.3402657508850098,
1747
+ "learning_rate": 8.060493167332874e-06,
1748
+ "loss": 0.2895,
1749
+ "step": 228
1750
+ },
1751
+ {
1752
+ "epoch": 0.6090425531914894,
1753
+ "grad_norm": 4.533783912658691,
1754
+ "learning_rate": 7.969396664487534e-06,
1755
+ "loss": 0.256,
1756
+ "step": 229
1757
+ },
1758
+ {
1759
+ "epoch": 0.6117021276595744,
1760
+ "grad_norm": 4.254709243774414,
1761
+ "learning_rate": 7.878475585361045e-06,
1762
+ "loss": 0.2798,
1763
+ "step": 230
1764
+ },
1765
+ {
1766
+ "epoch": 0.6143617021276596,
1767
+ "grad_norm": 2.4173777103424072,
1768
+ "learning_rate": 7.787737784620803e-06,
1769
+ "loss": 0.3046,
1770
+ "step": 231
1771
+ },
1772
+ {
1773
+ "epoch": 0.6170212765957447,
1774
+ "grad_norm": 2.9640042781829834,
1775
+ "learning_rate": 7.697191101100802e-06,
1776
+ "loss": 0.2893,
1777
+ "step": 232
1778
+ },
1779
+ {
1780
+ "epoch": 0.6196808510638298,
1781
+ "grad_norm": 2.9573986530303955,
1782
+ "learning_rate": 7.606843357124426e-06,
1783
+ "loss": 0.2764,
1784
+ "step": 233
1785
+ },
1786
+ {
1787
+ "epoch": 0.6223404255319149,
1788
+ "grad_norm": 3.9960691928863525,
1789
+ "learning_rate": 7.516702357828672e-06,
1790
+ "loss": 0.3243,
1791
+ "step": 234
1792
+ },
1793
+ {
1794
+ "epoch": 0.625,
1795
+ "grad_norm": 2.9117209911346436,
1796
+ "learning_rate": 7.42677589048989e-06,
1797
+ "loss": 0.2863,
1798
+ "step": 235
1799
+ },
1800
+ {
1801
+ "epoch": 0.6276595744680851,
1802
+ "grad_norm": 2.57856822013855,
1803
+ "learning_rate": 7.337071723851018e-06,
1804
+ "loss": 0.2433,
1805
+ "step": 236
1806
+ },
1807
+ {
1808
+ "epoch": 0.6303191489361702,
1809
+ "grad_norm": 3.1635406017303467,
1810
+ "learning_rate": 7.247597607450446e-06,
1811
+ "loss": 0.2622,
1812
+ "step": 237
1813
+ },
1814
+ {
1815
+ "epoch": 0.6329787234042553,
1816
+ "grad_norm": 3.4039433002471924,
1817
+ "learning_rate": 7.1583612709525405e-06,
1818
+ "loss": 0.2313,
1819
+ "step": 238
1820
+ },
1821
+ {
1822
+ "epoch": 0.6356382978723404,
1823
+ "grad_norm": 3.072800397872925,
1824
+ "learning_rate": 7.06937042347987e-06,
1825
+ "loss": 0.3117,
1826
+ "step": 239
1827
+ },
1828
+ {
1829
+ "epoch": 0.6382978723404256,
1830
+ "grad_norm": 3.175246000289917,
1831
+ "learning_rate": 6.980632752947221e-06,
1832
+ "loss": 0.2632,
1833
+ "step": 240
1834
+ },
1835
+ {
1836
+ "epoch": 0.6382978723404256,
1837
+ "eval_accuracy": 0.8235294117647058,
1838
+ "eval_f1": 0.5161290322580645,
1839
+ "eval_loss": 0.4037013053894043,
1840
+ "eval_precision": 0.7339449541284404,
1841
+ "eval_recall": 0.39800995024875624,
1842
+ "eval_runtime": 34.0215,
1843
+ "eval_samples_per_second": 6.584,
1844
+ "eval_steps_per_second": 0.206,
1845
+ "step": 240
1846
+ },
1847
+ {
1848
+ "epoch": 0.6409574468085106,
1849
+ "grad_norm": 2.5714304447174072,
1850
+ "learning_rate": 6.892155925397437e-06,
1851
+ "loss": 0.2749,
1852
+ "step": 241
1853
+ },
1854
+ {
1855
+ "epoch": 0.6436170212765957,
1856
+ "grad_norm": 3.128525733947754,
1857
+ "learning_rate": 6.803947584339148e-06,
1858
+ "loss": 0.3527,
1859
+ "step": 242
1860
+ },
1861
+ {
1862
+ "epoch": 0.6462765957446809,
1863
+ "grad_norm": 3.6604840755462646,
1864
+ "learning_rate": 6.716015350086449e-06,
1865
+ "loss": 0.2686,
1866
+ "step": 243
1867
+ },
1868
+ {
1869
+ "epoch": 0.648936170212766,
1870
+ "grad_norm": 2.6133296489715576,
1871
+ "learning_rate": 6.628366819100586e-06,
1872
+ "loss": 0.2836,
1873
+ "step": 244
1874
+ },
1875
+ {
1876
+ "epoch": 0.651595744680851,
1877
+ "grad_norm": 2.5161774158477783,
1878
+ "learning_rate": 6.54100956333369e-06,
1879
+ "loss": 0.2395,
1880
+ "step": 245
1881
+ },
1882
+ {
1883
+ "epoch": 0.6542553191489362,
1884
+ "grad_norm": 2.824259042739868,
1885
+ "learning_rate": 6.453951129574644e-06,
1886
+ "loss": 0.2906,
1887
+ "step": 246
1888
+ },
1889
+ {
1890
+ "epoch": 0.6569148936170213,
1891
+ "grad_norm": 2.747422456741333,
1892
+ "learning_rate": 6.3671990387971096e-06,
1893
+ "loss": 0.2368,
1894
+ "step": 247
1895
+ },
1896
+ {
1897
+ "epoch": 0.6595744680851063,
1898
+ "grad_norm": 2.540599822998047,
1899
+ "learning_rate": 6.280760785509802e-06,
1900
+ "loss": 0.3036,
1901
+ "step": 248
1902
+ },
1903
+ {
1904
+ "epoch": 0.6622340425531915,
1905
+ "grad_norm": 2.4649527072906494,
1906
+ "learning_rate": 6.194643837109015e-06,
1907
+ "loss": 0.2935,
1908
+ "step": 249
1909
+ },
1910
+ {
1911
+ "epoch": 0.6648936170212766,
1912
+ "grad_norm": 2.2564632892608643,
1913
+ "learning_rate": 6.108855633233546e-06,
1914
+ "loss": 0.2276,
1915
+ "step": 250
1916
+ },
1917
+ {
1918
+ "epoch": 0.6675531914893617,
1919
+ "grad_norm": 2.5052363872528076,
1920
+ "learning_rate": 6.0234035851219604e-06,
1921
+ "loss": 0.2464,
1922
+ "step": 251
1923
+ },
1924
+ {
1925
+ "epoch": 0.6702127659574468,
1926
+ "grad_norm": 3.091642141342163,
1927
+ "learning_rate": 5.93829507497235e-06,
1928
+ "loss": 0.2766,
1929
+ "step": 252
1930
+ },
1931
+ {
1932
+ "epoch": 0.6728723404255319,
1933
+ "grad_norm": 3.3672595024108887,
1934
+ "learning_rate": 5.853537455304575e-06,
1935
+ "loss": 0.2567,
1936
+ "step": 253
1937
+ },
1938
+ {
1939
+ "epoch": 0.675531914893617,
1940
+ "grad_norm": 2.4779727458953857,
1941
+ "learning_rate": 5.769138048325087e-06,
1942
+ "loss": 0.2628,
1943
+ "step": 254
1944
+ },
1945
+ {
1946
+ "epoch": 0.6781914893617021,
1947
+ "grad_norm": 2.5639469623565674,
1948
+ "learning_rate": 5.685104145294364e-06,
1949
+ "loss": 0.2204,
1950
+ "step": 255
1951
+ },
1952
+ {
1953
+ "epoch": 0.6808510638297872,
1954
+ "grad_norm": 3.3351776599884033,
1955
+ "learning_rate": 5.601443005897012e-06,
1956
+ "loss": 0.2535,
1957
+ "step": 256
1958
+ },
1959
+ {
1960
+ "epoch": 0.6835106382978723,
1961
+ "grad_norm": 2.3642754554748535,
1962
+ "learning_rate": 5.5181618576146e-06,
1963
+ "loss": 0.2234,
1964
+ "step": 257
1965
+ },
1966
+ {
1967
+ "epoch": 0.6861702127659575,
1968
+ "grad_norm": 2.9997129440307617,
1969
+ "learning_rate": 5.435267895101303e-06,
1970
+ "loss": 0.2643,
1971
+ "step": 258
1972
+ },
1973
+ {
1974
+ "epoch": 0.6888297872340425,
1975
+ "grad_norm": 2.4532787799835205,
1976
+ "learning_rate": 5.352768279562315e-06,
1977
+ "loss": 0.2621,
1978
+ "step": 259
1979
+ },
1980
+ {
1981
+ "epoch": 0.6914893617021277,
1982
+ "grad_norm": 2.572538137435913,
1983
+ "learning_rate": 5.270670138135234e-06,
1984
+ "loss": 0.2499,
1985
+ "step": 260
1986
+ },
1987
+ {
1988
+ "epoch": 0.6914893617021277,
1989
+ "eval_accuracy": 0.8247058823529412,
1990
+ "eval_f1": 0.5299684542586751,
1991
+ "eval_loss": 0.3885125517845154,
1992
+ "eval_precision": 0.7241379310344828,
1993
+ "eval_recall": 0.417910447761194,
1994
+ "eval_runtime": 33.8843,
1995
+ "eval_samples_per_second": 6.611,
1996
+ "eval_steps_per_second": 0.207,
1997
+ "step": 260
1998
+ },
1999
+ {
2000
+ "epoch": 0.6941489361702128,
2001
+ "grad_norm": 2.906144618988037,
2002
+ "learning_rate": 5.188980563274315e-06,
2003
+ "loss": 0.3095,
2004
+ "step": 261
2005
+ },
2006
+ {
2007
+ "epoch": 0.6968085106382979,
2008
+ "grad_norm": 2.319133996963501,
2009
+ "learning_rate": 5.107706612137776e-06,
2010
+ "loss": 0.2388,
2011
+ "step": 262
2012
+ },
2013
+ {
2014
+ "epoch": 0.699468085106383,
2015
+ "grad_norm": 3.162642478942871,
2016
+ "learning_rate": 5.026855305978129e-06,
2017
+ "loss": 0.2462,
2018
+ "step": 263
2019
+ },
2020
+ {
2021
+ "epoch": 0.7021276595744681,
2022
+ "grad_norm": 2.749540090560913,
2023
+ "learning_rate": 4.946433629535585e-06,
2024
+ "loss": 0.2659,
2025
+ "step": 264
2026
+ },
2027
+ {
2028
+ "epoch": 0.7047872340425532,
2029
+ "grad_norm": 2.891836643218994,
2030
+ "learning_rate": 4.866448530434692e-06,
2031
+ "loss": 0.2332,
2032
+ "step": 265
2033
+ },
2034
+ {
2035
+ "epoch": 0.7074468085106383,
2036
+ "grad_norm": 2.4717514514923096,
2037
+ "learning_rate": 4.786906918584083e-06,
2038
+ "loss": 0.2136,
2039
+ "step": 266
2040
+ },
2041
+ {
2042
+ "epoch": 0.7101063829787234,
2043
+ "grad_norm": 2.679591655731201,
2044
+ "learning_rate": 4.707815665579569e-06,
2045
+ "loss": 0.3036,
2046
+ "step": 267
2047
+ },
2048
+ {
2049
+ "epoch": 0.7127659574468085,
2050
+ "grad_norm": 2.3344614505767822,
2051
+ "learning_rate": 4.629181604110464e-06,
2052
+ "loss": 0.2853,
2053
+ "step": 268
2054
+ },
2055
+ {
2056
+ "epoch": 0.7154255319148937,
2057
+ "grad_norm": 2.839320182800293,
2058
+ "learning_rate": 4.551011527369348e-06,
2059
+ "loss": 0.2394,
2060
+ "step": 269
2061
+ },
2062
+ {
2063
+ "epoch": 0.7180851063829787,
2064
+ "grad_norm": 2.27245831489563,
2065
+ "learning_rate": 4.4733121884651665e-06,
2066
+ "loss": 0.2496,
2067
+ "step": 270
2068
+ },
2069
+ {
2070
+ "epoch": 0.7207446808510638,
2071
+ "grad_norm": 3.038536548614502,
2072
+ "learning_rate": 4.3960902998398524e-06,
2073
+ "loss": 0.2787,
2074
+ "step": 271
2075
+ },
2076
+ {
2077
+ "epoch": 0.723404255319149,
2078
+ "grad_norm": 3.1204025745391846,
2079
+ "learning_rate": 4.319352532688444e-06,
2080
+ "loss": 0.2678,
2081
+ "step": 272
2082
+ },
2083
+ {
2084
+ "epoch": 0.726063829787234,
2085
+ "grad_norm": 3.8436288833618164,
2086
+ "learning_rate": 4.243105516382732e-06,
2087
+ "loss": 0.2405,
2088
+ "step": 273
2089
+ },
2090
+ {
2091
+ "epoch": 0.7287234042553191,
2092
+ "grad_norm": 3.1559836864471436,
2093
+ "learning_rate": 4.167355837898585e-06,
2094
+ "loss": 0.2881,
2095
+ "step": 274
2096
+ },
2097
+ {
2098
+ "epoch": 0.7313829787234043,
2099
+ "grad_norm": 2.5084681510925293,
2100
+ "learning_rate": 4.092110041246865e-06,
2101
+ "loss": 0.2365,
2102
+ "step": 275
2103
+ },
2104
+ {
2105
+ "epoch": 0.7340425531914894,
2106
+ "grad_norm": 3.0584487915039062,
2107
+ "learning_rate": 4.017374626908125e-06,
2108
+ "loss": 0.2808,
2109
+ "step": 276
2110
+ },
2111
+ {
2112
+ "epoch": 0.7367021276595744,
2113
+ "grad_norm": 3.6234519481658936,
2114
+ "learning_rate": 3.943156051271003e-06,
2115
+ "loss": 0.2993,
2116
+ "step": 277
2117
+ },
2118
+ {
2119
+ "epoch": 0.7393617021276596,
2120
+ "grad_norm": 1.8584307432174683,
2121
+ "learning_rate": 3.8694607260744745e-06,
2122
+ "loss": 0.2012,
2123
+ "step": 278
2124
+ },
2125
+ {
2126
+ "epoch": 0.7420212765957447,
2127
+ "grad_norm": 2.4248085021972656,
2128
+ "learning_rate": 3.7962950178539282e-06,
2129
+ "loss": 0.2352,
2130
+ "step": 279
2131
+ },
2132
+ {
2133
+ "epoch": 0.7446808510638298,
2134
+ "grad_norm": 2.5359675884246826,
2135
+ "learning_rate": 3.7236652473911817e-06,
2136
+ "loss": 0.2121,
2137
+ "step": 280
2138
+ },
2139
+ {
2140
+ "epoch": 0.7446808510638298,
2141
+ "eval_accuracy": 0.8223529411764706,
2142
+ "eval_f1": 0.5175718849840255,
2143
+ "eval_loss": 0.3953240215778351,
2144
+ "eval_precision": 0.7232142857142857,
2145
+ "eval_recall": 0.40298507462686567,
2146
+ "eval_runtime": 34.1139,
2147
+ "eval_samples_per_second": 6.566,
2148
+ "eval_steps_per_second": 0.205,
2149
+ "step": 280
2150
+ },
2151
+ {
2152
+ "epoch": 0.7473404255319149,
2153
+ "grad_norm": 2.3844354152679443,
2154
+ "learning_rate": 3.651577689168405e-06,
2155
+ "loss": 0.2212,
2156
+ "step": 281
2157
+ },
2158
+ {
2159
+ "epoch": 0.75,
2160
+ "grad_norm": 2.8635263442993164,
2161
+ "learning_rate": 3.580038570826093e-06,
2162
+ "loss": 0.2259,
2163
+ "step": 282
2164
+ },
2165
+ {
2166
+ "epoch": 0.7526595744680851,
2167
+ "grad_norm": 3.1672933101654053,
2168
+ "learning_rate": 3.509054072625031e-06,
2169
+ "loss": 0.2691,
2170
+ "step": 283
2171
+ },
2172
+ {
2173
+ "epoch": 0.7553191489361702,
2174
+ "grad_norm": 3.298377752304077,
2175
+ "learning_rate": 3.4386303269124142e-06,
2176
+ "loss": 0.261,
2177
+ "step": 284
2178
+ },
2179
+ {
2180
+ "epoch": 0.7579787234042553,
2181
+ "grad_norm": 3.3718481063842773,
2182
+ "learning_rate": 3.3687734175920505e-06,
2183
+ "loss": 0.2842,
2184
+ "step": 285
2185
+ },
2186
+ {
2187
+ "epoch": 0.7606382978723404,
2188
+ "grad_norm": 2.822702646255493,
2189
+ "learning_rate": 3.299489379598777e-06,
2190
+ "loss": 0.2416,
2191
+ "step": 286
2192
+ },
2193
+ {
2194
+ "epoch": 0.7632978723404256,
2195
+ "grad_norm": 3.209895372390747,
2196
+ "learning_rate": 3.2307841983771182e-06,
2197
+ "loss": 0.2706,
2198
+ "step": 287
2199
+ },
2200
+ {
2201
+ "epoch": 0.7659574468085106,
2202
+ "grad_norm": 2.953824996948242,
2203
+ "learning_rate": 3.162663809364178e-06,
2204
+ "loss": 0.2629,
2205
+ "step": 288
2206
+ },
2207
+ {
2208
+ "epoch": 0.7686170212765957,
2209
+ "grad_norm": 4.190698623657227,
2210
+ "learning_rate": 3.095134097476904e-06,
2211
+ "loss": 0.2609,
2212
+ "step": 289
2213
+ },
2214
+ {
2215
+ "epoch": 0.7712765957446809,
2216
+ "grad_norm": 4.36337423324585,
2217
+ "learning_rate": 3.0282008966036647e-06,
2218
+ "loss": 0.2549,
2219
+ "step": 290
2220
+ },
2221
+ {
2222
+ "epoch": 0.773936170212766,
2223
+ "grad_norm": 2.8681600093841553,
2224
+ "learning_rate": 2.9618699891002843e-06,
2225
+ "loss": 0.2464,
2226
+ "step": 291
2227
+ },
2228
+ {
2229
+ "epoch": 0.776595744680851,
2230
+ "grad_norm": 3.781843900680542,
2231
+ "learning_rate": 2.8961471052904855e-06,
2232
+ "loss": 0.3261,
2233
+ "step": 292
2234
+ },
2235
+ {
2236
+ "epoch": 0.7792553191489362,
2237
+ "grad_norm": 3.1815481185913086,
2238
+ "learning_rate": 2.831037922970855e-06,
2239
+ "loss": 0.2659,
2240
+ "step": 293
2241
+ },
2242
+ {
2243
+ "epoch": 0.7819148936170213,
2244
+ "grad_norm": 3.2825517654418945,
2245
+ "learning_rate": 2.7665480669203383e-06,
2246
+ "loss": 0.2239,
2247
+ "step": 294
2248
+ },
2249
+ {
2250
+ "epoch": 0.7845744680851063,
2251
+ "grad_norm": 2.418006420135498,
2252
+ "learning_rate": 2.702683108414326e-06,
2253
+ "loss": 0.2476,
2254
+ "step": 295
2255
+ },
2256
+ {
2257
+ "epoch": 0.7872340425531915,
2258
+ "grad_norm": 3.483743906021118,
2259
+ "learning_rate": 2.639448564743328e-06,
2260
+ "loss": 0.2306,
2261
+ "step": 296
2262
+ },
2263
+ {
2264
+ "epoch": 0.7898936170212766,
2265
+ "grad_norm": 3.201629638671875,
2266
+ "learning_rate": 2.57684989873636e-06,
2267
+ "loss": 0.2562,
2268
+ "step": 297
2269
+ },
2270
+ {
2271
+ "epoch": 0.7925531914893617,
2272
+ "grad_norm": 2.7855303287506104,
2273
+ "learning_rate": 2.514892518288988e-06,
2274
+ "loss": 0.2245,
2275
+ "step": 298
2276
+ },
2277
+ {
2278
+ "epoch": 0.7952127659574468,
2279
+ "grad_norm": 3.742940664291382,
2280
+ "learning_rate": 2.4535817758961644e-06,
2281
+ "loss": 0.3192,
2282
+ "step": 299
2283
+ },
2284
+ {
2285
+ "epoch": 0.7978723404255319,
2286
+ "grad_norm": 2.966266393661499,
2287
+ "learning_rate": 2.3929229681898005e-06,
2288
+ "loss": 0.2704,
2289
+ "step": 300
2290
+ },
2291
+ {
2292
+ "epoch": 0.7978723404255319,
2293
+ "eval_accuracy": 0.8329411764705882,
2294
+ "eval_f1": 0.5644171779141104,
2295
+ "eval_loss": 0.38487711548805237,
2296
+ "eval_precision": 0.736,
2297
+ "eval_recall": 0.4577114427860697,
2298
+ "eval_runtime": 33.5166,
2299
+ "eval_samples_per_second": 6.683,
2300
+ "eval_steps_per_second": 0.209,
2301
+ "step": 300
2302
  }
2303
  ],
2304
  "logging_steps": 1,
 
2318
  "attributes": {}
2319
  }
2320
  },
2321
+ "total_flos": 9.717414664287027e+16,
2322
  "train_batch_size": 8,
2323
  "trial_name": null,
2324
  "trial_params": null