cotysong113 commited on
Commit
f0db936
·
verified ·
1 Parent(s): 62feee5

Training in progress, step 1590

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +9 -9
  2. config.json +1 -1
  3. model.safetensors +1 -1
  4. run-1/checkpoint-2862/config.json +1 -1
  5. run-1/checkpoint-2862/model.safetensors +1 -1
  6. run-1/checkpoint-2862/optimizer.pt +2 -2
  7. run-1/checkpoint-2862/rng_state.pth +2 -2
  8. run-1/checkpoint-2862/scheduler.pt +1 -1
  9. run-1/checkpoint-2862/trainer_state.json +60 -60
  10. run-1/checkpoint-2862/training_args.bin +2 -2
  11. run-10/checkpoint-3180/config.json +1 -1
  12. run-10/checkpoint-3180/model.safetensors +1 -1
  13. run-10/checkpoint-3180/optimizer.pt +2 -2
  14. run-10/checkpoint-3180/rng_state.pth +2 -2
  15. run-10/checkpoint-3180/scheduler.pt +1 -1
  16. run-10/checkpoint-3180/trainer_state.json +65 -65
  17. run-10/checkpoint-3180/training_args.bin +2 -2
  18. run-11/checkpoint-3180/config.json +1 -1
  19. run-11/checkpoint-3180/model.safetensors +1 -1
  20. run-11/checkpoint-3180/optimizer.pt +2 -2
  21. run-11/checkpoint-3180/rng_state.pth +2 -2
  22. run-11/checkpoint-3180/scheduler.pt +1 -1
  23. run-11/checkpoint-3180/trainer_state.json +63 -63
  24. run-11/checkpoint-3180/training_args.bin +2 -2
  25. run-12/checkpoint-2862/config.json +1 -1
  26. run-12/checkpoint-2862/model.safetensors +1 -1
  27. run-12/checkpoint-2862/optimizer.pt +2 -2
  28. run-12/checkpoint-2862/rng_state.pth +2 -2
  29. run-12/checkpoint-2862/scheduler.pt +1 -1
  30. run-12/checkpoint-2862/trainer_state.json +58 -58
  31. run-12/checkpoint-2862/training_args.bin +2 -2
  32. run-12/checkpoint-3180/config.json +1 -1
  33. run-12/checkpoint-3180/model.safetensors +1 -1
  34. run-12/checkpoint-3180/optimizer.pt +2 -2
  35. run-12/checkpoint-3180/rng_state.pth +2 -2
  36. run-12/checkpoint-3180/scheduler.pt +1 -1
  37. run-12/checkpoint-3180/trainer_state.json +65 -65
  38. run-12/checkpoint-3180/training_args.bin +2 -2
  39. run-17/checkpoint-2862/config.json +1 -1
  40. run-17/checkpoint-2862/model.safetensors +1 -1
  41. run-17/checkpoint-2862/optimizer.pt +2 -2
  42. run-17/checkpoint-2862/rng_state.pth +2 -2
  43. run-17/checkpoint-2862/scheduler.pt +1 -1
  44. run-17/checkpoint-2862/trainer_state.json +61 -61
  45. run-17/checkpoint-2862/training_args.bin +2 -2
  46. run-2/checkpoint-2544/config.json +1 -1
  47. run-2/checkpoint-2544/model.safetensors +1 -1
  48. run-2/checkpoint-2544/optimizer.pt +2 -2
  49. run-2/checkpoint-2544/rng_state.pth +2 -2
  50. run-2/checkpoint-2544/scheduler.pt +1 -1
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.7825
22
- - Accuracy: 0.9152
23
 
24
  ## Model description
25
 
@@ -50,16 +50,16 @@ The following hyperparameters were used during training:
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
- | 4.3026 | 1.0 | 318 | 3.2852 | 0.7316 |
54
- | 2.6284 | 2.0 | 636 | 1.8741 | 0.8510 |
55
- | 1.5524 | 3.0 | 954 | 1.1640 | 0.8910 |
56
- | 1.0182 | 4.0 | 1272 | 0.8668 | 0.9106 |
57
- | 0.8045 | 5.0 | 1590 | 0.7825 | 0.9152 |
58
 
59
 
60
  ### Framework versions
61
 
62
- - Transformers 4.47.0
63
- - Pytorch 2.5.1
64
  - Datasets 3.2.0
65
  - Tokenizers 0.21.0
 
18
 
19
  This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.8083
22
+ - Accuracy: 0.9161
23
 
24
  ## Model description
25
 
 
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
+ | 4.3211 | 1.0 | 318 | 3.3248 | 0.7210 |
54
+ | 2.6732 | 2.0 | 636 | 1.9187 | 0.8406 |
55
+ | 1.5914 | 3.0 | 954 | 1.1971 | 0.8877 |
56
+ | 1.0497 | 4.0 | 1272 | 0.8947 | 0.9090 |
57
+ | 0.8283 | 5.0 | 1590 | 0.8083 | 0.9161 |
58
 
59
 
60
  ### Framework versions
61
 
62
+ - Transformers 4.47.1
63
+ - Pytorch 2.5.1+cu124
64
  - Datasets 3.2.0
65
  - Tokenizers 0.21.0
config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dba1231926faf59ba8e66e75038141c586d1a2ad892362af3b9ae8429919f80
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ae88eff2997cf8baa4f19bc76eed6ee56ba9dca3d564bfe316860bf65c80beb
3
  size 268290900
run-1/checkpoint-2862/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-1/checkpoint-2862/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9485849fc0f0619157aa7247a4b8c39d8dfc23d577ec0b7a1bea0cf750df421f
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd49d9fed4d63ff70ccf669cdd06027bd63c93903a136a53cb3b5045948a815c
3
  size 268290900
run-1/checkpoint-2862/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:585c9dc0ed78d00cdde852275b7d2821654d6ad9b4d8ec79438d49ec18d9dfcc
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9070031d06060d90440e64a4923632ac3124825b3b707313668acd353160cdbd
3
+ size 536643898
run-1/checkpoint-2862/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e29353d4954fcf585947caa6a0a3fa59c7379d49f3bc56c6a4bd276fef18ca98
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
3
+ size 14244
run-1/checkpoint-2862/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c64bfaed33db311081ee0596530a03699ac7659f0cf4f57a488fea34df98816
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
3
  size 1064
run-1/checkpoint-2862/trainer_state.json CHANGED
@@ -10,137 +10,137 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.6216608881950378,
14
  "learning_rate": 1.7784765897973445e-05,
15
- "loss": 0.3805,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.5790322580645161,
21
- "eval_loss": 0.18826216459274292,
22
- "eval_runtime": 2.6452,
23
- "eval_samples_per_second": 1171.927,
24
- "eval_steps_per_second": 24.573,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.5106726288795471,
30
  "learning_rate": 1.556953179594689e-05,
31
- "loss": 0.1584,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8203225806451613,
37
- "eval_loss": 0.0928473100066185,
38
- "eval_runtime": 2.7805,
39
- "eval_samples_per_second": 1114.911,
40
- "eval_steps_per_second": 23.377,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.4919030964374542,
46
  "learning_rate": 1.3354297693920338e-05,
47
- "loss": 0.1001,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.8787096774193548,
53
- "eval_loss": 0.06263605505228043,
54
- "eval_runtime": 2.6683,
55
- "eval_samples_per_second": 1161.795,
56
- "eval_steps_per_second": 24.36,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.3679712116718292,
62
  "learning_rate": 1.1139063591893781e-05,
63
- "loss": 0.0754,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.8974193548387097,
69
- "eval_loss": 0.048369407653808594,
70
- "eval_runtime": 2.749,
71
- "eval_samples_per_second": 1127.668,
72
- "eval_steps_per_second": 23.645,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.27457061409950256,
78
  "learning_rate": 8.923829489867226e-06,
79
- "loss": 0.0627,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.9064516129032258,
85
- "eval_loss": 0.04010023921728134,
86
- "eval_runtime": 2.7621,
87
- "eval_samples_per_second": 1122.344,
88
- "eval_steps_per_second": 23.533,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.29231536388397217,
94
  "learning_rate": 6.708595387840672e-06,
95
- "loss": 0.0551,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9119354838709678,
101
- "eval_loss": 0.035356562584638596,
102
- "eval_runtime": 3.1487,
103
- "eval_samples_per_second": 984.536,
104
- "eval_steps_per_second": 20.643,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.30922219157218933,
110
  "learning_rate": 4.4933612858141165e-06,
111
- "loss": 0.0503,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9180645161290323,
117
- "eval_loss": 0.03271542862057686,
118
- "eval_runtime": 3.1361,
119
- "eval_samples_per_second": 988.49,
120
- "eval_steps_per_second": 20.726,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.27570685744285583,
126
  "learning_rate": 2.2781271837875614e-06,
127
- "loss": 0.0474,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9203225806451613,
133
- "eval_loss": 0.031012877821922302,
134
- "eval_runtime": 2.9764,
135
- "eval_samples_per_second": 1041.529,
136
- "eval_steps_per_second": 21.839,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.40512949228286743,
142
  "learning_rate": 6.289308176100629e-08,
143
- "loss": 0.046,
144
  "step": 2853
145
  }
146
  ],
@@ -165,8 +165,8 @@
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
- "alpha": 0.006547213182126477,
169
  "num_train_epochs": 9,
170
- "temperature": 20
171
  }
172
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.6337465047836304,
14
  "learning_rate": 1.7784765897973445e-05,
15
+ "loss": 0.3867,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.5703225806451613,
21
+ "eval_loss": 0.19408121705055237,
22
+ "eval_runtime": 1.3792,
23
+ "eval_samples_per_second": 2247.653,
24
+ "eval_steps_per_second": 47.128,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.5281593799591064,
30
  "learning_rate": 1.556953179594689e-05,
31
+ "loss": 0.1621,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.8274193548387097,
37
+ "eval_loss": 0.0953042134642601,
38
+ "eval_runtime": 1.5805,
39
+ "eval_samples_per_second": 1961.446,
40
+ "eval_steps_per_second": 41.127,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.4498799741268158,
46
  "learning_rate": 1.3354297693920338e-05,
47
+ "loss": 0.1018,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.8806451612903226,
53
+ "eval_loss": 0.06378939002752304,
54
+ "eval_runtime": 1.5977,
55
+ "eval_samples_per_second": 1940.328,
56
+ "eval_steps_per_second": 40.684,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.3946130573749542,
62
  "learning_rate": 1.1139063591893781e-05,
63
+ "loss": 0.0769,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.8980645161290323,
69
+ "eval_loss": 0.04853447526693344,
70
+ "eval_runtime": 1.6017,
71
+ "eval_samples_per_second": 1935.5,
72
+ "eval_steps_per_second": 40.583,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.2938636541366577,
78
  "learning_rate": 8.923829489867226e-06,
79
+ "loss": 0.0634,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.9067741935483871,
85
+ "eval_loss": 0.04064928740262985,
86
+ "eval_runtime": 1.4207,
87
+ "eval_samples_per_second": 2182.039,
88
+ "eval_steps_per_second": 45.752,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.3579629063606262,
94
  "learning_rate": 6.708595387840672e-06,
95
+ "loss": 0.0555,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9135483870967742,
101
+ "eval_loss": 0.03552273288369179,
102
+ "eval_runtime": 1.617,
103
+ "eval_samples_per_second": 1917.16,
104
+ "eval_steps_per_second": 40.199,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.2372261881828308,
110
  "learning_rate": 4.4933612858141165e-06,
111
+ "loss": 0.0508,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.9193548387096774,
117
+ "eval_loss": 0.033018212765455246,
118
+ "eval_runtime": 1.8153,
119
+ "eval_samples_per_second": 1707.728,
120
+ "eval_steps_per_second": 35.807,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.26516783237457275,
126
  "learning_rate": 2.2781271837875614e-06,
127
+ "loss": 0.0479,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9219354838709677,
133
+ "eval_loss": 0.03124028816819191,
134
+ "eval_runtime": 1.622,
135
+ "eval_samples_per_second": 1911.172,
136
+ "eval_steps_per_second": 40.073,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.26064929366111755,
142
  "learning_rate": 6.289308176100629e-08,
143
+ "loss": 0.0462,
144
  "step": 2853
145
  }
146
  ],
 
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
+ "alpha": 0.26432685644138476,
169
  "num_train_epochs": 9,
170
+ "temperature": 15
171
  }
172
  }
run-1/checkpoint-2862/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b47f481f10cd643596d93df6439f6f373ebc1e96283a4fb4c2f3edf61ab3458
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a4cd3bd3d7b199838f7cc7ce3902cc5b5197f609f99417e50920c79fb92f710
3
+ size 5368
run-10/checkpoint-3180/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-10/checkpoint-3180/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1eae9dab05be60513a1795635af24cccfa3fdfdead0509ebf734e5c1958177e
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c11c8d8fd5dab07dc0da9a774d4ac9892b28391145c860a4321a6d78b2468e5
3
  size 268290900
run-10/checkpoint-3180/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4b7f84d1c4f025034ce217fd706647c2f0fc4969a47e7cde0547efba8570105
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf56a6a19086ea10089ac80d4b475b67fd4477ea849dc7ec47b9c3add6cf41d2
3
+ size 536643898
run-10/checkpoint-3180/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fbdf6070082bd7f064003b0a80093bbcf8031eab17e8484cd03c2f330dae634
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
3
+ size 14244
run-10/checkpoint-3180/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7377b4a081dc59fc9c5a604a8fe62d7aa1f698f549e73e5c90d42a32e2a0d766
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
3
  size 1064
run-10/checkpoint-3180/trainer_state.json CHANGED
@@ -10,153 +10,153 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.8045943975448608,
14
  "learning_rate": 1.8006289308176103e-05,
15
- "loss": 0.8034,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.6796774193548387,
21
- "eval_loss": 0.4008762240409851,
22
- "eval_runtime": 2.8369,
23
- "eval_samples_per_second": 1092.731,
24
- "eval_steps_per_second": 22.912,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.7318870425224304,
30
  "learning_rate": 1.6012578616352204e-05,
31
- "loss": 0.3002,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8464516129032258,
37
- "eval_loss": 0.13422441482543945,
38
- "eval_runtime": 2.9793,
39
- "eval_samples_per_second": 1040.529,
40
- "eval_steps_per_second": 21.818,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.8440456986427307,
46
  "learning_rate": 1.4018867924528304e-05,
47
- "loss": 0.1385,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.9006451612903226,
53
- "eval_loss": 0.06945940852165222,
54
- "eval_runtime": 3.7653,
55
- "eval_samples_per_second": 823.311,
56
- "eval_steps_per_second": 17.263,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.5139324069023132,
62
  "learning_rate": 1.2025157232704403e-05,
63
- "loss": 0.0883,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9164516129032259,
69
- "eval_loss": 0.04914101958274841,
70
- "eval_runtime": 3.6075,
71
- "eval_samples_per_second": 859.331,
72
- "eval_steps_per_second": 18.018,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.36288055777549744,
78
  "learning_rate": 1.0031446540880504e-05,
79
  "loss": 0.0692,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.925483870967742,
85
- "eval_loss": 0.04096318036317825,
86
- "eval_runtime": 3.6209,
87
- "eval_samples_per_second": 856.144,
88
- "eval_steps_per_second": 17.951,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.3991723954677582,
94
  "learning_rate": 8.037735849056606e-06,
95
- "loss": 0.0594,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9274193548387096,
101
- "eval_loss": 0.03642675653100014,
102
- "eval_runtime": 3.4564,
103
- "eval_samples_per_second": 896.887,
104
- "eval_steps_per_second": 18.806,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.4134936034679413,
110
  "learning_rate": 6.044025157232704e-06,
111
- "loss": 0.0537,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.932258064516129,
117
- "eval_loss": 0.03399330750107765,
118
- "eval_runtime": 3.4401,
119
- "eval_samples_per_second": 901.146,
120
- "eval_steps_per_second": 18.895,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.4197172522544861,
126
  "learning_rate": 4.0503144654088055e-06,
127
- "loss": 0.0502,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9316129032258065,
133
- "eval_loss": 0.03201091289520264,
134
- "eval_runtime": 3.284,
135
- "eval_samples_per_second": 943.982,
136
- "eval_steps_per_second": 19.793,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.7050578594207764,
142
  "learning_rate": 2.056603773584906e-06,
143
  "loss": 0.0479,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_accuracy": 0.9354838709677419,
149
- "eval_loss": 0.030981773510575294,
150
- "eval_runtime": 3.2488,
151
- "eval_samples_per_second": 954.185,
152
- "eval_steps_per_second": 20.007,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.3551309406757355,
158
  "learning_rate": 6.289308176100629e-08,
159
- "loss": 0.0469,
160
  "step": 3170
161
  }
162
  ],
@@ -177,11 +177,11 @@
177
  "attributes": {}
178
  }
179
  },
180
- "total_flos": 825404033099184.0,
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
- "alpha": 0.8684692121006535,
185
  "num_train_epochs": 10,
186
  "temperature": 2
187
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.9062672853469849,
14
  "learning_rate": 1.8006289308176103e-05,
15
+ "loss": 0.8042,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.6709677419354839,
21
+ "eval_loss": 0.4064599573612213,
22
+ "eval_runtime": 1.4275,
23
+ "eval_samples_per_second": 2171.572,
24
+ "eval_steps_per_second": 45.533,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.7205497026443481,
30
  "learning_rate": 1.6012578616352204e-05,
31
+ "loss": 0.3038,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.847741935483871,
37
+ "eval_loss": 0.1362968385219574,
38
+ "eval_runtime": 1.6105,
39
+ "eval_samples_per_second": 1924.868,
40
+ "eval_steps_per_second": 40.36,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.5987477898597717,
46
  "learning_rate": 1.4018867924528304e-05,
47
+ "loss": 0.1395,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.8990322580645161,
53
+ "eval_loss": 0.07024983316659927,
54
+ "eval_runtime": 1.6105,
55
+ "eval_samples_per_second": 1924.879,
56
+ "eval_steps_per_second": 40.36,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.5540674924850464,
62
  "learning_rate": 1.2025157232704403e-05,
63
+ "loss": 0.0891,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9187096774193548,
69
+ "eval_loss": 0.04933710768818855,
70
+ "eval_runtime": 1.7991,
71
+ "eval_samples_per_second": 1723.062,
72
+ "eval_steps_per_second": 36.129,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.42864474654197693,
78
  "learning_rate": 1.0031446540880504e-05,
79
  "loss": 0.0692,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.9241935483870968,
85
+ "eval_loss": 0.04158218950033188,
86
+ "eval_runtime": 1.6087,
87
+ "eval_samples_per_second": 1927.079,
88
+ "eval_steps_per_second": 40.406,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.4545074999332428,
94
  "learning_rate": 8.037735849056606e-06,
95
+ "loss": 0.0595,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9270967741935484,
101
+ "eval_loss": 0.03682653605937958,
102
+ "eval_runtime": 1.4287,
103
+ "eval_samples_per_second": 2169.74,
104
+ "eval_steps_per_second": 45.495,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.34796932339668274,
110
  "learning_rate": 6.044025157232704e-06,
111
+ "loss": 0.0538,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.9316129032258065,
117
+ "eval_loss": 0.03404370695352554,
118
+ "eval_runtime": 1.6077,
119
+ "eval_samples_per_second": 1928.227,
120
+ "eval_steps_per_second": 40.431,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.3150351941585541,
126
  "learning_rate": 4.0503144654088055e-06,
127
+ "loss": 0.0503,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9341935483870968,
133
+ "eval_loss": 0.03234480321407318,
134
+ "eval_runtime": 1.4217,
135
+ "eval_samples_per_second": 2180.525,
136
+ "eval_steps_per_second": 45.721,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.3166097104549408,
142
  "learning_rate": 2.056603773584906e-06,
143
  "loss": 0.0479,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_accuracy": 0.9341935483870968,
149
+ "eval_loss": 0.03134315088391304,
150
+ "eval_runtime": 1.6077,
151
+ "eval_samples_per_second": 1928.261,
152
+ "eval_steps_per_second": 40.431,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.27577438950538635,
158
  "learning_rate": 6.289308176100629e-08,
159
+ "loss": 0.0467,
160
  "step": 3170
161
  }
162
  ],
 
177
  "attributes": {}
178
  }
179
  },
180
+ "total_flos": 827333546055996.0,
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
+ "alpha": 0.5781383032678951,
185
  "num_train_epochs": 10,
186
  "temperature": 2
187
  }
run-10/checkpoint-3180/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31136b152d8bec823af46e8f97f2f8bbe5859cef952eba4c6121a11c30bcfdf6
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d588f08c0e85333fa2a4adcf5ec378e9adea9df69c799ff80677ef0f82a3e48a
3
+ size 5368
run-11/checkpoint-3180/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-11/checkpoint-3180/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7b4592ac0c8c2621aa0bf6cc48358a9a7733b828ad086371bcfd4495a46541b
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c11c8d8fd5dab07dc0da9a774d4ac9892b28391145c860a4321a6d78b2468e5
3
  size 268290900
run-11/checkpoint-3180/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1bfa0ebb9d3c136bf9e300c370d03a101413666475a585e27fe5bfe859b7caae
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf56a6a19086ea10089ac80d4b475b67fd4477ea849dc7ec47b9c3add6cf41d2
3
+ size 536643898
run-11/checkpoint-3180/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fbdf6070082bd7f064003b0a80093bbcf8031eab17e8484cd03c2f330dae634
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
3
+ size 14244
run-11/checkpoint-3180/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7377b4a081dc59fc9c5a604a8fe62d7aa1f698f549e73e5c90d42a32e2a0d766
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
3
  size 1064
run-11/checkpoint-3180/trainer_state.json CHANGED
@@ -10,153 +10,153 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.8055088520050049,
14
  "learning_rate": 1.8006289308176103e-05,
15
- "loss": 0.8034,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.6793548387096774,
21
- "eval_loss": 0.4008634388446808,
22
- "eval_runtime": 2.7556,
23
- "eval_samples_per_second": 1124.967,
24
- "eval_steps_per_second": 23.588,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.731418788433075,
30
  "learning_rate": 1.6012578616352204e-05,
31
- "loss": 0.3002,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8464516129032258,
37
- "eval_loss": 0.1342196762561798,
38
- "eval_runtime": 2.8872,
39
- "eval_samples_per_second": 1073.695,
40
- "eval_steps_per_second": 22.513,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.8417670726776123,
46
  "learning_rate": 1.4018867924528304e-05,
47
- "loss": 0.1385,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.9006451612903226,
53
- "eval_loss": 0.06945406645536423,
54
- "eval_runtime": 2.7998,
55
- "eval_samples_per_second": 1107.216,
56
- "eval_steps_per_second": 23.216,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.5138720273971558,
62
  "learning_rate": 1.2025157232704403e-05,
63
- "loss": 0.0883,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9167741935483871,
69
- "eval_loss": 0.049133624881505966,
70
- "eval_runtime": 3.0876,
71
- "eval_samples_per_second": 1004.012,
72
- "eval_steps_per_second": 21.052,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.3620692193508148,
78
  "learning_rate": 1.0031446540880504e-05,
79
  "loss": 0.0692,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.925483870967742,
85
- "eval_loss": 0.04096338152885437,
86
- "eval_runtime": 3.4962,
87
- "eval_samples_per_second": 886.683,
88
- "eval_steps_per_second": 18.592,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.3995964229106903,
94
  "learning_rate": 8.037735849056606e-06,
95
- "loss": 0.0594,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
  "eval_accuracy": 0.9270967741935484,
101
- "eval_loss": 0.03643520548939705,
102
- "eval_runtime": 4.6927,
103
- "eval_samples_per_second": 660.602,
104
- "eval_steps_per_second": 13.851,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.4138696789741516,
110
  "learning_rate": 6.044025157232704e-06,
111
- "loss": 0.0537,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.932258064516129,
117
- "eval_loss": 0.033999357372522354,
118
- "eval_runtime": 3.3936,
119
- "eval_samples_per_second": 913.496,
120
- "eval_steps_per_second": 19.154,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.4184480905532837,
126
  "learning_rate": 4.0503144654088055e-06,
127
- "loss": 0.0502,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9316129032258065,
133
- "eval_loss": 0.03201307728886604,
134
- "eval_runtime": 3.2498,
135
- "eval_samples_per_second": 953.913,
136
- "eval_steps_per_second": 20.001,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.7048718929290771,
142
  "learning_rate": 2.056603773584906e-06,
143
  "loss": 0.0479,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_accuracy": 0.9358064516129032,
149
- "eval_loss": 0.030989257618784904,
150
- "eval_runtime": 3.1686,
151
- "eval_samples_per_second": 978.342,
152
- "eval_steps_per_second": 20.514,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.35616278648376465,
158
  "learning_rate": 6.289308176100629e-08,
159
- "loss": 0.0469,
160
  "step": 3170
161
  }
162
  ],
@@ -181,7 +181,7 @@
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
- "alpha": 0.8773153648687787,
185
  "num_train_epochs": 10,
186
  "temperature": 2
187
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.9062672853469849,
14
  "learning_rate": 1.8006289308176103e-05,
15
+ "loss": 0.8042,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.6709677419354839,
21
+ "eval_loss": 0.4064599573612213,
22
+ "eval_runtime": 1.3737,
23
+ "eval_samples_per_second": 2256.722,
24
+ "eval_steps_per_second": 47.318,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.7205497026443481,
30
  "learning_rate": 1.6012578616352204e-05,
31
+ "loss": 0.3038,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.847741935483871,
37
+ "eval_loss": 0.1362968385219574,
38
+ "eval_runtime": 1.392,
39
+ "eval_samples_per_second": 2227.027,
40
+ "eval_steps_per_second": 46.696,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.5987477898597717,
46
  "learning_rate": 1.4018867924528304e-05,
47
+ "loss": 0.1395,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.8990322580645161,
53
+ "eval_loss": 0.07024983316659927,
54
+ "eval_runtime": 1.4088,
55
+ "eval_samples_per_second": 2200.388,
56
+ "eval_steps_per_second": 46.137,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.5540674924850464,
62
  "learning_rate": 1.2025157232704403e-05,
63
+ "loss": 0.0891,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9187096774193548,
69
+ "eval_loss": 0.04933710768818855,
70
+ "eval_runtime": 1.6037,
71
+ "eval_samples_per_second": 1932.992,
72
+ "eval_steps_per_second": 40.53,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.42864474654197693,
78
  "learning_rate": 1.0031446540880504e-05,
79
  "loss": 0.0692,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.9241935483870968,
85
+ "eval_loss": 0.04158218950033188,
86
+ "eval_runtime": 1.4224,
87
+ "eval_samples_per_second": 2179.482,
88
+ "eval_steps_per_second": 45.699,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.4545074999332428,
94
  "learning_rate": 8.037735849056606e-06,
95
+ "loss": 0.0595,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
  "eval_accuracy": 0.9270967741935484,
101
+ "eval_loss": 0.03682653605937958,
102
+ "eval_runtime": 1.4223,
103
+ "eval_samples_per_second": 2179.524,
104
+ "eval_steps_per_second": 45.7,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.34796932339668274,
110
  "learning_rate": 6.044025157232704e-06,
111
+ "loss": 0.0538,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.9316129032258065,
117
+ "eval_loss": 0.03404370695352554,
118
+ "eval_runtime": 1.6095,
119
+ "eval_samples_per_second": 1926.117,
120
+ "eval_steps_per_second": 40.386,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.3150351941585541,
126
  "learning_rate": 4.0503144654088055e-06,
127
+ "loss": 0.0503,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9341935483870968,
133
+ "eval_loss": 0.03234480321407318,
134
+ "eval_runtime": 1.4255,
135
+ "eval_samples_per_second": 2174.649,
136
+ "eval_steps_per_second": 45.597,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.3166097104549408,
142
  "learning_rate": 2.056603773584906e-06,
143
  "loss": 0.0479,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_accuracy": 0.9341935483870968,
149
+ "eval_loss": 0.03134315088391304,
150
+ "eval_runtime": 1.6252,
151
+ "eval_samples_per_second": 1907.451,
152
+ "eval_steps_per_second": 39.995,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.27577438950538635,
158
  "learning_rate": 6.289308176100629e-08,
159
+ "loss": 0.0467,
160
  "step": 3170
161
  }
162
  ],
 
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
+ "alpha": 0.5555856325738124,
185
  "num_train_epochs": 10,
186
  "temperature": 2
187
  }
run-11/checkpoint-3180/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31136b152d8bec823af46e8f97f2f8bbe5859cef952eba4c6121a11c30bcfdf6
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d588f08c0e85333fa2a4adcf5ec378e9adea9df69c799ff80677ef0f82a3e48a
3
+ size 5368
run-12/checkpoint-2862/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-12/checkpoint-2862/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82af3ef589f6291c576d8820bf63a25e447969298df3a25793ba2cd28a0924bc
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c18c4a133656b004cd6c65fc4815e9ef62f1ec3522774d37fb11c5daa779d3e2
3
  size 268290900
run-12/checkpoint-2862/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e483cfc5c026a865fba1ae6f8e9a6aa2b014022ee836e4170842567b06c878ef
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81143e1443ee75c0758f59ffc911f4898c4f4e07b40479ce205da7301c876f7
3
+ size 536643898
run-12/checkpoint-2862/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e29353d4954fcf585947caa6a0a3fa59c7379d49f3bc56c6a4bd276fef18ca98
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
3
+ size 14244
run-12/checkpoint-2862/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c64bfaed33db311081ee0596530a03699ac7659f0cf4f57a488fea34df98816
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
3
  size 1064
run-12/checkpoint-2862/trainer_state.json CHANGED
@@ -10,135 +10,135 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.8072113394737244,
14
  "learning_rate": 1.7784765897973445e-05,
15
- "loss": 0.8045,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.677741935483871,
21
- "eval_loss": 0.4035729169845581,
22
- "eval_runtime": 3.1602,
23
- "eval_samples_per_second": 980.961,
24
- "eval_steps_per_second": 20.569,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.7372269034385681,
30
  "learning_rate": 1.556953179594689e-05,
31
- "loss": 0.3038,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8451612903225807,
37
- "eval_loss": 0.13743256032466888,
38
- "eval_runtime": 2.8178,
39
- "eval_samples_per_second": 1100.139,
40
- "eval_steps_per_second": 23.067,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.8540640473365784,
46
  "learning_rate": 1.3354297693920338e-05,
47
- "loss": 0.1417,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.8974193548387097,
53
- "eval_loss": 0.07147924602031708,
54
- "eval_runtime": 3.2365,
55
- "eval_samples_per_second": 957.827,
56
- "eval_steps_per_second": 20.083,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.5231291651725769,
62
  "learning_rate": 1.1139063591893781e-05,
63
- "loss": 0.0909,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9158064516129032,
69
- "eval_loss": 0.05066521465778351,
70
- "eval_runtime": 4.0727,
71
- "eval_samples_per_second": 761.171,
72
- "eval_steps_per_second": 15.96,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.37206166982650757,
78
  "learning_rate": 8.923829489867226e-06,
79
- "loss": 0.0714,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.9238709677419354,
85
- "eval_loss": 0.04229098558425903,
86
- "eval_runtime": 3.7158,
87
- "eval_samples_per_second": 834.269,
88
- "eval_steps_per_second": 17.493,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.40888988971710205,
94
  "learning_rate": 6.708595387840672e-06,
95
- "loss": 0.0616,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9264516129032258,
101
- "eval_loss": 0.037750471383333206,
102
- "eval_runtime": 3.3556,
103
- "eval_samples_per_second": 923.839,
104
- "eval_steps_per_second": 19.371,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.4326569139957428,
110
  "learning_rate": 4.4933612858141165e-06,
111
- "loss": 0.056,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9293548387096774,
117
- "eval_loss": 0.035395026206970215,
118
- "eval_runtime": 3.2793,
119
- "eval_samples_per_second": 945.317,
120
- "eval_steps_per_second": 19.821,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.42392992973327637,
126
  "learning_rate": 2.2781271837875614e-06,
127
- "loss": 0.0528,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9303225806451613,
133
- "eval_loss": 0.0337347649037838,
134
- "eval_runtime": 3.2707,
135
- "eval_samples_per_second": 947.797,
136
- "eval_steps_per_second": 19.873,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.7320842742919922,
142
  "learning_rate": 6.289308176100629e-08,
143
  "loss": 0.0511,
144
  "step": 2853
@@ -165,7 +165,7 @@
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
- "alpha": 0.9918433773634413,
169
  "num_train_epochs": 9,
170
  "temperature": 2
171
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.9016032814979553,
14
  "learning_rate": 1.7784765897973445e-05,
15
+ "loss": 0.8053,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.6696774193548387,
21
+ "eval_loss": 0.40919938683509827,
22
+ "eval_runtime": 1.401,
23
+ "eval_samples_per_second": 2212.634,
24
+ "eval_steps_per_second": 46.394,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.7222766876220703,
30
  "learning_rate": 1.556953179594689e-05,
31
+ "loss": 0.3073,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.844516129032258,
37
+ "eval_loss": 0.13953416049480438,
38
+ "eval_runtime": 1.4134,
39
+ "eval_samples_per_second": 2193.232,
40
+ "eval_steps_per_second": 45.987,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.6133838891983032,
46
  "learning_rate": 1.3354297693920338e-05,
47
+ "loss": 0.1428,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.8964516129032258,
53
+ "eval_loss": 0.0722731500864029,
54
+ "eval_runtime": 1.4182,
55
+ "eval_samples_per_second": 2185.887,
56
+ "eval_steps_per_second": 45.833,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.5777781009674072,
62
  "learning_rate": 1.1139063591893781e-05,
63
+ "loss": 0.0917,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9180645161290323,
69
+ "eval_loss": 0.050948865711688995,
70
+ "eval_runtime": 1.447,
71
+ "eval_samples_per_second": 2142.316,
72
+ "eval_steps_per_second": 44.92,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.4380759596824646,
78
  "learning_rate": 8.923829489867226e-06,
79
+ "loss": 0.0715,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.9232258064516129,
85
+ "eval_loss": 0.042863838374614716,
86
+ "eval_runtime": 1.4275,
87
+ "eval_samples_per_second": 2171.562,
88
+ "eval_steps_per_second": 45.533,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.46454158425331116,
94
  "learning_rate": 6.708595387840672e-06,
95
+ "loss": 0.0617,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9261290322580645,
101
+ "eval_loss": 0.03817891329526901,
102
+ "eval_runtime": 1.4469,
103
+ "eval_samples_per_second": 2142.441,
104
+ "eval_steps_per_second": 44.922,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.34600403904914856,
110
  "learning_rate": 4.4933612858141165e-06,
111
+ "loss": 0.0562,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.9303225806451613,
117
+ "eval_loss": 0.035540465265512466,
118
+ "eval_runtime": 1.6133,
119
+ "eval_samples_per_second": 1921.568,
120
+ "eval_steps_per_second": 40.291,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.32331326603889465,
126
  "learning_rate": 2.2781271837875614e-06,
127
+ "loss": 0.053,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.932258064516129,
133
+ "eval_loss": 0.034126147627830505,
134
+ "eval_runtime": 1.6193,
135
+ "eval_samples_per_second": 1914.441,
136
+ "eval_steps_per_second": 40.142,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.3256802558898926,
142
  "learning_rate": 6.289308176100629e-08,
143
  "loss": 0.0511,
144
  "step": 2853
 
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
+ "alpha": 0.5906402738898417,
169
  "num_train_epochs": 9,
170
  "temperature": 2
171
  }
run-12/checkpoint-2862/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f32cdf1fa208c9f12320f360dbe5c36de2cb881a09479e7736bdfb115e4bb72
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc3aa49ba06431aba065e94edc972c0714cf8bbe8237b664cb1781900dce380
3
+ size 5368
run-12/checkpoint-3180/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-12/checkpoint-3180/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cca0793fae2bb07821bf9c3521dd1505cf3de7bae593ae1b13b608a1e2b8f303
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43a6cedd76d3a1b3e0bb77e4d00c341877299c5525fce1d5e4626930c66c2446
3
  size 268290900
run-12/checkpoint-3180/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d488d2f32cb307527b74b933e5757bcb7ff1bdafde1955d8b6bda02006e57a23
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b65b32663111aef62cead559fa75318ca4c1b5ee0fefefc8732da15c1c2e734
3
+ size 536643898
run-12/checkpoint-3180/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fbdf6070082bd7f064003b0a80093bbcf8031eab17e8484cd03c2f330dae634
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
3
+ size 14244
run-12/checkpoint-3180/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7377b4a081dc59fc9c5a604a8fe62d7aa1f698f549e73e5c90d42a32e2a0d766
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
3
  size 1064
run-12/checkpoint-3180/trainer_state.json CHANGED
@@ -10,153 +10,153 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.6610699892044067,
14
  "learning_rate": 1.8006289308176103e-05,
15
- "loss": 0.4374,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.6170967741935484,
21
- "eval_loss": 0.21603462100028992,
22
- "eval_runtime": 2.8338,
23
- "eval_samples_per_second": 1093.928,
24
- "eval_steps_per_second": 22.937,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.5326525568962097,
30
  "learning_rate": 1.6012578616352204e-05,
31
- "loss": 0.1787,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8351612903225807,
37
- "eval_loss": 0.10006385296583176,
38
- "eval_runtime": 2.9113,
39
- "eval_samples_per_second": 1064.812,
40
- "eval_steps_per_second": 22.327,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.5324532985687256,
46
  "learning_rate": 1.4018867924528304e-05,
47
- "loss": 0.1078,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.8874193548387097,
53
- "eval_loss": 0.06386292725801468,
54
- "eval_runtime": 2.748,
55
- "eval_samples_per_second": 1128.087,
56
- "eval_steps_per_second": 23.653,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.3831179141998291,
62
  "learning_rate": 1.2025157232704403e-05,
63
- "loss": 0.0784,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9006451612903226,
69
- "eval_loss": 0.04731517285108566,
70
- "eval_runtime": 2.9085,
71
- "eval_samples_per_second": 1065.824,
72
- "eval_steps_per_second": 22.348,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.2874238193035126,
78
  "learning_rate": 1.0031446540880504e-05,
79
- "loss": 0.0637,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.9119354838709678,
85
- "eval_loss": 0.038367290049791336,
86
- "eval_runtime": 3.1834,
87
- "eval_samples_per_second": 973.805,
88
- "eval_steps_per_second": 20.418,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.30679208040237427,
94
  "learning_rate": 8.037735849056606e-06,
95
- "loss": 0.0551,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9193548387096774,
101
- "eval_loss": 0.033336855471134186,
102
- "eval_runtime": 3.3073,
103
- "eval_samples_per_second": 937.327,
104
- "eval_steps_per_second": 19.654,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.3407283425331116,
110
  "learning_rate": 6.044025157232704e-06,
111
- "loss": 0.0496,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9245161290322581,
117
- "eval_loss": 0.030338862910866737,
118
- "eval_runtime": 3.2295,
119
- "eval_samples_per_second": 959.913,
120
- "eval_steps_per_second": 20.127,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.3018116354942322,
126
  "learning_rate": 4.0503144654088055e-06,
127
- "loss": 0.0462,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9258064516129032,
133
- "eval_loss": 0.028306515887379646,
134
- "eval_runtime": 3.1561,
135
- "eval_samples_per_second": 982.229,
136
- "eval_steps_per_second": 20.595,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.46791136264801025,
142
  "learning_rate": 2.056603773584906e-06,
143
- "loss": 0.0441,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
  "eval_accuracy": 0.9303225806451613,
149
- "eval_loss": 0.027270827442407608,
150
- "eval_runtime": 3.0569,
151
- "eval_samples_per_second": 1014.096,
152
- "eval_steps_per_second": 21.263,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.2770797610282898,
158
  "learning_rate": 6.289308176100629e-08,
159
- "loss": 0.043,
160
  "step": 3170
161
  }
162
  ],
@@ -181,8 +181,8 @@
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
- "alpha": 0.32219587849011416,
185
  "num_train_epochs": 10,
186
- "temperature": 6
187
  }
188
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.6776733994483948,
14
  "learning_rate": 1.8006289308176103e-05,
15
+ "loss": 0.4584,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.6125806451612903,
21
+ "eval_loss": 0.229416161775589,
22
+ "eval_runtime": 1.4059,
23
+ "eval_samples_per_second": 2205.025,
24
+ "eval_steps_per_second": 46.234,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.5552124977111816,
30
  "learning_rate": 1.6012578616352204e-05,
31
+ "loss": 0.1874,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.8419354838709677,
37
+ "eval_loss": 0.10401267558336258,
38
+ "eval_runtime": 1.4225,
39
+ "eval_samples_per_second": 2179.306,
40
+ "eval_steps_per_second": 45.695,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.4675765037536621,
46
  "learning_rate": 1.4018867924528304e-05,
47
+ "loss": 0.1109,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
  "eval_accuracy": 0.8874193548387097,
53
+ "eval_loss": 0.0649406909942627,
54
+ "eval_runtime": 1.4308,
55
+ "eval_samples_per_second": 2166.649,
56
+ "eval_steps_per_second": 45.43,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.41055992245674133,
62
  "learning_rate": 1.2025157232704403e-05,
63
+ "loss": 0.0802,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9067741935483871,
69
+ "eval_loss": 0.04709634184837341,
70
+ "eval_runtime": 1.6361,
71
+ "eval_samples_per_second": 1894.729,
72
+ "eval_steps_per_second": 39.728,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.31869634985923767,
78
  "learning_rate": 1.0031446540880504e-05,
79
+ "loss": 0.0644,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.915483870967742,
85
+ "eval_loss": 0.03851104900240898,
86
+ "eval_runtime": 1.4501,
87
+ "eval_samples_per_second": 2137.847,
88
+ "eval_steps_per_second": 44.826,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.3909931480884552,
94
  "learning_rate": 8.037735849056606e-06,
95
+ "loss": 0.0554,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9212903225806451,
101
+ "eval_loss": 0.033167432993650436,
102
+ "eval_runtime": 1.8412,
103
+ "eval_samples_per_second": 1683.68,
104
+ "eval_steps_per_second": 35.303,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.2565544545650482,
110
  "learning_rate": 6.044025157232704e-06,
111
+ "loss": 0.05,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.927741935483871,
117
+ "eval_loss": 0.03042704612016678,
118
+ "eval_runtime": 1.4492,
119
+ "eval_samples_per_second": 2139.078,
120
+ "eval_steps_per_second": 44.852,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.27845093607902527,
126
  "learning_rate": 4.0503144654088055e-06,
127
+ "loss": 0.0467,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9293548387096774,
133
+ "eval_loss": 0.028363477438688278,
134
+ "eval_runtime": 1.6476,
135
+ "eval_samples_per_second": 1881.537,
136
+ "eval_steps_per_second": 39.452,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.2760983407497406,
142
  "learning_rate": 2.056603773584906e-06,
143
+ "loss": 0.0443,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
  "eval_accuracy": 0.9303225806451613,
149
+ "eval_loss": 0.027360040694475174,
150
+ "eval_runtime": 1.4532,
151
+ "eval_samples_per_second": 2133.234,
152
+ "eval_steps_per_second": 44.729,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.22213682532310486,
158
  "learning_rate": 6.289308176100629e-08,
159
+ "loss": 0.0432,
160
  "step": 3170
161
  }
162
  ],
 
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
+ "alpha": 0.7838482405834947,
185
  "num_train_epochs": 10,
186
+ "temperature": 5
187
  }
188
  }
run-12/checkpoint-3180/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0121f554575978ba0cbc17b2f3d309476aa4613bc5d4f33897d2e50955c438a
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44a54d5eb36e142309417d7c1e2047bbc008d9022a6a64a3ca97fdac4d430f2
3
+ size 5368
run-17/checkpoint-2862/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-17/checkpoint-2862/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:653b0e018e72d692bf7628138f5d4a36847a2a672b4dbcdcc6ed9cf77b0428aa
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c18c4a133656b004cd6c65fc4815e9ef62f1ec3522774d37fb11c5daa779d3e2
3
  size 268290900
run-17/checkpoint-2862/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9431bd5195dc5efa71f284143d1f44afc0195adc6d056b4cefa0075f55e7845
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a81143e1443ee75c0758f59ffc911f4898c4f4e07b40479ce205da7301c876f7
3
+ size 536643898
run-17/checkpoint-2862/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e29353d4954fcf585947caa6a0a3fa59c7379d49f3bc56c6a4bd276fef18ca98
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
3
+ size 14244
run-17/checkpoint-2862/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c64bfaed33db311081ee0596530a03699ac7659f0cf4f57a488fea34df98816
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
3
  size 1064
run-17/checkpoint-2862/trainer_state.json CHANGED
@@ -10,137 +10,137 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.6738103628158569,
14
  "learning_rate": 1.7784765897973445e-05,
15
- "loss": 0.4958,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.6290322580645161,
21
- "eval_loss": 0.24582983553409576,
22
- "eval_runtime": 3.3797,
23
- "eval_samples_per_second": 917.252,
24
- "eval_steps_per_second": 19.233,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.5609093308448792,
30
  "learning_rate": 1.556953179594689e-05,
31
- "loss": 0.2002,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8429032258064516,
37
- "eval_loss": 0.10813045501708984,
38
- "eval_runtime": 3.0019,
39
- "eval_samples_per_second": 1032.674,
40
- "eval_steps_per_second": 21.653,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.587248682975769,
46
  "learning_rate": 1.3354297693920338e-05,
47
- "loss": 0.1161,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.8893548387096775,
53
- "eval_loss": 0.06641113013029099,
54
- "eval_runtime": 3.0082,
55
- "eval_samples_per_second": 1030.502,
56
- "eval_steps_per_second": 21.607,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.4174334406852722,
62
  "learning_rate": 1.1139063591893781e-05,
63
- "loss": 0.0829,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9048387096774193,
69
- "eval_loss": 0.04835071042180061,
70
- "eval_runtime": 3.0244,
71
- "eval_samples_per_second": 1025.01,
72
- "eval_steps_per_second": 21.492,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.3136546015739441,
78
  "learning_rate": 8.923829489867226e-06,
79
- "loss": 0.0669,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.9148387096774193,
85
- "eval_loss": 0.039326973259449005,
86
- "eval_runtime": 3.0031,
87
- "eval_samples_per_second": 1032.25,
88
- "eval_steps_per_second": 21.644,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.3310624957084656,
94
  "learning_rate": 6.708595387840672e-06,
95
- "loss": 0.058,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9212903225806451,
101
- "eval_loss": 0.03441833332180977,
102
- "eval_runtime": 3.0173,
103
- "eval_samples_per_second": 1027.407,
104
- "eval_steps_per_second": 21.542,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.366222620010376,
110
  "learning_rate": 4.4933612858141165e-06,
111
- "loss": 0.0526,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9241935483870968,
117
- "eval_loss": 0.03173591569066048,
118
- "eval_runtime": 3.0023,
119
- "eval_samples_per_second": 1032.538,
120
- "eval_steps_per_second": 21.65,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.3408018946647644,
126
  "learning_rate": 2.2781271837875614e-06,
127
- "loss": 0.0495,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9258064516129032,
133
- "eval_loss": 0.03004557453095913,
134
- "eval_runtime": 3.0017,
135
- "eval_samples_per_second": 1032.734,
136
- "eval_steps_per_second": 21.654,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.5218478441238403,
142
  "learning_rate": 6.289308176100629e-08,
143
- "loss": 0.0479,
144
  "step": 2853
145
  }
146
  ],
@@ -161,12 +161,12 @@
161
  "attributes": {}
162
  }
163
  },
164
- "total_flos": 745151547572796.0,
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
- "alpha": 0.8675533183457913,
169
  "num_train_epochs": 9,
170
- "temperature": 4
171
  }
172
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.9016032814979553,
14
  "learning_rate": 1.7784765897973445e-05,
15
+ "loss": 0.8053,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.6696774193548387,
21
+ "eval_loss": 0.40919938683509827,
22
+ "eval_runtime": 1.6439,
23
+ "eval_samples_per_second": 1885.749,
24
+ "eval_steps_per_second": 39.54,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.7222766876220703,
30
  "learning_rate": 1.556953179594689e-05,
31
+ "loss": 0.3073,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.844516129032258,
37
+ "eval_loss": 0.13953416049480438,
38
+ "eval_runtime": 1.6434,
39
+ "eval_samples_per_second": 1886.366,
40
+ "eval_steps_per_second": 39.553,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.6133838891983032,
46
  "learning_rate": 1.3354297693920338e-05,
47
+ "loss": 0.1428,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.8964516129032258,
53
+ "eval_loss": 0.0722731500864029,
54
+ "eval_runtime": 1.6496,
55
+ "eval_samples_per_second": 1879.251,
56
+ "eval_steps_per_second": 39.404,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.5777781009674072,
62
  "learning_rate": 1.1139063591893781e-05,
63
+ "loss": 0.0917,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9180645161290323,
69
+ "eval_loss": 0.050948865711688995,
70
+ "eval_runtime": 1.4494,
71
+ "eval_samples_per_second": 2138.806,
72
+ "eval_steps_per_second": 44.846,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.4380759596824646,
78
  "learning_rate": 8.923829489867226e-06,
79
+ "loss": 0.0715,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.9232258064516129,
85
+ "eval_loss": 0.042863838374614716,
86
+ "eval_runtime": 1.6395,
87
+ "eval_samples_per_second": 1890.819,
88
+ "eval_steps_per_second": 39.646,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.46454158425331116,
94
  "learning_rate": 6.708595387840672e-06,
95
+ "loss": 0.0617,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9261290322580645,
101
+ "eval_loss": 0.03817891329526901,
102
+ "eval_runtime": 1.8355,
103
+ "eval_samples_per_second": 1688.925,
104
+ "eval_steps_per_second": 35.413,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.34600403904914856,
110
  "learning_rate": 4.4933612858141165e-06,
111
+ "loss": 0.0562,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.9303225806451613,
117
+ "eval_loss": 0.035540465265512466,
118
+ "eval_runtime": 1.8369,
119
+ "eval_samples_per_second": 1687.632,
120
+ "eval_steps_per_second": 35.386,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.32331326603889465,
126
  "learning_rate": 2.2781271837875614e-06,
127
+ "loss": 0.053,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.932258064516129,
133
+ "eval_loss": 0.034126147627830505,
134
+ "eval_runtime": 1.8294,
135
+ "eval_samples_per_second": 1694.566,
136
+ "eval_steps_per_second": 35.531,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.3256802558898926,
142
  "learning_rate": 6.289308176100629e-08,
143
+ "loss": 0.0511,
144
  "step": 2853
145
  }
146
  ],
 
161
  "attributes": {}
162
  }
163
  },
164
+ "total_flos": 744918075986196.0,
165
  "train_batch_size": 48,
166
  "trial_name": null,
167
  "trial_params": {
168
+ "alpha": 0.8523684644932737,
169
  "num_train_epochs": 9,
170
+ "temperature": 2
171
  }
172
  }
run-17/checkpoint-2862/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d55f72d033e4da62205f611e22d64151820ef3799975ce2ff32a7cb5cd94ac1d
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2604ed4b06bd9bd0b94d7de27f6549fd8ad42a818c36e531355fafadbef23c48
3
+ size 5368
run-2/checkpoint-2544/config.json CHANGED
@@ -326,6 +326,6 @@
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
- "transformers_version": "4.47.0",
330
  "vocab_size": 30522
331
  }
 
326
  "sinusoidal_pos_embds": false,
327
  "tie_weights_": true,
328
  "torch_dtype": "float32",
329
+ "transformers_version": "4.47.1",
330
  "vocab_size": 30522
331
  }
run-2/checkpoint-2544/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9427d39678336d95e13fdfea952f516d9bf056e71e1bd693f83f616f91335ff
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd73bf9fd9b9d0345f573706c8fc13e5a050e48878125734642ca11863038952
3
  size 268290900
run-2/checkpoint-2544/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:108c29621e1212a97dfff6d6c0c1199af3d6396b985ada5243ac291de4e9816b
3
- size 536641018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e4afc7f349f20b3ff0fee9612c271bf763a89d659466a6339f2187f36aee8d7
3
+ size 536643898
run-2/checkpoint-2544/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8d01b59a9b9b5afe359a210e91ba85ed7f8ef7e1040c96bc5a7bb0068433710
3
- size 13990
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:940bb167a15c246bdedc32635c80794df47530e35ccbba4b4e50737052f0263e
3
+ size 14244
run-2/checkpoint-2544/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e37a4a51a7ddb49778a03f6fd8e39edcf3619749659cd78720dfb097e7c796b4
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:214eac02be0209c50c951afc980c6dcbdad93d966fbfca385bff547659925bf8
3
  size 1064