Training in progress, step 1590
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +9 -9
- config.json +1 -1
- model.safetensors +1 -1
- run-1/checkpoint-2862/config.json +1 -1
- run-1/checkpoint-2862/model.safetensors +1 -1
- run-1/checkpoint-2862/optimizer.pt +2 -2
- run-1/checkpoint-2862/rng_state.pth +2 -2
- run-1/checkpoint-2862/scheduler.pt +1 -1
- run-1/checkpoint-2862/trainer_state.json +60 -60
- run-1/checkpoint-2862/training_args.bin +2 -2
- run-10/checkpoint-3180/config.json +1 -1
- run-10/checkpoint-3180/model.safetensors +1 -1
- run-10/checkpoint-3180/optimizer.pt +2 -2
- run-10/checkpoint-3180/rng_state.pth +2 -2
- run-10/checkpoint-3180/scheduler.pt +1 -1
- run-10/checkpoint-3180/trainer_state.json +65 -65
- run-10/checkpoint-3180/training_args.bin +2 -2
- run-11/checkpoint-3180/config.json +1 -1
- run-11/checkpoint-3180/model.safetensors +1 -1
- run-11/checkpoint-3180/optimizer.pt +2 -2
- run-11/checkpoint-3180/rng_state.pth +2 -2
- run-11/checkpoint-3180/scheduler.pt +1 -1
- run-11/checkpoint-3180/trainer_state.json +63 -63
- run-11/checkpoint-3180/training_args.bin +2 -2
- run-12/checkpoint-2862/config.json +1 -1
- run-12/checkpoint-2862/model.safetensors +1 -1
- run-12/checkpoint-2862/optimizer.pt +2 -2
- run-12/checkpoint-2862/rng_state.pth +2 -2
- run-12/checkpoint-2862/scheduler.pt +1 -1
- run-12/checkpoint-2862/trainer_state.json +58 -58
- run-12/checkpoint-2862/training_args.bin +2 -2
- run-12/checkpoint-3180/config.json +1 -1
- run-12/checkpoint-3180/model.safetensors +1 -1
- run-12/checkpoint-3180/optimizer.pt +2 -2
- run-12/checkpoint-3180/rng_state.pth +2 -2
- run-12/checkpoint-3180/scheduler.pt +1 -1
- run-12/checkpoint-3180/trainer_state.json +65 -65
- run-12/checkpoint-3180/training_args.bin +2 -2
- run-17/checkpoint-2862/config.json +1 -1
- run-17/checkpoint-2862/model.safetensors +1 -1
- run-17/checkpoint-2862/optimizer.pt +2 -2
- run-17/checkpoint-2862/rng_state.pth +2 -2
- run-17/checkpoint-2862/scheduler.pt +1 -1
- run-17/checkpoint-2862/trainer_state.json +61 -61
- run-17/checkpoint-2862/training_args.bin +2 -2
- run-2/checkpoint-2544/config.json +1 -1
- run-2/checkpoint-2544/model.safetensors +1 -1
- run-2/checkpoint-2544/optimizer.pt +2 -2
- run-2/checkpoint-2544/rng_state.pth +2 -2
- run-2/checkpoint-2544/scheduler.pt +1 -1
README.md
CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
|
|
18 |
|
19 |
This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on an unknown dataset.
|
20 |
It achieves the following results on the evaluation set:
|
21 |
-
- Loss: 0.
|
22 |
-
- Accuracy: 0.
|
23 |
|
24 |
## Model description
|
25 |
|
@@ -50,16 +50,16 @@ The following hyperparameters were used during training:
|
|
50 |
|
51 |
| Training Loss | Epoch | Step | Validation Loss | Accuracy |
|
52 |
|:-------------:|:-----:|:----:|:---------------:|:--------:|
|
53 |
-
| 4.
|
54 |
-
| 2.
|
55 |
-
| 1.
|
56 |
-
| 1.
|
57 |
-
| 0.
|
58 |
|
59 |
|
60 |
### Framework versions
|
61 |
|
62 |
-
- Transformers 4.47.
|
63 |
-
- Pytorch 2.5.1
|
64 |
- Datasets 3.2.0
|
65 |
- Tokenizers 0.21.0
|
|
|
18 |
|
19 |
This model is a fine-tuned version of [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on an unknown dataset.
|
20 |
It achieves the following results on the evaluation set:
|
21 |
+
- Loss: 0.8083
|
22 |
+
- Accuracy: 0.9161
|
23 |
|
24 |
## Model description
|
25 |
|
|
|
50 |
|
51 |
| Training Loss | Epoch | Step | Validation Loss | Accuracy |
|
52 |
|:-------------:|:-----:|:----:|:---------------:|:--------:|
|
53 |
+
| 4.3211 | 1.0 | 318 | 3.3248 | 0.7210 |
|
54 |
+
| 2.6732 | 2.0 | 636 | 1.9187 | 0.8406 |
|
55 |
+
| 1.5914 | 3.0 | 954 | 1.1971 | 0.8877 |
|
56 |
+
| 1.0497 | 4.0 | 1272 | 0.8947 | 0.9090 |
|
57 |
+
| 0.8283 | 5.0 | 1590 | 0.8083 | 0.9161 |
|
58 |
|
59 |
|
60 |
### Framework versions
|
61 |
|
62 |
+
- Transformers 4.47.1
|
63 |
+
- Pytorch 2.5.1+cu124
|
64 |
- Datasets 3.2.0
|
65 |
- Tokenizers 0.21.0
|
config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ae88eff2997cf8baa4f19bc76eed6ee56ba9dca3d564bfe316860bf65c80beb
|
3 |
size 268290900
|
run-1/checkpoint-2862/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-1/checkpoint-2862/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd49d9fed4d63ff70ccf669cdd06027bd63c93903a136a53cb3b5045948a815c
|
3 |
size 268290900
|
run-1/checkpoint-2862/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9070031d06060d90440e64a4923632ac3124825b3b707313668acd353160cdbd
|
3 |
+
size 536643898
|
run-1/checkpoint-2862/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
|
3 |
+
size 14244
|
run-1/checkpoint-2862/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
|
3 |
size 1064
|
run-1/checkpoint-2862/trainer_state.json
CHANGED
@@ -10,137 +10,137 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
-
"eval_accuracy": 0.
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
-
"loss": 0.
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
-
"eval_accuracy": 0.
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
-
"loss": 0.
|
144 |
"step": 2853
|
145 |
}
|
146 |
],
|
@@ -165,8 +165,8 @@
|
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
-
"alpha": 0.
|
169 |
"num_train_epochs": 9,
|
170 |
-
"temperature":
|
171 |
}
|
172 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.6337465047836304,
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
+
"loss": 0.3867,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.5703225806451613,
|
21 |
+
"eval_loss": 0.19408121705055237,
|
22 |
+
"eval_runtime": 1.3792,
|
23 |
+
"eval_samples_per_second": 2247.653,
|
24 |
+
"eval_steps_per_second": 47.128,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.5281593799591064,
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
+
"loss": 0.1621,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.8274193548387097,
|
37 |
+
"eval_loss": 0.0953042134642601,
|
38 |
+
"eval_runtime": 1.5805,
|
39 |
+
"eval_samples_per_second": 1961.446,
|
40 |
+
"eval_steps_per_second": 41.127,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.4498799741268158,
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
+
"loss": 0.1018,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
+
"eval_accuracy": 0.8806451612903226,
|
53 |
+
"eval_loss": 0.06378939002752304,
|
54 |
+
"eval_runtime": 1.5977,
|
55 |
+
"eval_samples_per_second": 1940.328,
|
56 |
+
"eval_steps_per_second": 40.684,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.3946130573749542,
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
+
"loss": 0.0769,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.8980645161290323,
|
69 |
+
"eval_loss": 0.04853447526693344,
|
70 |
+
"eval_runtime": 1.6017,
|
71 |
+
"eval_samples_per_second": 1935.5,
|
72 |
+
"eval_steps_per_second": 40.583,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.2938636541366577,
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
+
"loss": 0.0634,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.9067741935483871,
|
85 |
+
"eval_loss": 0.04064928740262985,
|
86 |
+
"eval_runtime": 1.4207,
|
87 |
+
"eval_samples_per_second": 2182.039,
|
88 |
+
"eval_steps_per_second": 45.752,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.3579629063606262,
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
+
"loss": 0.0555,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
+
"eval_accuracy": 0.9135483870967742,
|
101 |
+
"eval_loss": 0.03552273288369179,
|
102 |
+
"eval_runtime": 1.617,
|
103 |
+
"eval_samples_per_second": 1917.16,
|
104 |
+
"eval_steps_per_second": 40.199,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.2372261881828308,
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
+
"loss": 0.0508,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.9193548387096774,
|
117 |
+
"eval_loss": 0.033018212765455246,
|
118 |
+
"eval_runtime": 1.8153,
|
119 |
+
"eval_samples_per_second": 1707.728,
|
120 |
+
"eval_steps_per_second": 35.807,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.26516783237457275,
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
+
"loss": 0.0479,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.9219354838709677,
|
133 |
+
"eval_loss": 0.03124028816819191,
|
134 |
+
"eval_runtime": 1.622,
|
135 |
+
"eval_samples_per_second": 1911.172,
|
136 |
+
"eval_steps_per_second": 40.073,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.26064929366111755,
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
+
"loss": 0.0462,
|
144 |
"step": 2853
|
145 |
}
|
146 |
],
|
|
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
+
"alpha": 0.26432685644138476,
|
169 |
"num_train_epochs": 9,
|
170 |
+
"temperature": 15
|
171 |
}
|
172 |
}
|
run-1/checkpoint-2862/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a4cd3bd3d7b199838f7cc7ce3902cc5b5197f609f99417e50920c79fb92f710
|
3 |
+
size 5368
|
run-10/checkpoint-3180/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-10/checkpoint-3180/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c11c8d8fd5dab07dc0da9a774d4ac9892b28391145c860a4321a6d78b2468e5
|
3 |
size 268290900
|
run-10/checkpoint-3180/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf56a6a19086ea10089ac80d4b475b67fd4477ea849dc7ec47b9c3add6cf41d2
|
3 |
+
size 536643898
|
run-10/checkpoint-3180/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
|
3 |
+
size 14244
|
run-10/checkpoint-3180/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
|
3 |
size 1064
|
run-10/checkpoint-3180/trainer_state.json
CHANGED
@@ -10,153 +10,153 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
-
"eval_accuracy": 0.
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
"loss": 0.0692,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
-
"eval_accuracy": 0.
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
"loss": 0.0479,
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
-
"eval_accuracy": 0.
|
149 |
-
"eval_loss": 0.
|
150 |
-
"eval_runtime":
|
151 |
-
"eval_samples_per_second":
|
152 |
-
"eval_steps_per_second":
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
-
"grad_norm": 0.
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
-
"loss": 0.
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
@@ -177,11 +177,11 @@
|
|
177 |
"attributes": {}
|
178 |
}
|
179 |
},
|
180 |
-
"total_flos":
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
-
"alpha": 0.
|
185 |
"num_train_epochs": 10,
|
186 |
"temperature": 2
|
187 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.9062672853469849,
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
+
"loss": 0.8042,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.6709677419354839,
|
21 |
+
"eval_loss": 0.4064599573612213,
|
22 |
+
"eval_runtime": 1.4275,
|
23 |
+
"eval_samples_per_second": 2171.572,
|
24 |
+
"eval_steps_per_second": 45.533,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.7205497026443481,
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
+
"loss": 0.3038,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.847741935483871,
|
37 |
+
"eval_loss": 0.1362968385219574,
|
38 |
+
"eval_runtime": 1.6105,
|
39 |
+
"eval_samples_per_second": 1924.868,
|
40 |
+
"eval_steps_per_second": 40.36,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.5987477898597717,
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
+
"loss": 0.1395,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
+
"eval_accuracy": 0.8990322580645161,
|
53 |
+
"eval_loss": 0.07024983316659927,
|
54 |
+
"eval_runtime": 1.6105,
|
55 |
+
"eval_samples_per_second": 1924.879,
|
56 |
+
"eval_steps_per_second": 40.36,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.5540674924850464,
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
+
"loss": 0.0891,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.9187096774193548,
|
69 |
+
"eval_loss": 0.04933710768818855,
|
70 |
+
"eval_runtime": 1.7991,
|
71 |
+
"eval_samples_per_second": 1723.062,
|
72 |
+
"eval_steps_per_second": 36.129,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.42864474654197693,
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
"loss": 0.0692,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.9241935483870968,
|
85 |
+
"eval_loss": 0.04158218950033188,
|
86 |
+
"eval_runtime": 1.6087,
|
87 |
+
"eval_samples_per_second": 1927.079,
|
88 |
+
"eval_steps_per_second": 40.406,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.4545074999332428,
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
+
"loss": 0.0595,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
+
"eval_accuracy": 0.9270967741935484,
|
101 |
+
"eval_loss": 0.03682653605937958,
|
102 |
+
"eval_runtime": 1.4287,
|
103 |
+
"eval_samples_per_second": 2169.74,
|
104 |
+
"eval_steps_per_second": 45.495,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.34796932339668274,
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
+
"loss": 0.0538,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.9316129032258065,
|
117 |
+
"eval_loss": 0.03404370695352554,
|
118 |
+
"eval_runtime": 1.6077,
|
119 |
+
"eval_samples_per_second": 1928.227,
|
120 |
+
"eval_steps_per_second": 40.431,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.3150351941585541,
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
+
"loss": 0.0503,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.9341935483870968,
|
133 |
+
"eval_loss": 0.03234480321407318,
|
134 |
+
"eval_runtime": 1.4217,
|
135 |
+
"eval_samples_per_second": 2180.525,
|
136 |
+
"eval_steps_per_second": 45.721,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.3166097104549408,
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
"loss": 0.0479,
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
+
"eval_accuracy": 0.9341935483870968,
|
149 |
+
"eval_loss": 0.03134315088391304,
|
150 |
+
"eval_runtime": 1.6077,
|
151 |
+
"eval_samples_per_second": 1928.261,
|
152 |
+
"eval_steps_per_second": 40.431,
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
+
"grad_norm": 0.27577438950538635,
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
+
"loss": 0.0467,
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
|
|
177 |
"attributes": {}
|
178 |
}
|
179 |
},
|
180 |
+
"total_flos": 827333546055996.0,
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
+
"alpha": 0.5781383032678951,
|
185 |
"num_train_epochs": 10,
|
186 |
"temperature": 2
|
187 |
}
|
run-10/checkpoint-3180/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d588f08c0e85333fa2a4adcf5ec378e9adea9df69c799ff80677ef0f82a3e48a
|
3 |
+
size 5368
|
run-11/checkpoint-3180/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-11/checkpoint-3180/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c11c8d8fd5dab07dc0da9a774d4ac9892b28391145c860a4321a6d78b2468e5
|
3 |
size 268290900
|
run-11/checkpoint-3180/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf56a6a19086ea10089ac80d4b475b67fd4477ea849dc7ec47b9c3add6cf41d2
|
3 |
+
size 536643898
|
run-11/checkpoint-3180/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
|
3 |
+
size 14244
|
run-11/checkpoint-3180/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
|
3 |
size 1064
|
run-11/checkpoint-3180/trainer_state.json
CHANGED
@@ -10,153 +10,153 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
-
"eval_accuracy": 0.
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
"loss": 0.0692,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
"eval_accuracy": 0.9270967741935484,
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
"loss": 0.0479,
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
-
"eval_accuracy": 0.
|
149 |
-
"eval_loss": 0.
|
150 |
-
"eval_runtime":
|
151 |
-
"eval_samples_per_second":
|
152 |
-
"eval_steps_per_second":
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
-
"grad_norm": 0.
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
-
"loss": 0.
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
@@ -181,7 +181,7 @@
|
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
-
"alpha": 0.
|
185 |
"num_train_epochs": 10,
|
186 |
"temperature": 2
|
187 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.9062672853469849,
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
+
"loss": 0.8042,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.6709677419354839,
|
21 |
+
"eval_loss": 0.4064599573612213,
|
22 |
+
"eval_runtime": 1.3737,
|
23 |
+
"eval_samples_per_second": 2256.722,
|
24 |
+
"eval_steps_per_second": 47.318,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.7205497026443481,
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
+
"loss": 0.3038,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.847741935483871,
|
37 |
+
"eval_loss": 0.1362968385219574,
|
38 |
+
"eval_runtime": 1.392,
|
39 |
+
"eval_samples_per_second": 2227.027,
|
40 |
+
"eval_steps_per_second": 46.696,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.5987477898597717,
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
+
"loss": 0.1395,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
+
"eval_accuracy": 0.8990322580645161,
|
53 |
+
"eval_loss": 0.07024983316659927,
|
54 |
+
"eval_runtime": 1.4088,
|
55 |
+
"eval_samples_per_second": 2200.388,
|
56 |
+
"eval_steps_per_second": 46.137,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.5540674924850464,
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
+
"loss": 0.0891,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.9187096774193548,
|
69 |
+
"eval_loss": 0.04933710768818855,
|
70 |
+
"eval_runtime": 1.6037,
|
71 |
+
"eval_samples_per_second": 1932.992,
|
72 |
+
"eval_steps_per_second": 40.53,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.42864474654197693,
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
"loss": 0.0692,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.9241935483870968,
|
85 |
+
"eval_loss": 0.04158218950033188,
|
86 |
+
"eval_runtime": 1.4224,
|
87 |
+
"eval_samples_per_second": 2179.482,
|
88 |
+
"eval_steps_per_second": 45.699,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.4545074999332428,
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
+
"loss": 0.0595,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
"eval_accuracy": 0.9270967741935484,
|
101 |
+
"eval_loss": 0.03682653605937958,
|
102 |
+
"eval_runtime": 1.4223,
|
103 |
+
"eval_samples_per_second": 2179.524,
|
104 |
+
"eval_steps_per_second": 45.7,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.34796932339668274,
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
+
"loss": 0.0538,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.9316129032258065,
|
117 |
+
"eval_loss": 0.03404370695352554,
|
118 |
+
"eval_runtime": 1.6095,
|
119 |
+
"eval_samples_per_second": 1926.117,
|
120 |
+
"eval_steps_per_second": 40.386,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.3150351941585541,
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
+
"loss": 0.0503,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.9341935483870968,
|
133 |
+
"eval_loss": 0.03234480321407318,
|
134 |
+
"eval_runtime": 1.4255,
|
135 |
+
"eval_samples_per_second": 2174.649,
|
136 |
+
"eval_steps_per_second": 45.597,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.3166097104549408,
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
"loss": 0.0479,
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
+
"eval_accuracy": 0.9341935483870968,
|
149 |
+
"eval_loss": 0.03134315088391304,
|
150 |
+
"eval_runtime": 1.6252,
|
151 |
+
"eval_samples_per_second": 1907.451,
|
152 |
+
"eval_steps_per_second": 39.995,
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
+
"grad_norm": 0.27577438950538635,
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
+
"loss": 0.0467,
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
|
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
+
"alpha": 0.5555856325738124,
|
185 |
"num_train_epochs": 10,
|
186 |
"temperature": 2
|
187 |
}
|
run-11/checkpoint-3180/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d588f08c0e85333fa2a4adcf5ec378e9adea9df69c799ff80677ef0f82a3e48a
|
3 |
+
size 5368
|
run-12/checkpoint-2862/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-12/checkpoint-2862/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c18c4a133656b004cd6c65fc4815e9ef62f1ec3522774d37fb11c5daa779d3e2
|
3 |
size 268290900
|
run-12/checkpoint-2862/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a81143e1443ee75c0758f59ffc911f4898c4f4e07b40479ce205da7301c876f7
|
3 |
+
size 536643898
|
run-12/checkpoint-2862/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
|
3 |
+
size 14244
|
run-12/checkpoint-2862/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
|
3 |
size 1064
|
run-12/checkpoint-2862/trainer_state.json
CHANGED
@@ -10,135 +10,135 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
-
"eval_accuracy": 0.
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
-
"loss": 0.
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
-
"eval_accuracy": 0.
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
"loss": 0.0511,
|
144 |
"step": 2853
|
@@ -165,7 +165,7 @@
|
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
-
"alpha": 0.
|
169 |
"num_train_epochs": 9,
|
170 |
"temperature": 2
|
171 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.9016032814979553,
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
+
"loss": 0.8053,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.6696774193548387,
|
21 |
+
"eval_loss": 0.40919938683509827,
|
22 |
+
"eval_runtime": 1.401,
|
23 |
+
"eval_samples_per_second": 2212.634,
|
24 |
+
"eval_steps_per_second": 46.394,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.7222766876220703,
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
+
"loss": 0.3073,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.844516129032258,
|
37 |
+
"eval_loss": 0.13953416049480438,
|
38 |
+
"eval_runtime": 1.4134,
|
39 |
+
"eval_samples_per_second": 2193.232,
|
40 |
+
"eval_steps_per_second": 45.987,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.6133838891983032,
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
+
"loss": 0.1428,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
+
"eval_accuracy": 0.8964516129032258,
|
53 |
+
"eval_loss": 0.0722731500864029,
|
54 |
+
"eval_runtime": 1.4182,
|
55 |
+
"eval_samples_per_second": 2185.887,
|
56 |
+
"eval_steps_per_second": 45.833,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.5777781009674072,
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
+
"loss": 0.0917,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.9180645161290323,
|
69 |
+
"eval_loss": 0.050948865711688995,
|
70 |
+
"eval_runtime": 1.447,
|
71 |
+
"eval_samples_per_second": 2142.316,
|
72 |
+
"eval_steps_per_second": 44.92,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.4380759596824646,
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
+
"loss": 0.0715,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.9232258064516129,
|
85 |
+
"eval_loss": 0.042863838374614716,
|
86 |
+
"eval_runtime": 1.4275,
|
87 |
+
"eval_samples_per_second": 2171.562,
|
88 |
+
"eval_steps_per_second": 45.533,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.46454158425331116,
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
+
"loss": 0.0617,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
+
"eval_accuracy": 0.9261290322580645,
|
101 |
+
"eval_loss": 0.03817891329526901,
|
102 |
+
"eval_runtime": 1.4469,
|
103 |
+
"eval_samples_per_second": 2142.441,
|
104 |
+
"eval_steps_per_second": 44.922,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.34600403904914856,
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
+
"loss": 0.0562,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.9303225806451613,
|
117 |
+
"eval_loss": 0.035540465265512466,
|
118 |
+
"eval_runtime": 1.6133,
|
119 |
+
"eval_samples_per_second": 1921.568,
|
120 |
+
"eval_steps_per_second": 40.291,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.32331326603889465,
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
+
"loss": 0.053,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.932258064516129,
|
133 |
+
"eval_loss": 0.034126147627830505,
|
134 |
+
"eval_runtime": 1.6193,
|
135 |
+
"eval_samples_per_second": 1914.441,
|
136 |
+
"eval_steps_per_second": 40.142,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.3256802558898926,
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
"loss": 0.0511,
|
144 |
"step": 2853
|
|
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
+
"alpha": 0.5906402738898417,
|
169 |
"num_train_epochs": 9,
|
170 |
"temperature": 2
|
171 |
}
|
run-12/checkpoint-2862/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fc3aa49ba06431aba065e94edc972c0714cf8bbe8237b664cb1781900dce380
|
3 |
+
size 5368
|
run-12/checkpoint-3180/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-12/checkpoint-3180/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43a6cedd76d3a1b3e0bb77e4d00c341877299c5525fce1d5e4626930c66c2446
|
3 |
size 268290900
|
run-12/checkpoint-3180/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b65b32663111aef62cead559fa75318ca4c1b5ee0fefefc8732da15c1c2e734
|
3 |
+
size 536643898
|
run-12/checkpoint-3180/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4929abc25ddcb4d4986245c01bee45e03155019fd32282d1467b43fbdcdaed02
|
3 |
+
size 14244
|
run-12/checkpoint-3180/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:33fadc11bb5c6bd0a5ea603f5e48cedcd72384fa2714656ecf6f8da629f7ae05
|
3 |
size 1064
|
run-12/checkpoint-3180/trainer_state.json
CHANGED
@@ -10,153 +10,153 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
"eval_accuracy": 0.8874193548387097,
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
-
"loss": 0.
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
-
"eval_accuracy": 0.
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
-
"loss": 0.
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
"eval_accuracy": 0.9303225806451613,
|
149 |
-
"eval_loss": 0.
|
150 |
-
"eval_runtime":
|
151 |
-
"eval_samples_per_second":
|
152 |
-
"eval_steps_per_second":
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
-
"grad_norm": 0.
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
-
"loss": 0.
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
@@ -181,8 +181,8 @@
|
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
-
"alpha": 0.
|
185 |
"num_train_epochs": 10,
|
186 |
-
"temperature":
|
187 |
}
|
188 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.6776733994483948,
|
14 |
"learning_rate": 1.8006289308176103e-05,
|
15 |
+
"loss": 0.4584,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.6125806451612903,
|
21 |
+
"eval_loss": 0.229416161775589,
|
22 |
+
"eval_runtime": 1.4059,
|
23 |
+
"eval_samples_per_second": 2205.025,
|
24 |
+
"eval_steps_per_second": 46.234,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.5552124977111816,
|
30 |
"learning_rate": 1.6012578616352204e-05,
|
31 |
+
"loss": 0.1874,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.8419354838709677,
|
37 |
+
"eval_loss": 0.10401267558336258,
|
38 |
+
"eval_runtime": 1.4225,
|
39 |
+
"eval_samples_per_second": 2179.306,
|
40 |
+
"eval_steps_per_second": 45.695,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.4675765037536621,
|
46 |
"learning_rate": 1.4018867924528304e-05,
|
47 |
+
"loss": 0.1109,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
"eval_accuracy": 0.8874193548387097,
|
53 |
+
"eval_loss": 0.0649406909942627,
|
54 |
+
"eval_runtime": 1.4308,
|
55 |
+
"eval_samples_per_second": 2166.649,
|
56 |
+
"eval_steps_per_second": 45.43,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.41055992245674133,
|
62 |
"learning_rate": 1.2025157232704403e-05,
|
63 |
+
"loss": 0.0802,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.9067741935483871,
|
69 |
+
"eval_loss": 0.04709634184837341,
|
70 |
+
"eval_runtime": 1.6361,
|
71 |
+
"eval_samples_per_second": 1894.729,
|
72 |
+
"eval_steps_per_second": 39.728,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.31869634985923767,
|
78 |
"learning_rate": 1.0031446540880504e-05,
|
79 |
+
"loss": 0.0644,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.915483870967742,
|
85 |
+
"eval_loss": 0.03851104900240898,
|
86 |
+
"eval_runtime": 1.4501,
|
87 |
+
"eval_samples_per_second": 2137.847,
|
88 |
+
"eval_steps_per_second": 44.826,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.3909931480884552,
|
94 |
"learning_rate": 8.037735849056606e-06,
|
95 |
+
"loss": 0.0554,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
+
"eval_accuracy": 0.9212903225806451,
|
101 |
+
"eval_loss": 0.033167432993650436,
|
102 |
+
"eval_runtime": 1.8412,
|
103 |
+
"eval_samples_per_second": 1683.68,
|
104 |
+
"eval_steps_per_second": 35.303,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.2565544545650482,
|
110 |
"learning_rate": 6.044025157232704e-06,
|
111 |
+
"loss": 0.05,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.927741935483871,
|
117 |
+
"eval_loss": 0.03042704612016678,
|
118 |
+
"eval_runtime": 1.4492,
|
119 |
+
"eval_samples_per_second": 2139.078,
|
120 |
+
"eval_steps_per_second": 44.852,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.27845093607902527,
|
126 |
"learning_rate": 4.0503144654088055e-06,
|
127 |
+
"loss": 0.0467,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.9293548387096774,
|
133 |
+
"eval_loss": 0.028363477438688278,
|
134 |
+
"eval_runtime": 1.6476,
|
135 |
+
"eval_samples_per_second": 1881.537,
|
136 |
+
"eval_steps_per_second": 39.452,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.2760983407497406,
|
142 |
"learning_rate": 2.056603773584906e-06,
|
143 |
+
"loss": 0.0443,
|
144 |
"step": 2853
|
145 |
},
|
146 |
{
|
147 |
"epoch": 9.0,
|
148 |
"eval_accuracy": 0.9303225806451613,
|
149 |
+
"eval_loss": 0.027360040694475174,
|
150 |
+
"eval_runtime": 1.4532,
|
151 |
+
"eval_samples_per_second": 2133.234,
|
152 |
+
"eval_steps_per_second": 44.729,
|
153 |
"step": 2862
|
154 |
},
|
155 |
{
|
156 |
"epoch": 9.968553459119496,
|
157 |
+
"grad_norm": 0.22213682532310486,
|
158 |
"learning_rate": 6.289308176100629e-08,
|
159 |
+
"loss": 0.0432,
|
160 |
"step": 3170
|
161 |
}
|
162 |
],
|
|
|
181 |
"train_batch_size": 48,
|
182 |
"trial_name": null,
|
183 |
"trial_params": {
|
184 |
+
"alpha": 0.7838482405834947,
|
185 |
"num_train_epochs": 10,
|
186 |
+
"temperature": 5
|
187 |
}
|
188 |
}
|
run-12/checkpoint-3180/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a44a54d5eb36e142309417d7c1e2047bbc008d9022a6a64a3ca97fdac4d430f2
|
3 |
+
size 5368
|
run-17/checkpoint-2862/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-17/checkpoint-2862/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c18c4a133656b004cd6c65fc4815e9ef62f1ec3522774d37fb11c5daa779d3e2
|
3 |
size 268290900
|
run-17/checkpoint-2862/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a81143e1443ee75c0758f59ffc911f4898c4f4e07b40479ce205da7301c876f7
|
3 |
+
size 536643898
|
run-17/checkpoint-2862/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:41b5282f1cde964bf5a382eb03f1cdd87f1c8e2f60e43277b3453f63947f5933
|
3 |
+
size 14244
|
run-17/checkpoint-2862/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a738824491ff8b261b6db2a842e15bf80355d04a07892a16df2c4715b2ee556
|
3 |
size 1064
|
run-17/checkpoint-2862/trainer_state.json
CHANGED
@@ -10,137 +10,137 @@
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
-
"grad_norm": 0.
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
-
"loss": 0.
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
-
"eval_accuracy": 0.
|
21 |
-
"eval_loss": 0.
|
22 |
-
"eval_runtime":
|
23 |
-
"eval_samples_per_second":
|
24 |
-
"eval_steps_per_second":
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
-
"grad_norm": 0.
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
-
"loss": 0.
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
-
"eval_accuracy": 0.
|
37 |
-
"eval_loss": 0.
|
38 |
-
"eval_runtime":
|
39 |
-
"eval_samples_per_second":
|
40 |
-
"eval_steps_per_second":
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
-
"grad_norm": 0.
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
-
"loss": 0.
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
-
"eval_accuracy": 0.
|
53 |
-
"eval_loss": 0.
|
54 |
-
"eval_runtime":
|
55 |
-
"eval_samples_per_second":
|
56 |
-
"eval_steps_per_second":
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
-
"grad_norm": 0.
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
-
"loss": 0.
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
-
"eval_accuracy": 0.
|
69 |
-
"eval_loss": 0.
|
70 |
-
"eval_runtime":
|
71 |
-
"eval_samples_per_second":
|
72 |
-
"eval_steps_per_second":
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
-
"grad_norm": 0.
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
-
"loss": 0.
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
-
"eval_accuracy": 0.
|
85 |
-
"eval_loss": 0.
|
86 |
-
"eval_runtime":
|
87 |
-
"eval_samples_per_second":
|
88 |
-
"eval_steps_per_second":
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
-
"grad_norm": 0.
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
-
"loss": 0.
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
-
"eval_accuracy": 0.
|
101 |
-
"eval_loss": 0.
|
102 |
-
"eval_runtime":
|
103 |
-
"eval_samples_per_second":
|
104 |
-
"eval_steps_per_second":
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
-
"grad_norm": 0.
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
-
"loss": 0.
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
-
"eval_accuracy": 0.
|
117 |
-
"eval_loss": 0.
|
118 |
-
"eval_runtime":
|
119 |
-
"eval_samples_per_second":
|
120 |
-
"eval_steps_per_second":
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
-
"grad_norm": 0.
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
-
"loss": 0.
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
-
"eval_accuracy": 0.
|
133 |
-
"eval_loss": 0.
|
134 |
-
"eval_runtime":
|
135 |
-
"eval_samples_per_second":
|
136 |
-
"eval_steps_per_second":
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
-
"grad_norm": 0.
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
-
"loss": 0.
|
144 |
"step": 2853
|
145 |
}
|
146 |
],
|
@@ -161,12 +161,12 @@
|
|
161 |
"attributes": {}
|
162 |
}
|
163 |
},
|
164 |
-
"total_flos":
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
-
"alpha": 0.
|
169 |
"num_train_epochs": 9,
|
170 |
-
"temperature":
|
171 |
}
|
172 |
}
|
|
|
10 |
"log_history": [
|
11 |
{
|
12 |
"epoch": 0.9968553459119497,
|
13 |
+
"grad_norm": 0.9016032814979553,
|
14 |
"learning_rate": 1.7784765897973445e-05,
|
15 |
+
"loss": 0.8053,
|
16 |
"step": 317
|
17 |
},
|
18 |
{
|
19 |
"epoch": 1.0,
|
20 |
+
"eval_accuracy": 0.6696774193548387,
|
21 |
+
"eval_loss": 0.40919938683509827,
|
22 |
+
"eval_runtime": 1.6439,
|
23 |
+
"eval_samples_per_second": 1885.749,
|
24 |
+
"eval_steps_per_second": 39.54,
|
25 |
"step": 318
|
26 |
},
|
27 |
{
|
28 |
"epoch": 1.9937106918238994,
|
29 |
+
"grad_norm": 0.7222766876220703,
|
30 |
"learning_rate": 1.556953179594689e-05,
|
31 |
+
"loss": 0.3073,
|
32 |
"step": 634
|
33 |
},
|
34 |
{
|
35 |
"epoch": 2.0,
|
36 |
+
"eval_accuracy": 0.844516129032258,
|
37 |
+
"eval_loss": 0.13953416049480438,
|
38 |
+
"eval_runtime": 1.6434,
|
39 |
+
"eval_samples_per_second": 1886.366,
|
40 |
+
"eval_steps_per_second": 39.553,
|
41 |
"step": 636
|
42 |
},
|
43 |
{
|
44 |
"epoch": 2.990566037735849,
|
45 |
+
"grad_norm": 0.6133838891983032,
|
46 |
"learning_rate": 1.3354297693920338e-05,
|
47 |
+
"loss": 0.1428,
|
48 |
"step": 951
|
49 |
},
|
50 |
{
|
51 |
"epoch": 3.0,
|
52 |
+
"eval_accuracy": 0.8964516129032258,
|
53 |
+
"eval_loss": 0.0722731500864029,
|
54 |
+
"eval_runtime": 1.6496,
|
55 |
+
"eval_samples_per_second": 1879.251,
|
56 |
+
"eval_steps_per_second": 39.404,
|
57 |
"step": 954
|
58 |
},
|
59 |
{
|
60 |
"epoch": 3.9874213836477987,
|
61 |
+
"grad_norm": 0.5777781009674072,
|
62 |
"learning_rate": 1.1139063591893781e-05,
|
63 |
+
"loss": 0.0917,
|
64 |
"step": 1268
|
65 |
},
|
66 |
{
|
67 |
"epoch": 4.0,
|
68 |
+
"eval_accuracy": 0.9180645161290323,
|
69 |
+
"eval_loss": 0.050948865711688995,
|
70 |
+
"eval_runtime": 1.4494,
|
71 |
+
"eval_samples_per_second": 2138.806,
|
72 |
+
"eval_steps_per_second": 44.846,
|
73 |
"step": 1272
|
74 |
},
|
75 |
{
|
76 |
"epoch": 4.984276729559748,
|
77 |
+
"grad_norm": 0.4380759596824646,
|
78 |
"learning_rate": 8.923829489867226e-06,
|
79 |
+
"loss": 0.0715,
|
80 |
"step": 1585
|
81 |
},
|
82 |
{
|
83 |
"epoch": 5.0,
|
84 |
+
"eval_accuracy": 0.9232258064516129,
|
85 |
+
"eval_loss": 0.042863838374614716,
|
86 |
+
"eval_runtime": 1.6395,
|
87 |
+
"eval_samples_per_second": 1890.819,
|
88 |
+
"eval_steps_per_second": 39.646,
|
89 |
"step": 1590
|
90 |
},
|
91 |
{
|
92 |
"epoch": 5.981132075471698,
|
93 |
+
"grad_norm": 0.46454158425331116,
|
94 |
"learning_rate": 6.708595387840672e-06,
|
95 |
+
"loss": 0.0617,
|
96 |
"step": 1902
|
97 |
},
|
98 |
{
|
99 |
"epoch": 6.0,
|
100 |
+
"eval_accuracy": 0.9261290322580645,
|
101 |
+
"eval_loss": 0.03817891329526901,
|
102 |
+
"eval_runtime": 1.8355,
|
103 |
+
"eval_samples_per_second": 1688.925,
|
104 |
+
"eval_steps_per_second": 35.413,
|
105 |
"step": 1908
|
106 |
},
|
107 |
{
|
108 |
"epoch": 6.977987421383648,
|
109 |
+
"grad_norm": 0.34600403904914856,
|
110 |
"learning_rate": 4.4933612858141165e-06,
|
111 |
+
"loss": 0.0562,
|
112 |
"step": 2219
|
113 |
},
|
114 |
{
|
115 |
"epoch": 7.0,
|
116 |
+
"eval_accuracy": 0.9303225806451613,
|
117 |
+
"eval_loss": 0.035540465265512466,
|
118 |
+
"eval_runtime": 1.8369,
|
119 |
+
"eval_samples_per_second": 1687.632,
|
120 |
+
"eval_steps_per_second": 35.386,
|
121 |
"step": 2226
|
122 |
},
|
123 |
{
|
124 |
"epoch": 7.9748427672955975,
|
125 |
+
"grad_norm": 0.32331326603889465,
|
126 |
"learning_rate": 2.2781271837875614e-06,
|
127 |
+
"loss": 0.053,
|
128 |
"step": 2536
|
129 |
},
|
130 |
{
|
131 |
"epoch": 8.0,
|
132 |
+
"eval_accuracy": 0.932258064516129,
|
133 |
+
"eval_loss": 0.034126147627830505,
|
134 |
+
"eval_runtime": 1.8294,
|
135 |
+
"eval_samples_per_second": 1694.566,
|
136 |
+
"eval_steps_per_second": 35.531,
|
137 |
"step": 2544
|
138 |
},
|
139 |
{
|
140 |
"epoch": 8.971698113207546,
|
141 |
+
"grad_norm": 0.3256802558898926,
|
142 |
"learning_rate": 6.289308176100629e-08,
|
143 |
+
"loss": 0.0511,
|
144 |
"step": 2853
|
145 |
}
|
146 |
],
|
|
|
161 |
"attributes": {}
|
162 |
}
|
163 |
},
|
164 |
+
"total_flos": 744918075986196.0,
|
165 |
"train_batch_size": 48,
|
166 |
"trial_name": null,
|
167 |
"trial_params": {
|
168 |
+
"alpha": 0.8523684644932737,
|
169 |
"num_train_epochs": 9,
|
170 |
+
"temperature": 2
|
171 |
}
|
172 |
}
|
run-17/checkpoint-2862/training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2604ed4b06bd9bd0b94d7de27f6549fd8ad42a818c36e531355fafadbef23c48
|
3 |
+
size 5368
|
run-2/checkpoint-2544/config.json
CHANGED
@@ -326,6 +326,6 @@
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
-
"transformers_version": "4.47.
|
330 |
"vocab_size": 30522
|
331 |
}
|
|
|
326 |
"sinusoidal_pos_embds": false,
|
327 |
"tie_weights_": true,
|
328 |
"torch_dtype": "float32",
|
329 |
+
"transformers_version": "4.47.1",
|
330 |
"vocab_size": 30522
|
331 |
}
|
run-2/checkpoint-2544/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 268290900
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd73bf9fd9b9d0345f573706c8fc13e5a050e48878125734642ca11863038952
|
3 |
size 268290900
|
run-2/checkpoint-2544/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e4afc7f349f20b3ff0fee9612c271bf763a89d659466a6339f2187f36aee8d7
|
3 |
+
size 536643898
|
run-2/checkpoint-2544/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:940bb167a15c246bdedc32635c80794df47530e35ccbba4b4e50737052f0263e
|
3 |
+
size 14244
|
run-2/checkpoint-2544/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1064
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:214eac02be0209c50c951afc980c6dcbdad93d966fbfca385bff547659925bf8
|
3 |
size 1064
|