jantrienes commited on
Commit
e8c0878
1 Parent(s): 484e5df

End of training

Browse files
README.md ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: roberta-large
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - open_question_type
8
+ metrics:
9
+ - f1
10
+ model-index:
11
+ - name: roberta-large-question-classifier
12
+ results:
13
+ - task:
14
+ name: Text Classification
15
+ type: text-classification
16
+ dataset:
17
+ name: open_question_type
18
+ type: open_question_type
19
+ config: default
20
+ split: validation
21
+ args: default
22
+ metrics:
23
+ - name: F1
24
+ type: f1
25
+ value: 0.7954091951908298
26
+ ---
27
+
28
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
+ should probably proofread and complete it, then remove this comment. -->
30
+
31
+ # roberta-large-question-classifier
32
+
33
+ This model is a fine-tuned version of [roberta-large](https://huggingface.co/roberta-large) on the open_question_type dataset.
34
+ It achieves the following results on the evaluation set:
35
+ - Loss: 1.9002
36
+ - F1: 0.7954
37
+
38
+ ## Model description
39
+
40
+ More information needed
41
+
42
+ ## Intended uses & limitations
43
+
44
+ More information needed
45
+
46
+ ## Training and evaluation data
47
+
48
+ More information needed
49
+
50
+ ## Training procedure
51
+
52
+ ### Training hyperparameters
53
+
54
+ The following hyperparameters were used during training:
55
+ - learning_rate: 2e-05
56
+ - train_batch_size: 16
57
+ - eval_batch_size: 512
58
+ - seed: 42
59
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
60
+ - lr_scheduler_type: linear
61
+ - lr_scheduler_warmup_ratio: 0.1
62
+ - num_epochs: 30
63
+
64
+ ### Training results
65
+
66
+ | Training Loss | Epoch | Step | Validation Loss | F1 |
67
+ |:-------------:|:-----:|:----:|:---------------:|:------:|
68
+ | 1.9467 | 1.0 | 233 | 1.3099 | 0.4050 |
69
+ | 0.6381 | 2.0 | 466 | 0.5586 | 0.7785 |
70
+ | 0.628 | 3.0 | 699 | 0.6419 | 0.7831 |
71
+ | 0.4487 | 4.0 | 932 | 0.5770 | 0.8094 |
72
+ | 0.3319 | 5.0 | 1165 | 0.7713 | 0.7953 |
73
+ | 0.2095 | 6.0 | 1398 | 0.8799 | 0.8018 |
74
+ | 0.1355 | 7.0 | 1631 | 1.0646 | 0.7961 |
75
+ | 0.0956 | 8.0 | 1864 | 1.2175 | 0.7999 |
76
+ | 0.0687 | 9.0 | 2097 | 1.3647 | 0.7892 |
77
+ | 0.0371 | 10.0 | 2330 | 1.3809 | 0.7987 |
78
+ | 0.0303 | 11.0 | 2563 | 1.3591 | 0.8123 |
79
+ | 0.0263 | 12.0 | 2796 | 1.5317 | 0.8100 |
80
+ | 0.0144 | 13.0 | 3029 | 1.5726 | 0.7959 |
81
+ | 0.0436 | 14.0 | 3262 | 1.6160 | 0.7988 |
82
+ | 0.0048 | 15.0 | 3495 | 1.6826 | 0.7957 |
83
+ | 0.0001 | 16.0 | 3728 | 1.6913 | 0.7957 |
84
+ | 0.0001 | 17.0 | 3961 | 1.7076 | 0.7995 |
85
+ | 0.0034 | 18.0 | 4194 | 1.8018 | 0.7960 |
86
+ | 0.0228 | 19.0 | 4427 | 1.7457 | 0.7916 |
87
+ | 0.0083 | 20.0 | 4660 | 1.9279 | 0.7869 |
88
+ | 0.0001 | 21.0 | 4893 | 1.8367 | 0.7915 |
89
+ | 0.0003 | 22.0 | 5126 | 1.8620 | 0.7842 |
90
+ | 0.0002 | 23.0 | 5359 | 1.9192 | 0.7828 |
91
+ | 0.0 | 24.0 | 5592 | 1.9081 | 0.7927 |
92
+ | 0.0003 | 25.0 | 5825 | 1.9822 | 0.7813 |
93
+ | 0.0059 | 26.0 | 6058 | 1.8737 | 0.7954 |
94
+ | 0.0 | 27.0 | 6291 | 1.8793 | 0.7929 |
95
+ | 0.0 | 28.0 | 6524 | 1.8905 | 0.7940 |
96
+ | 0.0 | 29.0 | 6757 | 1.8971 | 0.7940 |
97
+ | 0.0002 | 30.0 | 6990 | 1.9002 | 0.7954 |
98
+
99
+
100
+ ### Framework versions
101
+
102
+ - Transformers 4.33.2
103
+ - Pytorch 2.1.0+cu118
104
+ - Datasets 2.14.5
105
+ - Tokenizers 0.13.3
config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roberta-large",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "id2label": {
14
+ "0": "extent",
15
+ "1": "disjunction",
16
+ "2": "concept",
17
+ "3": "comparison",
18
+ "4": "procedural",
19
+ "5": "consequence",
20
+ "6": "judgmental",
21
+ "7": "example",
22
+ "8": "verification",
23
+ "9": "cause"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 4096,
27
+ "label2id": {
28
+ "cause": 9,
29
+ "comparison": 3,
30
+ "concept": 2,
31
+ "consequence": 5,
32
+ "disjunction": 1,
33
+ "example": 7,
34
+ "extent": 0,
35
+ "judgmental": 6,
36
+ "procedural": 4,
37
+ "verification": 8
38
+ },
39
+ "layer_norm_eps": 1e-05,
40
+ "max_position_embeddings": 514,
41
+ "model_type": "roberta",
42
+ "num_attention_heads": 16,
43
+ "num_hidden_layers": 24,
44
+ "pad_token_id": 1,
45
+ "position_embedding_type": "absolute",
46
+ "problem_type": "single_label_classification",
47
+ "torch_dtype": "float32",
48
+ "transformers_version": "4.33.2",
49
+ "type_vocab_size": 1,
50
+ "use_cache": true,
51
+ "vocab_size": 50265
52
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:632e4998b864201d52dc9bb19b649075ce06ceab00b63399c5ab76abf1a8007b
3
+ size 1421615982
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "RobertaTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
trainer_state.json ADDED
@@ -0,0 +1,1132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8123190611646329,
3
+ "best_model_checkpoint": "output/roberta-large-question-classifier/checkpoint-2563",
4
+ "epoch": 30.0,
5
+ "eval_steps": 500,
6
+ "global_step": 6990,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.21,
13
+ "learning_rate": 1.430615164520744e-06,
14
+ "loss": 2.3372,
15
+ "step": 50
16
+ },
17
+ {
18
+ "epoch": 0.43,
19
+ "learning_rate": 2.861230329041488e-06,
20
+ "loss": 2.276,
21
+ "step": 100
22
+ },
23
+ {
24
+ "epoch": 0.64,
25
+ "learning_rate": 4.291845493562232e-06,
26
+ "loss": 2.1988,
27
+ "step": 150
28
+ },
29
+ {
30
+ "epoch": 0.86,
31
+ "learning_rate": 5.722460658082976e-06,
32
+ "loss": 1.9467,
33
+ "step": 200
34
+ },
35
+ {
36
+ "epoch": 1.0,
37
+ "eval_f1": 0.4050404697492347,
38
+ "eval_loss": 1.3099409341812134,
39
+ "eval_runtime": 1.3906,
40
+ "eval_samples_per_second": 417.1,
41
+ "eval_steps_per_second": 1.438,
42
+ "step": 233
43
+ },
44
+ {
45
+ "epoch": 1.07,
46
+ "learning_rate": 7.15307582260372e-06,
47
+ "loss": 1.5551,
48
+ "step": 250
49
+ },
50
+ {
51
+ "epoch": 1.29,
52
+ "learning_rate": 8.583690987124465e-06,
53
+ "loss": 1.0537,
54
+ "step": 300
55
+ },
56
+ {
57
+ "epoch": 1.5,
58
+ "learning_rate": 1.0014306151645208e-05,
59
+ "loss": 0.872,
60
+ "step": 350
61
+ },
62
+ {
63
+ "epoch": 1.72,
64
+ "learning_rate": 1.1444921316165953e-05,
65
+ "loss": 0.6619,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 1.93,
70
+ "learning_rate": 1.2875536480686697e-05,
71
+ "loss": 0.6381,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 2.0,
76
+ "eval_f1": 0.7785421184302428,
77
+ "eval_loss": 0.5586220622062683,
78
+ "eval_runtime": 1.4464,
79
+ "eval_samples_per_second": 400.997,
80
+ "eval_steps_per_second": 1.383,
81
+ "step": 466
82
+ },
83
+ {
84
+ "epoch": 2.15,
85
+ "learning_rate": 1.430615164520744e-05,
86
+ "loss": 0.509,
87
+ "step": 500
88
+ },
89
+ {
90
+ "epoch": 2.36,
91
+ "learning_rate": 1.5736766809728185e-05,
92
+ "loss": 0.5387,
93
+ "step": 550
94
+ },
95
+ {
96
+ "epoch": 2.58,
97
+ "learning_rate": 1.716738197424893e-05,
98
+ "loss": 0.5163,
99
+ "step": 600
100
+ },
101
+ {
102
+ "epoch": 2.79,
103
+ "learning_rate": 1.859799713876967e-05,
104
+ "loss": 0.628,
105
+ "step": 650
106
+ },
107
+ {
108
+ "epoch": 3.0,
109
+ "eval_f1": 0.7831151120797589,
110
+ "eval_loss": 0.6418800354003906,
111
+ "eval_runtime": 1.467,
112
+ "eval_samples_per_second": 395.356,
113
+ "eval_steps_per_second": 1.363,
114
+ "step": 699
115
+ },
116
+ {
117
+ "epoch": 3.0,
118
+ "learning_rate": 1.9996820855189955e-05,
119
+ "loss": 0.5632,
120
+ "step": 700
121
+ },
122
+ {
123
+ "epoch": 3.22,
124
+ "learning_rate": 1.983786361468765e-05,
125
+ "loss": 0.4046,
126
+ "step": 750
127
+ },
128
+ {
129
+ "epoch": 3.43,
130
+ "learning_rate": 1.9678906374185345e-05,
131
+ "loss": 0.3985,
132
+ "step": 800
133
+ },
134
+ {
135
+ "epoch": 3.65,
136
+ "learning_rate": 1.951994913368304e-05,
137
+ "loss": 0.4307,
138
+ "step": 850
139
+ },
140
+ {
141
+ "epoch": 3.86,
142
+ "learning_rate": 1.9360991893180737e-05,
143
+ "loss": 0.4487,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 4.0,
148
+ "eval_f1": 0.8093842888236766,
149
+ "eval_loss": 0.5770355463027954,
150
+ "eval_runtime": 1.4647,
151
+ "eval_samples_per_second": 395.985,
152
+ "eval_steps_per_second": 1.365,
153
+ "step": 932
154
+ },
155
+ {
156
+ "epoch": 4.08,
157
+ "learning_rate": 1.9202034652678432e-05,
158
+ "loss": 0.3373,
159
+ "step": 950
160
+ },
161
+ {
162
+ "epoch": 4.29,
163
+ "learning_rate": 1.9043077412176127e-05,
164
+ "loss": 0.2578,
165
+ "step": 1000
166
+ },
167
+ {
168
+ "epoch": 4.51,
169
+ "learning_rate": 1.888412017167382e-05,
170
+ "loss": 0.2675,
171
+ "step": 1050
172
+ },
173
+ {
174
+ "epoch": 4.72,
175
+ "learning_rate": 1.8725162931171516e-05,
176
+ "loss": 0.2697,
177
+ "step": 1100
178
+ },
179
+ {
180
+ "epoch": 4.94,
181
+ "learning_rate": 1.8566205690669214e-05,
182
+ "loss": 0.3319,
183
+ "step": 1150
184
+ },
185
+ {
186
+ "epoch": 5.0,
187
+ "eval_f1": 0.7952503005676876,
188
+ "eval_loss": 0.7712982296943665,
189
+ "eval_runtime": 1.5475,
190
+ "eval_samples_per_second": 374.79,
191
+ "eval_steps_per_second": 1.292,
192
+ "step": 1165
193
+ },
194
+ {
195
+ "epoch": 5.15,
196
+ "learning_rate": 1.8407248450166905e-05,
197
+ "loss": 0.2049,
198
+ "step": 1200
199
+ },
200
+ {
201
+ "epoch": 5.36,
202
+ "learning_rate": 1.82482912096646e-05,
203
+ "loss": 0.2344,
204
+ "step": 1250
205
+ },
206
+ {
207
+ "epoch": 5.58,
208
+ "learning_rate": 1.8089333969162298e-05,
209
+ "loss": 0.1843,
210
+ "step": 1300
211
+ },
212
+ {
213
+ "epoch": 5.79,
214
+ "learning_rate": 1.7930376728659993e-05,
215
+ "loss": 0.2095,
216
+ "step": 1350
217
+ },
218
+ {
219
+ "epoch": 6.0,
220
+ "eval_f1": 0.8017807103839256,
221
+ "eval_loss": 0.8798965811729431,
222
+ "eval_runtime": 1.4572,
223
+ "eval_samples_per_second": 398.025,
224
+ "eval_steps_per_second": 1.372,
225
+ "step": 1398
226
+ },
227
+ {
228
+ "epoch": 6.01,
229
+ "learning_rate": 1.7771419488157687e-05,
230
+ "loss": 0.2039,
231
+ "step": 1400
232
+ },
233
+ {
234
+ "epoch": 6.22,
235
+ "learning_rate": 1.7612462247655382e-05,
236
+ "loss": 0.0876,
237
+ "step": 1450
238
+ },
239
+ {
240
+ "epoch": 6.44,
241
+ "learning_rate": 1.7453505007153077e-05,
242
+ "loss": 0.1054,
243
+ "step": 1500
244
+ },
245
+ {
246
+ "epoch": 6.65,
247
+ "learning_rate": 1.7294547766650775e-05,
248
+ "loss": 0.1629,
249
+ "step": 1550
250
+ },
251
+ {
252
+ "epoch": 6.87,
253
+ "learning_rate": 1.7135590526148466e-05,
254
+ "loss": 0.1355,
255
+ "step": 1600
256
+ },
257
+ {
258
+ "epoch": 7.0,
259
+ "eval_f1": 0.7961224122154954,
260
+ "eval_loss": 1.0646474361419678,
261
+ "eval_runtime": 1.4774,
262
+ "eval_samples_per_second": 392.581,
263
+ "eval_steps_per_second": 1.354,
264
+ "step": 1631
265
+ },
266
+ {
267
+ "epoch": 7.08,
268
+ "learning_rate": 1.6976633285646164e-05,
269
+ "loss": 0.1457,
270
+ "step": 1650
271
+ },
272
+ {
273
+ "epoch": 7.3,
274
+ "learning_rate": 1.681767604514386e-05,
275
+ "loss": 0.0861,
276
+ "step": 1700
277
+ },
278
+ {
279
+ "epoch": 7.51,
280
+ "learning_rate": 1.6658718804641553e-05,
281
+ "loss": 0.0852,
282
+ "step": 1750
283
+ },
284
+ {
285
+ "epoch": 7.73,
286
+ "learning_rate": 1.6499761564139248e-05,
287
+ "loss": 0.1283,
288
+ "step": 1800
289
+ },
290
+ {
291
+ "epoch": 7.94,
292
+ "learning_rate": 1.6340804323636943e-05,
293
+ "loss": 0.0956,
294
+ "step": 1850
295
+ },
296
+ {
297
+ "epoch": 8.0,
298
+ "eval_f1": 0.7998718228606326,
299
+ "eval_loss": 1.2174800634384155,
300
+ "eval_runtime": 1.5358,
301
+ "eval_samples_per_second": 377.658,
302
+ "eval_steps_per_second": 1.302,
303
+ "step": 1864
304
+ },
305
+ {
306
+ "epoch": 8.15,
307
+ "learning_rate": 1.6181847083134637e-05,
308
+ "loss": 0.0862,
309
+ "step": 1900
310
+ },
311
+ {
312
+ "epoch": 8.37,
313
+ "learning_rate": 1.6022889842632335e-05,
314
+ "loss": 0.0486,
315
+ "step": 1950
316
+ },
317
+ {
318
+ "epoch": 8.58,
319
+ "learning_rate": 1.5863932602130026e-05,
320
+ "loss": 0.0321,
321
+ "step": 2000
322
+ },
323
+ {
324
+ "epoch": 8.8,
325
+ "learning_rate": 1.5704975361627725e-05,
326
+ "loss": 0.0687,
327
+ "step": 2050
328
+ },
329
+ {
330
+ "epoch": 9.0,
331
+ "eval_f1": 0.789186529273271,
332
+ "eval_loss": 1.3646652698516846,
333
+ "eval_runtime": 1.5089,
334
+ "eval_samples_per_second": 384.398,
335
+ "eval_steps_per_second": 1.326,
336
+ "step": 2097
337
+ },
338
+ {
339
+ "epoch": 9.01,
340
+ "learning_rate": 1.554601812112542e-05,
341
+ "loss": 0.0522,
342
+ "step": 2100
343
+ },
344
+ {
345
+ "epoch": 9.23,
346
+ "learning_rate": 1.5387060880623114e-05,
347
+ "loss": 0.0349,
348
+ "step": 2150
349
+ },
350
+ {
351
+ "epoch": 9.44,
352
+ "learning_rate": 1.5228103640120809e-05,
353
+ "loss": 0.0529,
354
+ "step": 2200
355
+ },
356
+ {
357
+ "epoch": 9.66,
358
+ "learning_rate": 1.5069146399618503e-05,
359
+ "loss": 0.0284,
360
+ "step": 2250
361
+ },
362
+ {
363
+ "epoch": 9.87,
364
+ "learning_rate": 1.49101891591162e-05,
365
+ "loss": 0.0371,
366
+ "step": 2300
367
+ },
368
+ {
369
+ "epoch": 10.0,
370
+ "eval_f1": 0.7986917021269787,
371
+ "eval_loss": 1.3809223175048828,
372
+ "eval_runtime": 1.6909,
373
+ "eval_samples_per_second": 343.007,
374
+ "eval_steps_per_second": 1.183,
375
+ "step": 2330
376
+ },
377
+ {
378
+ "epoch": 10.09,
379
+ "learning_rate": 1.4751231918613892e-05,
380
+ "loss": 0.0143,
381
+ "step": 2350
382
+ },
383
+ {
384
+ "epoch": 10.3,
385
+ "learning_rate": 1.4592274678111589e-05,
386
+ "loss": 0.0012,
387
+ "step": 2400
388
+ },
389
+ {
390
+ "epoch": 10.52,
391
+ "learning_rate": 1.4433317437609285e-05,
392
+ "loss": 0.0117,
393
+ "step": 2450
394
+ },
395
+ {
396
+ "epoch": 10.73,
397
+ "learning_rate": 1.427436019710698e-05,
398
+ "loss": 0.0248,
399
+ "step": 2500
400
+ },
401
+ {
402
+ "epoch": 10.94,
403
+ "learning_rate": 1.4115402956604673e-05,
404
+ "loss": 0.0303,
405
+ "step": 2550
406
+ },
407
+ {
408
+ "epoch": 11.0,
409
+ "eval_f1": 0.8123190611646329,
410
+ "eval_loss": 1.3591104745864868,
411
+ "eval_runtime": 1.57,
412
+ "eval_samples_per_second": 369.434,
413
+ "eval_steps_per_second": 1.274,
414
+ "step": 2563
415
+ },
416
+ {
417
+ "epoch": 11.16,
418
+ "learning_rate": 1.395644571610237e-05,
419
+ "loss": 0.0142,
420
+ "step": 2600
421
+ },
422
+ {
423
+ "epoch": 11.37,
424
+ "learning_rate": 1.3797488475600066e-05,
425
+ "loss": 0.0136,
426
+ "step": 2650
427
+ },
428
+ {
429
+ "epoch": 11.59,
430
+ "learning_rate": 1.363853123509776e-05,
431
+ "loss": 0.0126,
432
+ "step": 2700
433
+ },
434
+ {
435
+ "epoch": 11.8,
436
+ "learning_rate": 1.3479573994595455e-05,
437
+ "loss": 0.0263,
438
+ "step": 2750
439
+ },
440
+ {
441
+ "epoch": 12.0,
442
+ "eval_f1": 0.8100291935535177,
443
+ "eval_loss": 1.5316766500473022,
444
+ "eval_runtime": 1.5184,
445
+ "eval_samples_per_second": 381.982,
446
+ "eval_steps_per_second": 1.317,
447
+ "step": 2796
448
+ },
449
+ {
450
+ "epoch": 12.02,
451
+ "learning_rate": 1.332061675409315e-05,
452
+ "loss": 0.011,
453
+ "step": 2800
454
+ },
455
+ {
456
+ "epoch": 12.23,
457
+ "learning_rate": 1.3161659513590846e-05,
458
+ "loss": 0.0002,
459
+ "step": 2850
460
+ },
461
+ {
462
+ "epoch": 12.45,
463
+ "learning_rate": 1.300270227308854e-05,
464
+ "loss": 0.0057,
465
+ "step": 2900
466
+ },
467
+ {
468
+ "epoch": 12.66,
469
+ "learning_rate": 1.2843745032586235e-05,
470
+ "loss": 0.0016,
471
+ "step": 2950
472
+ },
473
+ {
474
+ "epoch": 12.88,
475
+ "learning_rate": 1.268478779208393e-05,
476
+ "loss": 0.0144,
477
+ "step": 3000
478
+ },
479
+ {
480
+ "epoch": 13.0,
481
+ "eval_f1": 0.7959241618420011,
482
+ "eval_loss": 1.5725551843643188,
483
+ "eval_runtime": 1.4849,
484
+ "eval_samples_per_second": 390.601,
485
+ "eval_steps_per_second": 1.347,
486
+ "step": 3029
487
+ },
488
+ {
489
+ "epoch": 13.09,
490
+ "learning_rate": 1.2525830551581626e-05,
491
+ "loss": 0.006,
492
+ "step": 3050
493
+ },
494
+ {
495
+ "epoch": 13.3,
496
+ "learning_rate": 1.236687331107932e-05,
497
+ "loss": 0.0056,
498
+ "step": 3100
499
+ },
500
+ {
501
+ "epoch": 13.52,
502
+ "learning_rate": 1.2207916070577015e-05,
503
+ "loss": 0.0114,
504
+ "step": 3150
505
+ },
506
+ {
507
+ "epoch": 13.73,
508
+ "learning_rate": 1.204895883007471e-05,
509
+ "loss": 0.021,
510
+ "step": 3200
511
+ },
512
+ {
513
+ "epoch": 13.95,
514
+ "learning_rate": 1.1890001589572406e-05,
515
+ "loss": 0.0436,
516
+ "step": 3250
517
+ },
518
+ {
519
+ "epoch": 14.0,
520
+ "eval_f1": 0.7987626313618129,
521
+ "eval_loss": 1.6159876585006714,
522
+ "eval_runtime": 1.4555,
523
+ "eval_samples_per_second": 398.497,
524
+ "eval_steps_per_second": 1.374,
525
+ "step": 3262
526
+ },
527
+ {
528
+ "epoch": 14.16,
529
+ "learning_rate": 1.1731044349070103e-05,
530
+ "loss": 0.0002,
531
+ "step": 3300
532
+ },
533
+ {
534
+ "epoch": 14.38,
535
+ "learning_rate": 1.1572087108567796e-05,
536
+ "loss": 0.0062,
537
+ "step": 3350
538
+ },
539
+ {
540
+ "epoch": 14.59,
541
+ "learning_rate": 1.141312986806549e-05,
542
+ "loss": 0.0056,
543
+ "step": 3400
544
+ },
545
+ {
546
+ "epoch": 14.81,
547
+ "learning_rate": 1.1254172627563187e-05,
548
+ "loss": 0.0048,
549
+ "step": 3450
550
+ },
551
+ {
552
+ "epoch": 15.0,
553
+ "eval_f1": 0.7957479636902922,
554
+ "eval_loss": 1.6826026439666748,
555
+ "eval_runtime": 1.4617,
556
+ "eval_samples_per_second": 396.789,
557
+ "eval_steps_per_second": 1.368,
558
+ "step": 3495
559
+ },
560
+ {
561
+ "epoch": 15.02,
562
+ "learning_rate": 1.1095215387060883e-05,
563
+ "loss": 0.0039,
564
+ "step": 3500
565
+ },
566
+ {
567
+ "epoch": 15.24,
568
+ "learning_rate": 1.0936258146558576e-05,
569
+ "loss": 0.0001,
570
+ "step": 3550
571
+ },
572
+ {
573
+ "epoch": 15.45,
574
+ "learning_rate": 1.0777300906056272e-05,
575
+ "loss": 0.0236,
576
+ "step": 3600
577
+ },
578
+ {
579
+ "epoch": 15.67,
580
+ "learning_rate": 1.0618343665553967e-05,
581
+ "loss": 0.0004,
582
+ "step": 3650
583
+ },
584
+ {
585
+ "epoch": 15.88,
586
+ "learning_rate": 1.0459386425051663e-05,
587
+ "loss": 0.0001,
588
+ "step": 3700
589
+ },
590
+ {
591
+ "epoch": 16.0,
592
+ "eval_f1": 0.7956639409293647,
593
+ "eval_loss": 1.6912556886672974,
594
+ "eval_runtime": 1.4563,
595
+ "eval_samples_per_second": 398.278,
596
+ "eval_steps_per_second": 1.373,
597
+ "step": 3728
598
+ },
599
+ {
600
+ "epoch": 16.09,
601
+ "learning_rate": 1.0300429184549356e-05,
602
+ "loss": 0.0002,
603
+ "step": 3750
604
+ },
605
+ {
606
+ "epoch": 16.31,
607
+ "learning_rate": 1.0141471944047053e-05,
608
+ "loss": 0.0002,
609
+ "step": 3800
610
+ },
611
+ {
612
+ "epoch": 16.52,
613
+ "learning_rate": 9.982514703544747e-06,
614
+ "loss": 0.0006,
615
+ "step": 3850
616
+ },
617
+ {
618
+ "epoch": 16.74,
619
+ "learning_rate": 9.823557463042442e-06,
620
+ "loss": 0.0002,
621
+ "step": 3900
622
+ },
623
+ {
624
+ "epoch": 16.95,
625
+ "learning_rate": 9.664600222540137e-06,
626
+ "loss": 0.0001,
627
+ "step": 3950
628
+ },
629
+ {
630
+ "epoch": 17.0,
631
+ "eval_f1": 0.7994751240525658,
632
+ "eval_loss": 1.7075979709625244,
633
+ "eval_runtime": 1.4886,
634
+ "eval_samples_per_second": 389.634,
635
+ "eval_steps_per_second": 1.344,
636
+ "step": 3961
637
+ },
638
+ {
639
+ "epoch": 17.17,
640
+ "learning_rate": 9.505642982037833e-06,
641
+ "loss": 0.0002,
642
+ "step": 4000
643
+ },
644
+ {
645
+ "epoch": 17.38,
646
+ "learning_rate": 9.346685741535528e-06,
647
+ "loss": 0.0185,
648
+ "step": 4050
649
+ },
650
+ {
651
+ "epoch": 17.6,
652
+ "learning_rate": 9.187728501033222e-06,
653
+ "loss": 0.0001,
654
+ "step": 4100
655
+ },
656
+ {
657
+ "epoch": 17.81,
658
+ "learning_rate": 9.028771260530917e-06,
659
+ "loss": 0.0034,
660
+ "step": 4150
661
+ },
662
+ {
663
+ "epoch": 18.0,
664
+ "eval_f1": 0.7960354805040918,
665
+ "eval_loss": 1.8018221855163574,
666
+ "eval_runtime": 1.5408,
667
+ "eval_samples_per_second": 376.422,
668
+ "eval_steps_per_second": 1.298,
669
+ "step": 4194
670
+ },
671
+ {
672
+ "epoch": 18.03,
673
+ "learning_rate": 8.869814020028613e-06,
674
+ "loss": 0.013,
675
+ "step": 4200
676
+ },
677
+ {
678
+ "epoch": 18.24,
679
+ "learning_rate": 8.710856779526308e-06,
680
+ "loss": 0.0003,
681
+ "step": 4250
682
+ },
683
+ {
684
+ "epoch": 18.45,
685
+ "learning_rate": 8.551899539024003e-06,
686
+ "loss": 0.0001,
687
+ "step": 4300
688
+ },
689
+ {
690
+ "epoch": 18.67,
691
+ "learning_rate": 8.392942298521697e-06,
692
+ "loss": 0.0002,
693
+ "step": 4350
694
+ },
695
+ {
696
+ "epoch": 18.88,
697
+ "learning_rate": 8.233985058019394e-06,
698
+ "loss": 0.0228,
699
+ "step": 4400
700
+ },
701
+ {
702
+ "epoch": 19.0,
703
+ "eval_f1": 0.7915974698658704,
704
+ "eval_loss": 1.7456856966018677,
705
+ "eval_runtime": 1.4762,
706
+ "eval_samples_per_second": 392.912,
707
+ "eval_steps_per_second": 1.355,
708
+ "step": 4427
709
+ },
710
+ {
711
+ "epoch": 19.1,
712
+ "learning_rate": 8.075027817517088e-06,
713
+ "loss": 0.0006,
714
+ "step": 4450
715
+ },
716
+ {
717
+ "epoch": 19.31,
718
+ "learning_rate": 7.916070577014783e-06,
719
+ "loss": 0.0037,
720
+ "step": 4500
721
+ },
722
+ {
723
+ "epoch": 19.53,
724
+ "learning_rate": 7.757113336512478e-06,
725
+ "loss": 0.0314,
726
+ "step": 4550
727
+ },
728
+ {
729
+ "epoch": 19.74,
730
+ "learning_rate": 7.598156096010174e-06,
731
+ "loss": 0.0028,
732
+ "step": 4600
733
+ },
734
+ {
735
+ "epoch": 19.96,
736
+ "learning_rate": 7.439198855507869e-06,
737
+ "loss": 0.0083,
738
+ "step": 4650
739
+ },
740
+ {
741
+ "epoch": 20.0,
742
+ "eval_f1": 0.7868576028090374,
743
+ "eval_loss": 1.9279075860977173,
744
+ "eval_runtime": 1.4679,
745
+ "eval_samples_per_second": 395.119,
746
+ "eval_steps_per_second": 1.362,
747
+ "step": 4660
748
+ },
749
+ {
750
+ "epoch": 20.17,
751
+ "learning_rate": 7.280241615005564e-06,
752
+ "loss": 0.0009,
753
+ "step": 4700
754
+ },
755
+ {
756
+ "epoch": 20.39,
757
+ "learning_rate": 7.121284374503259e-06,
758
+ "loss": 0.0002,
759
+ "step": 4750
760
+ },
761
+ {
762
+ "epoch": 20.6,
763
+ "learning_rate": 6.962327134000954e-06,
764
+ "loss": 0.0082,
765
+ "step": 4800
766
+ },
767
+ {
768
+ "epoch": 20.82,
769
+ "learning_rate": 6.803369893498649e-06,
770
+ "loss": 0.0001,
771
+ "step": 4850
772
+ },
773
+ {
774
+ "epoch": 21.0,
775
+ "eval_f1": 0.7915377946685866,
776
+ "eval_loss": 1.8367053270339966,
777
+ "eval_runtime": 2.0999,
778
+ "eval_samples_per_second": 276.201,
779
+ "eval_steps_per_second": 0.952,
780
+ "step": 4893
781
+ },
782
+ {
783
+ "epoch": 21.03,
784
+ "learning_rate": 6.6444126529963445e-06,
785
+ "loss": 0.0001,
786
+ "step": 4900
787
+ },
788
+ {
789
+ "epoch": 21.24,
790
+ "learning_rate": 6.485455412494039e-06,
791
+ "loss": 0.0072,
792
+ "step": 4950
793
+ },
794
+ {
795
+ "epoch": 21.46,
796
+ "learning_rate": 6.326498171991735e-06,
797
+ "loss": 0.0,
798
+ "step": 5000
799
+ },
800
+ {
801
+ "epoch": 21.67,
802
+ "learning_rate": 6.167540931489429e-06,
803
+ "loss": 0.0,
804
+ "step": 5050
805
+ },
806
+ {
807
+ "epoch": 21.89,
808
+ "learning_rate": 6.008583690987126e-06,
809
+ "loss": 0.0003,
810
+ "step": 5100
811
+ },
812
+ {
813
+ "epoch": 22.0,
814
+ "eval_f1": 0.7842117575951872,
815
+ "eval_loss": 1.8620420694351196,
816
+ "eval_runtime": 1.8603,
817
+ "eval_samples_per_second": 311.785,
818
+ "eval_steps_per_second": 1.075,
819
+ "step": 5126
820
+ },
821
+ {
822
+ "epoch": 22.1,
823
+ "learning_rate": 5.8496264504848195e-06,
824
+ "loss": 0.0007,
825
+ "step": 5150
826
+ },
827
+ {
828
+ "epoch": 22.32,
829
+ "learning_rate": 5.690669209982516e-06,
830
+ "loss": 0.0,
831
+ "step": 5200
832
+ },
833
+ {
834
+ "epoch": 22.53,
835
+ "learning_rate": 5.5317119694802105e-06,
836
+ "loss": 0.0021,
837
+ "step": 5250
838
+ },
839
+ {
840
+ "epoch": 22.75,
841
+ "learning_rate": 5.372754728977906e-06,
842
+ "loss": 0.0077,
843
+ "step": 5300
844
+ },
845
+ {
846
+ "epoch": 22.96,
847
+ "learning_rate": 5.213797488475601e-06,
848
+ "loss": 0.0002,
849
+ "step": 5350
850
+ },
851
+ {
852
+ "epoch": 23.0,
853
+ "eval_f1": 0.7828476594276503,
854
+ "eval_loss": 1.919188141822815,
855
+ "eval_runtime": 1.4859,
856
+ "eval_samples_per_second": 390.344,
857
+ "eval_steps_per_second": 1.346,
858
+ "step": 5359
859
+ },
860
+ {
861
+ "epoch": 23.18,
862
+ "learning_rate": 5.054840247973296e-06,
863
+ "loss": 0.0194,
864
+ "step": 5400
865
+ },
866
+ {
867
+ "epoch": 23.39,
868
+ "learning_rate": 4.895883007470991e-06,
869
+ "loss": 0.0132,
870
+ "step": 5450
871
+ },
872
+ {
873
+ "epoch": 23.61,
874
+ "learning_rate": 4.7369257669686855e-06,
875
+ "loss": 0.0001,
876
+ "step": 5500
877
+ },
878
+ {
879
+ "epoch": 23.82,
880
+ "learning_rate": 4.577968526466381e-06,
881
+ "loss": 0.0,
882
+ "step": 5550
883
+ },
884
+ {
885
+ "epoch": 24.0,
886
+ "eval_f1": 0.7927310235612234,
887
+ "eval_loss": 1.9081404209136963,
888
+ "eval_runtime": 1.4831,
889
+ "eval_samples_per_second": 391.082,
890
+ "eval_steps_per_second": 1.349,
891
+ "step": 5592
892
+ },
893
+ {
894
+ "epoch": 24.03,
895
+ "learning_rate": 4.419011285964076e-06,
896
+ "loss": 0.0,
897
+ "step": 5600
898
+ },
899
+ {
900
+ "epoch": 24.25,
901
+ "learning_rate": 4.260054045461771e-06,
902
+ "loss": 0.0001,
903
+ "step": 5650
904
+ },
905
+ {
906
+ "epoch": 24.46,
907
+ "learning_rate": 4.101096804959467e-06,
908
+ "loss": 0.0122,
909
+ "step": 5700
910
+ },
911
+ {
912
+ "epoch": 24.68,
913
+ "learning_rate": 3.942139564457161e-06,
914
+ "loss": 0.0,
915
+ "step": 5750
916
+ },
917
+ {
918
+ "epoch": 24.89,
919
+ "learning_rate": 3.7831823239548564e-06,
920
+ "loss": 0.0003,
921
+ "step": 5800
922
+ },
923
+ {
924
+ "epoch": 25.0,
925
+ "eval_f1": 0.7812550199347442,
926
+ "eval_loss": 1.9822450876235962,
927
+ "eval_runtime": 1.5174,
928
+ "eval_samples_per_second": 382.228,
929
+ "eval_steps_per_second": 1.318,
930
+ "step": 5825
931
+ },
932
+ {
933
+ "epoch": 25.11,
934
+ "learning_rate": 3.6242250834525515e-06,
935
+ "loss": 0.0116,
936
+ "step": 5850
937
+ },
938
+ {
939
+ "epoch": 25.32,
940
+ "learning_rate": 3.4652678429502466e-06,
941
+ "loss": 0.0,
942
+ "step": 5900
943
+ },
944
+ {
945
+ "epoch": 25.54,
946
+ "learning_rate": 3.306310602447942e-06,
947
+ "loss": 0.0004,
948
+ "step": 5950
949
+ },
950
+ {
951
+ "epoch": 25.75,
952
+ "learning_rate": 3.147353361945637e-06,
953
+ "loss": 0.0027,
954
+ "step": 6000
955
+ },
956
+ {
957
+ "epoch": 25.97,
958
+ "learning_rate": 2.9883961214433322e-06,
959
+ "loss": 0.0059,
960
+ "step": 6050
961
+ },
962
+ {
963
+ "epoch": 26.0,
964
+ "eval_f1": 0.7953953204096383,
965
+ "eval_loss": 1.8736791610717773,
966
+ "eval_runtime": 1.4646,
967
+ "eval_samples_per_second": 396.003,
968
+ "eval_steps_per_second": 1.366,
969
+ "step": 6058
970
+ },
971
+ {
972
+ "epoch": 26.18,
973
+ "learning_rate": 2.8294388809410273e-06,
974
+ "loss": 0.0001,
975
+ "step": 6100
976
+ },
977
+ {
978
+ "epoch": 26.39,
979
+ "learning_rate": 2.6704816404387224e-06,
980
+ "loss": 0.0,
981
+ "step": 6150
982
+ },
983
+ {
984
+ "epoch": 26.61,
985
+ "learning_rate": 2.5115243999364175e-06,
986
+ "loss": 0.0,
987
+ "step": 6200
988
+ },
989
+ {
990
+ "epoch": 26.82,
991
+ "learning_rate": 2.3525671594341126e-06,
992
+ "loss": 0.0,
993
+ "step": 6250
994
+ },
995
+ {
996
+ "epoch": 27.0,
997
+ "eval_f1": 0.7929217495075929,
998
+ "eval_loss": 1.879309892654419,
999
+ "eval_runtime": 1.9514,
1000
+ "eval_samples_per_second": 297.228,
1001
+ "eval_steps_per_second": 1.025,
1002
+ "step": 6291
1003
+ },
1004
+ {
1005
+ "epoch": 27.04,
1006
+ "learning_rate": 2.1936099189318076e-06,
1007
+ "loss": 0.0,
1008
+ "step": 6300
1009
+ },
1010
+ {
1011
+ "epoch": 27.25,
1012
+ "learning_rate": 2.0346526784295027e-06,
1013
+ "loss": 0.0,
1014
+ "step": 6350
1015
+ },
1016
+ {
1017
+ "epoch": 27.47,
1018
+ "learning_rate": 1.8756954379271978e-06,
1019
+ "loss": 0.0111,
1020
+ "step": 6400
1021
+ },
1022
+ {
1023
+ "epoch": 27.68,
1024
+ "learning_rate": 1.7167381974248929e-06,
1025
+ "loss": 0.0,
1026
+ "step": 6450
1027
+ },
1028
+ {
1029
+ "epoch": 27.9,
1030
+ "learning_rate": 1.557780956922588e-06,
1031
+ "loss": 0.0,
1032
+ "step": 6500
1033
+ },
1034
+ {
1035
+ "epoch": 28.0,
1036
+ "eval_f1": 0.794029634093503,
1037
+ "eval_loss": 1.8904625177383423,
1038
+ "eval_runtime": 2.2478,
1039
+ "eval_samples_per_second": 258.035,
1040
+ "eval_steps_per_second": 0.89,
1041
+ "step": 6524
1042
+ },
1043
+ {
1044
+ "epoch": 28.11,
1045
+ "learning_rate": 1.398823716420283e-06,
1046
+ "loss": 0.0,
1047
+ "step": 6550
1048
+ },
1049
+ {
1050
+ "epoch": 28.33,
1051
+ "learning_rate": 1.2398664759179781e-06,
1052
+ "loss": 0.0,
1053
+ "step": 6600
1054
+ },
1055
+ {
1056
+ "epoch": 28.54,
1057
+ "learning_rate": 1.0809092354156734e-06,
1058
+ "loss": 0.0,
1059
+ "step": 6650
1060
+ },
1061
+ {
1062
+ "epoch": 28.76,
1063
+ "learning_rate": 9.219519949133683e-07,
1064
+ "loss": 0.0,
1065
+ "step": 6700
1066
+ },
1067
+ {
1068
+ "epoch": 28.97,
1069
+ "learning_rate": 7.629947544110635e-07,
1070
+ "loss": 0.0,
1071
+ "step": 6750
1072
+ },
1073
+ {
1074
+ "epoch": 29.0,
1075
+ "eval_f1": 0.794029634093503,
1076
+ "eval_loss": 1.8970826864242554,
1077
+ "eval_runtime": 1.9492,
1078
+ "eval_samples_per_second": 297.551,
1079
+ "eval_steps_per_second": 1.026,
1080
+ "step": 6757
1081
+ },
1082
+ {
1083
+ "epoch": 29.18,
1084
+ "learning_rate": 6.040375139087585e-07,
1085
+ "loss": 0.0,
1086
+ "step": 6800
1087
+ },
1088
+ {
1089
+ "epoch": 29.4,
1090
+ "learning_rate": 4.450802734064537e-07,
1091
+ "loss": 0.0,
1092
+ "step": 6850
1093
+ },
1094
+ {
1095
+ "epoch": 29.61,
1096
+ "learning_rate": 2.861230329041488e-07,
1097
+ "loss": 0.0,
1098
+ "step": 6900
1099
+ },
1100
+ {
1101
+ "epoch": 29.83,
1102
+ "learning_rate": 1.2716579240184392e-07,
1103
+ "loss": 0.0002,
1104
+ "step": 6950
1105
+ },
1106
+ {
1107
+ "epoch": 30.0,
1108
+ "eval_f1": 0.7954091951908298,
1109
+ "eval_loss": 1.9001948833465576,
1110
+ "eval_runtime": 1.8428,
1111
+ "eval_samples_per_second": 314.746,
1112
+ "eval_steps_per_second": 1.085,
1113
+ "step": 6990
1114
+ },
1115
+ {
1116
+ "epoch": 30.0,
1117
+ "step": 6990,
1118
+ "total_flos": 5566168764425088.0,
1119
+ "train_loss": 0.16078996370909257,
1120
+ "train_runtime": 2045.6711,
1121
+ "train_samples_per_second": 54.496,
1122
+ "train_steps_per_second": 3.417
1123
+ }
1124
+ ],
1125
+ "logging_steps": 50,
1126
+ "max_steps": 6990,
1127
+ "num_train_epochs": 30,
1128
+ "save_steps": 500,
1129
+ "total_flos": 5566168764425088.0,
1130
+ "trial_name": null,
1131
+ "trial_params": null
1132
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f7d007681c7fac4efe338744ae0ead6b261215bad92b4852a05a8e9b609f753
3
+ size 4536
vocab.json ADDED
The diff for this file is too large to render. See raw diff