Tristan commited on
Commit
711c575
β€’
1 Parent(s): d043716

Training in progress, epoch 0

Browse files
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/pythia-70m",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 512,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 2048,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 8,
20
+ "num_hidden_layers": 6,
21
+ "rope_scaling": null,
22
+ "rotary_emb_base": 10000,
23
+ "rotary_pct": 0.25,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.40.2",
27
+ "use_cache": true,
28
+ "use_parallel_residual": true,
29
+ "vocab_size": 50304
30
+ }
eval_job_output.txt ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ slurm submission log: 2024-05-25 22:01:15.907170
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7651386
11
+ #SBATCH --gres=gpu:1
12
+ #SBATCH --job-name=tthrush-job-3177077
13
+ #SBATCH --mem=60G
14
+ #SBATCH --nodelist=sphinx1
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+ Submitted batch job 7651387
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-25 22:02:25.155540
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7651417
53
+ #SBATCH --gres=gpu:1
54
+ #SBATCH --job-name=tthrush-job-3723367
55
+ #SBATCH --mem=60G
56
+ #SBATCH --nodelist=sphinx1
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7651418
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-25 22:12:50.394186
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7651458
95
+ #SBATCH --gres=gpu:1
96
+ #SBATCH --job-name=tthrush-job-3832230
97
+ #SBATCH --mem=60G
98
+ #SBATCH --nodelist=sphinx1
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7651459
122
+
123
+
124
+
125
+ ###############################
126
+
127
+ slurm submission log: 2024-05-25 22:15:55.997250
128
+ created following sbatch script:
129
+
130
+ ###############################
131
+
132
+ #!/bin/bash
133
+
134
+ #SBATCH --account=nlp
135
+ #SBATCH --cpus-per-task=16
136
+ #SBATCH --dependency=afterok:7651486
137
+ #SBATCH --gres=gpu:1
138
+ #SBATCH --job-name=tthrush-job-2732481
139
+ #SBATCH --mem=60G
140
+ #SBATCH --nodelist=sphinx1
141
+ #SBATCH --open-mode=append
142
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
143
+ #SBATCH --partition=sphinx
144
+ #SBATCH --time=14-0
145
+
146
+ # activate your desired anaconda environment
147
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
148
+
149
+ # cd to working directory
150
+ cd .
151
+
152
+ # launch commands
153
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
154
+
155
+ ###############################
156
+
157
+ submission to slurm complete!
158
+
159
+
160
+ ###############################
161
+ slurm submission output
162
+
163
+ Submitted batch job 7651487
164
+
165
+
166
+
167
+ ###############################
168
+
169
+ slurm submission log: 2024-05-25 22:18:14.995526
170
+ created following sbatch script:
171
+
172
+ ###############################
173
+
174
+ #!/bin/bash
175
+
176
+ #SBATCH --account=nlp
177
+ #SBATCH --cpus-per-task=16
178
+ #SBATCH --dependency=afterok:7651516
179
+ #SBATCH --gres=gpu:1
180
+ #SBATCH --job-name=tthrush-job-3308982
181
+ #SBATCH --mem=60G
182
+ #SBATCH --nodelist=sphinx1
183
+ #SBATCH --open-mode=append
184
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
185
+ #SBATCH --partition=sphinx
186
+ #SBATCH --time=14-0
187
+
188
+ # activate your desired anaconda environment
189
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
190
+
191
+ # cd to working directory
192
+ cd .
193
+
194
+ # launch commands
195
+ srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
196
+
197
+ ###############################
198
+
199
+ submission to slurm complete!
200
+
201
+
202
+ ###############################
203
+ slurm submission output
204
+
205
+ Submitted batch job 7651517
206
+
207
+
208
+
209
+ ###############################
210
+
logs/events.out.tfevents.1716738249.sphinx2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11e15a6f6500f0062c4964dc38d6af748a94f95f7df5aeabc9657e2199256da5
3
+ size 11781
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae13490cbf98ca8da07c69d1d78ddec321a71452b204a7e4e6c442f454038800
3
+ size 281715176
train_job_output.txt ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/11704 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
 
 
1
  0%| | 1/11704 [00:11<36:01:50, 11.08s/it]
2
  0%| | 2/11704 [00:15<22:29:09, 6.92s/it]
3
  0%| | 3/11704 [00:18<17:13:49, 5.30s/it]
4
  0%| | 4/11704 [00:21<13:50:30, 4.26s/it]
5
  0%| | 5/11704 [00:22<10:32:47, 3.25s/it]
6
  0%| | 6/11704 [00:24<9:19:43, 2.87s/it]
7
  0%| | 7/11704 [00:26<7:44:55, 2.38s/it]
8
  0%| | 8/11704 [00:27<6:35:35, 2.03s/it]
9
  0%| | 9/11704 [00:28<5:32:02, 1.70s/it]
10
  0%| | 10/11704 [00:29<5:12:33, 1.60s/it]
11
  0%| | 11/11704 [00:30<4:43:07, 1.45s/it]
12
  0%| | 12/11704 [00:31<4:09:08, 1.28s/it]
13
  0%| | 13/11704 [00:32<3:36:18, 1.11s/it]
14
  0%| | 14/11704 [00:33<3:17:27, 1.01s/it]
15
  0%| | 15/11704 [00:34<3:11:14, 1.02it/s]
16
  0%| | 16/11704 [00:34<3:01:30, 1.07it/s]
17
  0%| | 17/11704 [00:35<2:48:52, 1.15it/s]
18
  0%| | 18/11704 [00:36<2:33:34, 1.27it/s]
19
  0%| | 19/11704 [00:36<2:26:24, 1.33it/s]
20
  0%| | 20/11704 [00:37<2:19:52, 1.39it/s]
21
  0%| | 21/11704 [00:38<2:16:43, 1.42it/s]
22
  0%| | 22/11704 [00:38<2:16:47, 1.42it/s]
23
  0%| | 23/11704 [00:39<2:13:38, 1.46it/s]
24
  0%| | 24/11704 [00:40<2:20:53, 1.38it/s]
25
  0%| | 25/11704 [00:41<2:25:03, 1.34it/s]{'loss': 10.6948, 'grad_norm': 1.2803484201431274, 'learning_rate': 2.134927412467976e-05, 'epoch': 0.03}
26
 
 
27
  0%| | 25/11704 [00:41<2:25:03, 1.34it/s]
28
  0%| | 26/11704 [00:41<2:14:46, 1.44it/s]
29
  0%| | 27/11704 [00:42<2:08:18, 1.52it/s]
30
  0%| | 28/11704 [00:42<2:06:17, 1.54it/s]
31
  0%| | 29/11704 [00:43<2:03:23, 1.58it/s]
32
  0%| | 30/11704 [00:44<1:58:17, 1.64it/s]
33
  0%| | 31/11704 [00:44<1:53:07, 1.72it/s]
34
  0%| | 32/11704 [00:45<1:50:58, 1.75it/s]
35
  0%| | 33/11704 [00:45<1:50:04, 1.77it/s]
36
  0%| | 34/11704 [00:46<1:51:19, 1.75it/s]
37
  0%| | 35/11704 [00:46<1:47:30, 1.81it/s]
38
  0%| | 36/11704 [00:47<1:47:57, 1.80it/s]
39
  0%| | 37/11704 [00:47<1:46:41, 1.82it/s]
40
  0%| | 38/11704 [00:48<1:45:56, 1.84it/s]
41
  0%| | 39/11704 [00:49<1:47:28, 1.81it/s]
42
  0%| | 40/11704 [00:49<1:46:05, 1.83it/s]
43
  0%| | 41/11704 [00:50<1:45:06, 1.85it/s]
44
  0%| | 42/11704 [00:50<1:45:14, 1.85it/s]
45
  0%| | 43/11704 [00:51<1:46:16, 1.83it/s]
46
  0%| | 44/11704 [00:51<1:44:48, 1.85it/s]
47
  0%| | 45/11704 [00:52<1:42:40, 1.89it/s]
48
  0%| | 46/11704 [00:52<1:43:30, 1.88it/s]
49
  0%| | 47/11704 [00:53<1:43:45, 1.87it/s]
50
  0%| | 48/11704 [00:53<1:42:51, 1.89it/s]
51
  0%| | 49/11704 [00:54<1:42:14, 1.90it/s]
52
  0%| | 50/11704 [00:54<1:41:41, 1.91it/s]{'loss': 9.9789, 'grad_norm': 1.1821939945220947, 'learning_rate': 4.269854824935952e-05, 'epoch': 0.06}
 
53
 
54
  0%| | 50/11704 [00:54<1:41:41, 1.91it/s]
55
  0%| | 51/11704 [00:55<1:41:01, 1.92it/s]
56
  0%| | 52/11704 [00:55<1:40:15, 1.94it/s]
57
  0%| | 53/11704 [00:56<1:39:39, 1.95it/s]
58
  0%| | 54/11704 [00:56<1:40:20, 1.94it/s]
59
  0%| | 55/11704 [00:57<1:39:05, 1.96it/s]
60
  0%| | 56/11704 [00:57<1:39:10, 1.96it/s]
61
  0%| | 57/11704 [00:58<1:39:27, 1.95it/s]
62
  0%| | 58/11704 [00:58<1:39:40, 1.95it/s]
63
  1%| | 59/11704 [00:59<1:52:14, 1.73it/s]
64
  1%| | 60/11704 [01:00<1:48:03, 1.80it/s]
65
  1%| | 61/11704 [01:00<1:45:25, 1.84it/s]
66
  1%| | 62/11704 [01:01<1:43:05, 1.88it/s]
67
  1%| | 63/11704 [01:01<1:41:38, 1.91it/s]
68
  1%| | 64/11704 [01:02<1:40:08, 1.94it/s]
69
  1%| | 65/11704 [01:02<1:39:38, 1.95it/s]
70
  1%| | 66/11704 [01:03<1:38:25, 1.97it/s]
71
  1%| | 67/11704 [01:03<1:38:36, 1.97it/s]
72
  1%| | 68/11704 [01:04<1:38:36, 1.97it/s]
73
  1%| | 69/11704 [01:04<1:37:32, 1.99it/s]
74
  1%| | 70/11704 [01:05<1:37:04, 2.00it/s]
75
  1%| | 71/11704 [01:05<1:36:37, 2.01it/s]
76
  1%| | 72/11704 [01:06<1:36:46, 2.00it/s]
77
  1%| | 73/11704 [01:06<1:36:54, 2.00it/s]
78
  1%| | 74/11704 [01:07<1:37:03, 2.00it/s]
79
  1%| | 75/11704 [01:07<1:36:35, 2.01it/s]{'loss': 9.2071, 'grad_norm': 1.037238597869873, 'learning_rate': 6.404782237403927e-05, 'epoch': 0.09}
80
 
 
81
  1%| | 75/11704 [01:07<1:36:35, 2.01it/s]
82
  1%| | 76/11704 [01:08<1:36:49, 2.00it/s]
83
  1%| | 77/11704 [01:08<1:36:51, 2.00it/s]
84
  1%| | 78/11704 [01:09<1:36:22, 2.01it/s]
85
  1%| | 79/11704 [01:09<1:36:16, 2.01it/s]
86
  1%| | 80/11704 [01:10<1:36:33, 2.01it/s]
87
  1%| | 81/11704 [01:10<1:36:15, 2.01it/s]
88
  1%| | 82/11704 [01:11<1:36:11, 2.01it/s]
89
  1%| | 83/11704 [01:11<1:36:08, 2.01it/s]
90
  1%| | 84/11704 [01:12<1:35:57, 2.02it/s]
91
  1%| | 85/11704 [01:12<1:36:27, 2.01it/s]
92
  1%| | 86/11704 [01:13<1:36:25, 2.01it/s]
93
  1%| | 87/11704 [01:13<1:36:11, 2.01it/s]
94
  1%| | 88/11704 [01:14<1:36:08, 2.01it/s]
95
  1%| | 89/11704 [01:14<1:37:01, 2.00it/s]
96
  1%| | 90/11704 [01:15<1:39:29, 1.95it/s]
97
  1%| | 91/11704 [01:15<1:38:15, 1.97it/s]
98
  1%| | 92/11704 [01:16<1:38:03, 1.97it/s]
99
  1%| | 93/11704 [01:16<1:37:15, 1.99it/s]
100
  1%| | 94/11704 [01:17<1:36:57, 2.00it/s]
101
  1%| | 95/11704 [01:17<1:36:48, 2.00it/s]
102
  1%| | 96/11704 [01:18<1:36:31, 2.00it/s]
103
  1%| | 97/11704 [01:18<1:36:15, 2.01it/s]
104
  1%| | 98/11704 [01:19<1:36:02, 2.01it/s]
105
  1%| | 99/11704 [01:19<1:36:18, 2.01it/s]
106
  1%| | 100/11704 [01:20<1:36:11, 2.01it/s]{'loss': 8.3139, 'grad_norm': 0.7935464978218079, 'learning_rate': 8.539709649871905e-05, 'epoch': 0.12}
 
107
 
108
  1%| | 100/11704 [01:20<1:36:11, 2.01it/s]
109
  1%| | 101/11704 [01:20<1:36:06, 2.01it/s]
110
  1%| | 102/11704 [01:21<1:35:47, 2.02it/s]
111
  1%| | 103/11704 [01:21<1:36:08, 2.01it/s]
112
  1%| | 104/11704 [01:22<1:35:49, 2.02it/s]
113
  1%| | 105/11704 [01:22<1:35:43, 2.02it/s]
114
  1%| | 106/11704 [01:23<1:35:41, 2.02it/s]
115
  1%| | 107/11704 [01:23<1:35:33, 2.02it/s]
116
  1%| | 108/11704 [01:24<1:35:22, 2.03it/s]
117
  1%| | 109/11704 [01:24<1:35:39, 2.02it/s]
118
  1%| | 110/11704 [01:25<1:35:33, 2.02it/s]
119
  1%| | 111/11704 [01:25<1:35:49, 2.02it/s]
120
  1%| | 112/11704 [01:26<1:35:39, 2.02it/s]
121
  1%| | 113/11704 [01:26<1:35:34, 2.02it/s]
122
  1%| | 114/11704 [01:27<1:35:24, 2.02it/s]
123
  1%| | 115/11704 [01:27<1:35:20, 2.03it/s]
124
  1%| | 116/11704 [01:28<1:35:31, 2.02it/s]
125
  1%| | 117/11704 [01:28<1:35:27, 2.02it/s]
126
  1%| | 118/11704 [01:29<1:35:24, 2.02it/s]
127
  1%| | 119/11704 [01:29<1:35:14, 2.03it/s]
128
  1%| | 120/11704 [01:30<1:35:08, 2.03it/s]
129
  1%| | 121/11704 [01:30<1:35:10, 2.03it/s]
130
  1%| | 122/11704 [01:31<1:35:03, 2.03it/s]
131
  1%| | 123/11704 [01:31<1:34:57, 2.03it/s]
132
  1%| | 124/11704 [01:32<1:35:01, 2.03it/s]
133
  1%| | 125/11704 [01:32<1:35:08, 2.03it/s]{'loss': 7.526, 'grad_norm': 0.5917059183120728, 'learning_rate': 0.0001067463706233988, 'epoch': 0.15}
 
134
 
135
  1%| | 125/11704 [01:32<1:35:08, 2.03it/s]
136
  1%| | 126/11704 [01:33<1:35:09, 2.03it/s]
137
  1%| | 127/11704 [01:33<1:35:04, 2.03it/s]
138
  1%| | 128/11704 [01:34<1:34:59, 2.03it/s]
139
  1%| | 129/11704 [01:34<1:34:59, 2.03it/s]
140
  1%| | 130/11704 [01:35<1:35:09, 2.03it/s]
141
  1%| | 131/11704 [01:35<1:35:07, 2.03it/s]
142
  1%| | 132/11704 [01:35<1:35:02, 2.03it/s]
143
  1%| | 133/11704 [01:36<1:34:58, 2.03it/s]
144
  1%| | 134/11704 [01:36<1:34:58, 2.03it/s]
145
  1%| | 135/11704 [01:37<1:34:56, 2.03it/s]
146
  1%| | 136/11704 [01:37<1:35:01, 2.03it/s]
147
  1%| | 137/11704 [01:38<1:34:56, 2.03it/s]
148
  1%| | 138/11704 [01:38<1:35:30, 2.02it/s]
149
  1%| | 139/11704 [01:39<1:35:41, 2.01it/s]
150
  1%| | 140/11704 [01:39<1:35:29, 2.02it/s]
151
  1%| | 141/11704 [01:40<1:35:14, 2.02it/s]
152
  1%| | 142/11704 [01:40<1:35:11, 2.02it/s]
153
  1%| | 143/11704 [01:41<1:35:00, 2.03it/s]
154
  1%| | 144/11704 [01:41<1:34:55, 2.03it/s]
155
  1%| | 145/11704 [01:42<1:34:52, 2.03it/s]
156
  1%| | 146/11704 [01:42<1:34:48, 2.03it/s]
157
  1%|▏ | 147/11704 [01:43<1:34:51, 2.03it/s]
158
  1%|▏ | 148/11704 [01:43<1:34:45, 2.03it/s]
159
  1%|▏ | 149/11704 [01:44<1:34:43, 2.03it/s]
160
  1%|▏ | 150/11704 [01:44<1:34:48, 2.03it/s]{'loss': 6.8951, 'grad_norm': 0.37877357006073, 'learning_rate': 0.00012809564474807855, 'epoch': 0.18}
 
161
 
162
  1%|▏ | 150/11704 [01:44<1:34:48, 2.03it/s]
163
  1%|▏ | 151/11704 [01:45<1:34:56, 2.03it/s]
164
  1%|▏ | 152/11704 [01:45<1:34:58, 2.03it/s]
165
  1%|▏ | 153/11704 [01:46<1:34:45, 2.03it/s]
166
  1%|▏ | 154/11704 [01:46<1:34:38, 2.03it/s]
167
  1%|▏ | 155/11704 [01:47<1:34:42, 2.03it/s]
168
  1%|▏ | 156/11704 [01:47<1:34:49, 2.03it/s]
169
  1%|▏ | 157/11704 [01:48<1:34:49, 2.03it/s]
170
  1%|▏ | 158/11704 [01:48<1:34:47, 2.03it/s]
171
  1%|▏ | 159/11704 [01:49<1:34:42, 2.03it/s]
172
  1%|▏ | 160/11704 [01:49<1:34:40, 2.03it/s]
173
  1%|▏ | 161/11704 [01:50<1:34:38, 2.03it/s]
174
  1%|▏ | 162/11704 [01:50<1:34:39, 2.03it/s]
175
  1%|▏ | 163/11704 [01:51<1:34:47, 2.03it/s]
176
  1%|▏ | 164/11704 [01:51<1:34:47, 2.03it/s]
177
  1%|▏ | 165/11704 [01:52<1:34:54, 2.03it/s]
178
  1%|▏ | 166/11704 [01:52<1:34:47, 2.03it/s]
179
  1%|▏ | 167/11704 [01:53<1:34:47, 2.03it/s]
180
  1%|▏ | 168/11704 [01:53<1:34:38, 2.03it/s]
181
  1%|▏ | 169/11704 [01:54<1:34:42, 2.03it/s]
182
  1%|▏ | 170/11704 [01:54<1:34:41, 2.03it/s]
183
  1%|▏ | 171/11704 [01:55<1:34:32, 2.03it/s]
184
  1%|▏ | 172/11704 [01:55<1:34:38, 2.03it/s]
185
  1%|▏ | 173/11704 [01:56<1:34:32, 2.03it/s]
186
  1%|▏ | 174/11704 [01:56<1:34:27, 2.03it/s]
187
  1%|▏ | 175/11704 [01:57<1:34:29, 2.03it/s]{'loss': 6.3957, 'grad_norm': 0.3709251582622528, 'learning_rate': 0.00014944491887275833, 'epoch': 0.21}
 
188
 
189
  1%|▏ | 175/11704 [01:57<1:34:29, 2.03it/s]
190
  2%|▏ | 176/11704 [01:57<1:34:26, 2.03it/s]
191
  2%|▏ | 177/11704 [01:58<1:34:35, 2.03it/s]
192
  2%|▏ | 178/11704 [01:58<1:34:30, 2.03it/s]
193
  2%|▏ | 179/11704 [01:59<1:34:27, 2.03it/s]
194
  2%|▏ | 180/11704 [01:59<1:34:35, 2.03it/s]
195
  2%|▏ | 181/11704 [02:00<1:34:38, 2.03it/s]
196
  2%|▏ | 182/11704 [02:00<1:34:39, 2.03it/s]
197
  2%|▏ | 183/11704 [02:01<1:34:32, 2.03it/s]
198
  2%|▏ | 184/11704 [02:01<1:34:33, 2.03it/s]
199
  2%|▏ | 185/11704 [02:02<1:34:35, 2.03it/s]
200
  2%|▏ | 186/11704 [02:02<1:34:32, 2.03it/s]
201
  2%|▏ | 187/11704 [02:03<1:34:35, 2.03it/s]
202
  2%|▏ | 188/11704 [02:03<1:34:30, 2.03it/s]
203
  2%|▏ | 189/11704 [02:04<1:34:32, 2.03it/s]
204
  2%|▏ | 190/11704 [02:04<1:34:33, 2.03it/s]
205
  2%|▏ | 191/11704 [02:05<1:34:28, 2.03it/s]
206
  2%|▏ | 192/11704 [02:05<1:34:33, 2.03it/s]
207
  2%|▏ | 193/11704 [02:06<1:34:30, 2.03it/s]
208
  2%|▏ | 194/11704 [02:06<1:34:35, 2.03it/s]
209
  2%|▏ | 195/11704 [02:07<1:34:27, 2.03it/s]
210
  2%|▏ | 196/11704 [02:07<1:34:29, 2.03it/s]
211
  2%|▏ | 197/11704 [02:08<1:34:33, 2.03it/s]
212
  2%|▏ | 198/11704 [02:08<1:34:28, 2.03it/s]
213
  2%|▏ | 199/11704 [02:09<1:34:28, 2.03it/s]
214
  2%|▏ | 200/11704 [02:09<1:34:30, 2.03it/s]{'loss': 5.9826, 'grad_norm': 0.6036785840988159, 'learning_rate': 0.0001707941929974381, 'epoch': 0.24}
 
215
 
216
  2%|▏ | 200/11704 [02:09<1:34:30, 2.03it/s]
217
  2%|▏ | 201/11704 [02:09<1:34:24, 2.03it/s]
218
  2%|▏ | 202/11704 [02:10<1:34:31, 2.03it/s]
219
  2%|▏ | 203/11704 [02:10<1:34:23, 2.03it/s]
220
  2%|▏ | 204/11704 [02:11<1:34:21, 2.03it/s]
221
  2%|▏ | 205/11704 [02:11<1:34:27, 2.03it/s]
222
  2%|▏ | 206/11704 [02:12<1:34:31, 2.03it/s]
223
  2%|▏ | 207/11704 [02:12<1:34:36, 2.03it/s]
224
  2%|▏ | 208/11704 [02:13<1:34:28, 2.03it/s]
225
  2%|▏ | 209/11704 [02:13<1:34:31, 2.03it/s]
226
  2%|▏ | 210/11704 [02:14<1:34:28, 2.03it/s]
227
  2%|▏ | 211/11704 [02:14<1:34:26, 2.03it/s]
228
  2%|▏ | 212/11704 [02:15<1:34:28, 2.03it/s]
229
  2%|▏ | 213/11704 [02:15<1:34:23, 2.03it/s]
230
  2%|▏ | 214/11704 [02:16<1:34:28, 2.03it/s]
231
  2%|▏ | 215/11704 [02:16<1:34:30, 2.03it/s]
232
  2%|▏ | 216/11704 [02:17<1:34:29, 2.03it/s]
233
  2%|▏ | 217/11704 [02:17<1:34:34, 2.02it/s]
234
  2%|▏ | 218/11704 [02:18<1:34:38, 2.02it/s]
235
  2%|▏ | 219/11704 [02:18<1:34:37, 2.02it/s]
236
  2%|▏ | 220/11704 [02:19<1:34:45, 2.02it/s]
237
  2%|▏ | 221/11704 [02:19<1:34:34, 2.02it/s]
238
  2%|▏ | 222/11704 [02:20<1:34:43, 2.02it/s]
239
  2%|▏ | 223/11704 [02:20<1:34:35, 2.02it/s]
240
  2%|▏ | 224/11704 [02:21<1:34:33, 2.02it/s]
241
  2%|▏ | 225/11704 [02:21<1:34:27, 2.03it/s]{'loss': 5.659, 'grad_norm': 0.9925475716590881, 'learning_rate': 0.00019214346712211785, 'epoch': 0.27}
242
 
 
243
  2%|▏ | 225/11704 [02:21<1:34:27, 2.03it/s]
244
  2%|▏ | 226/11704 [02:22<1:34:29, 2.02it/s]
245
  2%|▏ | 227/11704 [02:22<1:34:27, 2.03it/s]
246
  2%|▏ | 228/11704 [02:23<1:34:41, 2.02it/s]
247
  2%|▏ | 229/11704 [02:23<1:34:33, 2.02it/s]
248
  2%|▏ | 230/11704 [02:24<1:34:35, 2.02it/s]
249
  2%|▏ | 231/11704 [02:24<1:34:27, 2.02it/s]
250
  2%|▏ | 232/11704 [02:25<1:34:25, 2.03it/s]
251
  2%|▏ | 233/11704 [02:25<1:34:22, 2.03it/s]
252
  2%|▏ | 234/11704 [02:26<1:34:20, 2.03it/s]
253
  2%|▏ | 235/11704 [02:26<1:34:18, 2.03it/s]
254
  2%|▏ | 236/11704 [02:27<1:34:21, 2.03it/s]
255
  2%|▏ | 237/11704 [02:27<1:34:25, 2.02it/s]
256
  2%|▏ | 238/11704 [02:28<1:34:20, 2.03it/s]
257
  2%|▏ | 239/11704 [02:28<1:34:29, 2.02it/s]
258
  2%|▏ | 240/11704 [02:29<1:34:21, 2.02it/s]
259
  2%|▏ | 241/11704 [02:29<1:34:26, 2.02it/s]
260
  2%|▏ | 242/11704 [02:30<1:34:19, 2.03it/s]
261
  2%|▏ | 243/11704 [02:30<1:34:16, 2.03it/s]
262
  2%|▏ | 244/11704 [02:31<1:34:14, 2.03it/s]
263
  2%|▏ | 245/11704 [02:31<1:34:16, 2.03it/s]
264
  2%|▏ | 246/11704 [02:32<1:34:15, 2.03it/s]
265
  2%|▏ | 247/11704 [02:32<1:34:18, 2.02it/s]
266
  2%|▏ | 248/11704 [02:33<1:34:35, 2.02it/s]
267
  2%|▏ | 249/11704 [02:33<1:34:34, 2.02it/s]
268
  2%|▏ | 250/11704 [02:34<1:34:36, 2.02it/s]{'loss': 5.4253, 'grad_norm': 0.6297341585159302, 'learning_rate': 0.0002134927412467976, 'epoch': 0.3}
 
269
 
270
  2%|▏ | 250/11704 [02:34<1:34:36, 2.02it/s]
271
  2%|▏ | 251/11704 [02:34<1:34:48, 2.01it/s]
272
  2%|▏ | 252/11704 [02:35<1:34:35, 2.02it/s]
273
  2%|▏ | 253/11704 [02:35<1:34:32, 2.02it/s]
274
  2%|▏ | 254/11704 [02:36<1:34:26, 2.02it/s]
275
  2%|▏ | 255/11704 [02:36<1:34:20, 2.02it/s]
276
  2%|▏ | 256/11704 [02:37<1:34:34, 2.02it/s]
277
  2%|▏ | 257/11704 [02:37<1:34:28, 2.02it/s]
278
  2%|▏ | 258/11704 [02:38<1:34:19, 2.02it/s]
279
  2%|▏ | 259/11704 [02:38<1:34:10, 2.03it/s]
280
  2%|▏ | 260/11704 [02:39<1:34:10, 2.03it/s]
281
  2%|▏ | 261/11704 [02:39<1:34:06, 2.03it/s]
282
  2%|▏ | 262/11704 [02:40<1:33:56, 2.03it/s]
283
  2%|▏ | 263/11704 [02:40<1:33:59, 2.03it/s]
284
  2%|▏ | 264/11704 [02:41<1:33:55, 2.03it/s]
285
  2%|▏ | 265/11704 [02:41<1:34:01, 2.03it/s]
286
  2%|▏ | 266/11704 [02:42<1:34:04, 2.03it/s]
287
  2%|▏ | 267/11704 [02:42<1:33:59, 2.03it/s]
288
  2%|▏ | 268/11704 [02:43<1:34:06, 2.03it/s]
289
  2%|▏ | 269/11704 [02:43<1:34:01, 2.03it/s]
290
  2%|▏ | 270/11704 [02:44<1:34:04, 2.03it/s]
291
  2%|▏ | 271/11704 [02:44<1:33:59, 2.03it/s]
292
  2%|▏ | 272/11704 [02:45<1:33:57, 2.03it/s]
293
  2%|▏ | 273/11704 [02:45<1:33:59, 2.03it/s]
294
  2%|▏ | 274/11704 [02:46<1:33:57, 2.03it/s]
295
  2%|▏ | 275/11704 [02:46<1:34:06, 2.02it/s]{'loss': 5.2293, 'grad_norm': 0.5968512892723083, 'learning_rate': 0.00023484201537147736, 'epoch': 0.33}
296
 
 
297
  2%|▏ | 275/11704 [02:46<1:34:06, 2.02it/s]
298
  2%|▏ | 276/11704 [02:47<1:34:02, 2.03it/s]
299
  2%|▏ | 277/11704 [02:47<1:34:05, 2.02it/s]
300
  2%|▏ | 278/11704 [02:48<1:33:59, 2.03it/s]
301
  2%|▏ | 279/11704 [02:48<1:34:01, 2.02it/s]
302
  2%|▏ | 280/11704 [02:49<1:33:54, 2.03it/s]
303
  2%|▏ | 281/11704 [02:49<1:34:00, 2.03it/s]
304
  2%|▏ | 282/11704 [02:49<1:33:55, 2.03it/s]
305
  2%|▏ | 283/11704 [02:50<1:33:55, 2.03it/s]
306
  2%|▏ | 284/11704 [02:50<1:33:52, 2.03it/s]
307
  2%|▏ | 285/11704 [02:51<1:33:47, 2.03it/s]
308
  2%|▏ | 286/11704 [02:51<1:33:49, 2.03it/s]
309
  2%|▏ | 287/11704 [02:52<1:33:49, 2.03it/s]
310
  2%|▏ | 288/11704 [02:52<1:33:50, 2.03it/s]
311
  2%|▏ | 289/11704 [02:53<1:33:43, 2.03it/s]
312
  2%|▏ | 290/11704 [02:53<1:33:47, 2.03it/s]
313
  2%|▏ | 291/11704 [02:54<1:33:42, 2.03it/s]
314
  2%|▏ | 292/11704 [02:54<1:33:40, 2.03it/s]
315
  3%|β–Ž | 293/11704 [02:55<1:33:43, 2.03it/s]
316
  3%|β–Ž | 294/11704 [02:55<1:33:38, 2.03it/s]
317
  3%|β–Ž | 295/11704 [02:56<1:33:41, 2.03it/s]
318
  3%|β–Ž | 296/11704 [02:56<1:33:37, 2.03it/s]
319
  3%|β–Ž | 297/11704 [02:57<1:33:29, 2.03it/s]
320
  3%|β–Ž | 298/11704 [02:57<1:33:33, 2.03it/s]
321
  3%|β–Ž | 299/11704 [02:58<1:33:34, 2.03it/s]
322
  3%|β–Ž | 300/11704 [02:58<1:33:27, 2.03it/s]{'loss': 5.0541, 'grad_norm': 0.9146194458007812, 'learning_rate': 0.0002561912894961571, 'epoch': 0.36}
 
323
 
324
  3%|β–Ž | 300/11704 [02:58<1:33:27, 2.03it/s]
325
  3%|β–Ž | 301/11704 [02:59<1:33:30, 2.03it/s]
326
  3%|β–Ž | 302/11704 [02:59<1:33:27, 2.03it/s]
327
  3%|β–Ž | 303/11704 [03:00<1:33:27, 2.03it/s]
328
  3%|β–Ž | 304/11704 [03:00<1:33:31, 2.03it/s]
329
  3%|β–Ž | 305/11704 [03:01<1:33:25, 2.03it/s]
330
  3%|β–Ž | 306/11704 [03:01<1:33:30, 2.03it/s]
331
  3%|β–Ž | 307/11704 [03:02<1:33:24, 2.03it/s]
332
  3%|β–Ž | 308/11704 [03:02<1:33:26, 2.03it/s]
333
  3%|β–Ž | 309/11704 [03:03<1:33:31, 2.03it/s]
334
  3%|β–Ž | 310/11704 [03:03<1:33:21, 2.03it/s]
335
  3%|β–Ž | 311/11704 [03:04<1:33:24, 2.03it/s]
336
  3%|β–Ž | 312/11704 [03:04<1:33:26, 2.03it/s]
337
  3%|β–Ž | 313/11704 [03:05<1:33:25, 2.03it/s]
338
  3%|β–Ž | 314/11704 [03:05<1:33:30, 2.03it/s]
339
  3%|β–Ž | 315/11704 [03:06<1:33:22, 2.03it/s]
340
  3%|β–Ž | 316/11704 [03:06<1:33:25, 2.03it/s]
341
  3%|β–Ž | 317/11704 [03:07<1:33:24, 2.03it/s]
342
  3%|β–Ž | 318/11704 [03:07<1:33:20, 2.03it/s]
343
  3%|β–Ž | 319/11704 [03:08<1:33:22, 2.03it/s]
344
  3%|β–Ž | 320/11704 [03:08<1:33:22, 2.03it/s]
345
  3%|β–Ž | 321/11704 [03:09<1:33:16, 2.03it/s]
346
  3%|β–Ž | 322/11704 [03:09<1:33:24, 2.03it/s]
347
  3%|β–Ž | 323/11704 [03:10<1:33:18, 2.03it/s]
348
  3%|β–Ž | 324/11704 [03:10<1:33:15, 2.03it/s]
349
  3%|β–Ž | 325/11704 [03:11<1:33:18, 2.03it/s]{'loss': 4.934, 'grad_norm': 0.9477291703224182, 'learning_rate': 0.0002775405636208369, 'epoch': 0.39}
 
350
 
351
  3%|β–Ž | 325/11704 [03:11<1:33:18, 2.03it/s]
352
  3%|β–Ž | 326/11704 [03:11<1:33:15, 2.03it/s]
353
  3%|β–Ž | 327/11704 [03:12<1:33:14, 2.03it/s]
354
  3%|β–Ž | 328/11704 [03:12<1:33:18, 2.03it/s]
355
  3%|β–Ž | 329/11704 [03:13<1:33:10, 2.03it/s]
356
  3%|β–Ž | 330/11704 [03:13<1:33:10, 2.03it/s]
357
  3%|β–Ž | 331/11704 [03:14<1:33:11, 2.03it/s]
358
  3%|β–Ž | 332/11704 [03:14<1:33:11, 2.03it/s]
359
  3%|β–Ž | 333/11704 [03:15<1:33:12, 2.03it/s]
360
  3%|β–Ž | 334/11704 [03:15<1:33:15, 2.03it/s]
361
  3%|β–Ž | 335/11704 [03:16<1:33:09, 2.03it/s]
362
  3%|β–Ž | 336/11704 [03:16<1:33:09, 2.03it/s]
363
  3%|β–Ž | 337/11704 [03:17<1:33:16, 2.03it/s]
364
  3%|β–Ž | 338/11704 [03:17<1:33:49, 2.02it/s]
365
  3%|β–Ž | 339/11704 [03:18<1:33:39, 2.02it/s]
366
  3%|β–Ž | 340/11704 [03:18<1:33:32, 2.02it/s]
367
  3%|β–Ž | 341/11704 [03:19<1:33:29, 2.03it/s]
368
  3%|β–Ž | 342/11704 [03:19<1:33:20, 2.03it/s]
369
  3%|β–Ž | 343/11704 [03:20<1:33:18, 2.03it/s]
370
  3%|β–Ž | 344/11704 [03:20<1:33:18, 2.03it/s]
371
  3%|β–Ž | 345/11704 [03:21<1:33:12, 2.03it/s]
372
  3%|β–Ž | 346/11704 [03:21<1:33:12, 2.03it/s]
373
  3%|β–Ž | 347/11704 [03:21<1:33:08, 2.03it/s]
374
  3%|β–Ž | 348/11704 [03:22<1:33:03, 2.03it/s]
375
  3%|β–Ž | 349/11704 [03:22<1:33:10, 2.03it/s]
376
  3%|β–Ž | 350/11704 [03:23<1:33:03, 2.03it/s]{'loss': 4.8046, 'grad_norm': 0.824380099773407, 'learning_rate': 0.00029888983774551667, 'epoch': 0.42}
 
377
 
378
  3%|β–Ž | 350/11704 [03:23<1:33:03, 2.03it/s]
379
  3%|β–Ž | 351/11704 [03:23<1:33:04, 2.03it/s]
380
  3%|β–Ž | 352/11704 [03:24<1:33:07, 2.03it/s]
381
  3%|β–Ž | 353/11704 [03:24<1:33:07, 2.03it/s]
382
  3%|β–Ž | 354/11704 [03:25<1:33:00, 2.03it/s]
383
  3%|β–Ž | 355/11704 [03:25<1:33:01, 2.03it/s]
384
  3%|β–Ž | 356/11704 [03:26<1:32:58, 2.03it/s]
385
  3%|β–Ž | 357/11704 [03:26<1:33:00, 2.03it/s]
386
  3%|β–Ž | 358/11704 [03:27<1:32:59, 2.03it/s]
387
  3%|β–Ž | 359/11704 [03:27<1:32:59, 2.03it/s]
388
  3%|β–Ž | 360/11704 [03:28<1:32:57, 2.03it/s]
389
  3%|β–Ž | 361/11704 [03:28<1:32:57, 2.03it/s]
390
  3%|β–Ž | 362/11704 [03:29<1:32:54, 2.03it/s]
391
  3%|β–Ž | 363/11704 [03:29<1:32:51, 2.04it/s]
392
  3%|β–Ž | 364/11704 [03:30<1:32:54, 2.03it/s]
393
  3%|β–Ž | 365/11704 [03:30<1:32:55, 2.03it/s]
394
  3%|β–Ž | 366/11704 [03:31<1:32:52, 2.03it/s]
395
  3%|β–Ž | 367/11704 [03:31<1:33:03, 2.03it/s]
396
  3%|β–Ž | 368/11704 [03:32<1:33:03, 2.03it/s]
397
  3%|β–Ž | 369/11704 [03:32<1:33:10, 2.03it/s]
398
  3%|β–Ž | 370/11704 [03:33<1:33:01, 2.03it/s]
399
  3%|β–Ž | 371/11704 [03:33<1:32:54, 2.03it/s]
400
  3%|β–Ž | 372/11704 [03:34<1:32:58, 2.03it/s]
401
  3%|β–Ž | 373/11704 [03:34<1:32:51, 2.03it/s]
402
  3%|β–Ž | 374/11704 [03:35<1:32:56, 2.03it/s]
403
  3%|β–Ž | 375/11704 [03:35<1:33:04, 2.03it/s]{'loss': 4.6929, 'grad_norm': 0.7358281016349792, 'learning_rate': 0.0003202391118701964, 'epoch': 0.45}
 
404
 
405
  3%|β–Ž | 375/11704 [03:35<1:33:04, 2.03it/s]
406
  3%|β–Ž | 376/11704 [03:36<1:33:02, 2.03it/s]
407
  3%|β–Ž | 377/11704 [03:36<1:33:04, 2.03it/s]
408
  3%|β–Ž | 378/11704 [03:37<1:33:01, 2.03it/s]
409
  3%|β–Ž | 379/11704 [03:37<1:33:04, 2.03it/s]
410
  3%|β–Ž | 380/11704 [03:38<1:33:01, 2.03it/s]
411
  3%|β–Ž | 381/11704 [03:38<1:32:53, 2.03it/s]
412
  3%|β–Ž | 382/11704 [03:39<1:33:07, 2.03it/s]
413
  3%|β–Ž | 383/11704 [03:39<1:33:02, 2.03it/s]
414
  3%|β–Ž | 384/11704 [03:40<1:33:01, 2.03it/s]
415
  3%|β–Ž | 385/11704 [03:40<1:32:59, 2.03it/s]
416
  3%|β–Ž | 386/11704 [03:41<1:32:51, 2.03it/s]
417
  3%|β–Ž | 387/11704 [03:41<1:32:44, 2.03it/s]
418
  3%|β–Ž | 388/11704 [03:42<1:32:52, 2.03it/s]
419
  3%|β–Ž | 389/11704 [03:42<1:32:47, 2.03it/s]
420
  3%|β–Ž | 390/11704 [03:43<1:32:51, 2.03it/s]
421
  3%|β–Ž | 391/11704 [03:43<1:32:56, 2.03it/s]
422
  3%|β–Ž | 392/11704 [03:44<1:32:50, 2.03it/s]
423
  3%|β–Ž | 393/11704 [03:44<1:32:50, 2.03it/s]
424
  3%|β–Ž | 394/11704 [03:45<1:32:44, 2.03it/s]
425
  3%|β–Ž | 395/11704 [03:45<1:32:41, 2.03it/s]
426
  3%|β–Ž | 396/11704 [03:46<1:32:36, 2.04it/s]
427
  3%|β–Ž | 397/11704 [03:46<1:32:45, 2.03it/s]
428
  3%|β–Ž | 398/11704 [03:47<1:32:39, 2.03it/s]
429
  3%|β–Ž | 399/11704 [03:47<1:32:35, 2.03it/s]
430
  3%|β–Ž | 400/11704 [03:48<1:32:43, 2.03it/s]
431
 
 
432
  3%|β–Ž | 400/11704 [03:48<1:32:43, 2.03it/s]
433
  3%|β–Ž | 401/11704 [03:48<1:32:46, 2.03it/s]
434
  3%|β–Ž | 402/11704 [03:49<1:32:45, 2.03it/s]
435
  3%|β–Ž | 403/11704 [03:49<1:32:48, 2.03it/s]
436
  3%|β–Ž | 404/11704 [03:50<1:32:43, 2.03it/s]
437
  3%|β–Ž | 405/11704 [03:50<1:32:41, 2.03it/s]
438
  3%|β–Ž | 406/11704 [03:51<1:32:39, 2.03it/s]
439
  3%|β–Ž | 407/11704 [03:51<1:32:34, 2.03it/s]
440
  3%|β–Ž | 408/11704 [03:52<1:32:39, 2.03it/s]
441
  3%|β–Ž | 409/11704 [03:52<1:32:39, 2.03it/s]
442
  4%|β–Ž | 410/11704 [03:53<1:32:35, 2.03it/s]
443
  4%|β–Ž | 411/11704 [03:53<1:32:35, 2.03it/s]
444
  4%|β–Ž | 412/11704 [03:53<1:32:32, 2.03it/s]
445
  4%|β–Ž | 413/11704 [03:54<1:32:25, 2.04it/s]
446
  4%|β–Ž | 414/11704 [03:54<1:32:29, 2.03it/s]
447
  4%|β–Ž | 415/11704 [03:55<1:32:26, 2.04it/s]
448
  4%|β–Ž | 416/11704 [03:55<1:32:17, 2.04it/s]
449
  4%|β–Ž | 417/11704 [03:56<1:32:23, 2.04it/s]
450
  4%|β–Ž | 418/11704 [03:56<1:32:26, 2.03it/s]
451
  4%|β–Ž | 419/11704 [03:57<1:32:22, 2.04it/s]
452
  4%|β–Ž | 420/11704 [03:57<1:32:24, 2.04it/s]
453
  4%|β–Ž | 421/11704 [03:58<1:32:22, 2.04it/s]
454
  4%|β–Ž | 422/11704 [03:58<1:32:19, 2.04it/s]
455
  4%|β–Ž | 423/11704 [03:59<1:32:20, 2.04it/s]
456
  4%|β–Ž | 424/11704 [03:59<1:32:19, 2.04it/s]
457
  4%|β–Ž | 425/11704 [04:00<1:32:14, 2.04it/s]
458
 
 
459
  4%|β–Ž | 425/11704 [04:00<1:32:14, 2.04it/s]
460
  4%|β–Ž | 426/11704 [04:00<1:32:21, 2.04it/s]
461
  4%|β–Ž | 427/11704 [04:01<1:32:20, 2.04it/s]
462
  4%|β–Ž | 428/11704 [04:01<1:32:16, 2.04it/s]
463
  4%|β–Ž | 429/11704 [04:02<1:32:26, 2.03it/s]
464
  4%|β–Ž | 430/11704 [04:02<1:32:20, 2.03it/s]
465
  4%|β–Ž | 431/11704 [04:03<1:32:17, 2.04it/s]
466
  4%|β–Ž | 432/11704 [04:03<1:32:23, 2.03it/s]
467
  4%|β–Ž | 433/11704 [04:04<1:32:20, 2.03it/s]
468
  4%|β–Ž | 434/11704 [04:04<1:32:17, 2.04it/s]
469
  4%|β–Ž | 435/11704 [04:05<1:32:23, 2.03it/s]
470
  4%|β–Ž | 436/11704 [04:05<1:32:19, 2.03it/s]
471
  4%|β–Ž | 437/11704 [04:06<1:32:21, 2.03it/s]
472
  4%|β–Ž | 438/11704 [04:06<1:32:25, 2.03it/s]
473
  4%|▍ | 439/11704 [04:07<1:32:13, 2.04it/s]
474
  4%|▍ | 440/11704 [04:07<1:32:22, 2.03it/s]
475
  4%|▍ | 441/11704 [04:08<1:32:18, 2.03it/s]
476
  4%|▍ | 442/11704 [04:08<1:32:09, 2.04it/s]
477
  4%|▍ | 443/11704 [04:09<1:32:17, 2.03it/s]
478
  4%|▍ | 444/11704 [04:09<1:32:16, 2.03it/s]
479
  4%|▍ | 445/11704 [04:10<1:32:08, 2.04it/s]
480
  4%|▍ | 446/11704 [04:10<1:32:13, 2.03it/s]
481
  4%|▍ | 447/11704 [04:11<1:32:10, 2.04it/s]
482
  4%|▍ | 448/11704 [04:11<1:32:08, 2.04it/s]
483
  4%|▍ | 449/11704 [04:12<1:32:12, 2.03it/s]
484
  4%|▍ | 450/11704 [04:12<1:32:11, 2.03it/s]{'loss': 4.4307, 'grad_norm': 0.6776960492134094, 'learning_rate': 0.0003842869342442357, 'epoch': 0.54}
485
 
 
486
  4%|▍ | 450/11704 [04:12<1:32:11, 2.03it/s]
487
  4%|▍ | 451/11704 [04:13<1:32:13, 2.03it/s]
488
  4%|▍ | 452/11704 [04:13<1:32:17, 2.03it/s]
489
  4%|▍ | 453/11704 [04:14<1:32:13, 2.03it/s]
490
  4%|▍ | 454/11704 [04:14<1:32:16, 2.03it/s]
491
  4%|▍ | 455/11704 [04:15<1:32:11, 2.03it/s]
492
  4%|▍ | 456/11704 [04:15<1:32:06, 2.04it/s]
493
  4%|▍ | 457/11704 [04:16<1:32:11, 2.03it/s]
494
  4%|▍ | 458/11704 [04:16<1:32:09, 2.03it/s]
495
  4%|▍ | 459/11704 [04:17<1:32:04, 2.04it/s]
496
  4%|▍ | 460/11704 [04:17<1:32:09, 2.03it/s]
497
  4%|▍ | 461/11704 [04:18<1:32:10, 2.03it/s]
498
  4%|▍ | 462/11704 [04:18<1:32:04, 2.03it/s]
499
  4%|▍ | 463/11704 [04:19<1:32:10, 2.03it/s]
500
  4%|▍ | 464/11704 [04:19<1:32:03, 2.04it/s]
501
  4%|▍ | 465/11704 [04:20<1:31:56, 2.04it/s]
502
  4%|▍ | 466/11704 [04:20<1:32:00, 2.04it/s]
503
  4%|▍ | 467/11704 [04:21<1:32:05, 2.03it/s]
504
  4%|▍ | 468/11704 [04:21<1:31:58, 2.04it/s]
505
  4%|▍ | 469/11704 [04:22<1:32:03, 2.03it/s]
506
  4%|▍ | 470/11704 [04:22<1:31:58, 2.04it/s]
507
  4%|▍ | 471/11704 [04:22<1:31:56, 2.04it/s]
508
  4%|▍ | 472/11704 [04:23<1:31:58, 2.04it/s]
509
  4%|▍ | 473/11704 [04:23<1:32:04, 2.03it/s]
510
  4%|▍ | 474/11704 [04:24<1:31:58, 2.04it/s]
511
  4%|▍ | 475/11704 [04:24<1:32:03, 2.03it/s]{'loss': 4.3608, 'grad_norm': 0.7181493043899536, 'learning_rate': 0.00040563620836891546, 'epoch': 0.57}
 
512
 
513
  4%|▍ | 475/11704 [04:24<1:32:03, 2.03it/s]
514
  4%|▍ | 476/11704 [04:25<1:32:02, 2.03it/s]
515
  4%|▍ | 477/11704 [04:25<1:32:02, 2.03it/s]
516
  4%|▍ | 478/11704 [04:26<1:32:07, 2.03it/s]
517
  4%|▍ | 479/11704 [04:26<1:31:59, 2.03it/s]
518
  4%|▍ | 480/11704 [04:27<1:32:01, 2.03it/s]
519
  4%|▍ | 481/11704 [04:27<1:32:03, 2.03it/s]
520
  4%|▍ | 482/11704 [04:28<1:31:58, 2.03it/s]
521
  4%|▍ | 483/11704 [04:28<1:32:07, 2.03it/s]
522
  4%|▍ | 484/11704 [04:29<1:32:02, 2.03it/s]
523
  4%|▍ | 485/11704 [04:29<1:31:57, 2.03it/s]
524
  4%|▍ | 486/11704 [04:30<1:31:53, 2.03it/s]
525
  4%|▍ | 487/11704 [04:30<1:31:57, 2.03it/s]
526
  4%|▍ | 488/11704 [04:31<1:31:56, 2.03it/s]
527
  4%|▍ | 489/11704 [04:31<1:31:56, 2.03it/s]
528
  4%|▍ | 490/11704 [04:32<1:31:57, 2.03it/s]
529
  4%|▍ | 491/11704 [04:32<1:31:54, 2.03it/s]
530
  4%|▍ | 492/11704 [04:33<1:31:57, 2.03it/s]
531
  4%|▍ | 493/11704 [04:33<1:31:54, 2.03it/s]
532
  4%|▍ | 494/11704 [04:34<1:31:55, 2.03it/s]
533
  4%|▍ | 495/11704 [04:34<1:31:55, 2.03it/s]
534
  4%|▍ | 496/11704 [04:35<1:31:53, 2.03it/s]
535
  4%|▍ | 497/11704 [04:35<1:31:52, 2.03it/s]
536
  4%|▍ | 498/11704 [04:36<1:31:53, 2.03it/s]
537
  4%|▍ | 499/11704 [04:36<1:31:47, 2.03it/s]
538
  4%|▍ | 500/11704 [04:37<1:31:42, 2.04it/s]{'loss': 4.2807, 'grad_norm': 0.6647967100143433, 'learning_rate': 0.0004269854824935952, 'epoch': 0.6}
 
539
 
540
  4%|▍ | 500/11704 [04:37<1:31:42, 2.04it/s]
541
  4%|▍ | 501/11704 [04:37<1:31:46, 2.03it/s]
542
  4%|▍ | 502/11704 [04:38<1:31:41, 2.04it/s]
543
  4%|▍ | 503/11704 [04:38<1:31:44, 2.03it/s]
544
  4%|▍ | 504/11704 [04:39<1:31:46, 2.03it/s]
545
  4%|▍ | 505/11704 [04:39<1:31:44, 2.03it/s]
546
  4%|▍ | 506/11704 [04:40<1:31:47, 2.03it/s]
547
  4%|▍ | 507/11704 [04:40<1:31:46, 2.03it/s]
548
  4%|▍ | 508/11704 [04:41<1:31:43, 2.03it/s]
549
  4%|▍ | 509/11704 [04:41<1:31:50, 2.03it/s]
550
  4%|▍ | 510/11704 [04:42<1:31:44, 2.03it/s]
551
  4%|▍ | 511/11704 [04:42<1:31:40, 2.04it/s]
552
  4%|▍ | 512/11704 [04:43<1:31:45, 2.03it/s]
553
  4%|▍ | 513/11704 [04:43<1:31:42, 2.03it/s]
554
  4%|▍ | 514/11704 [04:44<1:31:42, 2.03it/s]
555
  4%|▍ | 515/11704 [04:44<1:31:45, 2.03it/s]
556
  4%|▍ | 516/11704 [04:45<1:31:43, 2.03it/s]
557
  4%|▍ | 517/11704 [04:45<1:31:48, 2.03it/s]
558
  4%|▍ | 518/11704 [04:46<1:32:34, 2.01it/s]
559
  4%|▍ | 519/11704 [04:46<1:32:16, 2.02it/s]
560
  4%|▍ | 520/11704 [04:47<1:31:57, 2.03it/s]
561
  4%|▍ | 521/11704 [04:47<1:31:50, 2.03it/s]
562
  4%|▍ | 522/11704 [04:48<1:32:03, 2.02it/s]
563
  4%|▍ | 523/11704 [04:48<1:31:50, 2.03it/s]
564
  4%|▍ | 524/11704 [04:49<1:31:45, 2.03it/s]
565
  4%|▍ | 525/11704 [04:49<1:31:44, 2.03it/s]{'loss': 4.221, 'grad_norm': 0.5851206183433533, 'learning_rate': 0.00044833475661827497, 'epoch': 0.63}
 
566
 
567
  4%|▍ | 525/11704 [04:49<1:31:44, 2.03it/s]
568
  4%|▍ | 526/11704 [04:50<1:31:38, 2.03it/s]
569
  5%|▍ | 527/11704 [04:50<1:31:31, 2.04it/s]
570
  5%|▍ | 528/11704 [04:51<1:31:36, 2.03it/s]
571
  5%|▍ | 529/11704 [04:51<1:31:34, 2.03it/s]
572
  5%|▍ | 530/11704 [04:52<1:31:33, 2.03it/s]
573
  5%|▍ | 531/11704 [04:52<1:31:38, 2.03it/s]
574
  5%|▍ | 532/11704 [04:53<1:31:36, 2.03it/s]
575
  5%|▍ | 533/11704 [04:53<1:31:28, 2.04it/s]
576
  5%|▍ | 534/11704 [04:53<1:31:30, 2.03it/s]
577
  5%|▍ | 535/11704 [04:54<1:31:34, 2.03it/s]
578
  5%|▍ | 536/11704 [04:54<1:31:33, 2.03it/s]
579
  5%|▍ | 537/11704 [04:55<1:31:29, 2.03it/s]
580
  5%|▍ | 538/11704 [04:55<1:31:32, 2.03it/s]
581
  5%|▍ | 539/11704 [04:56<1:31:29, 2.03it/s]
582
  5%|▍ | 540/11704 [04:56<1:31:23, 2.04it/s]
583
  5%|▍ | 541/11704 [04:57<1:31:26, 2.03it/s]
584
  5%|▍ | 542/11704 [04:57<1:31:21, 2.04it/s]
585
  5%|▍ | 543/11704 [04:58<1:31:19, 2.04it/s]
586
  5%|▍ | 544/11704 [04:58<1:31:27, 2.03it/s]
587
  5%|▍ | 545/11704 [04:59<1:31:24, 2.03it/s]
588
  5%|▍ | 546/11704 [04:59<1:31:18, 2.04it/s]
589
  5%|▍ | 547/11704 [05:00<1:31:20, 2.04it/s]
590
  5%|▍ | 548/11704 [05:00<1:31:17, 2.04it/s]
591
  5%|▍ | 549/11704 [05:01<1:31:09, 2.04it/s]
592
  5%|▍ | 550/11704 [05:01<1:31:18, 2.04it/s]{'loss': 4.1699, 'grad_norm': 0.8667837381362915, 'learning_rate': 0.00046968403074295473, 'epoch': 0.66}
 
593
 
594
  5%|▍ | 550/11704 [05:01<1:31:18, 2.04it/s]
595
  5%|▍ | 551/11704 [05:02<1:31:19, 2.04it/s]
596
  5%|▍ | 552/11704 [05:02<1:31:17, 2.04it/s]
597
  5%|▍ | 553/11704 [05:03<1:31:25, 2.03it/s]
598
  5%|▍ | 554/11704 [05:03<1:31:22, 2.03it/s]
599
  5%|▍ | 555/11704 [05:04<1:31:22, 2.03it/s]
600
  5%|▍ | 556/11704 [05:04<1:31:25, 2.03it/s]
601
  5%|▍ | 557/11704 [05:05<1:31:21, 2.03it/s]
602
  5%|▍ | 558/11704 [05:05<1:31:20, 2.03it/s]
603
  5%|▍ | 559/11704 [05:06<1:31:21, 2.03it/s]
604
  5%|▍ | 560/11704 [05:06<1:31:22, 2.03it/s]
605
  5%|▍ | 561/11704 [05:07<1:31:22, 2.03it/s]
606
  5%|▍ | 562/11704 [05:07<1:31:17, 2.03it/s]
607
  5%|▍ | 563/11704 [05:08<1:31:13, 2.04it/s]
608
  5%|▍ | 564/11704 [05:08<1:31:18, 2.03it/s]
609
  5%|▍ | 565/11704 [05:09<1:31:13, 2.03it/s]
610
  5%|▍ | 566/11704 [05:09<1:31:07, 2.04it/s]
611
  5%|▍ | 567/11704 [05:10<1:31:11, 2.04it/s]
612
  5%|▍ | 568/11704 [05:10<1:31:09, 2.04it/s]
613
  5%|▍ | 569/11704 [05:11<1:31:03, 2.04it/s]
614
  5%|▍ | 570/11704 [05:11<1:31:11, 2.03it/s]
615
  5%|▍ | 571/11704 [05:12<1:31:07, 2.04it/s]
616
  5%|▍ | 572/11704 [05:12<1:31:04, 2.04it/s]
617
  5%|▍ | 573/11704 [05:13<1:31:09, 2.04it/s]
618
  5%|▍ | 574/11704 [05:13<1:31:05, 2.04it/s]
619
  5%|▍ | 575/11704 [05:14<1:31:05, 2.04it/s]{'loss': 4.1233, 'grad_norm': 0.720784604549408, 'learning_rate': 0.0004910333048676345, 'epoch': 0.69}
 
620
 
621
  5%|▍ | 575/11704 [05:14<1:31:05, 2.04it/s]
622
  5%|▍ | 576/11704 [05:14<1:31:14, 2.03it/s]
623
  5%|▍ | 577/11704 [05:15<1:31:06, 2.04it/s]
624
  5%|▍ | 578/11704 [05:15<1:31:06, 2.04it/s]
625
  5%|▍ | 579/11704 [05:16<1:31:10, 2.03it/s]
626
  5%|▍ | 580/11704 [05:16<1:31:03, 2.04it/s]
627
  5%|▍ | 581/11704 [05:17<1:31:00, 2.04it/s]
628
  5%|▍ | 582/11704 [05:17<1:31:08, 2.03it/s]
629
  5%|▍ | 583/11704 [05:18<1:31:06, 2.03it/s]
630
  5%|▍ | 584/11704 [05:18<1:31:09, 2.03it/s]
631
  5%|▍ | 585/11704 [05:19<1:31:07, 2.03it/s]
632
  5%|β–Œ | 586/11704 [05:19<1:31:03, 2.04it/s]
633
  5%|β–Œ | 587/11704 [05:20<1:31:06, 2.03it/s]
634
  5%|β–Œ | 588/11704 [05:20<1:31:03, 2.03it/s]
635
  5%|β–Œ | 589/11704 [05:21<1:30:56, 2.04it/s]
636
  5%|β–Œ | 590/11704 [05:21<1:31:02, 2.03it/s]
637
  5%|β–Œ | 591/11704 [05:21<1:31:02, 2.03it/s]
638
  5%|β–Œ | 592/11704 [05:22<1:30:59, 2.04it/s]
639
  5%|β–Œ | 593/11704 [05:22<1:31:05, 2.03it/s]
640
  5%|β–Œ | 594/11704 [05:23<1:31:00, 2.03it/s]
641
  5%|β–Œ | 595/11704 [05:23<1:30:57, 2.04it/s]
642
  5%|β–Œ | 596/11704 [05:24<1:30:54, 2.04it/s]
643
  5%|β–Œ | 597/11704 [05:24<1:30:55, 2.04it/s]
644
  5%|β–Œ | 598/11704 [05:25<1:30:59, 2.03it/s]
645
  5%|β–Œ | 599/11704 [05:25<1:31:07, 2.03it/s]
646
  5%|β–Œ | 600/11704 [05:26<1:31:00, 2.03it/s]{'loss': 4.0671, 'grad_norm': 0.5922385454177856, 'learning_rate': 0.0005123825789923142, 'epoch': 0.72}
 
647
 
648
  5%|β–Œ | 600/11704 [05:26<1:31:00, 2.03it/s]
649
  5%|β–Œ | 601/11704 [05:26<1:31:07, 2.03it/s]
650
  5%|β–Œ | 602/11704 [05:27<1:31:07, 2.03it/s]
651
  5%|β–Œ | 603/11704 [05:27<1:30:58, 2.03it/s]
652
  5%|β–Œ | 604/11704 [05:28<1:31:02, 2.03it/s]
653
  5%|β–Œ | 605/11704 [05:28<1:30:57, 2.03it/s]
654
  5%|β–Œ | 606/11704 [05:29<1:30:54, 2.03it/s]
655
  5%|β–Œ | 607/11704 [05:29<1:31:00, 2.03it/s]
656
  5%|β–Œ | 608/11704 [05:30<1:30:53, 2.03it/s]
657
  5%|β–Œ | 609/11704 [05:30<1:30:51, 2.04it/s]
658
  5%|β–Œ | 610/11704 [05:31<1:30:58, 2.03it/s]
659
  5%|β–Œ | 611/11704 [05:31<1:30:50, 2.04it/s]
660
  5%|β–Œ | 612/11704 [05:32<1:30:44, 2.04it/s]
661
  5%|β–Œ | 613/11704 [05:32<1:30:56, 2.03it/s]
662
  5%|β–Œ | 614/11704 [05:33<1:30:48, 2.04it/s]
663
  5%|β–Œ | 615/11704 [05:33<1:39:03, 1.87it/s]
664
  5%|β–Œ | 616/11704 [05:34<1:44:26, 1.77it/s]
665
  5%|β–Œ | 617/11704 [05:35<1:40:15, 1.84it/s]
666
  5%|β–Œ | 618/11704 [05:35<1:37:27, 1.90it/s]
667
  5%|β–Œ | 619/11704 [05:36<1:35:28, 1.93it/s]
668
  5%|β–Œ | 620/11704 [05:36<1:33:54, 1.97it/s]
669
  5%|β–Œ | 621/11704 [05:37<1:33:01, 1.99it/s]
670
  5%|β–Œ | 622/11704 [05:37<1:32:24, 2.00it/s]
671
  5%|β–Œ | 623/11704 [05:38<1:31:51, 2.01it/s]
672
  5%|β–Œ | 624/11704 [05:38<1:31:28, 2.02it/s]
673
  5%|β–Œ | 625/11704 [05:38<1:31:20, 2.02it/s]{'loss': 4.0331, 'grad_norm': 0.5989447236061096, 'learning_rate': 0.000533731853116994, 'epoch': 0.75}
 
674
 
675
  5%|β–Œ | 625/11704 [05:39<1:31:20, 2.02it/s]
676
  5%|β–Œ | 626/11704 [05:39<1:31:10, 2.03it/s]
677
  5%|β–Œ | 627/11704 [05:39<1:30:59, 2.03it/s]
678
  5%|β–Œ | 628/11704 [05:40<1:31:00, 2.03it/s]
679
  5%|β–Œ | 629/11704 [05:40<1:30:50, 2.03it/s]
680
  5%|β–Œ | 630/11704 [05:41<1:30:48, 2.03it/s]
681
  5%|β–Œ | 631/11704 [05:41<1:30:49, 2.03it/s]
682
  5%|β–Œ | 632/11704 [05:42<1:30:45, 2.03it/s]
683
  5%|β–Œ | 633/11704 [05:42<1:30:51, 2.03it/s]
684
  5%|β–Œ | 634/11704 [05:43<1:30:44, 2.03it/s]
685
  5%|β–Œ | 635/11704 [05:43<1:30:39, 2.04it/s]
686
  5%|β–Œ | 636/11704 [05:44<1:30:46, 2.03it/s]
687
  5%|β–Œ | 637/11704 [05:44<1:30:45, 2.03it/s]
688
  5%|β–Œ | 638/11704 [05:45<1:30:43, 2.03it/s]
689
  5%|β–Œ | 639/11704 [05:45<1:30:43, 2.03it/s]
690
  5%|β–Œ | 640/11704 [05:46<1:30:38, 2.03it/s]
691
  5%|β–Œ | 641/11704 [05:46<1:30:40, 2.03it/s]
692
  5%|β–Œ | 642/11704 [05:47<1:30:42, 2.03it/s]
693
  5%|β–Œ | 643/11704 [05:47<1:30:39, 2.03it/s]
694
  6%|β–Œ | 644/11704 [05:48<1:30:44, 2.03it/s]
695
  6%|β–Œ | 645/11704 [05:48<1:30:35, 2.03it/s]
696
  6%|β–Œ | 646/11704 [05:49<1:30:40, 2.03it/s]
697
  6%|β–Œ | 647/11704 [05:49<1:30:42, 2.03it/s]
698
  6%|β–Œ | 648/11704 [05:50<1:30:35, 2.03it/s]
699
  6%|β–Œ | 649/11704 [05:50<1:30:36, 2.03it/s]
700
  6%|β–Œ | 650/11704 [05:51<1:30:42, 2.03it/s]{'loss': 3.9926, 'grad_norm': 0.630885899066925, 'learning_rate': 0.0005550811272416738, 'epoch': 0.78}
 
701
 
702
  6%|β–Œ | 650/11704 [05:51<1:30:42, 2.03it/s]
703
  6%|β–Œ | 651/11704 [05:51<1:30:40, 2.03it/s]
704
  6%|β–Œ | 652/11704 [05:52<1:30:43, 2.03it/s]
705
  6%|β–Œ | 653/11704 [05:52<1:30:36, 2.03it/s]
706
  6%|β–Œ | 654/11704 [05:53<1:30:33, 2.03it/s]
707
  6%|β–Œ | 655/11704 [05:53<1:30:43, 2.03it/s]
708
  6%|β–Œ | 656/11704 [05:54<1:30:37, 2.03it/s]
709
  6%|β–Œ | 657/11704 [05:54<1:30:37, 2.03it/s]
710
  6%|β–Œ | 658/11704 [05:55<1:30:42, 2.03it/s]
711
  6%|β–Œ | 659/11704 [05:55<1:30:42, 2.03it/s]
712
  6%|β–Œ | 660/11704 [05:56<1:30:41, 2.03it/s]
713
  6%|β–Œ | 661/11704 [05:56<1:30:43, 2.03it/s]
714
  6%|β–Œ | 662/11704 [05:57<1:30:33, 2.03it/s]
715
  6%|β–Œ | 663/11704 [05:57<1:30:33, 2.03it/s]
716
  6%|β–Œ | 664/11704 [05:58<1:30:35, 2.03it/s]
717
  6%|β–Œ | 665/11704 [05:58<1:30:35, 2.03it/s]
718
  6%|β–Œ | 666/11704 [05:59<1:30:29, 2.03it/s]
719
  6%|β–Œ | 667/11704 [05:59<1:30:36, 2.03it/s]
720
  6%|β–Œ | 668/11704 [06:00<1:30:36, 2.03it/s]
721
  6%|β–Œ | 669/11704 [06:00<1:30:32, 2.03it/s]
722
  6%|β–Œ | 670/11704 [06:01<1:30:35, 2.03it/s]
723
  6%|β–Œ | 671/11704 [06:01<1:30:30, 2.03it/s]
724
  6%|β–Œ | 672/11704 [06:02<1:30:28, 2.03it/s]
725
  6%|β–Œ | 673/11704 [06:02<1:30:26, 2.03it/s]
726
  6%|β–Œ | 674/11704 [06:03<1:30:24, 2.03it/s]
727
  6%|β–Œ | 675/11704 [06:03<1:30:22, 2.03it/s]{'loss': 3.9532, 'grad_norm': 0.6419950723648071, 'learning_rate': 0.0005764304013663536, 'epoch': 0.81}
 
728
 
729
  6%|β–Œ | 675/11704 [06:03<1:30:22, 2.03it/s]
730
  6%|β–Œ | 676/11704 [06:04<1:30:31, 2.03it/s]
731
  6%|β–Œ | 677/11704 [06:04<1:30:28, 2.03it/s]
732
  6%|β–Œ | 678/11704 [06:05<1:30:33, 2.03it/s]
733
  6%|β–Œ | 679/11704 [06:05<1:30:34, 2.03it/s]
734
  6%|β–Œ | 680/11704 [06:06<1:30:24, 2.03it/s]
735
  6%|β–Œ | 681/11704 [06:06<1:30:30, 2.03it/s]
736
  6%|β–Œ | 682/11704 [06:07<1:30:29, 2.03it/s]
737
  6%|β–Œ | 683/11704 [06:07<1:30:31, 2.03it/s]
738
  6%|β–Œ | 684/11704 [06:08<1:30:31, 2.03it/s]
739
  6%|β–Œ | 685/11704 [06:08<1:30:25, 2.03it/s]
740
  6%|β–Œ | 686/11704 [06:09<1:30:28, 2.03it/s]
741
  6%|β–Œ | 687/11704 [06:09<1:30:25, 2.03it/s]
742
  6%|β–Œ | 688/11704 [06:10<1:30:24, 2.03it/s]
743
  6%|β–Œ | 689/11704 [06:10<1:30:25, 2.03it/s]
744
  6%|β–Œ | 690/11704 [06:10<1:30:24, 2.03it/s]
745
  6%|β–Œ | 691/11704 [06:11<1:30:24, 2.03it/s]
746
  6%|β–Œ | 692/11704 [06:11<1:30:19, 2.03it/s]
747
  6%|β–Œ | 693/11704 [06:12<1:30:15, 2.03it/s]
748
  6%|β–Œ | 694/11704 [06:12<1:30:20, 2.03it/s]
749
  6%|β–Œ | 695/11704 [06:13<1:30:17, 2.03it/s]
750
  6%|β–Œ | 696/11704 [06:13<1:30:23, 2.03it/s]
751
  6%|β–Œ | 697/11704 [06:14<1:30:18, 2.03it/s]
752
  6%|β–Œ | 698/11704 [06:14<1:30:13, 2.03it/s]
753
  6%|β–Œ | 699/11704 [06:15<1:30:15, 2.03it/s]
754
  6%|β–Œ | 700/11704 [06:15<1:30:13, 2.03it/s]{'loss': 3.9036, 'grad_norm': 0.5995571613311768, 'learning_rate': 0.0005977796754910333, 'epoch': 0.84}
 
755
 
756
  6%|β–Œ | 700/11704 [06:15<1:30:13, 2.03it/s]
757
  6%|β–Œ | 701/11704 [06:16<1:30:21, 2.03it/s]
758
  6%|β–Œ | 702/11704 [06:16<1:30:22, 2.03it/s]
759
  6%|β–Œ | 703/11704 [06:17<1:30:16, 2.03it/s]
760
  6%|β–Œ | 704/11704 [06:17<1:30:19, 2.03it/s]
761
  6%|β–Œ | 705/11704 [06:18<1:30:16, 2.03it/s]
762
  6%|β–Œ | 706/11704 [06:18<1:30:18, 2.03it/s]
763
  6%|β–Œ | 707/11704 [06:19<1:30:13, 2.03it/s]
764
  6%|β–Œ | 708/11704 [06:19<1:30:10, 2.03it/s]
765
  6%|β–Œ | 709/11704 [06:20<1:30:16, 2.03it/s]
766
  6%|β–Œ | 710/11704 [06:20<1:30:10, 2.03it/s]
767
  6%|β–Œ | 711/11704 [06:21<1:30:22, 2.03it/s]
768
  6%|β–Œ | 712/11704 [06:21<1:30:14, 2.03it/s]
769
  6%|β–Œ | 713/11704 [06:22<1:30:10, 2.03it/s]
770
  6%|β–Œ | 714/11704 [06:22<1:30:16, 2.03it/s]
771
  6%|β–Œ | 715/11704 [06:23<1:30:10, 2.03it/s]
772
  6%|β–Œ | 716/11704 [06:23<1:30:13, 2.03it/s]
773
  6%|β–Œ | 717/11704 [06:24<1:30:17, 2.03it/s]
774
  6%|β–Œ | 718/11704 [06:24<1:30:09, 2.03it/s]
775
  6%|β–Œ | 719/11704 [06:25<1:30:13, 2.03it/s]
776
  6%|β–Œ | 720/11704 [06:25<1:30:05, 2.03it/s]
777
  6%|β–Œ | 721/11704 [06:26<1:30:09, 2.03it/s]
778
  6%|β–Œ | 722/11704 [06:26<1:30:09, 2.03it/s]
779
  6%|β–Œ | 723/11704 [06:27<1:30:08, 2.03it/s]
780
  6%|β–Œ | 724/11704 [06:27<1:30:10, 2.03it/s]
781
  6%|β–Œ | 725/11704 [06:28<1:30:07, 2.03it/s]{'loss': 3.8761, 'grad_norm': 0.5167131423950195, 'learning_rate': 0.000619128949615713, 'epoch': 0.87}
 
782
 
783
  6%|β–Œ | 725/11704 [06:28<1:30:07, 2.03it/s]
784
  6%|β–Œ | 726/11704 [06:28<1:30:10, 2.03it/s]
785
  6%|β–Œ | 727/11704 [06:29<1:30:07, 2.03it/s]
786
  6%|β–Œ | 728/11704 [06:29<1:30:06, 2.03it/s]
787
  6%|β–Œ | 729/11704 [06:30<1:30:10, 2.03it/s]
788
  6%|β–Œ | 730/11704 [06:30<1:30:08, 2.03it/s]
789
  6%|β–Œ | 731/11704 [06:31<1:30:10, 2.03it/s]
790
  6%|β–‹ | 732/11704 [06:31<1:30:04, 2.03it/s]
791
  6%|β–‹ | 733/11704 [06:32<1:30:01, 2.03it/s]
792
  6%|β–‹ | 734/11704 [06:32<1:30:04, 2.03it/s]
793
  6%|β–‹ | 735/11704 [06:33<1:29:58, 2.03it/s]
794
  6%|β–‹ | 736/11704 [06:33<1:30:04, 2.03it/s]
795
  6%|β–‹ | 737/11704 [06:34<1:30:03, 2.03it/s]
796
  6%|β–‹ | 738/11704 [06:34<1:30:02, 2.03it/s]
797
  6%|β–‹ | 739/11704 [06:35<1:30:03, 2.03it/s]
798
  6%|β–‹ | 740/11704 [06:35<1:29:57, 2.03it/s]
799
  6%|β–‹ | 741/11704 [06:36<1:30:03, 2.03it/s]
800
  6%|β–‹ | 742/11704 [06:36<1:30:01, 2.03it/s]
801
  6%|β–‹ | 743/11704 [06:37<1:29:59, 2.03it/s]
802
  6%|β–‹ | 744/11704 [06:37<1:30:00, 2.03it/s]
803
  6%|β–‹ | 745/11704 [06:38<1:29:54, 2.03it/s]
804
  6%|β–‹ | 746/11704 [06:38<1:30:01, 2.03it/s]
805
  6%|β–‹ | 747/11704 [06:39<1:30:00, 2.03it/s]
806
  6%|β–‹ | 748/11704 [06:39<1:29:55, 2.03it/s]
807
  6%|β–‹ | 749/11704 [06:40<1:30:01, 2.03it/s]
808
  6%|β–‹ | 750/11704 [06:40<1:29:54, 2.03it/s]{'loss': 3.851, 'grad_norm': 0.5299991369247437, 'learning_rate': 0.0006404782237403928, 'epoch': 0.9}
 
809
 
810
  6%|β–‹ | 750/11704 [06:40<1:29:54, 2.03it/s]
811
  6%|β–‹ | 751/11704 [06:41<1:29:57, 2.03it/s]
812
  6%|β–‹ | 752/11704 [06:41<1:30:00, 2.03it/s]
813
  6%|β–‹ | 753/11704 [06:42<1:29:52, 2.03it/s]
814
  6%|β–‹ | 754/11704 [06:42<1:30:01, 2.03it/s]
815
  6%|β–‹ | 755/11704 [06:43<1:29:53, 2.03it/s]
816
  6%|β–‹ | 756/11704 [06:43<1:29:51, 2.03it/s]
817
  6%|β–‹ | 757/11704 [06:43<1:29:52, 2.03it/s]
818
  6%|β–‹ | 758/11704 [06:44<1:29:51, 2.03it/s]
819
  6%|β–‹ | 759/11704 [06:44<1:29:56, 2.03it/s]
820
  6%|β–‹ | 760/11704 [06:45<1:29:50, 2.03it/s]
821
  7%|β–‹ | 761/11704 [06:45<1:29:48, 2.03it/s]
822
  7%|β–‹ | 762/11704 [06:46<1:29:55, 2.03it/s]
823
  7%|β–‹ | 763/11704 [06:46<1:29:48, 2.03it/s]
824
  7%|β–‹ | 764/11704 [06:47<1:29:48, 2.03it/s]
825
  7%|β–‹ | 765/11704 [06:47<1:29:54, 2.03it/s]
826
  7%|β–‹ | 766/11704 [06:48<1:29:47, 2.03it/s]
827
  7%|β–‹ | 767/11704 [06:48<1:29:56, 2.03it/s]
828
  7%|β–‹ | 768/11704 [06:49<1:29:52, 2.03it/s]
829
  7%|β–‹ | 769/11704 [06:49<1:29:52, 2.03it/s]
830
  7%|β–‹ | 770/11704 [06:50<1:29:50, 2.03it/s]
831
  7%|β–‹ | 771/11704 [06:50<1:29:44, 2.03it/s]
832
  7%|β–‹ | 772/11704 [06:51<1:29:44, 2.03it/s]
833
  7%|β–‹ | 773/11704 [06:51<1:29:36, 2.03it/s]
834
  7%|β–‹ | 774/11704 [06:52<1:29:42, 2.03it/s]
835
  7%|β–‹ | 775/11704 [06:52<1:29:42, 2.03it/s]{'loss': 3.7964, 'grad_norm': 0.5229047536849976, 'learning_rate': 0.0006618274978650726, 'epoch': 0.93}
 
836
 
837
  7%|β–‹ | 775/11704 [06:52<1:29:42, 2.03it/s]
838
  7%|β–‹ | 776/11704 [06:53<1:29:43, 2.03it/s]
839
  7%|β–‹ | 777/11704 [06:53<1:29:50, 2.03it/s]
840
  7%|β–‹ | 778/11704 [06:54<1:29:41, 2.03it/s]
841
  7%|β–‹ | 779/11704 [06:54<1:29:43, 2.03it/s]
842
  7%|β–‹ | 780/11704 [06:55<1:29:41, 2.03it/s]
843
  7%|β–‹ | 781/11704 [06:55<1:29:40, 2.03it/s]
844
  7%|β–‹ | 782/11704 [06:56<1:29:39, 2.03it/s]
845
  7%|β–‹ | 783/11704 [06:56<1:29:35, 2.03it/s]
846
  7%|β–‹ | 784/11704 [06:57<1:29:34, 2.03it/s]
847
  7%|β–‹ | 785/11704 [06:57<1:29:40, 2.03it/s]
848
  7%|β–‹ | 786/11704 [06:58<1:29:40, 2.03it/s]
849
  7%|β–‹ | 787/11704 [06:58<1:29:44, 2.03it/s]
850
  7%|β–‹ | 788/11704 [06:59<1:29:40, 2.03it/s]
851
  7%|β–‹ | 789/11704 [06:59<1:29:42, 2.03it/s]
852
  7%|β–‹ | 790/11704 [07:00<1:29:42, 2.03it/s]
853
  7%|β–‹ | 791/11704 [07:00<1:29:41, 2.03it/s]
854
  7%|β–‹ | 792/11704 [07:01<1:29:39, 2.03it/s]
855
  7%|β–‹ | 793/11704 [07:01<1:29:34, 2.03it/s]
856
  7%|β–‹ | 794/11704 [07:02<1:29:26, 2.03it/s]
857
  7%|β–‹ | 795/11704 [07:02<1:29:29, 2.03it/s]
858
  7%|β–‹ | 796/11704 [07:03<1:29:29, 2.03it/s]
859
  7%|β–‹ | 797/11704 [07:03<1:29:29, 2.03it/s]
860
  7%|β–‹ | 798/11704 [07:04<1:29:32, 2.03it/s]
861
  7%|β–‹ | 799/11704 [07:04<1:29:33, 2.03it/s]
862
  7%|β–‹ | 800/11704 [07:05<1:29:32, 2.03it/s]{'loss': 3.7888, 'grad_norm': 0.48627805709838867, 'learning_rate': 0.0006831767719897524, 'epoch': 0.96}
 
863
 
864
  7%|β–‹ | 800/11704 [07:05<1:29:32, 2.03it/s]
865
  7%|β–‹ | 801/11704 [07:05<1:29:38, 2.03it/s]
866
  7%|β–‹ | 802/11704 [07:06<1:29:34, 2.03it/s]
867
  7%|β–‹ | 803/11704 [07:06<1:29:34, 2.03it/s]
868
  7%|β–‹ | 804/11704 [07:07<1:29:28, 2.03it/s]
869
  7%|β–‹ | 805/11704 [07:07<1:29:23, 2.03it/s]
870
  7%|β–‹ | 806/11704 [07:08<1:29:21, 2.03it/s]
871
  7%|β–‹ | 807/11704 [07:08<1:29:29, 2.03it/s]
872
  7%|β–‹ | 808/11704 [07:09<1:29:27, 2.03it/s]
873
  7%|β–‹ | 809/11704 [07:09<1:29:26, 2.03it/s]
874
  7%|β–‹ | 810/11704 [07:10<1:29:27, 2.03it/s]
875
  7%|β–‹ | 811/11704 [07:10<1:29:23, 2.03it/s]
876
  7%|β–‹ | 812/11704 [07:11<1:29:25, 2.03it/s]
877
  7%|β–‹ | 813/11704 [07:11<1:29:24, 2.03it/s]
878
  7%|β–‹ | 814/11704 [07:12<1:29:16, 2.03it/s]
879
  7%|β–‹ | 815/11704 [07:12<1:29:08, 2.04it/s]
880
  7%|β–‹ | 816/11704 [07:13<1:29:13, 2.03it/s]
881
  7%|β–‹ | 817/11704 [07:13<1:29:13, 2.03it/s]
882
  7%|β–‹ | 818/11704 [07:14<1:29:15, 2.03it/s]
883
  7%|β–‹ | 819/11704 [07:14<1:29:19, 2.03it/s]
884
  7%|β–‹ | 820/11704 [07:15<1:29:12, 2.03it/s]
885
  7%|β–‹ | 821/11704 [07:15<1:29:15, 2.03it/s]
886
  7%|β–‹ | 822/11704 [07:16<1:29:14, 2.03it/s]
887
  7%|β–‹ | 823/11704 [07:16<1:29:12, 2.03it/s]
888
  7%|β–‹ | 824/11704 [07:16<1:29:17, 2.03it/s]
889
  7%|β–‹ | 825/11704 [07:17<1:29:15, 2.03it/s]{'loss': 3.7506, 'grad_norm': 0.5719705820083618, 'learning_rate': 0.0007045260461144322, 'epoch': 0.99}
 
890
 
891
  7%|β–‹ | 825/11704 [07:17<1:29:15, 2.03it/s]
892
  7%|β–‹ | 826/11704 [07:17<1:29:22, 2.03it/s]
893
  7%|β–‹ | 827/11704 [07:18<1:29:19, 2.03it/s]
894
  7%|β–‹ | 828/11704 [07:18<1:29:13, 2.03it/s]
895
  7%|β–‹ | 829/11704 [07:19<1:29:19, 2.03it/s]
896
  7%|β–‹ | 830/11704 [07:19<1:29:18, 2.03it/s]
897
  7%|β–‹ | 831/11704 [07:20<1:29:20, 2.03it/s]
898
  7%|β–‹ | 832/11704 [07:20<1:29:17, 2.03it/s]
899
  7%|β–‹ | 833/11704 [07:21<1:29:13, 2.03it/s]
900
  7%|β–‹ | 834/11704 [07:21<1:29:20, 2.03it/s]
901
  7%|β–‹ | 835/11704 [07:22<1:29:11, 2.03it/s]
902
  7%|β–‹ | 836/11704 [07:22<1:31:45, 1.97it/s]
903
  7%|β–‹ | 837/11704 [07:34<11:50:27, 3.92s/it]
904
  7%|β–‹ | 838/11704 [07:35<8:44:46, 2.90s/it]
905
  7%|β–‹ | 839/11704 [07:35<6:33:59, 2.18s/it]
906
  7%|β–‹ | 840/11704 [07:36<5:02:24, 1.67s/it]
907
  7%|β–‹ | 841/11704 [07:36<3:58:30, 1.32s/it]
908
  7%|β–‹ | 842/11704 [07:37<3:13:48, 1.07s/it]
909
  7%|β–‹ | 843/11704 [07:37<2:42:26, 1.11it/s]
910
  7%|β–‹ | 844/11704 [07:38<2:20:22, 1.29it/s]
911
  7%|β–‹ | 845/11704 [07:38<2:05:46, 1.44it/s]
912
  7%|β–‹ | 846/11704 [07:39<1:54:47, 1.58it/s]
913
  7%|β–‹ | 847/11704 [07:39<1:47:02, 1.69it/s]
914
  7%|β–‹ | 848/11704 [07:40<1:41:37, 1.78it/s]
915
  7%|β–‹ | 849/11704 [07:40<1:37:49, 1.85it/s]
916
  7%|β–‹ | 850/11704 [07:41<1:35:06, 1.90it/s]{'loss': 3.7125, 'grad_norm': 0.48182985186576843, 'learning_rate': 0.0007258753202391119, 'epoch': 1.02}
 
917
 
918
  7%|β–‹ | 850/11704 [07:41<1:35:06, 1.90it/s]
919
  7%|β–‹ | 851/11704 [07:41<1:33:29, 1.93it/s]
920
  7%|β–‹ | 852/11704 [07:42<1:32:09, 1.96it/s]
921
  7%|β–‹ | 853/11704 [07:42<1:31:05, 1.99it/s]
922
  7%|β–‹ | 854/11704 [07:43<1:30:25, 2.00it/s]
923
  7%|β–‹ | 855/11704 [07:43<1:29:58, 2.01it/s]
924
  7%|β–‹ | 856/11704 [07:44<1:29:45, 2.01it/s]
925
  7%|β–‹ | 857/11704 [07:44<1:29:20, 2.02it/s]
926
  7%|β–‹ | 858/11704 [07:45<1:29:19, 2.02it/s]
927
  7%|β–‹ | 859/11704 [07:45<1:29:09, 2.03it/s]
928
  7%|β–‹ | 860/11704 [07:46<1:29:11, 2.03it/s]
929
  7%|β–‹ | 861/11704 [07:46<1:29:02, 2.03it/s]
930
  7%|β–‹ | 862/11704 [07:47<1:28:59, 2.03it/s]
931
  7%|β–‹ | 863/11704 [07:47<1:28:54, 2.03it/s]
932
  7%|β–‹ | 864/11704 [07:48<1:28:51, 2.03it/s]
933
  7%|β–‹ | 865/11704 [07:48<1:28:48, 2.03it/s]
934
  7%|β–‹ | 866/11704 [07:49<1:28:57, 2.03it/s]
935
  7%|β–‹ | 867/11704 [07:49<1:28:59, 2.03it/s]
936
  7%|β–‹ | 868/11704 [07:50<1:28:51, 2.03it/s]
937
  7%|β–‹ | 869/11704 [07:50<1:28:52, 2.03it/s]
938
  7%|β–‹ | 870/11704 [07:51<1:28:50, 2.03it/s]
939
  7%|β–‹ | 871/11704 [07:51<1:28:46, 2.03it/s]
940
  7%|β–‹ | 872/11704 [07:52<1:28:53, 2.03it/s]
 
1
+ slurm submission log: 2024-05-25 22:01:15.607519
2
+ created following sbatch script:
3
+
4
+ ###############################
5
+
6
+ #!/bin/bash
7
+
8
+ #SBATCH --account=nlp
9
+ #SBATCH --cpus-per-task=16
10
+ #SBATCH --dependency=afterok:7651385
11
+ #SBATCH --gres=gpu:2
12
+ #SBATCH --job-name=tthrush-job-339895
13
+ #SBATCH --mem=100G
14
+ #SBATCH --nodelist=sphinx2
15
+ #SBATCH --open-mode=append
16
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
17
+ #SBATCH --partition=sphinx
18
+ #SBATCH --time=14-0
19
+
20
+ # activate your desired anaconda environment
21
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
22
+
23
+ # cd to working directory
24
+ cd .
25
+
26
+ # launch commands
27
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
28
+
29
+ ###############################
30
+
31
+ submission to slurm complete!
32
+
33
+
34
+ ###############################
35
+ slurm submission output
36
+
37
+ Submitted batch job 7651386
38
+
39
+
40
+
41
+ ###############################
42
+
43
+ slurm submission log: 2024-05-25 22:02:24.879435
44
+ created following sbatch script:
45
+
46
+ ###############################
47
+
48
+ #!/bin/bash
49
+
50
+ #SBATCH --account=nlp
51
+ #SBATCH --cpus-per-task=16
52
+ #SBATCH --dependency=afterok:7651416
53
+ #SBATCH --gres=gpu:2
54
+ #SBATCH --job-name=tthrush-job-1467582
55
+ #SBATCH --mem=100G
56
+ #SBATCH --nodelist=sphinx2
57
+ #SBATCH --open-mode=append
58
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
59
+ #SBATCH --partition=sphinx
60
+ #SBATCH --time=14-0
61
+
62
+ # activate your desired anaconda environment
63
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
64
+
65
+ # cd to working directory
66
+ cd .
67
+
68
+ # launch commands
69
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
70
+
71
+ ###############################
72
+
73
+ submission to slurm complete!
74
+
75
+
76
+ ###############################
77
+ slurm submission output
78
+
79
+ Submitted batch job 7651417
80
+
81
+
82
+
83
+ ###############################
84
+
85
+ slurm submission log: 2024-05-25 22:12:50.153381
86
+ created following sbatch script:
87
+
88
+ ###############################
89
+
90
+ #!/bin/bash
91
+
92
+ #SBATCH --account=nlp
93
+ #SBATCH --cpus-per-task=16
94
+ #SBATCH --dependency=afterok:7651457
95
+ #SBATCH --gres=gpu:2
96
+ #SBATCH --job-name=tthrush-job-1098424
97
+ #SBATCH --mem=100G
98
+ #SBATCH --nodelist=sphinx2
99
+ #SBATCH --open-mode=append
100
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
101
+ #SBATCH --partition=sphinx
102
+ #SBATCH --time=14-0
103
+
104
+ # activate your desired anaconda environment
105
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
106
+
107
+ # cd to working directory
108
+ cd .
109
+
110
+ # launch commands
111
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
112
+
113
+ ###############################
114
+
115
+ submission to slurm complete!
116
+
117
+
118
+ ###############################
119
+ slurm submission output
120
+
121
+ Submitted batch job 7651458
122
+
123
+
124
+
125
+ ###############################
126
+
127
+ slurm submission log: 2024-05-25 22:15:55.753417
128
+ created following sbatch script:
129
+
130
+ ###############################
131
+
132
+ #!/bin/bash
133
+
134
+ #SBATCH --account=nlp
135
+ #SBATCH --cpus-per-task=16
136
+ #SBATCH --dependency=afterok:7651485
137
+ #SBATCH --gres=gpu:2
138
+ #SBATCH --job-name=tthrush-job-3438810
139
+ #SBATCH --mem=100G
140
+ #SBATCH --nodelist=sphinx2
141
+ #SBATCH --open-mode=append
142
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
143
+ #SBATCH --partition=sphinx
144
+ #SBATCH --time=14-0
145
+
146
+ # activate your desired anaconda environment
147
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
148
+
149
+ # cd to working directory
150
+ cd .
151
+
152
+ # launch commands
153
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
154
+
155
+ ###############################
156
+
157
+ submission to slurm complete!
158
+
159
+
160
+ ###############################
161
+ slurm submission output
162
+
163
+ Submitted batch job 7651486
164
+
165
+
166
+
167
+ ###############################
168
+
169
+ slurm submission log: 2024-05-25 22:18:14.763422
170
+ created following sbatch script:
171
+
172
+ ###############################
173
+
174
+ #!/bin/bash
175
+
176
+ #SBATCH --account=nlp
177
+ #SBATCH --cpus-per-task=16
178
+ #SBATCH --dependency=afterok:7651515
179
+ #SBATCH --gres=gpu:2
180
+ #SBATCH --job-name=tthrush-job-3850774
181
+ #SBATCH --mem=100G
182
+ #SBATCH --nodelist=sphinx2
183
+ #SBATCH --open-mode=append
184
+ #SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
185
+ #SBATCH --partition=sphinx
186
+ #SBATCH --time=14-0
187
+
188
+ # activate your desired anaconda environment
189
+ . /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
190
+
191
+ # cd to working directory
192
+ cd .
193
+
194
+ # launch commands
195
+ srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
196
+
197
+ ###############################
198
+
199
+ submission to slurm complete!
200
+
201
+
202
+ ###############################
203
+ slurm submission output
204
+
205
+ Submitted batch job 7651516
206
+
207
+
208
+
209
+ ###############################
210
+
211
+ /var/lib/slurm/slurmd/job7651516/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory
212
+
213
+ CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
214
+ To initialize your shell, run
215
+
216
+ $ conda init <SHELL_NAME>
217
+
218
+ Currently supported shells are:
219
+ - bash
220
+ - fish
221
+ - tcsh
222
+ - xonsh
223
+ - zsh
224
+ - powershell
225
+
226
+ See 'conda init --help' for more information and options.
227
+
228
+ IMPORTANT: You may need to close and restart your shell after running 'conda init'.
229
+
230
+
231
+ ###############################
232
+ start time: 2024-05-26 08:43:47.611801
233
+ machine: sphinx2
234
+ conda env: pretraining-coreset-selection
235
+ ###############################
236
+ running following processes
237
+
238
+ torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14
239
+
240
+
241
+ ###############################
242
+ command outputs:
243
+
244
+
245
+ [2024-05-26 08:43:51,176] torch.distributed.run: [WARNING]
246
+ [2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] *****************************************
247
+ [2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
248
+ [2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] *****************************************
249
+ 05/26/2024 08:44:01 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
250
+ 05/26/2024 08:44:06 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
251
+
252
  0%| | 0/11704 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
253
+ [rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
254
+
255
  0%| | 1/11704 [00:11<36:01:50, 11.08s/it]
256
  0%| | 2/11704 [00:15<22:29:09, 6.92s/it]
257
  0%| | 3/11704 [00:18<17:13:49, 5.30s/it]
258
  0%| | 4/11704 [00:21<13:50:30, 4.26s/it]
259
  0%| | 5/11704 [00:22<10:32:47, 3.25s/it]
260
  0%| | 6/11704 [00:24<9:19:43, 2.87s/it]
261
  0%| | 7/11704 [00:26<7:44:55, 2.38s/it]
262
  0%| | 8/11704 [00:27<6:35:35, 2.03s/it]
263
  0%| | 9/11704 [00:28<5:32:02, 1.70s/it]
264
  0%| | 10/11704 [00:29<5:12:33, 1.60s/it]
265
  0%| | 11/11704 [00:30<4:43:07, 1.45s/it]
266
  0%| | 12/11704 [00:31<4:09:08, 1.28s/it]
267
  0%| | 13/11704 [00:32<3:36:18, 1.11s/it]
268
  0%| | 14/11704 [00:33<3:17:27, 1.01s/it]
269
  0%| | 15/11704 [00:34<3:11:14, 1.02it/s]
270
  0%| | 16/11704 [00:34<3:01:30, 1.07it/s]
271
  0%| | 17/11704 [00:35<2:48:52, 1.15it/s]
272
  0%| | 18/11704 [00:36<2:33:34, 1.27it/s]
273
  0%| | 19/11704 [00:36<2:26:24, 1.33it/s]
274
  0%| | 20/11704 [00:37<2:19:52, 1.39it/s]
275
  0%| | 21/11704 [00:38<2:16:43, 1.42it/s]
276
  0%| | 22/11704 [00:38<2:16:47, 1.42it/s]
277
  0%| | 23/11704 [00:39<2:13:38, 1.46it/s]
278
  0%| | 24/11704 [00:40<2:20:53, 1.38it/s]
279
  0%| | 25/11704 [00:41<2:25:03, 1.34it/s]{'loss': 10.6948, 'grad_norm': 1.2803484201431274, 'learning_rate': 2.134927412467976e-05, 'epoch': 0.03}
280
 
281
+
282
  0%| | 25/11704 [00:41<2:25:03, 1.34it/s]
283
  0%| | 26/11704 [00:41<2:14:46, 1.44it/s]
284
  0%| | 27/11704 [00:42<2:08:18, 1.52it/s]
285
  0%| | 28/11704 [00:42<2:06:17, 1.54it/s]
286
  0%| | 29/11704 [00:43<2:03:23, 1.58it/s]
287
  0%| | 30/11704 [00:44<1:58:17, 1.64it/s]
288
  0%| | 31/11704 [00:44<1:53:07, 1.72it/s]
289
  0%| | 32/11704 [00:45<1:50:58, 1.75it/s]
290
  0%| | 33/11704 [00:45<1:50:04, 1.77it/s]
291
  0%| | 34/11704 [00:46<1:51:19, 1.75it/s]
292
  0%| | 35/11704 [00:46<1:47:30, 1.81it/s]
293
  0%| | 36/11704 [00:47<1:47:57, 1.80it/s]
294
  0%| | 37/11704 [00:47<1:46:41, 1.82it/s]
295
  0%| | 38/11704 [00:48<1:45:56, 1.84it/s]
296
  0%| | 39/11704 [00:49<1:47:28, 1.81it/s]
297
  0%| | 40/11704 [00:49<1:46:05, 1.83it/s]
298
  0%| | 41/11704 [00:50<1:45:06, 1.85it/s]
299
  0%| | 42/11704 [00:50<1:45:14, 1.85it/s]
300
  0%| | 43/11704 [00:51<1:46:16, 1.83it/s]
301
  0%| | 44/11704 [00:51<1:44:48, 1.85it/s]
302
  0%| | 45/11704 [00:52<1:42:40, 1.89it/s]
303
  0%| | 46/11704 [00:52<1:43:30, 1.88it/s]
304
  0%| | 47/11704 [00:53<1:43:45, 1.87it/s]
305
  0%| | 48/11704 [00:53<1:42:51, 1.89it/s]
306
  0%| | 49/11704 [00:54<1:42:14, 1.90it/s]
307
  0%| | 50/11704 [00:54<1:41:41, 1.91it/s]{'loss': 9.9789, 'grad_norm': 1.1821939945220947, 'learning_rate': 4.269854824935952e-05, 'epoch': 0.06}
308
+
309
 
310
  0%| | 50/11704 [00:54<1:41:41, 1.91it/s]
311
  0%| | 51/11704 [00:55<1:41:01, 1.92it/s]
312
  0%| | 52/11704 [00:55<1:40:15, 1.94it/s]
313
  0%| | 53/11704 [00:56<1:39:39, 1.95it/s]
314
  0%| | 54/11704 [00:56<1:40:20, 1.94it/s]
315
  0%| | 55/11704 [00:57<1:39:05, 1.96it/s]
316
  0%| | 56/11704 [00:57<1:39:10, 1.96it/s]
317
  0%| | 57/11704 [00:58<1:39:27, 1.95it/s]
318
  0%| | 58/11704 [00:58<1:39:40, 1.95it/s]
319
  1%| | 59/11704 [00:59<1:52:14, 1.73it/s]
320
  1%| | 60/11704 [01:00<1:48:03, 1.80it/s]
321
  1%| | 61/11704 [01:00<1:45:25, 1.84it/s]
322
  1%| | 62/11704 [01:01<1:43:05, 1.88it/s]
323
  1%| | 63/11704 [01:01<1:41:38, 1.91it/s]
324
  1%| | 64/11704 [01:02<1:40:08, 1.94it/s]
325
  1%| | 65/11704 [01:02<1:39:38, 1.95it/s]
326
  1%| | 66/11704 [01:03<1:38:25, 1.97it/s]
327
  1%| | 67/11704 [01:03<1:38:36, 1.97it/s]
328
  1%| | 68/11704 [01:04<1:38:36, 1.97it/s]
329
  1%| | 69/11704 [01:04<1:37:32, 1.99it/s]
330
  1%| | 70/11704 [01:05<1:37:04, 2.00it/s]
331
  1%| | 71/11704 [01:05<1:36:37, 2.01it/s]
332
  1%| | 72/11704 [01:06<1:36:46, 2.00it/s]
333
  1%| | 73/11704 [01:06<1:36:54, 2.00it/s]
334
  1%| | 74/11704 [01:07<1:37:03, 2.00it/s]
335
  1%| | 75/11704 [01:07<1:36:35, 2.01it/s]{'loss': 9.2071, 'grad_norm': 1.037238597869873, 'learning_rate': 6.404782237403927e-05, 'epoch': 0.09}
336
 
337
+
338
  1%| | 75/11704 [01:07<1:36:35, 2.01it/s]
339
  1%| | 76/11704 [01:08<1:36:49, 2.00it/s]
340
  1%| | 77/11704 [01:08<1:36:51, 2.00it/s]
341
  1%| | 78/11704 [01:09<1:36:22, 2.01it/s]
342
  1%| | 79/11704 [01:09<1:36:16, 2.01it/s]
343
  1%| | 80/11704 [01:10<1:36:33, 2.01it/s]
344
  1%| | 81/11704 [01:10<1:36:15, 2.01it/s]
345
  1%| | 82/11704 [01:11<1:36:11, 2.01it/s]
346
  1%| | 83/11704 [01:11<1:36:08, 2.01it/s]
347
  1%| | 84/11704 [01:12<1:35:57, 2.02it/s]
348
  1%| | 85/11704 [01:12<1:36:27, 2.01it/s]
349
  1%| | 86/11704 [01:13<1:36:25, 2.01it/s]
350
  1%| | 87/11704 [01:13<1:36:11, 2.01it/s]
351
  1%| | 88/11704 [01:14<1:36:08, 2.01it/s]
352
  1%| | 89/11704 [01:14<1:37:01, 2.00it/s]
353
  1%| | 90/11704 [01:15<1:39:29, 1.95it/s]
354
  1%| | 91/11704 [01:15<1:38:15, 1.97it/s]
355
  1%| | 92/11704 [01:16<1:38:03, 1.97it/s]
356
  1%| | 93/11704 [01:16<1:37:15, 1.99it/s]
357
  1%| | 94/11704 [01:17<1:36:57, 2.00it/s]
358
  1%| | 95/11704 [01:17<1:36:48, 2.00it/s]
359
  1%| | 96/11704 [01:18<1:36:31, 2.00it/s]
360
  1%| | 97/11704 [01:18<1:36:15, 2.01it/s]
361
  1%| | 98/11704 [01:19<1:36:02, 2.01it/s]
362
  1%| | 99/11704 [01:19<1:36:18, 2.01it/s]
363
  1%| | 100/11704 [01:20<1:36:11, 2.01it/s]{'loss': 8.3139, 'grad_norm': 0.7935464978218079, 'learning_rate': 8.539709649871905e-05, 'epoch': 0.12}
364
+
365
 
366
  1%| | 100/11704 [01:20<1:36:11, 2.01it/s]
367
  1%| | 101/11704 [01:20<1:36:06, 2.01it/s]
368
  1%| | 102/11704 [01:21<1:35:47, 2.02it/s]
369
  1%| | 103/11704 [01:21<1:36:08, 2.01it/s]
370
  1%| | 104/11704 [01:22<1:35:49, 2.02it/s]
371
  1%| | 105/11704 [01:22<1:35:43, 2.02it/s]
372
  1%| | 106/11704 [01:23<1:35:41, 2.02it/s]
373
  1%| | 107/11704 [01:23<1:35:33, 2.02it/s]
374
  1%| | 108/11704 [01:24<1:35:22, 2.03it/s]
375
  1%| | 109/11704 [01:24<1:35:39, 2.02it/s]
376
  1%| | 110/11704 [01:25<1:35:33, 2.02it/s]
377
  1%| | 111/11704 [01:25<1:35:49, 2.02it/s]
378
  1%| | 112/11704 [01:26<1:35:39, 2.02it/s]
379
  1%| | 113/11704 [01:26<1:35:34, 2.02it/s]
380
  1%| | 114/11704 [01:27<1:35:24, 2.02it/s]
381
  1%| | 115/11704 [01:27<1:35:20, 2.03it/s]
382
  1%| | 116/11704 [01:28<1:35:31, 2.02it/s]
383
  1%| | 117/11704 [01:28<1:35:27, 2.02it/s]
384
  1%| | 118/11704 [01:29<1:35:24, 2.02it/s]
385
  1%| | 119/11704 [01:29<1:35:14, 2.03it/s]
386
  1%| | 120/11704 [01:30<1:35:08, 2.03it/s]
387
  1%| | 121/11704 [01:30<1:35:10, 2.03it/s]
388
  1%| | 122/11704 [01:31<1:35:03, 2.03it/s]
389
  1%| | 123/11704 [01:31<1:34:57, 2.03it/s]
390
  1%| | 124/11704 [01:32<1:35:01, 2.03it/s]
391
  1%| | 125/11704 [01:32<1:35:08, 2.03it/s]{'loss': 7.526, 'grad_norm': 0.5917059183120728, 'learning_rate': 0.0001067463706233988, 'epoch': 0.15}
392
+
393
 
394
  1%| | 125/11704 [01:32<1:35:08, 2.03it/s]
395
  1%| | 126/11704 [01:33<1:35:09, 2.03it/s]
396
  1%| | 127/11704 [01:33<1:35:04, 2.03it/s]
397
  1%| | 128/11704 [01:34<1:34:59, 2.03it/s]
398
  1%| | 129/11704 [01:34<1:34:59, 2.03it/s]
399
  1%| | 130/11704 [01:35<1:35:09, 2.03it/s]
400
  1%| | 131/11704 [01:35<1:35:07, 2.03it/s]
401
  1%| | 132/11704 [01:35<1:35:02, 2.03it/s]
402
  1%| | 133/11704 [01:36<1:34:58, 2.03it/s]
403
  1%| | 134/11704 [01:36<1:34:58, 2.03it/s]
404
  1%| | 135/11704 [01:37<1:34:56, 2.03it/s]
405
  1%| | 136/11704 [01:37<1:35:01, 2.03it/s]
406
  1%| | 137/11704 [01:38<1:34:56, 2.03it/s]
407
  1%| | 138/11704 [01:38<1:35:30, 2.02it/s]
408
  1%| | 139/11704 [01:39<1:35:41, 2.01it/s]
409
  1%| | 140/11704 [01:39<1:35:29, 2.02it/s]
410
  1%| | 141/11704 [01:40<1:35:14, 2.02it/s]
411
  1%| | 142/11704 [01:40<1:35:11, 2.02it/s]
412
  1%| | 143/11704 [01:41<1:35:00, 2.03it/s]
413
  1%| | 144/11704 [01:41<1:34:55, 2.03it/s]
414
  1%| | 145/11704 [01:42<1:34:52, 2.03it/s]
415
  1%| | 146/11704 [01:42<1:34:48, 2.03it/s]
416
  1%|▏ | 147/11704 [01:43<1:34:51, 2.03it/s]
417
  1%|▏ | 148/11704 [01:43<1:34:45, 2.03it/s]
418
  1%|▏ | 149/11704 [01:44<1:34:43, 2.03it/s]
419
  1%|▏ | 150/11704 [01:44<1:34:48, 2.03it/s]{'loss': 6.8951, 'grad_norm': 0.37877357006073, 'learning_rate': 0.00012809564474807855, 'epoch': 0.18}
420
+
421
 
422
  1%|▏ | 150/11704 [01:44<1:34:48, 2.03it/s]
423
  1%|▏ | 151/11704 [01:45<1:34:56, 2.03it/s]
424
  1%|▏ | 152/11704 [01:45<1:34:58, 2.03it/s]
425
  1%|▏ | 153/11704 [01:46<1:34:45, 2.03it/s]
426
  1%|▏ | 154/11704 [01:46<1:34:38, 2.03it/s]
427
  1%|▏ | 155/11704 [01:47<1:34:42, 2.03it/s]
428
  1%|▏ | 156/11704 [01:47<1:34:49, 2.03it/s]
429
  1%|▏ | 157/11704 [01:48<1:34:49, 2.03it/s]
430
  1%|▏ | 158/11704 [01:48<1:34:47, 2.03it/s]
431
  1%|▏ | 159/11704 [01:49<1:34:42, 2.03it/s]
432
  1%|▏ | 160/11704 [01:49<1:34:40, 2.03it/s]
433
  1%|▏ | 161/11704 [01:50<1:34:38, 2.03it/s]
434
  1%|▏ | 162/11704 [01:50<1:34:39, 2.03it/s]
435
  1%|▏ | 163/11704 [01:51<1:34:47, 2.03it/s]
436
  1%|▏ | 164/11704 [01:51<1:34:47, 2.03it/s]
437
  1%|▏ | 165/11704 [01:52<1:34:54, 2.03it/s]
438
  1%|▏ | 166/11704 [01:52<1:34:47, 2.03it/s]
439
  1%|▏ | 167/11704 [01:53<1:34:47, 2.03it/s]
440
  1%|▏ | 168/11704 [01:53<1:34:38, 2.03it/s]
441
  1%|▏ | 169/11704 [01:54<1:34:42, 2.03it/s]
442
  1%|▏ | 170/11704 [01:54<1:34:41, 2.03it/s]
443
  1%|▏ | 171/11704 [01:55<1:34:32, 2.03it/s]
444
  1%|▏ | 172/11704 [01:55<1:34:38, 2.03it/s]
445
  1%|▏ | 173/11704 [01:56<1:34:32, 2.03it/s]
446
  1%|▏ | 174/11704 [01:56<1:34:27, 2.03it/s]
447
  1%|▏ | 175/11704 [01:57<1:34:29, 2.03it/s]{'loss': 6.3957, 'grad_norm': 0.3709251582622528, 'learning_rate': 0.00014944491887275833, 'epoch': 0.21}
448
+
449
 
450
  1%|▏ | 175/11704 [01:57<1:34:29, 2.03it/s]
451
  2%|▏ | 176/11704 [01:57<1:34:26, 2.03it/s]
452
  2%|▏ | 177/11704 [01:58<1:34:35, 2.03it/s]
453
  2%|▏ | 178/11704 [01:58<1:34:30, 2.03it/s]
454
  2%|▏ | 179/11704 [01:59<1:34:27, 2.03it/s]
455
  2%|▏ | 180/11704 [01:59<1:34:35, 2.03it/s]
456
  2%|▏ | 181/11704 [02:00<1:34:38, 2.03it/s]
457
  2%|▏ | 182/11704 [02:00<1:34:39, 2.03it/s]
458
  2%|▏ | 183/11704 [02:01<1:34:32, 2.03it/s]
459
  2%|▏ | 184/11704 [02:01<1:34:33, 2.03it/s]
460
  2%|▏ | 185/11704 [02:02<1:34:35, 2.03it/s]
461
  2%|▏ | 186/11704 [02:02<1:34:32, 2.03it/s]
462
  2%|▏ | 187/11704 [02:03<1:34:35, 2.03it/s]
463
  2%|▏ | 188/11704 [02:03<1:34:30, 2.03it/s]
464
  2%|▏ | 189/11704 [02:04<1:34:32, 2.03it/s]
465
  2%|▏ | 190/11704 [02:04<1:34:33, 2.03it/s]
466
  2%|▏ | 191/11704 [02:05<1:34:28, 2.03it/s]
467
  2%|▏ | 192/11704 [02:05<1:34:33, 2.03it/s]
468
  2%|▏ | 193/11704 [02:06<1:34:30, 2.03it/s]
469
  2%|▏ | 194/11704 [02:06<1:34:35, 2.03it/s]
470
  2%|▏ | 195/11704 [02:07<1:34:27, 2.03it/s]
471
  2%|▏ | 196/11704 [02:07<1:34:29, 2.03it/s]
472
  2%|▏ | 197/11704 [02:08<1:34:33, 2.03it/s]
473
  2%|▏ | 198/11704 [02:08<1:34:28, 2.03it/s]
474
  2%|▏ | 199/11704 [02:09<1:34:28, 2.03it/s]
475
  2%|▏ | 200/11704 [02:09<1:34:30, 2.03it/s]{'loss': 5.9826, 'grad_norm': 0.6036785840988159, 'learning_rate': 0.0001707941929974381, 'epoch': 0.24}
476
+
477
 
478
  2%|▏ | 200/11704 [02:09<1:34:30, 2.03it/s]
479
  2%|▏ | 201/11704 [02:09<1:34:24, 2.03it/s]
480
  2%|▏ | 202/11704 [02:10<1:34:31, 2.03it/s]
481
  2%|▏ | 203/11704 [02:10<1:34:23, 2.03it/s]
482
  2%|▏ | 204/11704 [02:11<1:34:21, 2.03it/s]
483
  2%|▏ | 205/11704 [02:11<1:34:27, 2.03it/s]
484
  2%|▏ | 206/11704 [02:12<1:34:31, 2.03it/s]
485
  2%|▏ | 207/11704 [02:12<1:34:36, 2.03it/s]
486
  2%|▏ | 208/11704 [02:13<1:34:28, 2.03it/s]
487
  2%|▏ | 209/11704 [02:13<1:34:31, 2.03it/s]
488
  2%|▏ | 210/11704 [02:14<1:34:28, 2.03it/s]
489
  2%|▏ | 211/11704 [02:14<1:34:26, 2.03it/s]
490
  2%|▏ | 212/11704 [02:15<1:34:28, 2.03it/s]
491
  2%|▏ | 213/11704 [02:15<1:34:23, 2.03it/s]
492
  2%|▏ | 214/11704 [02:16<1:34:28, 2.03it/s]
493
  2%|▏ | 215/11704 [02:16<1:34:30, 2.03it/s]
494
  2%|▏ | 216/11704 [02:17<1:34:29, 2.03it/s]
495
  2%|▏ | 217/11704 [02:17<1:34:34, 2.02it/s]
496
  2%|▏ | 218/11704 [02:18<1:34:38, 2.02it/s]
497
  2%|▏ | 219/11704 [02:18<1:34:37, 2.02it/s]
498
  2%|▏ | 220/11704 [02:19<1:34:45, 2.02it/s]
499
  2%|▏ | 221/11704 [02:19<1:34:34, 2.02it/s]
500
  2%|▏ | 222/11704 [02:20<1:34:43, 2.02it/s]
501
  2%|▏ | 223/11704 [02:20<1:34:35, 2.02it/s]
502
  2%|▏ | 224/11704 [02:21<1:34:33, 2.02it/s]
503
  2%|▏ | 225/11704 [02:21<1:34:27, 2.03it/s]{'loss': 5.659, 'grad_norm': 0.9925475716590881, 'learning_rate': 0.00019214346712211785, 'epoch': 0.27}
504
 
505
+
506
  2%|▏ | 225/11704 [02:21<1:34:27, 2.03it/s]
507
  2%|▏ | 226/11704 [02:22<1:34:29, 2.02it/s]
508
  2%|▏ | 227/11704 [02:22<1:34:27, 2.03it/s]
509
  2%|▏ | 228/11704 [02:23<1:34:41, 2.02it/s]
510
  2%|▏ | 229/11704 [02:23<1:34:33, 2.02it/s]
511
  2%|▏ | 230/11704 [02:24<1:34:35, 2.02it/s]
512
  2%|▏ | 231/11704 [02:24<1:34:27, 2.02it/s]
513
  2%|▏ | 232/11704 [02:25<1:34:25, 2.03it/s]
514
  2%|▏ | 233/11704 [02:25<1:34:22, 2.03it/s]
515
  2%|▏ | 234/11704 [02:26<1:34:20, 2.03it/s]
516
  2%|▏ | 235/11704 [02:26<1:34:18, 2.03it/s]
517
  2%|▏ | 236/11704 [02:27<1:34:21, 2.03it/s]
518
  2%|▏ | 237/11704 [02:27<1:34:25, 2.02it/s]
519
  2%|▏ | 238/11704 [02:28<1:34:20, 2.03it/s]
520
  2%|▏ | 239/11704 [02:28<1:34:29, 2.02it/s]
521
  2%|▏ | 240/11704 [02:29<1:34:21, 2.02it/s]
522
  2%|▏ | 241/11704 [02:29<1:34:26, 2.02it/s]
523
  2%|▏ | 242/11704 [02:30<1:34:19, 2.03it/s]
524
  2%|▏ | 243/11704 [02:30<1:34:16, 2.03it/s]
525
  2%|▏ | 244/11704 [02:31<1:34:14, 2.03it/s]
526
  2%|▏ | 245/11704 [02:31<1:34:16, 2.03it/s]
527
  2%|▏ | 246/11704 [02:32<1:34:15, 2.03it/s]
528
  2%|▏ | 247/11704 [02:32<1:34:18, 2.02it/s]
529
  2%|▏ | 248/11704 [02:33<1:34:35, 2.02it/s]
530
  2%|▏ | 249/11704 [02:33<1:34:34, 2.02it/s]
531
  2%|▏ | 250/11704 [02:34<1:34:36, 2.02it/s]{'loss': 5.4253, 'grad_norm': 0.6297341585159302, 'learning_rate': 0.0002134927412467976, 'epoch': 0.3}
532
+
533
 
534
  2%|▏ | 250/11704 [02:34<1:34:36, 2.02it/s]
535
  2%|▏ | 251/11704 [02:34<1:34:48, 2.01it/s]
536
  2%|▏ | 252/11704 [02:35<1:34:35, 2.02it/s]
537
  2%|▏ | 253/11704 [02:35<1:34:32, 2.02it/s]
538
  2%|▏ | 254/11704 [02:36<1:34:26, 2.02it/s]
539
  2%|▏ | 255/11704 [02:36<1:34:20, 2.02it/s]
540
  2%|▏ | 256/11704 [02:37<1:34:34, 2.02it/s]
541
  2%|▏ | 257/11704 [02:37<1:34:28, 2.02it/s]
542
  2%|▏ | 258/11704 [02:38<1:34:19, 2.02it/s]
543
  2%|▏ | 259/11704 [02:38<1:34:10, 2.03it/s]
544
  2%|▏ | 260/11704 [02:39<1:34:10, 2.03it/s]
545
  2%|▏ | 261/11704 [02:39<1:34:06, 2.03it/s]
546
  2%|▏ | 262/11704 [02:40<1:33:56, 2.03it/s]
547
  2%|▏ | 263/11704 [02:40<1:33:59, 2.03it/s]
548
  2%|▏ | 264/11704 [02:41<1:33:55, 2.03it/s]
549
  2%|▏ | 265/11704 [02:41<1:34:01, 2.03it/s]
550
  2%|▏ | 266/11704 [02:42<1:34:04, 2.03it/s]
551
  2%|▏ | 267/11704 [02:42<1:33:59, 2.03it/s]
552
  2%|▏ | 268/11704 [02:43<1:34:06, 2.03it/s]
553
  2%|▏ | 269/11704 [02:43<1:34:01, 2.03it/s]
554
  2%|▏ | 270/11704 [02:44<1:34:04, 2.03it/s]
555
  2%|▏ | 271/11704 [02:44<1:33:59, 2.03it/s]
556
  2%|▏ | 272/11704 [02:45<1:33:57, 2.03it/s]
557
  2%|▏ | 273/11704 [02:45<1:33:59, 2.03it/s]
558
  2%|▏ | 274/11704 [02:46<1:33:57, 2.03it/s]
559
  2%|▏ | 275/11704 [02:46<1:34:06, 2.02it/s]{'loss': 5.2293, 'grad_norm': 0.5968512892723083, 'learning_rate': 0.00023484201537147736, 'epoch': 0.33}
560
 
561
+
562
  2%|▏ | 275/11704 [02:46<1:34:06, 2.02it/s]
563
  2%|▏ | 276/11704 [02:47<1:34:02, 2.03it/s]
564
  2%|▏ | 277/11704 [02:47<1:34:05, 2.02it/s]
565
  2%|▏ | 278/11704 [02:48<1:33:59, 2.03it/s]
566
  2%|▏ | 279/11704 [02:48<1:34:01, 2.02it/s]
567
  2%|▏ | 280/11704 [02:49<1:33:54, 2.03it/s]
568
  2%|▏ | 281/11704 [02:49<1:34:00, 2.03it/s]
569
  2%|▏ | 282/11704 [02:49<1:33:55, 2.03it/s]
570
  2%|▏ | 283/11704 [02:50<1:33:55, 2.03it/s]
571
  2%|▏ | 284/11704 [02:50<1:33:52, 2.03it/s]
572
  2%|▏ | 285/11704 [02:51<1:33:47, 2.03it/s]
573
  2%|▏ | 286/11704 [02:51<1:33:49, 2.03it/s]
574
  2%|▏ | 287/11704 [02:52<1:33:49, 2.03it/s]
575
  2%|▏ | 288/11704 [02:52<1:33:50, 2.03it/s]
576
  2%|▏ | 289/11704 [02:53<1:33:43, 2.03it/s]
577
  2%|▏ | 290/11704 [02:53<1:33:47, 2.03it/s]
578
  2%|▏ | 291/11704 [02:54<1:33:42, 2.03it/s]
579
  2%|▏ | 292/11704 [02:54<1:33:40, 2.03it/s]
580
  3%|β–Ž | 293/11704 [02:55<1:33:43, 2.03it/s]
581
  3%|β–Ž | 294/11704 [02:55<1:33:38, 2.03it/s]
582
  3%|β–Ž | 295/11704 [02:56<1:33:41, 2.03it/s]
583
  3%|β–Ž | 296/11704 [02:56<1:33:37, 2.03it/s]
584
  3%|β–Ž | 297/11704 [02:57<1:33:29, 2.03it/s]
585
  3%|β–Ž | 298/11704 [02:57<1:33:33, 2.03it/s]
586
  3%|β–Ž | 299/11704 [02:58<1:33:34, 2.03it/s]
587
  3%|β–Ž | 300/11704 [02:58<1:33:27, 2.03it/s]{'loss': 5.0541, 'grad_norm': 0.9146194458007812, 'learning_rate': 0.0002561912894961571, 'epoch': 0.36}
588
+
589
 
590
  3%|β–Ž | 300/11704 [02:58<1:33:27, 2.03it/s]
591
  3%|β–Ž | 301/11704 [02:59<1:33:30, 2.03it/s]
592
  3%|β–Ž | 302/11704 [02:59<1:33:27, 2.03it/s]
593
  3%|β–Ž | 303/11704 [03:00<1:33:27, 2.03it/s]
594
  3%|β–Ž | 304/11704 [03:00<1:33:31, 2.03it/s]
595
  3%|β–Ž | 305/11704 [03:01<1:33:25, 2.03it/s]
596
  3%|β–Ž | 306/11704 [03:01<1:33:30, 2.03it/s]
597
  3%|β–Ž | 307/11704 [03:02<1:33:24, 2.03it/s]
598
  3%|β–Ž | 308/11704 [03:02<1:33:26, 2.03it/s]
599
  3%|β–Ž | 309/11704 [03:03<1:33:31, 2.03it/s]
600
  3%|β–Ž | 310/11704 [03:03<1:33:21, 2.03it/s]
601
  3%|β–Ž | 311/11704 [03:04<1:33:24, 2.03it/s]
602
  3%|β–Ž | 312/11704 [03:04<1:33:26, 2.03it/s]
603
  3%|β–Ž | 313/11704 [03:05<1:33:25, 2.03it/s]
604
  3%|β–Ž | 314/11704 [03:05<1:33:30, 2.03it/s]
605
  3%|β–Ž | 315/11704 [03:06<1:33:22, 2.03it/s]
606
  3%|β–Ž | 316/11704 [03:06<1:33:25, 2.03it/s]
607
  3%|β–Ž | 317/11704 [03:07<1:33:24, 2.03it/s]
608
  3%|β–Ž | 318/11704 [03:07<1:33:20, 2.03it/s]
609
  3%|β–Ž | 319/11704 [03:08<1:33:22, 2.03it/s]
610
  3%|β–Ž | 320/11704 [03:08<1:33:22, 2.03it/s]
611
  3%|β–Ž | 321/11704 [03:09<1:33:16, 2.03it/s]
612
  3%|β–Ž | 322/11704 [03:09<1:33:24, 2.03it/s]
613
  3%|β–Ž | 323/11704 [03:10<1:33:18, 2.03it/s]
614
  3%|β–Ž | 324/11704 [03:10<1:33:15, 2.03it/s]
615
  3%|β–Ž | 325/11704 [03:11<1:33:18, 2.03it/s]{'loss': 4.934, 'grad_norm': 0.9477291703224182, 'learning_rate': 0.0002775405636208369, 'epoch': 0.39}
616
+
617
 
618
  3%|β–Ž | 325/11704 [03:11<1:33:18, 2.03it/s]
619
  3%|β–Ž | 326/11704 [03:11<1:33:15, 2.03it/s]
620
  3%|β–Ž | 327/11704 [03:12<1:33:14, 2.03it/s]
621
  3%|β–Ž | 328/11704 [03:12<1:33:18, 2.03it/s]
622
  3%|β–Ž | 329/11704 [03:13<1:33:10, 2.03it/s]
623
  3%|β–Ž | 330/11704 [03:13<1:33:10, 2.03it/s]
624
  3%|β–Ž | 331/11704 [03:14<1:33:11, 2.03it/s]
625
  3%|β–Ž | 332/11704 [03:14<1:33:11, 2.03it/s]
626
  3%|β–Ž | 333/11704 [03:15<1:33:12, 2.03it/s]
627
  3%|β–Ž | 334/11704 [03:15<1:33:15, 2.03it/s]
628
  3%|β–Ž | 335/11704 [03:16<1:33:09, 2.03it/s]
629
  3%|β–Ž | 336/11704 [03:16<1:33:09, 2.03it/s]
630
  3%|β–Ž | 337/11704 [03:17<1:33:16, 2.03it/s]
631
  3%|β–Ž | 338/11704 [03:17<1:33:49, 2.02it/s]
632
  3%|β–Ž | 339/11704 [03:18<1:33:39, 2.02it/s]
633
  3%|β–Ž | 340/11704 [03:18<1:33:32, 2.02it/s]
634
  3%|β–Ž | 341/11704 [03:19<1:33:29, 2.03it/s]
635
  3%|β–Ž | 342/11704 [03:19<1:33:20, 2.03it/s]
636
  3%|β–Ž | 343/11704 [03:20<1:33:18, 2.03it/s]
637
  3%|β–Ž | 344/11704 [03:20<1:33:18, 2.03it/s]
638
  3%|β–Ž | 345/11704 [03:21<1:33:12, 2.03it/s]
639
  3%|β–Ž | 346/11704 [03:21<1:33:12, 2.03it/s]
640
  3%|β–Ž | 347/11704 [03:21<1:33:08, 2.03it/s]
641
  3%|β–Ž | 348/11704 [03:22<1:33:03, 2.03it/s]
642
  3%|β–Ž | 349/11704 [03:22<1:33:10, 2.03it/s]
643
  3%|β–Ž | 350/11704 [03:23<1:33:03, 2.03it/s]{'loss': 4.8046, 'grad_norm': 0.824380099773407, 'learning_rate': 0.00029888983774551667, 'epoch': 0.42}
644
+
645
 
646
  3%|β–Ž | 350/11704 [03:23<1:33:03, 2.03it/s]
647
  3%|β–Ž | 351/11704 [03:23<1:33:04, 2.03it/s]
648
  3%|β–Ž | 352/11704 [03:24<1:33:07, 2.03it/s]
649
  3%|β–Ž | 353/11704 [03:24<1:33:07, 2.03it/s]
650
  3%|β–Ž | 354/11704 [03:25<1:33:00, 2.03it/s]
651
  3%|β–Ž | 355/11704 [03:25<1:33:01, 2.03it/s]
652
  3%|β–Ž | 356/11704 [03:26<1:32:58, 2.03it/s]
653
  3%|β–Ž | 357/11704 [03:26<1:33:00, 2.03it/s]
654
  3%|β–Ž | 358/11704 [03:27<1:32:59, 2.03it/s]
655
  3%|β–Ž | 359/11704 [03:27<1:32:59, 2.03it/s]
656
  3%|β–Ž | 360/11704 [03:28<1:32:57, 2.03it/s]
657
  3%|β–Ž | 361/11704 [03:28<1:32:57, 2.03it/s]
658
  3%|β–Ž | 362/11704 [03:29<1:32:54, 2.03it/s]
659
  3%|β–Ž | 363/11704 [03:29<1:32:51, 2.04it/s]
660
  3%|β–Ž | 364/11704 [03:30<1:32:54, 2.03it/s]
661
  3%|β–Ž | 365/11704 [03:30<1:32:55, 2.03it/s]
662
  3%|β–Ž | 366/11704 [03:31<1:32:52, 2.03it/s]
663
  3%|β–Ž | 367/11704 [03:31<1:33:03, 2.03it/s]
664
  3%|β–Ž | 368/11704 [03:32<1:33:03, 2.03it/s]
665
  3%|β–Ž | 369/11704 [03:32<1:33:10, 2.03it/s]
666
  3%|β–Ž | 370/11704 [03:33<1:33:01, 2.03it/s]
667
  3%|β–Ž | 371/11704 [03:33<1:32:54, 2.03it/s]
668
  3%|β–Ž | 372/11704 [03:34<1:32:58, 2.03it/s]
669
  3%|β–Ž | 373/11704 [03:34<1:32:51, 2.03it/s]
670
  3%|β–Ž | 374/11704 [03:35<1:32:56, 2.03it/s]
671
  3%|β–Ž | 375/11704 [03:35<1:33:04, 2.03it/s]{'loss': 4.6929, 'grad_norm': 0.7358281016349792, 'learning_rate': 0.0003202391118701964, 'epoch': 0.45}
672
+
673
 
674
  3%|β–Ž | 375/11704 [03:35<1:33:04, 2.03it/s]
675
  3%|β–Ž | 376/11704 [03:36<1:33:02, 2.03it/s]
676
  3%|β–Ž | 377/11704 [03:36<1:33:04, 2.03it/s]
677
  3%|β–Ž | 378/11704 [03:37<1:33:01, 2.03it/s]
678
  3%|β–Ž | 379/11704 [03:37<1:33:04, 2.03it/s]
679
  3%|β–Ž | 380/11704 [03:38<1:33:01, 2.03it/s]
680
  3%|β–Ž | 381/11704 [03:38<1:32:53, 2.03it/s]
681
  3%|β–Ž | 382/11704 [03:39<1:33:07, 2.03it/s]
682
  3%|β–Ž | 383/11704 [03:39<1:33:02, 2.03it/s]
683
  3%|β–Ž | 384/11704 [03:40<1:33:01, 2.03it/s]
684
  3%|β–Ž | 385/11704 [03:40<1:32:59, 2.03it/s]
685
  3%|β–Ž | 386/11704 [03:41<1:32:51, 2.03it/s]
686
  3%|β–Ž | 387/11704 [03:41<1:32:44, 2.03it/s]
687
  3%|β–Ž | 388/11704 [03:42<1:32:52, 2.03it/s]
688
  3%|β–Ž | 389/11704 [03:42<1:32:47, 2.03it/s]
689
  3%|β–Ž | 390/11704 [03:43<1:32:51, 2.03it/s]
690
  3%|β–Ž | 391/11704 [03:43<1:32:56, 2.03it/s]
691
  3%|β–Ž | 392/11704 [03:44<1:32:50, 2.03it/s]
692
  3%|β–Ž | 393/11704 [03:44<1:32:50, 2.03it/s]
693
  3%|β–Ž | 394/11704 [03:45<1:32:44, 2.03it/s]
694
  3%|β–Ž | 395/11704 [03:45<1:32:41, 2.03it/s]
695
  3%|β–Ž | 396/11704 [03:46<1:32:36, 2.04it/s]
696
  3%|β–Ž | 397/11704 [03:46<1:32:45, 2.03it/s]
697
  3%|β–Ž | 398/11704 [03:47<1:32:39, 2.03it/s]
698
  3%|β–Ž | 399/11704 [03:47<1:32:35, 2.03it/s]
699
  3%|β–Ž | 400/11704 [03:48<1:32:43, 2.03it/s]
700
 
701
+
702
  3%|β–Ž | 400/11704 [03:48<1:32:43, 2.03it/s]
703
  3%|β–Ž | 401/11704 [03:48<1:32:46, 2.03it/s]
704
  3%|β–Ž | 402/11704 [03:49<1:32:45, 2.03it/s]
705
  3%|β–Ž | 403/11704 [03:49<1:32:48, 2.03it/s]
706
  3%|β–Ž | 404/11704 [03:50<1:32:43, 2.03it/s]
707
  3%|β–Ž | 405/11704 [03:50<1:32:41, 2.03it/s]
708
  3%|β–Ž | 406/11704 [03:51<1:32:39, 2.03it/s]
709
  3%|β–Ž | 407/11704 [03:51<1:32:34, 2.03it/s]
710
  3%|β–Ž | 408/11704 [03:52<1:32:39, 2.03it/s]
711
  3%|β–Ž | 409/11704 [03:52<1:32:39, 2.03it/s]
712
  4%|β–Ž | 410/11704 [03:53<1:32:35, 2.03it/s]
713
  4%|β–Ž | 411/11704 [03:53<1:32:35, 2.03it/s]
714
  4%|β–Ž | 412/11704 [03:53<1:32:32, 2.03it/s]
715
  4%|β–Ž | 413/11704 [03:54<1:32:25, 2.04it/s]
716
  4%|β–Ž | 414/11704 [03:54<1:32:29, 2.03it/s]
717
  4%|β–Ž | 415/11704 [03:55<1:32:26, 2.04it/s]
718
  4%|β–Ž | 416/11704 [03:55<1:32:17, 2.04it/s]
719
  4%|β–Ž | 417/11704 [03:56<1:32:23, 2.04it/s]
720
  4%|β–Ž | 418/11704 [03:56<1:32:26, 2.03it/s]
721
  4%|β–Ž | 419/11704 [03:57<1:32:22, 2.04it/s]
722
  4%|β–Ž | 420/11704 [03:57<1:32:24, 2.04it/s]
723
  4%|β–Ž | 421/11704 [03:58<1:32:22, 2.04it/s]
724
  4%|β–Ž | 422/11704 [03:58<1:32:19, 2.04it/s]
725
  4%|β–Ž | 423/11704 [03:59<1:32:20, 2.04it/s]
726
  4%|β–Ž | 424/11704 [03:59<1:32:19, 2.04it/s]
727
  4%|β–Ž | 425/11704 [04:00<1:32:14, 2.04it/s]
728
 
729
+
730
  4%|β–Ž | 425/11704 [04:00<1:32:14, 2.04it/s]
731
  4%|β–Ž | 426/11704 [04:00<1:32:21, 2.04it/s]
732
  4%|β–Ž | 427/11704 [04:01<1:32:20, 2.04it/s]
733
  4%|β–Ž | 428/11704 [04:01<1:32:16, 2.04it/s]
734
  4%|β–Ž | 429/11704 [04:02<1:32:26, 2.03it/s]
735
  4%|β–Ž | 430/11704 [04:02<1:32:20, 2.03it/s]
736
  4%|β–Ž | 431/11704 [04:03<1:32:17, 2.04it/s]
737
  4%|β–Ž | 432/11704 [04:03<1:32:23, 2.03it/s]
738
  4%|β–Ž | 433/11704 [04:04<1:32:20, 2.03it/s]
739
  4%|β–Ž | 434/11704 [04:04<1:32:17, 2.04it/s]
740
  4%|β–Ž | 435/11704 [04:05<1:32:23, 2.03it/s]
741
  4%|β–Ž | 436/11704 [04:05<1:32:19, 2.03it/s]
742
  4%|β–Ž | 437/11704 [04:06<1:32:21, 2.03it/s]
743
  4%|β–Ž | 438/11704 [04:06<1:32:25, 2.03it/s]
744
  4%|▍ | 439/11704 [04:07<1:32:13, 2.04it/s]
745
  4%|▍ | 440/11704 [04:07<1:32:22, 2.03it/s]
746
  4%|▍ | 441/11704 [04:08<1:32:18, 2.03it/s]
747
  4%|▍ | 442/11704 [04:08<1:32:09, 2.04it/s]
748
  4%|▍ | 443/11704 [04:09<1:32:17, 2.03it/s]
749
  4%|▍ | 444/11704 [04:09<1:32:16, 2.03it/s]
750
  4%|▍ | 445/11704 [04:10<1:32:08, 2.04it/s]
751
  4%|▍ | 446/11704 [04:10<1:32:13, 2.03it/s]
752
  4%|▍ | 447/11704 [04:11<1:32:10, 2.04it/s]
753
  4%|▍ | 448/11704 [04:11<1:32:08, 2.04it/s]
754
  4%|▍ | 449/11704 [04:12<1:32:12, 2.03it/s]
755
  4%|▍ | 450/11704 [04:12<1:32:11, 2.03it/s]{'loss': 4.4307, 'grad_norm': 0.6776960492134094, 'learning_rate': 0.0003842869342442357, 'epoch': 0.54}
756
 
757
+
758
  4%|▍ | 450/11704 [04:12<1:32:11, 2.03it/s]
759
  4%|▍ | 451/11704 [04:13<1:32:13, 2.03it/s]
760
  4%|▍ | 452/11704 [04:13<1:32:17, 2.03it/s]
761
  4%|▍ | 453/11704 [04:14<1:32:13, 2.03it/s]
762
  4%|▍ | 454/11704 [04:14<1:32:16, 2.03it/s]
763
  4%|▍ | 455/11704 [04:15<1:32:11, 2.03it/s]
764
  4%|▍ | 456/11704 [04:15<1:32:06, 2.04it/s]
765
  4%|▍ | 457/11704 [04:16<1:32:11, 2.03it/s]
766
  4%|▍ | 458/11704 [04:16<1:32:09, 2.03it/s]
767
  4%|▍ | 459/11704 [04:17<1:32:04, 2.04it/s]
768
  4%|▍ | 460/11704 [04:17<1:32:09, 2.03it/s]
769
  4%|▍ | 461/11704 [04:18<1:32:10, 2.03it/s]
770
  4%|▍ | 462/11704 [04:18<1:32:04, 2.03it/s]
771
  4%|▍ | 463/11704 [04:19<1:32:10, 2.03it/s]
772
  4%|▍ | 464/11704 [04:19<1:32:03, 2.04it/s]
773
  4%|▍ | 465/11704 [04:20<1:31:56, 2.04it/s]
774
  4%|▍ | 466/11704 [04:20<1:32:00, 2.04it/s]
775
  4%|▍ | 467/11704 [04:21<1:32:05, 2.03it/s]
776
  4%|▍ | 468/11704 [04:21<1:31:58, 2.04it/s]
777
  4%|▍ | 469/11704 [04:22<1:32:03, 2.03it/s]
778
  4%|▍ | 470/11704 [04:22<1:31:58, 2.04it/s]
779
  4%|▍ | 471/11704 [04:22<1:31:56, 2.04it/s]
780
  4%|▍ | 472/11704 [04:23<1:31:58, 2.04it/s]
781
  4%|▍ | 473/11704 [04:23<1:32:04, 2.03it/s]
782
  4%|▍ | 474/11704 [04:24<1:31:58, 2.04it/s]
783
  4%|▍ | 475/11704 [04:24<1:32:03, 2.03it/s]{'loss': 4.3608, 'grad_norm': 0.7181493043899536, 'learning_rate': 0.00040563620836891546, 'epoch': 0.57}
784
+
785
 
786
  4%|▍ | 475/11704 [04:24<1:32:03, 2.03it/s]
787
  4%|▍ | 476/11704 [04:25<1:32:02, 2.03it/s]
788
  4%|▍ | 477/11704 [04:25<1:32:02, 2.03it/s]
789
  4%|▍ | 478/11704 [04:26<1:32:07, 2.03it/s]
790
  4%|▍ | 479/11704 [04:26<1:31:59, 2.03it/s]
791
  4%|▍ | 480/11704 [04:27<1:32:01, 2.03it/s]
792
  4%|▍ | 481/11704 [04:27<1:32:03, 2.03it/s]
793
  4%|▍ | 482/11704 [04:28<1:31:58, 2.03it/s]
794
  4%|▍ | 483/11704 [04:28<1:32:07, 2.03it/s]
795
  4%|▍ | 484/11704 [04:29<1:32:02, 2.03it/s]
796
  4%|▍ | 485/11704 [04:29<1:31:57, 2.03it/s]
797
  4%|▍ | 486/11704 [04:30<1:31:53, 2.03it/s]
798
  4%|▍ | 487/11704 [04:30<1:31:57, 2.03it/s]
799
  4%|▍ | 488/11704 [04:31<1:31:56, 2.03it/s]
800
  4%|▍ | 489/11704 [04:31<1:31:56, 2.03it/s]
801
  4%|▍ | 490/11704 [04:32<1:31:57, 2.03it/s]
802
  4%|▍ | 491/11704 [04:32<1:31:54, 2.03it/s]
803
  4%|▍ | 492/11704 [04:33<1:31:57, 2.03it/s]
804
  4%|▍ | 493/11704 [04:33<1:31:54, 2.03it/s]
805
  4%|▍ | 494/11704 [04:34<1:31:55, 2.03it/s]
806
  4%|▍ | 495/11704 [04:34<1:31:55, 2.03it/s]
807
  4%|▍ | 496/11704 [04:35<1:31:53, 2.03it/s]
808
  4%|▍ | 497/11704 [04:35<1:31:52, 2.03it/s]
809
  4%|▍ | 498/11704 [04:36<1:31:53, 2.03it/s]
810
  4%|▍ | 499/11704 [04:36<1:31:47, 2.03it/s]
811
  4%|▍ | 500/11704 [04:37<1:31:42, 2.04it/s]{'loss': 4.2807, 'grad_norm': 0.6647967100143433, 'learning_rate': 0.0004269854824935952, 'epoch': 0.6}
812
+
813
 
814
  4%|▍ | 500/11704 [04:37<1:31:42, 2.04it/s]
815
  4%|▍ | 501/11704 [04:37<1:31:46, 2.03it/s]
816
  4%|▍ | 502/11704 [04:38<1:31:41, 2.04it/s]
817
  4%|▍ | 503/11704 [04:38<1:31:44, 2.03it/s]
818
  4%|▍ | 504/11704 [04:39<1:31:46, 2.03it/s]
819
  4%|▍ | 505/11704 [04:39<1:31:44, 2.03it/s]
820
  4%|▍ | 506/11704 [04:40<1:31:47, 2.03it/s]
821
  4%|▍ | 507/11704 [04:40<1:31:46, 2.03it/s]
822
  4%|▍ | 508/11704 [04:41<1:31:43, 2.03it/s]
823
  4%|▍ | 509/11704 [04:41<1:31:50, 2.03it/s]
824
  4%|▍ | 510/11704 [04:42<1:31:44, 2.03it/s]
825
  4%|▍ | 511/11704 [04:42<1:31:40, 2.04it/s]
826
  4%|▍ | 512/11704 [04:43<1:31:45, 2.03it/s]
827
  4%|▍ | 513/11704 [04:43<1:31:42, 2.03it/s]
828
  4%|▍ | 514/11704 [04:44<1:31:42, 2.03it/s]
829
  4%|▍ | 515/11704 [04:44<1:31:45, 2.03it/s]
830
  4%|▍ | 516/11704 [04:45<1:31:43, 2.03it/s]
831
  4%|▍ | 517/11704 [04:45<1:31:48, 2.03it/s]
832
  4%|▍ | 518/11704 [04:46<1:32:34, 2.01it/s]
833
  4%|▍ | 519/11704 [04:46<1:32:16, 2.02it/s]
834
  4%|▍ | 520/11704 [04:47<1:31:57, 2.03it/s]
835
  4%|▍ | 521/11704 [04:47<1:31:50, 2.03it/s]
836
  4%|▍ | 522/11704 [04:48<1:32:03, 2.02it/s]
837
  4%|▍ | 523/11704 [04:48<1:31:50, 2.03it/s]
838
  4%|▍ | 524/11704 [04:49<1:31:45, 2.03it/s]
839
  4%|▍ | 525/11704 [04:49<1:31:44, 2.03it/s]{'loss': 4.221, 'grad_norm': 0.5851206183433533, 'learning_rate': 0.00044833475661827497, 'epoch': 0.63}
840
+
841
 
842
  4%|▍ | 525/11704 [04:49<1:31:44, 2.03it/s]
843
  4%|▍ | 526/11704 [04:50<1:31:38, 2.03it/s]
844
  5%|▍ | 527/11704 [04:50<1:31:31, 2.04it/s]
845
  5%|▍ | 528/11704 [04:51<1:31:36, 2.03it/s]
846
  5%|▍ | 529/11704 [04:51<1:31:34, 2.03it/s]
847
  5%|▍ | 530/11704 [04:52<1:31:33, 2.03it/s]
848
  5%|▍ | 531/11704 [04:52<1:31:38, 2.03it/s]
849
  5%|▍ | 532/11704 [04:53<1:31:36, 2.03it/s]
850
  5%|▍ | 533/11704 [04:53<1:31:28, 2.04it/s]
851
  5%|▍ | 534/11704 [04:53<1:31:30, 2.03it/s]
852
  5%|▍ | 535/11704 [04:54<1:31:34, 2.03it/s]
853
  5%|▍ | 536/11704 [04:54<1:31:33, 2.03it/s]
854
  5%|▍ | 537/11704 [04:55<1:31:29, 2.03it/s]
855
  5%|▍ | 538/11704 [04:55<1:31:32, 2.03it/s]
856
  5%|▍ | 539/11704 [04:56<1:31:29, 2.03it/s]
857
  5%|▍ | 540/11704 [04:56<1:31:23, 2.04it/s]
858
  5%|▍ | 541/11704 [04:57<1:31:26, 2.03it/s]
859
  5%|▍ | 542/11704 [04:57<1:31:21, 2.04it/s]
860
  5%|▍ | 543/11704 [04:58<1:31:19, 2.04it/s]
861
  5%|▍ | 544/11704 [04:58<1:31:27, 2.03it/s]
862
  5%|▍ | 545/11704 [04:59<1:31:24, 2.03it/s]
863
  5%|▍ | 546/11704 [04:59<1:31:18, 2.04it/s]
864
  5%|▍ | 547/11704 [05:00<1:31:20, 2.04it/s]
865
  5%|▍ | 548/11704 [05:00<1:31:17, 2.04it/s]
866
  5%|▍ | 549/11704 [05:01<1:31:09, 2.04it/s]
867
  5%|▍ | 550/11704 [05:01<1:31:18, 2.04it/s]{'loss': 4.1699, 'grad_norm': 0.8667837381362915, 'learning_rate': 0.00046968403074295473, 'epoch': 0.66}
868
+
869
 
870
  5%|▍ | 550/11704 [05:01<1:31:18, 2.04it/s]
871
  5%|▍ | 551/11704 [05:02<1:31:19, 2.04it/s]
872
  5%|▍ | 552/11704 [05:02<1:31:17, 2.04it/s]
873
  5%|▍ | 553/11704 [05:03<1:31:25, 2.03it/s]
874
  5%|▍ | 554/11704 [05:03<1:31:22, 2.03it/s]
875
  5%|▍ | 555/11704 [05:04<1:31:22, 2.03it/s]
876
  5%|▍ | 556/11704 [05:04<1:31:25, 2.03it/s]
877
  5%|▍ | 557/11704 [05:05<1:31:21, 2.03it/s]
878
  5%|▍ | 558/11704 [05:05<1:31:20, 2.03it/s]
879
  5%|▍ | 559/11704 [05:06<1:31:21, 2.03it/s]
880
  5%|▍ | 560/11704 [05:06<1:31:22, 2.03it/s]
881
  5%|▍ | 561/11704 [05:07<1:31:22, 2.03it/s]
882
  5%|▍ | 562/11704 [05:07<1:31:17, 2.03it/s]
883
  5%|▍ | 563/11704 [05:08<1:31:13, 2.04it/s]
884
  5%|▍ | 564/11704 [05:08<1:31:18, 2.03it/s]
885
  5%|▍ | 565/11704 [05:09<1:31:13, 2.03it/s]
886
  5%|▍ | 566/11704 [05:09<1:31:07, 2.04it/s]
887
  5%|▍ | 567/11704 [05:10<1:31:11, 2.04it/s]
888
  5%|▍ | 568/11704 [05:10<1:31:09, 2.04it/s]
889
  5%|▍ | 569/11704 [05:11<1:31:03, 2.04it/s]
890
  5%|▍ | 570/11704 [05:11<1:31:11, 2.03it/s]
891
  5%|▍ | 571/11704 [05:12<1:31:07, 2.04it/s]
892
  5%|▍ | 572/11704 [05:12<1:31:04, 2.04it/s]
893
  5%|▍ | 573/11704 [05:13<1:31:09, 2.04it/s]
894
  5%|▍ | 574/11704 [05:13<1:31:05, 2.04it/s]
895
  5%|▍ | 575/11704 [05:14<1:31:05, 2.04it/s]{'loss': 4.1233, 'grad_norm': 0.720784604549408, 'learning_rate': 0.0004910333048676345, 'epoch': 0.69}
896
+
897
 
898
  5%|▍ | 575/11704 [05:14<1:31:05, 2.04it/s]
899
  5%|▍ | 576/11704 [05:14<1:31:14, 2.03it/s]
900
  5%|▍ | 577/11704 [05:15<1:31:06, 2.04it/s]
901
  5%|▍ | 578/11704 [05:15<1:31:06, 2.04it/s]
902
  5%|▍ | 579/11704 [05:16<1:31:10, 2.03it/s]
903
  5%|▍ | 580/11704 [05:16<1:31:03, 2.04it/s]
904
  5%|▍ | 581/11704 [05:17<1:31:00, 2.04it/s]
905
  5%|▍ | 582/11704 [05:17<1:31:08, 2.03it/s]
906
  5%|▍ | 583/11704 [05:18<1:31:06, 2.03it/s]
907
  5%|▍ | 584/11704 [05:18<1:31:09, 2.03it/s]
908
  5%|▍ | 585/11704 [05:19<1:31:07, 2.03it/s]
909
  5%|β–Œ | 586/11704 [05:19<1:31:03, 2.04it/s]
910
  5%|β–Œ | 587/11704 [05:20<1:31:06, 2.03it/s]
911
  5%|β–Œ | 588/11704 [05:20<1:31:03, 2.03it/s]
912
  5%|β–Œ | 589/11704 [05:21<1:30:56, 2.04it/s]
913
  5%|β–Œ | 590/11704 [05:21<1:31:02, 2.03it/s]
914
  5%|β–Œ | 591/11704 [05:21<1:31:02, 2.03it/s]
915
  5%|β–Œ | 592/11704 [05:22<1:30:59, 2.04it/s]
916
  5%|β–Œ | 593/11704 [05:22<1:31:05, 2.03it/s]
917
  5%|β–Œ | 594/11704 [05:23<1:31:00, 2.03it/s]
918
  5%|β–Œ | 595/11704 [05:23<1:30:57, 2.04it/s]
919
  5%|β–Œ | 596/11704 [05:24<1:30:54, 2.04it/s]
920
  5%|β–Œ | 597/11704 [05:24<1:30:55, 2.04it/s]
921
  5%|β–Œ | 598/11704 [05:25<1:30:59, 2.03it/s]
922
  5%|β–Œ | 599/11704 [05:25<1:31:07, 2.03it/s]
923
  5%|β–Œ | 600/11704 [05:26<1:31:00, 2.03it/s]{'loss': 4.0671, 'grad_norm': 0.5922385454177856, 'learning_rate': 0.0005123825789923142, 'epoch': 0.72}
924
+
925
 
926
  5%|β–Œ | 600/11704 [05:26<1:31:00, 2.03it/s]
927
  5%|β–Œ | 601/11704 [05:26<1:31:07, 2.03it/s]
928
  5%|β–Œ | 602/11704 [05:27<1:31:07, 2.03it/s]
929
  5%|β–Œ | 603/11704 [05:27<1:30:58, 2.03it/s]
930
  5%|β–Œ | 604/11704 [05:28<1:31:02, 2.03it/s]
931
  5%|β–Œ | 605/11704 [05:28<1:30:57, 2.03it/s]
932
  5%|β–Œ | 606/11704 [05:29<1:30:54, 2.03it/s]
933
  5%|β–Œ | 607/11704 [05:29<1:31:00, 2.03it/s]
934
  5%|β–Œ | 608/11704 [05:30<1:30:53, 2.03it/s]
935
  5%|β–Œ | 609/11704 [05:30<1:30:51, 2.04it/s]
936
  5%|β–Œ | 610/11704 [05:31<1:30:58, 2.03it/s]
937
  5%|β–Œ | 611/11704 [05:31<1:30:50, 2.04it/s]
938
  5%|β–Œ | 612/11704 [05:32<1:30:44, 2.04it/s]
939
  5%|β–Œ | 613/11704 [05:32<1:30:56, 2.03it/s]
940
  5%|β–Œ | 614/11704 [05:33<1:30:48, 2.04it/s]
941
  5%|β–Œ | 615/11704 [05:33<1:39:03, 1.87it/s]
942
  5%|β–Œ | 616/11704 [05:34<1:44:26, 1.77it/s]
943
  5%|β–Œ | 617/11704 [05:35<1:40:15, 1.84it/s]
944
  5%|β–Œ | 618/11704 [05:35<1:37:27, 1.90it/s]
945
  5%|β–Œ | 619/11704 [05:36<1:35:28, 1.93it/s]
946
  5%|β–Œ | 620/11704 [05:36<1:33:54, 1.97it/s]
947
  5%|β–Œ | 621/11704 [05:37<1:33:01, 1.99it/s]
948
  5%|β–Œ | 622/11704 [05:37<1:32:24, 2.00it/s]
949
  5%|β–Œ | 623/11704 [05:38<1:31:51, 2.01it/s]
950
  5%|β–Œ | 624/11704 [05:38<1:31:28, 2.02it/s]
951
  5%|β–Œ | 625/11704 [05:38<1:31:20, 2.02it/s]{'loss': 4.0331, 'grad_norm': 0.5989447236061096, 'learning_rate': 0.000533731853116994, 'epoch': 0.75}
952
+
953
 
954
  5%|β–Œ | 625/11704 [05:39<1:31:20, 2.02it/s]
955
  5%|β–Œ | 626/11704 [05:39<1:31:10, 2.03it/s]
956
  5%|β–Œ | 627/11704 [05:39<1:30:59, 2.03it/s]
957
  5%|β–Œ | 628/11704 [05:40<1:31:00, 2.03it/s]
958
  5%|β–Œ | 629/11704 [05:40<1:30:50, 2.03it/s]
959
  5%|β–Œ | 630/11704 [05:41<1:30:48, 2.03it/s]
960
  5%|β–Œ | 631/11704 [05:41<1:30:49, 2.03it/s]
961
  5%|β–Œ | 632/11704 [05:42<1:30:45, 2.03it/s]
962
  5%|β–Œ | 633/11704 [05:42<1:30:51, 2.03it/s]
963
  5%|β–Œ | 634/11704 [05:43<1:30:44, 2.03it/s]
964
  5%|β–Œ | 635/11704 [05:43<1:30:39, 2.04it/s]
965
  5%|β–Œ | 636/11704 [05:44<1:30:46, 2.03it/s]
966
  5%|β–Œ | 637/11704 [05:44<1:30:45, 2.03it/s]
967
  5%|β–Œ | 638/11704 [05:45<1:30:43, 2.03it/s]
968
  5%|β–Œ | 639/11704 [05:45<1:30:43, 2.03it/s]
969
  5%|β–Œ | 640/11704 [05:46<1:30:38, 2.03it/s]
970
  5%|β–Œ | 641/11704 [05:46<1:30:40, 2.03it/s]
971
  5%|β–Œ | 642/11704 [05:47<1:30:42, 2.03it/s]
972
  5%|β–Œ | 643/11704 [05:47<1:30:39, 2.03it/s]
973
  6%|β–Œ | 644/11704 [05:48<1:30:44, 2.03it/s]
974
  6%|β–Œ | 645/11704 [05:48<1:30:35, 2.03it/s]
975
  6%|β–Œ | 646/11704 [05:49<1:30:40, 2.03it/s]
976
  6%|β–Œ | 647/11704 [05:49<1:30:42, 2.03it/s]
977
  6%|β–Œ | 648/11704 [05:50<1:30:35, 2.03it/s]
978
  6%|β–Œ | 649/11704 [05:50<1:30:36, 2.03it/s]
979
  6%|β–Œ | 650/11704 [05:51<1:30:42, 2.03it/s]{'loss': 3.9926, 'grad_norm': 0.630885899066925, 'learning_rate': 0.0005550811272416738, 'epoch': 0.78}
980
+
981
 
982
  6%|β–Œ | 650/11704 [05:51<1:30:42, 2.03it/s]
983
  6%|β–Œ | 651/11704 [05:51<1:30:40, 2.03it/s]
984
  6%|β–Œ | 652/11704 [05:52<1:30:43, 2.03it/s]
985
  6%|β–Œ | 653/11704 [05:52<1:30:36, 2.03it/s]
986
  6%|β–Œ | 654/11704 [05:53<1:30:33, 2.03it/s]
987
  6%|β–Œ | 655/11704 [05:53<1:30:43, 2.03it/s]
988
  6%|β–Œ | 656/11704 [05:54<1:30:37, 2.03it/s]
989
  6%|β–Œ | 657/11704 [05:54<1:30:37, 2.03it/s]
990
  6%|β–Œ | 658/11704 [05:55<1:30:42, 2.03it/s]
991
  6%|β–Œ | 659/11704 [05:55<1:30:42, 2.03it/s]
992
  6%|β–Œ | 660/11704 [05:56<1:30:41, 2.03it/s]
993
  6%|β–Œ | 661/11704 [05:56<1:30:43, 2.03it/s]
994
  6%|β–Œ | 662/11704 [05:57<1:30:33, 2.03it/s]
995
  6%|β–Œ | 663/11704 [05:57<1:30:33, 2.03it/s]
996
  6%|β–Œ | 664/11704 [05:58<1:30:35, 2.03it/s]
997
  6%|β–Œ | 665/11704 [05:58<1:30:35, 2.03it/s]
998
  6%|β–Œ | 666/11704 [05:59<1:30:29, 2.03it/s]
999
  6%|β–Œ | 667/11704 [05:59<1:30:36, 2.03it/s]
1000
  6%|β–Œ | 668/11704 [06:00<1:30:36, 2.03it/s]
1001
  6%|β–Œ | 669/11704 [06:00<1:30:32, 2.03it/s]
1002
  6%|β–Œ | 670/11704 [06:01<1:30:35, 2.03it/s]
1003
  6%|β–Œ | 671/11704 [06:01<1:30:30, 2.03it/s]
1004
  6%|β–Œ | 672/11704 [06:02<1:30:28, 2.03it/s]
1005
  6%|β–Œ | 673/11704 [06:02<1:30:26, 2.03it/s]
1006
  6%|β–Œ | 674/11704 [06:03<1:30:24, 2.03it/s]
1007
  6%|β–Œ | 675/11704 [06:03<1:30:22, 2.03it/s]{'loss': 3.9532, 'grad_norm': 0.6419950723648071, 'learning_rate': 0.0005764304013663536, 'epoch': 0.81}
1008
+
1009
 
1010
  6%|β–Œ | 675/11704 [06:03<1:30:22, 2.03it/s]
1011
  6%|β–Œ | 676/11704 [06:04<1:30:31, 2.03it/s]
1012
  6%|β–Œ | 677/11704 [06:04<1:30:28, 2.03it/s]
1013
  6%|β–Œ | 678/11704 [06:05<1:30:33, 2.03it/s]
1014
  6%|β–Œ | 679/11704 [06:05<1:30:34, 2.03it/s]
1015
  6%|β–Œ | 680/11704 [06:06<1:30:24, 2.03it/s]
1016
  6%|β–Œ | 681/11704 [06:06<1:30:30, 2.03it/s]
1017
  6%|β–Œ | 682/11704 [06:07<1:30:29, 2.03it/s]
1018
  6%|β–Œ | 683/11704 [06:07<1:30:31, 2.03it/s]
1019
  6%|β–Œ | 684/11704 [06:08<1:30:31, 2.03it/s]
1020
  6%|β–Œ | 685/11704 [06:08<1:30:25, 2.03it/s]
1021
  6%|β–Œ | 686/11704 [06:09<1:30:28, 2.03it/s]
1022
  6%|β–Œ | 687/11704 [06:09<1:30:25, 2.03it/s]
1023
  6%|β–Œ | 688/11704 [06:10<1:30:24, 2.03it/s]
1024
  6%|β–Œ | 689/11704 [06:10<1:30:25, 2.03it/s]
1025
  6%|β–Œ | 690/11704 [06:10<1:30:24, 2.03it/s]
1026
  6%|β–Œ | 691/11704 [06:11<1:30:24, 2.03it/s]
1027
  6%|β–Œ | 692/11704 [06:11<1:30:19, 2.03it/s]
1028
  6%|β–Œ | 693/11704 [06:12<1:30:15, 2.03it/s]
1029
  6%|β–Œ | 694/11704 [06:12<1:30:20, 2.03it/s]
1030
  6%|β–Œ | 695/11704 [06:13<1:30:17, 2.03it/s]
1031
  6%|β–Œ | 696/11704 [06:13<1:30:23, 2.03it/s]
1032
  6%|β–Œ | 697/11704 [06:14<1:30:18, 2.03it/s]
1033
  6%|β–Œ | 698/11704 [06:14<1:30:13, 2.03it/s]
1034
  6%|β–Œ | 699/11704 [06:15<1:30:15, 2.03it/s]
1035
  6%|β–Œ | 700/11704 [06:15<1:30:13, 2.03it/s]{'loss': 3.9036, 'grad_norm': 0.5995571613311768, 'learning_rate': 0.0005977796754910333, 'epoch': 0.84}
1036
+
1037
 
1038
  6%|β–Œ | 700/11704 [06:15<1:30:13, 2.03it/s]
1039
  6%|β–Œ | 701/11704 [06:16<1:30:21, 2.03it/s]
1040
  6%|β–Œ | 702/11704 [06:16<1:30:22, 2.03it/s]
1041
  6%|β–Œ | 703/11704 [06:17<1:30:16, 2.03it/s]
1042
  6%|β–Œ | 704/11704 [06:17<1:30:19, 2.03it/s]
1043
  6%|β–Œ | 705/11704 [06:18<1:30:16, 2.03it/s]
1044
  6%|β–Œ | 706/11704 [06:18<1:30:18, 2.03it/s]
1045
  6%|β–Œ | 707/11704 [06:19<1:30:13, 2.03it/s]
1046
  6%|β–Œ | 708/11704 [06:19<1:30:10, 2.03it/s]
1047
  6%|β–Œ | 709/11704 [06:20<1:30:16, 2.03it/s]
1048
  6%|β–Œ | 710/11704 [06:20<1:30:10, 2.03it/s]
1049
  6%|β–Œ | 711/11704 [06:21<1:30:22, 2.03it/s]
1050
  6%|β–Œ | 712/11704 [06:21<1:30:14, 2.03it/s]
1051
  6%|β–Œ | 713/11704 [06:22<1:30:10, 2.03it/s]
1052
  6%|β–Œ | 714/11704 [06:22<1:30:16, 2.03it/s]
1053
  6%|β–Œ | 715/11704 [06:23<1:30:10, 2.03it/s]
1054
  6%|β–Œ | 716/11704 [06:23<1:30:13, 2.03it/s]
1055
  6%|β–Œ | 717/11704 [06:24<1:30:17, 2.03it/s]
1056
  6%|β–Œ | 718/11704 [06:24<1:30:09, 2.03it/s]
1057
  6%|β–Œ | 719/11704 [06:25<1:30:13, 2.03it/s]
1058
  6%|β–Œ | 720/11704 [06:25<1:30:05, 2.03it/s]
1059
  6%|β–Œ | 721/11704 [06:26<1:30:09, 2.03it/s]
1060
  6%|β–Œ | 722/11704 [06:26<1:30:09, 2.03it/s]
1061
  6%|β–Œ | 723/11704 [06:27<1:30:08, 2.03it/s]
1062
  6%|β–Œ | 724/11704 [06:27<1:30:10, 2.03it/s]
1063
  6%|β–Œ | 725/11704 [06:28<1:30:07, 2.03it/s]{'loss': 3.8761, 'grad_norm': 0.5167131423950195, 'learning_rate': 0.000619128949615713, 'epoch': 0.87}
1064
+
1065
 
1066
  6%|β–Œ | 725/11704 [06:28<1:30:07, 2.03it/s]
1067
  6%|β–Œ | 726/11704 [06:28<1:30:10, 2.03it/s]
1068
  6%|β–Œ | 727/11704 [06:29<1:30:07, 2.03it/s]
1069
  6%|β–Œ | 728/11704 [06:29<1:30:06, 2.03it/s]
1070
  6%|β–Œ | 729/11704 [06:30<1:30:10, 2.03it/s]
1071
  6%|β–Œ | 730/11704 [06:30<1:30:08, 2.03it/s]
1072
  6%|β–Œ | 731/11704 [06:31<1:30:10, 2.03it/s]
1073
  6%|β–‹ | 732/11704 [06:31<1:30:04, 2.03it/s]
1074
  6%|β–‹ | 733/11704 [06:32<1:30:01, 2.03it/s]
1075
  6%|β–‹ | 734/11704 [06:32<1:30:04, 2.03it/s]
1076
  6%|β–‹ | 735/11704 [06:33<1:29:58, 2.03it/s]
1077
  6%|β–‹ | 736/11704 [06:33<1:30:04, 2.03it/s]
1078
  6%|β–‹ | 737/11704 [06:34<1:30:03, 2.03it/s]
1079
  6%|β–‹ | 738/11704 [06:34<1:30:02, 2.03it/s]
1080
  6%|β–‹ | 739/11704 [06:35<1:30:03, 2.03it/s]
1081
  6%|β–‹ | 740/11704 [06:35<1:29:57, 2.03it/s]
1082
  6%|β–‹ | 741/11704 [06:36<1:30:03, 2.03it/s]
1083
  6%|β–‹ | 742/11704 [06:36<1:30:01, 2.03it/s]
1084
  6%|β–‹ | 743/11704 [06:37<1:29:59, 2.03it/s]
1085
  6%|β–‹ | 744/11704 [06:37<1:30:00, 2.03it/s]
1086
  6%|β–‹ | 745/11704 [06:38<1:29:54, 2.03it/s]
1087
  6%|β–‹ | 746/11704 [06:38<1:30:01, 2.03it/s]
1088
  6%|β–‹ | 747/11704 [06:39<1:30:00, 2.03it/s]
1089
  6%|β–‹ | 748/11704 [06:39<1:29:55, 2.03it/s]
1090
  6%|β–‹ | 749/11704 [06:40<1:30:01, 2.03it/s]
1091
  6%|β–‹ | 750/11704 [06:40<1:29:54, 2.03it/s]{'loss': 3.851, 'grad_norm': 0.5299991369247437, 'learning_rate': 0.0006404782237403928, 'epoch': 0.9}
1092
+
1093
 
1094
  6%|β–‹ | 750/11704 [06:40<1:29:54, 2.03it/s]
1095
  6%|β–‹ | 751/11704 [06:41<1:29:57, 2.03it/s]
1096
  6%|β–‹ | 752/11704 [06:41<1:30:00, 2.03it/s]
1097
  6%|β–‹ | 753/11704 [06:42<1:29:52, 2.03it/s]
1098
  6%|β–‹ | 754/11704 [06:42<1:30:01, 2.03it/s]
1099
  6%|β–‹ | 755/11704 [06:43<1:29:53, 2.03it/s]
1100
  6%|β–‹ | 756/11704 [06:43<1:29:51, 2.03it/s]
1101
  6%|β–‹ | 757/11704 [06:43<1:29:52, 2.03it/s]
1102
  6%|β–‹ | 758/11704 [06:44<1:29:51, 2.03it/s]
1103
  6%|β–‹ | 759/11704 [06:44<1:29:56, 2.03it/s]
1104
  6%|β–‹ | 760/11704 [06:45<1:29:50, 2.03it/s]
1105
  7%|β–‹ | 761/11704 [06:45<1:29:48, 2.03it/s]
1106
  7%|β–‹ | 762/11704 [06:46<1:29:55, 2.03it/s]
1107
  7%|β–‹ | 763/11704 [06:46<1:29:48, 2.03it/s]
1108
  7%|β–‹ | 764/11704 [06:47<1:29:48, 2.03it/s]
1109
  7%|β–‹ | 765/11704 [06:47<1:29:54, 2.03it/s]
1110
  7%|β–‹ | 766/11704 [06:48<1:29:47, 2.03it/s]
1111
  7%|β–‹ | 767/11704 [06:48<1:29:56, 2.03it/s]
1112
  7%|β–‹ | 768/11704 [06:49<1:29:52, 2.03it/s]
1113
  7%|β–‹ | 769/11704 [06:49<1:29:52, 2.03it/s]
1114
  7%|β–‹ | 770/11704 [06:50<1:29:50, 2.03it/s]
1115
  7%|β–‹ | 771/11704 [06:50<1:29:44, 2.03it/s]
1116
  7%|β–‹ | 772/11704 [06:51<1:29:44, 2.03it/s]
1117
  7%|β–‹ | 773/11704 [06:51<1:29:36, 2.03it/s]
1118
  7%|β–‹ | 774/11704 [06:52<1:29:42, 2.03it/s]
1119
  7%|β–‹ | 775/11704 [06:52<1:29:42, 2.03it/s]{'loss': 3.7964, 'grad_norm': 0.5229047536849976, 'learning_rate': 0.0006618274978650726, 'epoch': 0.93}
1120
+
1121
 
1122
  7%|β–‹ | 775/11704 [06:52<1:29:42, 2.03it/s]
1123
  7%|β–‹ | 776/11704 [06:53<1:29:43, 2.03it/s]
1124
  7%|β–‹ | 777/11704 [06:53<1:29:50, 2.03it/s]
1125
  7%|β–‹ | 778/11704 [06:54<1:29:41, 2.03it/s]
1126
  7%|β–‹ | 779/11704 [06:54<1:29:43, 2.03it/s]
1127
  7%|β–‹ | 780/11704 [06:55<1:29:41, 2.03it/s]
1128
  7%|β–‹ | 781/11704 [06:55<1:29:40, 2.03it/s]
1129
  7%|β–‹ | 782/11704 [06:56<1:29:39, 2.03it/s]
1130
  7%|β–‹ | 783/11704 [06:56<1:29:35, 2.03it/s]
1131
  7%|β–‹ | 784/11704 [06:57<1:29:34, 2.03it/s]
1132
  7%|β–‹ | 785/11704 [06:57<1:29:40, 2.03it/s]
1133
  7%|β–‹ | 786/11704 [06:58<1:29:40, 2.03it/s]
1134
  7%|β–‹ | 787/11704 [06:58<1:29:44, 2.03it/s]
1135
  7%|β–‹ | 788/11704 [06:59<1:29:40, 2.03it/s]
1136
  7%|β–‹ | 789/11704 [06:59<1:29:42, 2.03it/s]
1137
  7%|β–‹ | 790/11704 [07:00<1:29:42, 2.03it/s]
1138
  7%|β–‹ | 791/11704 [07:00<1:29:41, 2.03it/s]
1139
  7%|β–‹ | 792/11704 [07:01<1:29:39, 2.03it/s]
1140
  7%|β–‹ | 793/11704 [07:01<1:29:34, 2.03it/s]
1141
  7%|β–‹ | 794/11704 [07:02<1:29:26, 2.03it/s]
1142
  7%|β–‹ | 795/11704 [07:02<1:29:29, 2.03it/s]
1143
  7%|β–‹ | 796/11704 [07:03<1:29:29, 2.03it/s]
1144
  7%|β–‹ | 797/11704 [07:03<1:29:29, 2.03it/s]
1145
  7%|β–‹ | 798/11704 [07:04<1:29:32, 2.03it/s]
1146
  7%|β–‹ | 799/11704 [07:04<1:29:33, 2.03it/s]
1147
  7%|β–‹ | 800/11704 [07:05<1:29:32, 2.03it/s]{'loss': 3.7888, 'grad_norm': 0.48627805709838867, 'learning_rate': 0.0006831767719897524, 'epoch': 0.96}
1148
+
1149
 
1150
  7%|β–‹ | 800/11704 [07:05<1:29:32, 2.03it/s]
1151
  7%|β–‹ | 801/11704 [07:05<1:29:38, 2.03it/s]
1152
  7%|β–‹ | 802/11704 [07:06<1:29:34, 2.03it/s]
1153
  7%|β–‹ | 803/11704 [07:06<1:29:34, 2.03it/s]
1154
  7%|β–‹ | 804/11704 [07:07<1:29:28, 2.03it/s]
1155
  7%|β–‹ | 805/11704 [07:07<1:29:23, 2.03it/s]
1156
  7%|β–‹ | 806/11704 [07:08<1:29:21, 2.03it/s]
1157
  7%|β–‹ | 807/11704 [07:08<1:29:29, 2.03it/s]
1158
  7%|β–‹ | 808/11704 [07:09<1:29:27, 2.03it/s]
1159
  7%|β–‹ | 809/11704 [07:09<1:29:26, 2.03it/s]
1160
  7%|β–‹ | 810/11704 [07:10<1:29:27, 2.03it/s]
1161
  7%|β–‹ | 811/11704 [07:10<1:29:23, 2.03it/s]
1162
  7%|β–‹ | 812/11704 [07:11<1:29:25, 2.03it/s]
1163
  7%|β–‹ | 813/11704 [07:11<1:29:24, 2.03it/s]
1164
  7%|β–‹ | 814/11704 [07:12<1:29:16, 2.03it/s]
1165
  7%|β–‹ | 815/11704 [07:12<1:29:08, 2.04it/s]
1166
  7%|β–‹ | 816/11704 [07:13<1:29:13, 2.03it/s]
1167
  7%|β–‹ | 817/11704 [07:13<1:29:13, 2.03it/s]
1168
  7%|β–‹ | 818/11704 [07:14<1:29:15, 2.03it/s]
1169
  7%|β–‹ | 819/11704 [07:14<1:29:19, 2.03it/s]
1170
  7%|β–‹ | 820/11704 [07:15<1:29:12, 2.03it/s]
1171
  7%|β–‹ | 821/11704 [07:15<1:29:15, 2.03it/s]
1172
  7%|β–‹ | 822/11704 [07:16<1:29:14, 2.03it/s]
1173
  7%|β–‹ | 823/11704 [07:16<1:29:12, 2.03it/s]
1174
  7%|β–‹ | 824/11704 [07:16<1:29:17, 2.03it/s]
1175
  7%|β–‹ | 825/11704 [07:17<1:29:15, 2.03it/s]{'loss': 3.7506, 'grad_norm': 0.5719705820083618, 'learning_rate': 0.0007045260461144322, 'epoch': 0.99}
1176
+
1177
 
1178
  7%|β–‹ | 825/11704 [07:17<1:29:15, 2.03it/s]
1179
  7%|β–‹ | 826/11704 [07:17<1:29:22, 2.03it/s]
1180
  7%|β–‹ | 827/11704 [07:18<1:29:19, 2.03it/s]
1181
  7%|β–‹ | 828/11704 [07:18<1:29:13, 2.03it/s]
1182
  7%|β–‹ | 829/11704 [07:19<1:29:19, 2.03it/s]
1183
  7%|β–‹ | 830/11704 [07:19<1:29:18, 2.03it/s]
1184
  7%|β–‹ | 831/11704 [07:20<1:29:20, 2.03it/s]
1185
  7%|β–‹ | 832/11704 [07:20<1:29:17, 2.03it/s]
1186
  7%|β–‹ | 833/11704 [07:21<1:29:13, 2.03it/s]
1187
  7%|β–‹ | 834/11704 [07:21<1:29:20, 2.03it/s]
1188
  7%|β–‹ | 835/11704 [07:22<1:29:11, 2.03it/s]
1189
  7%|β–‹ | 836/11704 [07:22<1:31:45, 1.97it/s]
1190
  7%|β–‹ | 837/11704 [07:34<11:50:27, 3.92s/it]
1191
  7%|β–‹ | 838/11704 [07:35<8:44:46, 2.90s/it]
1192
  7%|β–‹ | 839/11704 [07:35<6:33:59, 2.18s/it]
1193
  7%|β–‹ | 840/11704 [07:36<5:02:24, 1.67s/it]
1194
  7%|β–‹ | 841/11704 [07:36<3:58:30, 1.32s/it]
1195
  7%|β–‹ | 842/11704 [07:37<3:13:48, 1.07s/it]
1196
  7%|β–‹ | 843/11704 [07:37<2:42:26, 1.11it/s]
1197
  7%|β–‹ | 844/11704 [07:38<2:20:22, 1.29it/s]
1198
  7%|β–‹ | 845/11704 [07:38<2:05:46, 1.44it/s]
1199
  7%|β–‹ | 846/11704 [07:39<1:54:47, 1.58it/s]
1200
  7%|β–‹ | 847/11704 [07:39<1:47:02, 1.69it/s]
1201
  7%|β–‹ | 848/11704 [07:40<1:41:37, 1.78it/s]
1202
  7%|β–‹ | 849/11704 [07:40<1:37:49, 1.85it/s]
1203
  7%|β–‹ | 850/11704 [07:41<1:35:06, 1.90it/s]{'loss': 3.7125, 'grad_norm': 0.48182985186576843, 'learning_rate': 0.0007258753202391119, 'epoch': 1.02}
1204
+
1205
 
1206
  7%|β–‹ | 850/11704 [07:41<1:35:06, 1.90it/s]
1207
  7%|β–‹ | 851/11704 [07:41<1:33:29, 1.93it/s]
1208
  7%|β–‹ | 852/11704 [07:42<1:32:09, 1.96it/s]
1209
  7%|β–‹ | 853/11704 [07:42<1:31:05, 1.99it/s]
1210
  7%|β–‹ | 854/11704 [07:43<1:30:25, 2.00it/s]
1211
  7%|β–‹ | 855/11704 [07:43<1:29:58, 2.01it/s]
1212
  7%|β–‹ | 856/11704 [07:44<1:29:45, 2.01it/s]
1213
  7%|β–‹ | 857/11704 [07:44<1:29:20, 2.02it/s]
1214
  7%|β–‹ | 858/11704 [07:45<1:29:19, 2.02it/s]
1215
  7%|β–‹ | 859/11704 [07:45<1:29:09, 2.03it/s]
1216
  7%|β–‹ | 860/11704 [07:46<1:29:11, 2.03it/s]
1217
  7%|β–‹ | 861/11704 [07:46<1:29:02, 2.03it/s]
1218
  7%|β–‹ | 862/11704 [07:47<1:28:59, 2.03it/s]
1219
  7%|β–‹ | 863/11704 [07:47<1:28:54, 2.03it/s]
1220
  7%|β–‹ | 864/11704 [07:48<1:28:51, 2.03it/s]
1221
  7%|β–‹ | 865/11704 [07:48<1:28:48, 2.03it/s]
1222
  7%|β–‹ | 866/11704 [07:49<1:28:57, 2.03it/s]
1223
  7%|β–‹ | 867/11704 [07:49<1:28:59, 2.03it/s]
1224
  7%|β–‹ | 868/11704 [07:50<1:28:51, 2.03it/s]
1225
  7%|β–‹ | 869/11704 [07:50<1:28:52, 2.03it/s]
1226
  7%|β–‹ | 870/11704 [07:51<1:28:50, 2.03it/s]
1227
  7%|β–‹ | 871/11704 [07:51<1:28:46, 2.03it/s]
1228
  7%|β–‹ | 872/11704 [07:52<1:28:53, 2.03it/s]
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9509fc878acd0defe5adfeb20bab4c79a7a8244d939dbf9aedc248e6079ebae0
3
+ size 5240