Training in progress, epoch 0
Browse files- config.json +30 -0
- eval_job_output.txt +210 -0
- logs/events.out.tfevents.1716738249.sphinx2 +3 -0
- model.safetensors +3 -0
- train_job_output.txt +287 -0
- training_args.bin +3 -0
config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "EleutherAI/pythia-70m",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 0,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 512,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 2048,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"max_position_embeddings": 2048,
|
18 |
+
"model_type": "gpt_neox",
|
19 |
+
"num_attention_heads": 8,
|
20 |
+
"num_hidden_layers": 6,
|
21 |
+
"rope_scaling": null,
|
22 |
+
"rotary_emb_base": 10000,
|
23 |
+
"rotary_pct": 0.25,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
+
"torch_dtype": "float32",
|
26 |
+
"transformers_version": "4.40.2",
|
27 |
+
"use_cache": true,
|
28 |
+
"use_parallel_residual": true,
|
29 |
+
"vocab_size": 50304
|
30 |
+
}
|
eval_job_output.txt
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
slurm submission log: 2024-05-25 22:01:15.907170
|
2 |
+
created following sbatch script:
|
3 |
+
|
4 |
+
###############################
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
|
8 |
+
#SBATCH --account=nlp
|
9 |
+
#SBATCH --cpus-per-task=16
|
10 |
+
#SBATCH --dependency=afterok:7651386
|
11 |
+
#SBATCH --gres=gpu:1
|
12 |
+
#SBATCH --job-name=tthrush-job-3177077
|
13 |
+
#SBATCH --mem=60G
|
14 |
+
#SBATCH --nodelist=sphinx1
|
15 |
+
#SBATCH --open-mode=append
|
16 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
|
17 |
+
#SBATCH --partition=sphinx
|
18 |
+
#SBATCH --time=14-0
|
19 |
+
|
20 |
+
# activate your desired anaconda environment
|
21 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
22 |
+
|
23 |
+
# cd to working directory
|
24 |
+
cd .
|
25 |
+
|
26 |
+
# launch commands
|
27 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
|
28 |
+
|
29 |
+
###############################
|
30 |
+
|
31 |
+
submission to slurm complete!
|
32 |
+
|
33 |
+
|
34 |
+
###############################
|
35 |
+
slurm submission output
|
36 |
+
|
37 |
+
Submitted batch job 7651387
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
###############################
|
42 |
+
|
43 |
+
slurm submission log: 2024-05-25 22:02:25.155540
|
44 |
+
created following sbatch script:
|
45 |
+
|
46 |
+
###############################
|
47 |
+
|
48 |
+
#!/bin/bash
|
49 |
+
|
50 |
+
#SBATCH --account=nlp
|
51 |
+
#SBATCH --cpus-per-task=16
|
52 |
+
#SBATCH --dependency=afterok:7651417
|
53 |
+
#SBATCH --gres=gpu:1
|
54 |
+
#SBATCH --job-name=tthrush-job-3723367
|
55 |
+
#SBATCH --mem=60G
|
56 |
+
#SBATCH --nodelist=sphinx1
|
57 |
+
#SBATCH --open-mode=append
|
58 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
|
59 |
+
#SBATCH --partition=sphinx
|
60 |
+
#SBATCH --time=14-0
|
61 |
+
|
62 |
+
# activate your desired anaconda environment
|
63 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
64 |
+
|
65 |
+
# cd to working directory
|
66 |
+
cd .
|
67 |
+
|
68 |
+
# launch commands
|
69 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
|
70 |
+
|
71 |
+
###############################
|
72 |
+
|
73 |
+
submission to slurm complete!
|
74 |
+
|
75 |
+
|
76 |
+
###############################
|
77 |
+
slurm submission output
|
78 |
+
|
79 |
+
Submitted batch job 7651418
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
###############################
|
84 |
+
|
85 |
+
slurm submission log: 2024-05-25 22:12:50.394186
|
86 |
+
created following sbatch script:
|
87 |
+
|
88 |
+
###############################
|
89 |
+
|
90 |
+
#!/bin/bash
|
91 |
+
|
92 |
+
#SBATCH --account=nlp
|
93 |
+
#SBATCH --cpus-per-task=16
|
94 |
+
#SBATCH --dependency=afterok:7651458
|
95 |
+
#SBATCH --gres=gpu:1
|
96 |
+
#SBATCH --job-name=tthrush-job-3832230
|
97 |
+
#SBATCH --mem=60G
|
98 |
+
#SBATCH --nodelist=sphinx1
|
99 |
+
#SBATCH --open-mode=append
|
100 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
|
101 |
+
#SBATCH --partition=sphinx
|
102 |
+
#SBATCH --time=14-0
|
103 |
+
|
104 |
+
# activate your desired anaconda environment
|
105 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
106 |
+
|
107 |
+
# cd to working directory
|
108 |
+
cd .
|
109 |
+
|
110 |
+
# launch commands
|
111 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
|
112 |
+
|
113 |
+
###############################
|
114 |
+
|
115 |
+
submission to slurm complete!
|
116 |
+
|
117 |
+
|
118 |
+
###############################
|
119 |
+
slurm submission output
|
120 |
+
|
121 |
+
Submitted batch job 7651459
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
###############################
|
126 |
+
|
127 |
+
slurm submission log: 2024-05-25 22:15:55.997250
|
128 |
+
created following sbatch script:
|
129 |
+
|
130 |
+
###############################
|
131 |
+
|
132 |
+
#!/bin/bash
|
133 |
+
|
134 |
+
#SBATCH --account=nlp
|
135 |
+
#SBATCH --cpus-per-task=16
|
136 |
+
#SBATCH --dependency=afterok:7651486
|
137 |
+
#SBATCH --gres=gpu:1
|
138 |
+
#SBATCH --job-name=tthrush-job-2732481
|
139 |
+
#SBATCH --mem=60G
|
140 |
+
#SBATCH --nodelist=sphinx1
|
141 |
+
#SBATCH --open-mode=append
|
142 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
|
143 |
+
#SBATCH --partition=sphinx
|
144 |
+
#SBATCH --time=14-0
|
145 |
+
|
146 |
+
# activate your desired anaconda environment
|
147 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
148 |
+
|
149 |
+
# cd to working directory
|
150 |
+
cd .
|
151 |
+
|
152 |
+
# launch commands
|
153 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
|
154 |
+
|
155 |
+
###############################
|
156 |
+
|
157 |
+
submission to slurm complete!
|
158 |
+
|
159 |
+
|
160 |
+
###############################
|
161 |
+
slurm submission output
|
162 |
+
|
163 |
+
Submitted batch job 7651487
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
###############################
|
168 |
+
|
169 |
+
slurm submission log: 2024-05-25 22:18:14.995526
|
170 |
+
created following sbatch script:
|
171 |
+
|
172 |
+
###############################
|
173 |
+
|
174 |
+
#!/bin/bash
|
175 |
+
|
176 |
+
#SBATCH --account=nlp
|
177 |
+
#SBATCH --cpus-per-task=16
|
178 |
+
#SBATCH --dependency=afterok:7651516
|
179 |
+
#SBATCH --gres=gpu:1
|
180 |
+
#SBATCH --job-name=tthrush-job-3308982
|
181 |
+
#SBATCH --mem=60G
|
182 |
+
#SBATCH --nodelist=sphinx1
|
183 |
+
#SBATCH --open-mode=append
|
184 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/eval_job_output.txt
|
185 |
+
#SBATCH --partition=sphinx
|
186 |
+
#SBATCH --time=14-0
|
187 |
+
|
188 |
+
# activate your desired anaconda environment
|
189 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
190 |
+
|
191 |
+
# cd to working directory
|
192 |
+
cd .
|
193 |
+
|
194 |
+
# launch commands
|
195 |
+
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1,revision=main,dtype=float16,trust_remote_code=True --tasks piqa,arc_easy,xnli_en,xnli_fr,xnli_de,xnli_es,sciq,lambada --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/perf'
|
196 |
+
|
197 |
+
###############################
|
198 |
+
|
199 |
+
submission to slurm complete!
|
200 |
+
|
201 |
+
|
202 |
+
###############################
|
203 |
+
slurm submission output
|
204 |
+
|
205 |
+
Submitted batch job 7651517
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
###############################
|
210 |
+
|
logs/events.out.tfevents.1716738249.sphinx2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11e15a6f6500f0062c4964dc38d6af748a94f95f7df5aeabc9657e2199256da5
|
3 |
+
size 11781
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ae13490cbf98ca8da07c69d1d78ddec321a71452b204a7e4e6c442f454038800
|
3 |
+
size 281715176
|
train_job_output.txt
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
0 |
0%| | 0/11704 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
|
|
|
|
1 |
0%| | 1/11704 [00:11<36:01:50, 11.08s/it]
|
2 |
0%| | 2/11704 [00:15<22:29:09, 6.92s/it]
|
3 |
0%| | 3/11704 [00:18<17:13:49, 5.30s/it]
|
4 |
0%| | 4/11704 [00:21<13:50:30, 4.26s/it]
|
5 |
0%| | 5/11704 [00:22<10:32:47, 3.25s/it]
|
6 |
0%| | 6/11704 [00:24<9:19:43, 2.87s/it]
|
7 |
0%| | 7/11704 [00:26<7:44:55, 2.38s/it]
|
8 |
0%| | 8/11704 [00:27<6:35:35, 2.03s/it]
|
9 |
0%| | 9/11704 [00:28<5:32:02, 1.70s/it]
|
10 |
0%| | 10/11704 [00:29<5:12:33, 1.60s/it]
|
11 |
0%| | 11/11704 [00:30<4:43:07, 1.45s/it]
|
12 |
0%| | 12/11704 [00:31<4:09:08, 1.28s/it]
|
13 |
0%| | 13/11704 [00:32<3:36:18, 1.11s/it]
|
14 |
0%| | 14/11704 [00:33<3:17:27, 1.01s/it]
|
15 |
0%| | 15/11704 [00:34<3:11:14, 1.02it/s]
|
16 |
0%| | 16/11704 [00:34<3:01:30, 1.07it/s]
|
17 |
0%| | 17/11704 [00:35<2:48:52, 1.15it/s]
|
18 |
0%| | 18/11704 [00:36<2:33:34, 1.27it/s]
|
19 |
0%| | 19/11704 [00:36<2:26:24, 1.33it/s]
|
20 |
0%| | 20/11704 [00:37<2:19:52, 1.39it/s]
|
21 |
0%| | 21/11704 [00:38<2:16:43, 1.42it/s]
|
22 |
0%| | 22/11704 [00:38<2:16:47, 1.42it/s]
|
23 |
0%| | 23/11704 [00:39<2:13:38, 1.46it/s]
|
24 |
0%| | 24/11704 [00:40<2:20:53, 1.38it/s]
|
25 |
0%| | 25/11704 [00:41<2:25:03, 1.34it/s]{'loss': 10.6948, 'grad_norm': 1.2803484201431274, 'learning_rate': 2.134927412467976e-05, 'epoch': 0.03}
|
26 |
|
|
|
27 |
0%| | 25/11704 [00:41<2:25:03, 1.34it/s]
|
28 |
0%| | 26/11704 [00:41<2:14:46, 1.44it/s]
|
29 |
0%| | 27/11704 [00:42<2:08:18, 1.52it/s]
|
30 |
0%| | 28/11704 [00:42<2:06:17, 1.54it/s]
|
31 |
0%| | 29/11704 [00:43<2:03:23, 1.58it/s]
|
32 |
0%| | 30/11704 [00:44<1:58:17, 1.64it/s]
|
33 |
0%| | 31/11704 [00:44<1:53:07, 1.72it/s]
|
34 |
0%| | 32/11704 [00:45<1:50:58, 1.75it/s]
|
35 |
0%| | 33/11704 [00:45<1:50:04, 1.77it/s]
|
36 |
0%| | 34/11704 [00:46<1:51:19, 1.75it/s]
|
37 |
0%| | 35/11704 [00:46<1:47:30, 1.81it/s]
|
38 |
0%| | 36/11704 [00:47<1:47:57, 1.80it/s]
|
39 |
0%| | 37/11704 [00:47<1:46:41, 1.82it/s]
|
40 |
0%| | 38/11704 [00:48<1:45:56, 1.84it/s]
|
41 |
0%| | 39/11704 [00:49<1:47:28, 1.81it/s]
|
42 |
0%| | 40/11704 [00:49<1:46:05, 1.83it/s]
|
43 |
0%| | 41/11704 [00:50<1:45:06, 1.85it/s]
|
44 |
0%| | 42/11704 [00:50<1:45:14, 1.85it/s]
|
45 |
0%| | 43/11704 [00:51<1:46:16, 1.83it/s]
|
46 |
0%| | 44/11704 [00:51<1:44:48, 1.85it/s]
|
47 |
0%| | 45/11704 [00:52<1:42:40, 1.89it/s]
|
48 |
0%| | 46/11704 [00:52<1:43:30, 1.88it/s]
|
49 |
0%| | 47/11704 [00:53<1:43:45, 1.87it/s]
|
50 |
0%| | 48/11704 [00:53<1:42:51, 1.89it/s]
|
51 |
0%| | 49/11704 [00:54<1:42:14, 1.90it/s]
|
52 |
0%| | 50/11704 [00:54<1:41:41, 1.91it/s]{'loss': 9.9789, 'grad_norm': 1.1821939945220947, 'learning_rate': 4.269854824935952e-05, 'epoch': 0.06}
|
|
|
53 |
|
54 |
0%| | 50/11704 [00:54<1:41:41, 1.91it/s]
|
55 |
0%| | 51/11704 [00:55<1:41:01, 1.92it/s]
|
56 |
0%| | 52/11704 [00:55<1:40:15, 1.94it/s]
|
57 |
0%| | 53/11704 [00:56<1:39:39, 1.95it/s]
|
58 |
0%| | 54/11704 [00:56<1:40:20, 1.94it/s]
|
59 |
0%| | 55/11704 [00:57<1:39:05, 1.96it/s]
|
60 |
0%| | 56/11704 [00:57<1:39:10, 1.96it/s]
|
61 |
0%| | 57/11704 [00:58<1:39:27, 1.95it/s]
|
62 |
0%| | 58/11704 [00:58<1:39:40, 1.95it/s]
|
63 |
1%| | 59/11704 [00:59<1:52:14, 1.73it/s]
|
64 |
1%| | 60/11704 [01:00<1:48:03, 1.80it/s]
|
65 |
1%| | 61/11704 [01:00<1:45:25, 1.84it/s]
|
66 |
1%| | 62/11704 [01:01<1:43:05, 1.88it/s]
|
67 |
1%| | 63/11704 [01:01<1:41:38, 1.91it/s]
|
68 |
1%| | 64/11704 [01:02<1:40:08, 1.94it/s]
|
69 |
1%| | 65/11704 [01:02<1:39:38, 1.95it/s]
|
70 |
1%| | 66/11704 [01:03<1:38:25, 1.97it/s]
|
71 |
1%| | 67/11704 [01:03<1:38:36, 1.97it/s]
|
72 |
1%| | 68/11704 [01:04<1:38:36, 1.97it/s]
|
73 |
1%| | 69/11704 [01:04<1:37:32, 1.99it/s]
|
74 |
1%| | 70/11704 [01:05<1:37:04, 2.00it/s]
|
75 |
1%| | 71/11704 [01:05<1:36:37, 2.01it/s]
|
76 |
1%| | 72/11704 [01:06<1:36:46, 2.00it/s]
|
77 |
1%| | 73/11704 [01:06<1:36:54, 2.00it/s]
|
78 |
1%| | 74/11704 [01:07<1:37:03, 2.00it/s]
|
79 |
1%| | 75/11704 [01:07<1:36:35, 2.01it/s]{'loss': 9.2071, 'grad_norm': 1.037238597869873, 'learning_rate': 6.404782237403927e-05, 'epoch': 0.09}
|
80 |
|
|
|
81 |
1%| | 75/11704 [01:07<1:36:35, 2.01it/s]
|
82 |
1%| | 76/11704 [01:08<1:36:49, 2.00it/s]
|
83 |
1%| | 77/11704 [01:08<1:36:51, 2.00it/s]
|
84 |
1%| | 78/11704 [01:09<1:36:22, 2.01it/s]
|
85 |
1%| | 79/11704 [01:09<1:36:16, 2.01it/s]
|
86 |
1%| | 80/11704 [01:10<1:36:33, 2.01it/s]
|
87 |
1%| | 81/11704 [01:10<1:36:15, 2.01it/s]
|
88 |
1%| | 82/11704 [01:11<1:36:11, 2.01it/s]
|
89 |
1%| | 83/11704 [01:11<1:36:08, 2.01it/s]
|
90 |
1%| | 84/11704 [01:12<1:35:57, 2.02it/s]
|
91 |
1%| | 85/11704 [01:12<1:36:27, 2.01it/s]
|
92 |
1%| | 86/11704 [01:13<1:36:25, 2.01it/s]
|
93 |
1%| | 87/11704 [01:13<1:36:11, 2.01it/s]
|
94 |
1%| | 88/11704 [01:14<1:36:08, 2.01it/s]
|
95 |
1%| | 89/11704 [01:14<1:37:01, 2.00it/s]
|
96 |
1%| | 90/11704 [01:15<1:39:29, 1.95it/s]
|
97 |
1%| | 91/11704 [01:15<1:38:15, 1.97it/s]
|
98 |
1%| | 92/11704 [01:16<1:38:03, 1.97it/s]
|
99 |
1%| | 93/11704 [01:16<1:37:15, 1.99it/s]
|
100 |
1%| | 94/11704 [01:17<1:36:57, 2.00it/s]
|
101 |
1%| | 95/11704 [01:17<1:36:48, 2.00it/s]
|
102 |
1%| | 96/11704 [01:18<1:36:31, 2.00it/s]
|
103 |
1%| | 97/11704 [01:18<1:36:15, 2.01it/s]
|
104 |
1%| | 98/11704 [01:19<1:36:02, 2.01it/s]
|
105 |
1%| | 99/11704 [01:19<1:36:18, 2.01it/s]
|
106 |
1%| | 100/11704 [01:20<1:36:11, 2.01it/s]{'loss': 8.3139, 'grad_norm': 0.7935464978218079, 'learning_rate': 8.539709649871905e-05, 'epoch': 0.12}
|
|
|
107 |
|
108 |
1%| | 100/11704 [01:20<1:36:11, 2.01it/s]
|
109 |
1%| | 101/11704 [01:20<1:36:06, 2.01it/s]
|
110 |
1%| | 102/11704 [01:21<1:35:47, 2.02it/s]
|
111 |
1%| | 103/11704 [01:21<1:36:08, 2.01it/s]
|
112 |
1%| | 104/11704 [01:22<1:35:49, 2.02it/s]
|
113 |
1%| | 105/11704 [01:22<1:35:43, 2.02it/s]
|
114 |
1%| | 106/11704 [01:23<1:35:41, 2.02it/s]
|
115 |
1%| | 107/11704 [01:23<1:35:33, 2.02it/s]
|
116 |
1%| | 108/11704 [01:24<1:35:22, 2.03it/s]
|
117 |
1%| | 109/11704 [01:24<1:35:39, 2.02it/s]
|
118 |
1%| | 110/11704 [01:25<1:35:33, 2.02it/s]
|
119 |
1%| | 111/11704 [01:25<1:35:49, 2.02it/s]
|
120 |
1%| | 112/11704 [01:26<1:35:39, 2.02it/s]
|
121 |
1%| | 113/11704 [01:26<1:35:34, 2.02it/s]
|
122 |
1%| | 114/11704 [01:27<1:35:24, 2.02it/s]
|
123 |
1%| | 115/11704 [01:27<1:35:20, 2.03it/s]
|
124 |
1%| | 116/11704 [01:28<1:35:31, 2.02it/s]
|
125 |
1%| | 117/11704 [01:28<1:35:27, 2.02it/s]
|
126 |
1%| | 118/11704 [01:29<1:35:24, 2.02it/s]
|
127 |
1%| | 119/11704 [01:29<1:35:14, 2.03it/s]
|
128 |
1%| | 120/11704 [01:30<1:35:08, 2.03it/s]
|
129 |
1%| | 121/11704 [01:30<1:35:10, 2.03it/s]
|
130 |
1%| | 122/11704 [01:31<1:35:03, 2.03it/s]
|
131 |
1%| | 123/11704 [01:31<1:34:57, 2.03it/s]
|
132 |
1%| | 124/11704 [01:32<1:35:01, 2.03it/s]
|
133 |
1%| | 125/11704 [01:32<1:35:08, 2.03it/s]{'loss': 7.526, 'grad_norm': 0.5917059183120728, 'learning_rate': 0.0001067463706233988, 'epoch': 0.15}
|
|
|
134 |
|
135 |
1%| | 125/11704 [01:32<1:35:08, 2.03it/s]
|
136 |
1%| | 126/11704 [01:33<1:35:09, 2.03it/s]
|
137 |
1%| | 127/11704 [01:33<1:35:04, 2.03it/s]
|
138 |
1%| | 128/11704 [01:34<1:34:59, 2.03it/s]
|
139 |
1%| | 129/11704 [01:34<1:34:59, 2.03it/s]
|
140 |
1%| | 130/11704 [01:35<1:35:09, 2.03it/s]
|
141 |
1%| | 131/11704 [01:35<1:35:07, 2.03it/s]
|
142 |
1%| | 132/11704 [01:35<1:35:02, 2.03it/s]
|
143 |
1%| | 133/11704 [01:36<1:34:58, 2.03it/s]
|
144 |
1%| | 134/11704 [01:36<1:34:58, 2.03it/s]
|
145 |
1%| | 135/11704 [01:37<1:34:56, 2.03it/s]
|
146 |
1%| | 136/11704 [01:37<1:35:01, 2.03it/s]
|
147 |
1%| | 137/11704 [01:38<1:34:56, 2.03it/s]
|
148 |
1%| | 138/11704 [01:38<1:35:30, 2.02it/s]
|
149 |
1%| | 139/11704 [01:39<1:35:41, 2.01it/s]
|
150 |
1%| | 140/11704 [01:39<1:35:29, 2.02it/s]
|
151 |
1%| | 141/11704 [01:40<1:35:14, 2.02it/s]
|
152 |
1%| | 142/11704 [01:40<1:35:11, 2.02it/s]
|
153 |
1%| | 143/11704 [01:41<1:35:00, 2.03it/s]
|
154 |
1%| | 144/11704 [01:41<1:34:55, 2.03it/s]
|
155 |
1%| | 145/11704 [01:42<1:34:52, 2.03it/s]
|
156 |
1%| | 146/11704 [01:42<1:34:48, 2.03it/s]
|
157 |
1%|β | 147/11704 [01:43<1:34:51, 2.03it/s]
|
158 |
1%|β | 148/11704 [01:43<1:34:45, 2.03it/s]
|
159 |
1%|β | 149/11704 [01:44<1:34:43, 2.03it/s]
|
160 |
1%|β | 150/11704 [01:44<1:34:48, 2.03it/s]{'loss': 6.8951, 'grad_norm': 0.37877357006073, 'learning_rate': 0.00012809564474807855, 'epoch': 0.18}
|
|
|
161 |
|
162 |
1%|β | 150/11704 [01:44<1:34:48, 2.03it/s]
|
163 |
1%|β | 151/11704 [01:45<1:34:56, 2.03it/s]
|
164 |
1%|β | 152/11704 [01:45<1:34:58, 2.03it/s]
|
165 |
1%|β | 153/11704 [01:46<1:34:45, 2.03it/s]
|
166 |
1%|β | 154/11704 [01:46<1:34:38, 2.03it/s]
|
167 |
1%|β | 155/11704 [01:47<1:34:42, 2.03it/s]
|
168 |
1%|β | 156/11704 [01:47<1:34:49, 2.03it/s]
|
169 |
1%|β | 157/11704 [01:48<1:34:49, 2.03it/s]
|
170 |
1%|β | 158/11704 [01:48<1:34:47, 2.03it/s]
|
171 |
1%|β | 159/11704 [01:49<1:34:42, 2.03it/s]
|
172 |
1%|β | 160/11704 [01:49<1:34:40, 2.03it/s]
|
173 |
1%|β | 161/11704 [01:50<1:34:38, 2.03it/s]
|
174 |
1%|β | 162/11704 [01:50<1:34:39, 2.03it/s]
|
175 |
1%|β | 163/11704 [01:51<1:34:47, 2.03it/s]
|
176 |
1%|β | 164/11704 [01:51<1:34:47, 2.03it/s]
|
177 |
1%|β | 165/11704 [01:52<1:34:54, 2.03it/s]
|
178 |
1%|β | 166/11704 [01:52<1:34:47, 2.03it/s]
|
179 |
1%|β | 167/11704 [01:53<1:34:47, 2.03it/s]
|
180 |
1%|β | 168/11704 [01:53<1:34:38, 2.03it/s]
|
181 |
1%|β | 169/11704 [01:54<1:34:42, 2.03it/s]
|
182 |
1%|β | 170/11704 [01:54<1:34:41, 2.03it/s]
|
183 |
1%|β | 171/11704 [01:55<1:34:32, 2.03it/s]
|
184 |
1%|β | 172/11704 [01:55<1:34:38, 2.03it/s]
|
185 |
1%|β | 173/11704 [01:56<1:34:32, 2.03it/s]
|
186 |
1%|β | 174/11704 [01:56<1:34:27, 2.03it/s]
|
187 |
1%|β | 175/11704 [01:57<1:34:29, 2.03it/s]{'loss': 6.3957, 'grad_norm': 0.3709251582622528, 'learning_rate': 0.00014944491887275833, 'epoch': 0.21}
|
|
|
188 |
|
189 |
1%|β | 175/11704 [01:57<1:34:29, 2.03it/s]
|
190 |
2%|β | 176/11704 [01:57<1:34:26, 2.03it/s]
|
191 |
2%|β | 177/11704 [01:58<1:34:35, 2.03it/s]
|
192 |
2%|β | 178/11704 [01:58<1:34:30, 2.03it/s]
|
193 |
2%|β | 179/11704 [01:59<1:34:27, 2.03it/s]
|
194 |
2%|β | 180/11704 [01:59<1:34:35, 2.03it/s]
|
195 |
2%|β | 181/11704 [02:00<1:34:38, 2.03it/s]
|
196 |
2%|β | 182/11704 [02:00<1:34:39, 2.03it/s]
|
197 |
2%|β | 183/11704 [02:01<1:34:32, 2.03it/s]
|
198 |
2%|β | 184/11704 [02:01<1:34:33, 2.03it/s]
|
199 |
2%|β | 185/11704 [02:02<1:34:35, 2.03it/s]
|
200 |
2%|β | 186/11704 [02:02<1:34:32, 2.03it/s]
|
201 |
2%|β | 187/11704 [02:03<1:34:35, 2.03it/s]
|
202 |
2%|β | 188/11704 [02:03<1:34:30, 2.03it/s]
|
203 |
2%|β | 189/11704 [02:04<1:34:32, 2.03it/s]
|
204 |
2%|β | 190/11704 [02:04<1:34:33, 2.03it/s]
|
205 |
2%|β | 191/11704 [02:05<1:34:28, 2.03it/s]
|
206 |
2%|β | 192/11704 [02:05<1:34:33, 2.03it/s]
|
207 |
2%|β | 193/11704 [02:06<1:34:30, 2.03it/s]
|
208 |
2%|β | 194/11704 [02:06<1:34:35, 2.03it/s]
|
209 |
2%|β | 195/11704 [02:07<1:34:27, 2.03it/s]
|
210 |
2%|β | 196/11704 [02:07<1:34:29, 2.03it/s]
|
211 |
2%|β | 197/11704 [02:08<1:34:33, 2.03it/s]
|
212 |
2%|β | 198/11704 [02:08<1:34:28, 2.03it/s]
|
213 |
2%|β | 199/11704 [02:09<1:34:28, 2.03it/s]
|
214 |
2%|β | 200/11704 [02:09<1:34:30, 2.03it/s]{'loss': 5.9826, 'grad_norm': 0.6036785840988159, 'learning_rate': 0.0001707941929974381, 'epoch': 0.24}
|
|
|
215 |
|
216 |
2%|β | 200/11704 [02:09<1:34:30, 2.03it/s]
|
217 |
2%|β | 201/11704 [02:09<1:34:24, 2.03it/s]
|
218 |
2%|β | 202/11704 [02:10<1:34:31, 2.03it/s]
|
219 |
2%|β | 203/11704 [02:10<1:34:23, 2.03it/s]
|
220 |
2%|β | 204/11704 [02:11<1:34:21, 2.03it/s]
|
221 |
2%|β | 205/11704 [02:11<1:34:27, 2.03it/s]
|
222 |
2%|β | 206/11704 [02:12<1:34:31, 2.03it/s]
|
223 |
2%|β | 207/11704 [02:12<1:34:36, 2.03it/s]
|
224 |
2%|β | 208/11704 [02:13<1:34:28, 2.03it/s]
|
225 |
2%|β | 209/11704 [02:13<1:34:31, 2.03it/s]
|
226 |
2%|β | 210/11704 [02:14<1:34:28, 2.03it/s]
|
227 |
2%|β | 211/11704 [02:14<1:34:26, 2.03it/s]
|
228 |
2%|β | 212/11704 [02:15<1:34:28, 2.03it/s]
|
229 |
2%|β | 213/11704 [02:15<1:34:23, 2.03it/s]
|
230 |
2%|β | 214/11704 [02:16<1:34:28, 2.03it/s]
|
231 |
2%|β | 215/11704 [02:16<1:34:30, 2.03it/s]
|
232 |
2%|β | 216/11704 [02:17<1:34:29, 2.03it/s]
|
233 |
2%|β | 217/11704 [02:17<1:34:34, 2.02it/s]
|
234 |
2%|β | 218/11704 [02:18<1:34:38, 2.02it/s]
|
235 |
2%|β | 219/11704 [02:18<1:34:37, 2.02it/s]
|
236 |
2%|β | 220/11704 [02:19<1:34:45, 2.02it/s]
|
237 |
2%|β | 221/11704 [02:19<1:34:34, 2.02it/s]
|
238 |
2%|β | 222/11704 [02:20<1:34:43, 2.02it/s]
|
239 |
2%|β | 223/11704 [02:20<1:34:35, 2.02it/s]
|
240 |
2%|β | 224/11704 [02:21<1:34:33, 2.02it/s]
|
241 |
2%|β | 225/11704 [02:21<1:34:27, 2.03it/s]{'loss': 5.659, 'grad_norm': 0.9925475716590881, 'learning_rate': 0.00019214346712211785, 'epoch': 0.27}
|
242 |
|
|
|
243 |
2%|β | 225/11704 [02:21<1:34:27, 2.03it/s]
|
244 |
2%|β | 226/11704 [02:22<1:34:29, 2.02it/s]
|
245 |
2%|β | 227/11704 [02:22<1:34:27, 2.03it/s]
|
246 |
2%|β | 228/11704 [02:23<1:34:41, 2.02it/s]
|
247 |
2%|β | 229/11704 [02:23<1:34:33, 2.02it/s]
|
248 |
2%|β | 230/11704 [02:24<1:34:35, 2.02it/s]
|
249 |
2%|β | 231/11704 [02:24<1:34:27, 2.02it/s]
|
250 |
2%|β | 232/11704 [02:25<1:34:25, 2.03it/s]
|
251 |
2%|β | 233/11704 [02:25<1:34:22, 2.03it/s]
|
252 |
2%|β | 234/11704 [02:26<1:34:20, 2.03it/s]
|
253 |
2%|β | 235/11704 [02:26<1:34:18, 2.03it/s]
|
254 |
2%|β | 236/11704 [02:27<1:34:21, 2.03it/s]
|
255 |
2%|β | 237/11704 [02:27<1:34:25, 2.02it/s]
|
256 |
2%|β | 238/11704 [02:28<1:34:20, 2.03it/s]
|
257 |
2%|β | 239/11704 [02:28<1:34:29, 2.02it/s]
|
258 |
2%|β | 240/11704 [02:29<1:34:21, 2.02it/s]
|
259 |
2%|β | 241/11704 [02:29<1:34:26, 2.02it/s]
|
260 |
2%|β | 242/11704 [02:30<1:34:19, 2.03it/s]
|
261 |
2%|β | 243/11704 [02:30<1:34:16, 2.03it/s]
|
262 |
2%|β | 244/11704 [02:31<1:34:14, 2.03it/s]
|
263 |
2%|β | 245/11704 [02:31<1:34:16, 2.03it/s]
|
264 |
2%|β | 246/11704 [02:32<1:34:15, 2.03it/s]
|
265 |
2%|β | 247/11704 [02:32<1:34:18, 2.02it/s]
|
266 |
2%|β | 248/11704 [02:33<1:34:35, 2.02it/s]
|
267 |
2%|β | 249/11704 [02:33<1:34:34, 2.02it/s]
|
268 |
2%|β | 250/11704 [02:34<1:34:36, 2.02it/s]{'loss': 5.4253, 'grad_norm': 0.6297341585159302, 'learning_rate': 0.0002134927412467976, 'epoch': 0.3}
|
|
|
269 |
|
270 |
2%|β | 250/11704 [02:34<1:34:36, 2.02it/s]
|
271 |
2%|β | 251/11704 [02:34<1:34:48, 2.01it/s]
|
272 |
2%|β | 252/11704 [02:35<1:34:35, 2.02it/s]
|
273 |
2%|β | 253/11704 [02:35<1:34:32, 2.02it/s]
|
274 |
2%|β | 254/11704 [02:36<1:34:26, 2.02it/s]
|
275 |
2%|β | 255/11704 [02:36<1:34:20, 2.02it/s]
|
276 |
2%|β | 256/11704 [02:37<1:34:34, 2.02it/s]
|
277 |
2%|β | 257/11704 [02:37<1:34:28, 2.02it/s]
|
278 |
2%|β | 258/11704 [02:38<1:34:19, 2.02it/s]
|
279 |
2%|β | 259/11704 [02:38<1:34:10, 2.03it/s]
|
280 |
2%|β | 260/11704 [02:39<1:34:10, 2.03it/s]
|
281 |
2%|β | 261/11704 [02:39<1:34:06, 2.03it/s]
|
282 |
2%|β | 262/11704 [02:40<1:33:56, 2.03it/s]
|
283 |
2%|β | 263/11704 [02:40<1:33:59, 2.03it/s]
|
284 |
2%|β | 264/11704 [02:41<1:33:55, 2.03it/s]
|
285 |
2%|β | 265/11704 [02:41<1:34:01, 2.03it/s]
|
286 |
2%|β | 266/11704 [02:42<1:34:04, 2.03it/s]
|
287 |
2%|β | 267/11704 [02:42<1:33:59, 2.03it/s]
|
288 |
2%|β | 268/11704 [02:43<1:34:06, 2.03it/s]
|
289 |
2%|β | 269/11704 [02:43<1:34:01, 2.03it/s]
|
290 |
2%|β | 270/11704 [02:44<1:34:04, 2.03it/s]
|
291 |
2%|β | 271/11704 [02:44<1:33:59, 2.03it/s]
|
292 |
2%|β | 272/11704 [02:45<1:33:57, 2.03it/s]
|
293 |
2%|β | 273/11704 [02:45<1:33:59, 2.03it/s]
|
294 |
2%|β | 274/11704 [02:46<1:33:57, 2.03it/s]
|
295 |
2%|β | 275/11704 [02:46<1:34:06, 2.02it/s]{'loss': 5.2293, 'grad_norm': 0.5968512892723083, 'learning_rate': 0.00023484201537147736, 'epoch': 0.33}
|
296 |
|
|
|
297 |
2%|β | 275/11704 [02:46<1:34:06, 2.02it/s]
|
298 |
2%|β | 276/11704 [02:47<1:34:02, 2.03it/s]
|
299 |
2%|β | 277/11704 [02:47<1:34:05, 2.02it/s]
|
300 |
2%|β | 278/11704 [02:48<1:33:59, 2.03it/s]
|
301 |
2%|β | 279/11704 [02:48<1:34:01, 2.02it/s]
|
302 |
2%|β | 280/11704 [02:49<1:33:54, 2.03it/s]
|
303 |
2%|β | 281/11704 [02:49<1:34:00, 2.03it/s]
|
304 |
2%|β | 282/11704 [02:49<1:33:55, 2.03it/s]
|
305 |
2%|β | 283/11704 [02:50<1:33:55, 2.03it/s]
|
306 |
2%|β | 284/11704 [02:50<1:33:52, 2.03it/s]
|
307 |
2%|β | 285/11704 [02:51<1:33:47, 2.03it/s]
|
308 |
2%|β | 286/11704 [02:51<1:33:49, 2.03it/s]
|
309 |
2%|β | 287/11704 [02:52<1:33:49, 2.03it/s]
|
310 |
2%|β | 288/11704 [02:52<1:33:50, 2.03it/s]
|
311 |
2%|β | 289/11704 [02:53<1:33:43, 2.03it/s]
|
312 |
2%|β | 290/11704 [02:53<1:33:47, 2.03it/s]
|
313 |
2%|β | 291/11704 [02:54<1:33:42, 2.03it/s]
|
314 |
2%|β | 292/11704 [02:54<1:33:40, 2.03it/s]
|
315 |
3%|β | 293/11704 [02:55<1:33:43, 2.03it/s]
|
316 |
3%|β | 294/11704 [02:55<1:33:38, 2.03it/s]
|
317 |
3%|β | 295/11704 [02:56<1:33:41, 2.03it/s]
|
318 |
3%|β | 296/11704 [02:56<1:33:37, 2.03it/s]
|
319 |
3%|β | 297/11704 [02:57<1:33:29, 2.03it/s]
|
320 |
3%|β | 298/11704 [02:57<1:33:33, 2.03it/s]
|
321 |
3%|β | 299/11704 [02:58<1:33:34, 2.03it/s]
|
322 |
3%|β | 300/11704 [02:58<1:33:27, 2.03it/s]{'loss': 5.0541, 'grad_norm': 0.9146194458007812, 'learning_rate': 0.0002561912894961571, 'epoch': 0.36}
|
|
|
323 |
|
324 |
3%|β | 300/11704 [02:58<1:33:27, 2.03it/s]
|
325 |
3%|β | 301/11704 [02:59<1:33:30, 2.03it/s]
|
326 |
3%|β | 302/11704 [02:59<1:33:27, 2.03it/s]
|
327 |
3%|β | 303/11704 [03:00<1:33:27, 2.03it/s]
|
328 |
3%|β | 304/11704 [03:00<1:33:31, 2.03it/s]
|
329 |
3%|β | 305/11704 [03:01<1:33:25, 2.03it/s]
|
330 |
3%|β | 306/11704 [03:01<1:33:30, 2.03it/s]
|
331 |
3%|β | 307/11704 [03:02<1:33:24, 2.03it/s]
|
332 |
3%|β | 308/11704 [03:02<1:33:26, 2.03it/s]
|
333 |
3%|β | 309/11704 [03:03<1:33:31, 2.03it/s]
|
334 |
3%|β | 310/11704 [03:03<1:33:21, 2.03it/s]
|
335 |
3%|β | 311/11704 [03:04<1:33:24, 2.03it/s]
|
336 |
3%|β | 312/11704 [03:04<1:33:26, 2.03it/s]
|
337 |
3%|β | 313/11704 [03:05<1:33:25, 2.03it/s]
|
338 |
3%|β | 314/11704 [03:05<1:33:30, 2.03it/s]
|
339 |
3%|β | 315/11704 [03:06<1:33:22, 2.03it/s]
|
340 |
3%|β | 316/11704 [03:06<1:33:25, 2.03it/s]
|
341 |
3%|β | 317/11704 [03:07<1:33:24, 2.03it/s]
|
342 |
3%|β | 318/11704 [03:07<1:33:20, 2.03it/s]
|
343 |
3%|β | 319/11704 [03:08<1:33:22, 2.03it/s]
|
344 |
3%|β | 320/11704 [03:08<1:33:22, 2.03it/s]
|
345 |
3%|β | 321/11704 [03:09<1:33:16, 2.03it/s]
|
346 |
3%|β | 322/11704 [03:09<1:33:24, 2.03it/s]
|
347 |
3%|β | 323/11704 [03:10<1:33:18, 2.03it/s]
|
348 |
3%|β | 324/11704 [03:10<1:33:15, 2.03it/s]
|
349 |
3%|β | 325/11704 [03:11<1:33:18, 2.03it/s]{'loss': 4.934, 'grad_norm': 0.9477291703224182, 'learning_rate': 0.0002775405636208369, 'epoch': 0.39}
|
|
|
350 |
|
351 |
3%|β | 325/11704 [03:11<1:33:18, 2.03it/s]
|
352 |
3%|β | 326/11704 [03:11<1:33:15, 2.03it/s]
|
353 |
3%|β | 327/11704 [03:12<1:33:14, 2.03it/s]
|
354 |
3%|β | 328/11704 [03:12<1:33:18, 2.03it/s]
|
355 |
3%|β | 329/11704 [03:13<1:33:10, 2.03it/s]
|
356 |
3%|β | 330/11704 [03:13<1:33:10, 2.03it/s]
|
357 |
3%|β | 331/11704 [03:14<1:33:11, 2.03it/s]
|
358 |
3%|β | 332/11704 [03:14<1:33:11, 2.03it/s]
|
359 |
3%|β | 333/11704 [03:15<1:33:12, 2.03it/s]
|
360 |
3%|β | 334/11704 [03:15<1:33:15, 2.03it/s]
|
361 |
3%|β | 335/11704 [03:16<1:33:09, 2.03it/s]
|
362 |
3%|β | 336/11704 [03:16<1:33:09, 2.03it/s]
|
363 |
3%|β | 337/11704 [03:17<1:33:16, 2.03it/s]
|
364 |
3%|β | 338/11704 [03:17<1:33:49, 2.02it/s]
|
365 |
3%|β | 339/11704 [03:18<1:33:39, 2.02it/s]
|
366 |
3%|β | 340/11704 [03:18<1:33:32, 2.02it/s]
|
367 |
3%|β | 341/11704 [03:19<1:33:29, 2.03it/s]
|
368 |
3%|β | 342/11704 [03:19<1:33:20, 2.03it/s]
|
369 |
3%|β | 343/11704 [03:20<1:33:18, 2.03it/s]
|
370 |
3%|β | 344/11704 [03:20<1:33:18, 2.03it/s]
|
371 |
3%|β | 345/11704 [03:21<1:33:12, 2.03it/s]
|
372 |
3%|β | 346/11704 [03:21<1:33:12, 2.03it/s]
|
373 |
3%|β | 347/11704 [03:21<1:33:08, 2.03it/s]
|
374 |
3%|β | 348/11704 [03:22<1:33:03, 2.03it/s]
|
375 |
3%|β | 349/11704 [03:22<1:33:10, 2.03it/s]
|
376 |
3%|β | 350/11704 [03:23<1:33:03, 2.03it/s]{'loss': 4.8046, 'grad_norm': 0.824380099773407, 'learning_rate': 0.00029888983774551667, 'epoch': 0.42}
|
|
|
377 |
|
378 |
3%|β | 350/11704 [03:23<1:33:03, 2.03it/s]
|
379 |
3%|β | 351/11704 [03:23<1:33:04, 2.03it/s]
|
380 |
3%|β | 352/11704 [03:24<1:33:07, 2.03it/s]
|
381 |
3%|β | 353/11704 [03:24<1:33:07, 2.03it/s]
|
382 |
3%|β | 354/11704 [03:25<1:33:00, 2.03it/s]
|
383 |
3%|β | 355/11704 [03:25<1:33:01, 2.03it/s]
|
384 |
3%|β | 356/11704 [03:26<1:32:58, 2.03it/s]
|
385 |
3%|β | 357/11704 [03:26<1:33:00, 2.03it/s]
|
386 |
3%|β | 358/11704 [03:27<1:32:59, 2.03it/s]
|
387 |
3%|β | 359/11704 [03:27<1:32:59, 2.03it/s]
|
388 |
3%|β | 360/11704 [03:28<1:32:57, 2.03it/s]
|
389 |
3%|β | 361/11704 [03:28<1:32:57, 2.03it/s]
|
390 |
3%|β | 362/11704 [03:29<1:32:54, 2.03it/s]
|
391 |
3%|β | 363/11704 [03:29<1:32:51, 2.04it/s]
|
392 |
3%|β | 364/11704 [03:30<1:32:54, 2.03it/s]
|
393 |
3%|β | 365/11704 [03:30<1:32:55, 2.03it/s]
|
394 |
3%|β | 366/11704 [03:31<1:32:52, 2.03it/s]
|
395 |
3%|β | 367/11704 [03:31<1:33:03, 2.03it/s]
|
396 |
3%|β | 368/11704 [03:32<1:33:03, 2.03it/s]
|
397 |
3%|β | 369/11704 [03:32<1:33:10, 2.03it/s]
|
398 |
3%|β | 370/11704 [03:33<1:33:01, 2.03it/s]
|
399 |
3%|β | 371/11704 [03:33<1:32:54, 2.03it/s]
|
400 |
3%|β | 372/11704 [03:34<1:32:58, 2.03it/s]
|
401 |
3%|β | 373/11704 [03:34<1:32:51, 2.03it/s]
|
402 |
3%|β | 374/11704 [03:35<1:32:56, 2.03it/s]
|
403 |
3%|β | 375/11704 [03:35<1:33:04, 2.03it/s]{'loss': 4.6929, 'grad_norm': 0.7358281016349792, 'learning_rate': 0.0003202391118701964, 'epoch': 0.45}
|
|
|
404 |
|
405 |
3%|β | 375/11704 [03:35<1:33:04, 2.03it/s]
|
406 |
3%|β | 376/11704 [03:36<1:33:02, 2.03it/s]
|
407 |
3%|β | 377/11704 [03:36<1:33:04, 2.03it/s]
|
408 |
3%|β | 378/11704 [03:37<1:33:01, 2.03it/s]
|
409 |
3%|β | 379/11704 [03:37<1:33:04, 2.03it/s]
|
410 |
3%|β | 380/11704 [03:38<1:33:01, 2.03it/s]
|
411 |
3%|β | 381/11704 [03:38<1:32:53, 2.03it/s]
|
412 |
3%|β | 382/11704 [03:39<1:33:07, 2.03it/s]
|
413 |
3%|β | 383/11704 [03:39<1:33:02, 2.03it/s]
|
414 |
3%|β | 384/11704 [03:40<1:33:01, 2.03it/s]
|
415 |
3%|β | 385/11704 [03:40<1:32:59, 2.03it/s]
|
416 |
3%|β | 386/11704 [03:41<1:32:51, 2.03it/s]
|
417 |
3%|β | 387/11704 [03:41<1:32:44, 2.03it/s]
|
418 |
3%|β | 388/11704 [03:42<1:32:52, 2.03it/s]
|
419 |
3%|β | 389/11704 [03:42<1:32:47, 2.03it/s]
|
420 |
3%|β | 390/11704 [03:43<1:32:51, 2.03it/s]
|
421 |
3%|β | 391/11704 [03:43<1:32:56, 2.03it/s]
|
422 |
3%|β | 392/11704 [03:44<1:32:50, 2.03it/s]
|
423 |
3%|β | 393/11704 [03:44<1:32:50, 2.03it/s]
|
424 |
3%|β | 394/11704 [03:45<1:32:44, 2.03it/s]
|
425 |
3%|β | 395/11704 [03:45<1:32:41, 2.03it/s]
|
426 |
3%|β | 396/11704 [03:46<1:32:36, 2.04it/s]
|
427 |
3%|β | 397/11704 [03:46<1:32:45, 2.03it/s]
|
428 |
3%|β | 398/11704 [03:47<1:32:39, 2.03it/s]
|
429 |
3%|β | 399/11704 [03:47<1:32:35, 2.03it/s]
|
430 |
3%|β | 400/11704 [03:48<1:32:43, 2.03it/s]
|
431 |
|
|
|
432 |
3%|β | 400/11704 [03:48<1:32:43, 2.03it/s]
|
433 |
3%|β | 401/11704 [03:48<1:32:46, 2.03it/s]
|
434 |
3%|β | 402/11704 [03:49<1:32:45, 2.03it/s]
|
435 |
3%|β | 403/11704 [03:49<1:32:48, 2.03it/s]
|
436 |
3%|β | 404/11704 [03:50<1:32:43, 2.03it/s]
|
437 |
3%|β | 405/11704 [03:50<1:32:41, 2.03it/s]
|
438 |
3%|β | 406/11704 [03:51<1:32:39, 2.03it/s]
|
439 |
3%|β | 407/11704 [03:51<1:32:34, 2.03it/s]
|
440 |
3%|β | 408/11704 [03:52<1:32:39, 2.03it/s]
|
441 |
3%|β | 409/11704 [03:52<1:32:39, 2.03it/s]
|
442 |
4%|β | 410/11704 [03:53<1:32:35, 2.03it/s]
|
443 |
4%|β | 411/11704 [03:53<1:32:35, 2.03it/s]
|
444 |
4%|β | 412/11704 [03:53<1:32:32, 2.03it/s]
|
445 |
4%|β | 413/11704 [03:54<1:32:25, 2.04it/s]
|
446 |
4%|β | 414/11704 [03:54<1:32:29, 2.03it/s]
|
447 |
4%|β | 415/11704 [03:55<1:32:26, 2.04it/s]
|
448 |
4%|β | 416/11704 [03:55<1:32:17, 2.04it/s]
|
449 |
4%|β | 417/11704 [03:56<1:32:23, 2.04it/s]
|
450 |
4%|β | 418/11704 [03:56<1:32:26, 2.03it/s]
|
451 |
4%|β | 419/11704 [03:57<1:32:22, 2.04it/s]
|
452 |
4%|β | 420/11704 [03:57<1:32:24, 2.04it/s]
|
453 |
4%|β | 421/11704 [03:58<1:32:22, 2.04it/s]
|
454 |
4%|β | 422/11704 [03:58<1:32:19, 2.04it/s]
|
455 |
4%|β | 423/11704 [03:59<1:32:20, 2.04it/s]
|
456 |
4%|β | 424/11704 [03:59<1:32:19, 2.04it/s]
|
457 |
4%|β | 425/11704 [04:00<1:32:14, 2.04it/s]
|
458 |
|
|
|
459 |
4%|β | 425/11704 [04:00<1:32:14, 2.04it/s]
|
460 |
4%|β | 426/11704 [04:00<1:32:21, 2.04it/s]
|
461 |
4%|β | 427/11704 [04:01<1:32:20, 2.04it/s]
|
462 |
4%|β | 428/11704 [04:01<1:32:16, 2.04it/s]
|
463 |
4%|β | 429/11704 [04:02<1:32:26, 2.03it/s]
|
464 |
4%|β | 430/11704 [04:02<1:32:20, 2.03it/s]
|
465 |
4%|β | 431/11704 [04:03<1:32:17, 2.04it/s]
|
466 |
4%|β | 432/11704 [04:03<1:32:23, 2.03it/s]
|
467 |
4%|β | 433/11704 [04:04<1:32:20, 2.03it/s]
|
468 |
4%|β | 434/11704 [04:04<1:32:17, 2.04it/s]
|
469 |
4%|β | 435/11704 [04:05<1:32:23, 2.03it/s]
|
470 |
4%|β | 436/11704 [04:05<1:32:19, 2.03it/s]
|
471 |
4%|β | 437/11704 [04:06<1:32:21, 2.03it/s]
|
472 |
4%|β | 438/11704 [04:06<1:32:25, 2.03it/s]
|
473 |
4%|β | 439/11704 [04:07<1:32:13, 2.04it/s]
|
474 |
4%|β | 440/11704 [04:07<1:32:22, 2.03it/s]
|
475 |
4%|β | 441/11704 [04:08<1:32:18, 2.03it/s]
|
476 |
4%|β | 442/11704 [04:08<1:32:09, 2.04it/s]
|
477 |
4%|β | 443/11704 [04:09<1:32:17, 2.03it/s]
|
478 |
4%|β | 444/11704 [04:09<1:32:16, 2.03it/s]
|
479 |
4%|β | 445/11704 [04:10<1:32:08, 2.04it/s]
|
480 |
4%|β | 446/11704 [04:10<1:32:13, 2.03it/s]
|
481 |
4%|β | 447/11704 [04:11<1:32:10, 2.04it/s]
|
482 |
4%|β | 448/11704 [04:11<1:32:08, 2.04it/s]
|
483 |
4%|β | 449/11704 [04:12<1:32:12, 2.03it/s]
|
484 |
4%|β | 450/11704 [04:12<1:32:11, 2.03it/s]{'loss': 4.4307, 'grad_norm': 0.6776960492134094, 'learning_rate': 0.0003842869342442357, 'epoch': 0.54}
|
485 |
|
|
|
486 |
4%|β | 450/11704 [04:12<1:32:11, 2.03it/s]
|
487 |
4%|β | 451/11704 [04:13<1:32:13, 2.03it/s]
|
488 |
4%|β | 452/11704 [04:13<1:32:17, 2.03it/s]
|
489 |
4%|β | 453/11704 [04:14<1:32:13, 2.03it/s]
|
490 |
4%|β | 454/11704 [04:14<1:32:16, 2.03it/s]
|
491 |
4%|β | 455/11704 [04:15<1:32:11, 2.03it/s]
|
492 |
4%|β | 456/11704 [04:15<1:32:06, 2.04it/s]
|
493 |
4%|β | 457/11704 [04:16<1:32:11, 2.03it/s]
|
494 |
4%|β | 458/11704 [04:16<1:32:09, 2.03it/s]
|
495 |
4%|β | 459/11704 [04:17<1:32:04, 2.04it/s]
|
496 |
4%|β | 460/11704 [04:17<1:32:09, 2.03it/s]
|
497 |
4%|β | 461/11704 [04:18<1:32:10, 2.03it/s]
|
498 |
4%|β | 462/11704 [04:18<1:32:04, 2.03it/s]
|
499 |
4%|β | 463/11704 [04:19<1:32:10, 2.03it/s]
|
500 |
4%|β | 464/11704 [04:19<1:32:03, 2.04it/s]
|
501 |
4%|β | 465/11704 [04:20<1:31:56, 2.04it/s]
|
502 |
4%|β | 466/11704 [04:20<1:32:00, 2.04it/s]
|
503 |
4%|β | 467/11704 [04:21<1:32:05, 2.03it/s]
|
504 |
4%|β | 468/11704 [04:21<1:31:58, 2.04it/s]
|
505 |
4%|β | 469/11704 [04:22<1:32:03, 2.03it/s]
|
506 |
4%|β | 470/11704 [04:22<1:31:58, 2.04it/s]
|
507 |
4%|β | 471/11704 [04:22<1:31:56, 2.04it/s]
|
508 |
4%|β | 472/11704 [04:23<1:31:58, 2.04it/s]
|
509 |
4%|β | 473/11704 [04:23<1:32:04, 2.03it/s]
|
510 |
4%|β | 474/11704 [04:24<1:31:58, 2.04it/s]
|
511 |
4%|β | 475/11704 [04:24<1:32:03, 2.03it/s]{'loss': 4.3608, 'grad_norm': 0.7181493043899536, 'learning_rate': 0.00040563620836891546, 'epoch': 0.57}
|
|
|
512 |
|
513 |
4%|β | 475/11704 [04:24<1:32:03, 2.03it/s]
|
514 |
4%|β | 476/11704 [04:25<1:32:02, 2.03it/s]
|
515 |
4%|β | 477/11704 [04:25<1:32:02, 2.03it/s]
|
516 |
4%|β | 478/11704 [04:26<1:32:07, 2.03it/s]
|
517 |
4%|β | 479/11704 [04:26<1:31:59, 2.03it/s]
|
518 |
4%|β | 480/11704 [04:27<1:32:01, 2.03it/s]
|
519 |
4%|β | 481/11704 [04:27<1:32:03, 2.03it/s]
|
520 |
4%|β | 482/11704 [04:28<1:31:58, 2.03it/s]
|
521 |
4%|β | 483/11704 [04:28<1:32:07, 2.03it/s]
|
522 |
4%|β | 484/11704 [04:29<1:32:02, 2.03it/s]
|
523 |
4%|β | 485/11704 [04:29<1:31:57, 2.03it/s]
|
524 |
4%|β | 486/11704 [04:30<1:31:53, 2.03it/s]
|
525 |
4%|β | 487/11704 [04:30<1:31:57, 2.03it/s]
|
526 |
4%|β | 488/11704 [04:31<1:31:56, 2.03it/s]
|
527 |
4%|β | 489/11704 [04:31<1:31:56, 2.03it/s]
|
528 |
4%|β | 490/11704 [04:32<1:31:57, 2.03it/s]
|
529 |
4%|β | 491/11704 [04:32<1:31:54, 2.03it/s]
|
530 |
4%|β | 492/11704 [04:33<1:31:57, 2.03it/s]
|
531 |
4%|β | 493/11704 [04:33<1:31:54, 2.03it/s]
|
532 |
4%|β | 494/11704 [04:34<1:31:55, 2.03it/s]
|
533 |
4%|β | 495/11704 [04:34<1:31:55, 2.03it/s]
|
534 |
4%|β | 496/11704 [04:35<1:31:53, 2.03it/s]
|
535 |
4%|β | 497/11704 [04:35<1:31:52, 2.03it/s]
|
536 |
4%|β | 498/11704 [04:36<1:31:53, 2.03it/s]
|
537 |
4%|β | 499/11704 [04:36<1:31:47, 2.03it/s]
|
538 |
4%|β | 500/11704 [04:37<1:31:42, 2.04it/s]{'loss': 4.2807, 'grad_norm': 0.6647967100143433, 'learning_rate': 0.0004269854824935952, 'epoch': 0.6}
|
|
|
539 |
|
540 |
4%|β | 500/11704 [04:37<1:31:42, 2.04it/s]
|
541 |
4%|β | 501/11704 [04:37<1:31:46, 2.03it/s]
|
542 |
4%|β | 502/11704 [04:38<1:31:41, 2.04it/s]
|
543 |
4%|β | 503/11704 [04:38<1:31:44, 2.03it/s]
|
544 |
4%|β | 504/11704 [04:39<1:31:46, 2.03it/s]
|
545 |
4%|β | 505/11704 [04:39<1:31:44, 2.03it/s]
|
546 |
4%|β | 506/11704 [04:40<1:31:47, 2.03it/s]
|
547 |
4%|β | 507/11704 [04:40<1:31:46, 2.03it/s]
|
548 |
4%|β | 508/11704 [04:41<1:31:43, 2.03it/s]
|
549 |
4%|β | 509/11704 [04:41<1:31:50, 2.03it/s]
|
550 |
4%|β | 510/11704 [04:42<1:31:44, 2.03it/s]
|
551 |
4%|β | 511/11704 [04:42<1:31:40, 2.04it/s]
|
552 |
4%|β | 512/11704 [04:43<1:31:45, 2.03it/s]
|
553 |
4%|β | 513/11704 [04:43<1:31:42, 2.03it/s]
|
554 |
4%|β | 514/11704 [04:44<1:31:42, 2.03it/s]
|
555 |
4%|β | 515/11704 [04:44<1:31:45, 2.03it/s]
|
556 |
4%|β | 516/11704 [04:45<1:31:43, 2.03it/s]
|
557 |
4%|β | 517/11704 [04:45<1:31:48, 2.03it/s]
|
558 |
4%|β | 518/11704 [04:46<1:32:34, 2.01it/s]
|
559 |
4%|β | 519/11704 [04:46<1:32:16, 2.02it/s]
|
560 |
4%|β | 520/11704 [04:47<1:31:57, 2.03it/s]
|
561 |
4%|β | 521/11704 [04:47<1:31:50, 2.03it/s]
|
562 |
4%|β | 522/11704 [04:48<1:32:03, 2.02it/s]
|
563 |
4%|β | 523/11704 [04:48<1:31:50, 2.03it/s]
|
564 |
4%|β | 524/11704 [04:49<1:31:45, 2.03it/s]
|
565 |
4%|β | 525/11704 [04:49<1:31:44, 2.03it/s]{'loss': 4.221, 'grad_norm': 0.5851206183433533, 'learning_rate': 0.00044833475661827497, 'epoch': 0.63}
|
|
|
566 |
|
567 |
4%|β | 525/11704 [04:49<1:31:44, 2.03it/s]
|
568 |
4%|β | 526/11704 [04:50<1:31:38, 2.03it/s]
|
569 |
5%|β | 527/11704 [04:50<1:31:31, 2.04it/s]
|
570 |
5%|β | 528/11704 [04:51<1:31:36, 2.03it/s]
|
571 |
5%|β | 529/11704 [04:51<1:31:34, 2.03it/s]
|
572 |
5%|β | 530/11704 [04:52<1:31:33, 2.03it/s]
|
573 |
5%|β | 531/11704 [04:52<1:31:38, 2.03it/s]
|
574 |
5%|β | 532/11704 [04:53<1:31:36, 2.03it/s]
|
575 |
5%|β | 533/11704 [04:53<1:31:28, 2.04it/s]
|
576 |
5%|β | 534/11704 [04:53<1:31:30, 2.03it/s]
|
577 |
5%|β | 535/11704 [04:54<1:31:34, 2.03it/s]
|
578 |
5%|β | 536/11704 [04:54<1:31:33, 2.03it/s]
|
579 |
5%|β | 537/11704 [04:55<1:31:29, 2.03it/s]
|
580 |
5%|β | 538/11704 [04:55<1:31:32, 2.03it/s]
|
581 |
5%|β | 539/11704 [04:56<1:31:29, 2.03it/s]
|
582 |
5%|β | 540/11704 [04:56<1:31:23, 2.04it/s]
|
583 |
5%|β | 541/11704 [04:57<1:31:26, 2.03it/s]
|
584 |
5%|β | 542/11704 [04:57<1:31:21, 2.04it/s]
|
585 |
5%|β | 543/11704 [04:58<1:31:19, 2.04it/s]
|
586 |
5%|β | 544/11704 [04:58<1:31:27, 2.03it/s]
|
587 |
5%|β | 545/11704 [04:59<1:31:24, 2.03it/s]
|
588 |
5%|β | 546/11704 [04:59<1:31:18, 2.04it/s]
|
589 |
5%|β | 547/11704 [05:00<1:31:20, 2.04it/s]
|
590 |
5%|β | 548/11704 [05:00<1:31:17, 2.04it/s]
|
591 |
5%|β | 549/11704 [05:01<1:31:09, 2.04it/s]
|
592 |
5%|β | 550/11704 [05:01<1:31:18, 2.04it/s]{'loss': 4.1699, 'grad_norm': 0.8667837381362915, 'learning_rate': 0.00046968403074295473, 'epoch': 0.66}
|
|
|
593 |
|
594 |
5%|β | 550/11704 [05:01<1:31:18, 2.04it/s]
|
595 |
5%|β | 551/11704 [05:02<1:31:19, 2.04it/s]
|
596 |
5%|β | 552/11704 [05:02<1:31:17, 2.04it/s]
|
597 |
5%|β | 553/11704 [05:03<1:31:25, 2.03it/s]
|
598 |
5%|β | 554/11704 [05:03<1:31:22, 2.03it/s]
|
599 |
5%|β | 555/11704 [05:04<1:31:22, 2.03it/s]
|
600 |
5%|β | 556/11704 [05:04<1:31:25, 2.03it/s]
|
601 |
5%|β | 557/11704 [05:05<1:31:21, 2.03it/s]
|
602 |
5%|β | 558/11704 [05:05<1:31:20, 2.03it/s]
|
603 |
5%|β | 559/11704 [05:06<1:31:21, 2.03it/s]
|
604 |
5%|β | 560/11704 [05:06<1:31:22, 2.03it/s]
|
605 |
5%|β | 561/11704 [05:07<1:31:22, 2.03it/s]
|
606 |
5%|β | 562/11704 [05:07<1:31:17, 2.03it/s]
|
607 |
5%|β | 563/11704 [05:08<1:31:13, 2.04it/s]
|
608 |
5%|β | 564/11704 [05:08<1:31:18, 2.03it/s]
|
609 |
5%|β | 565/11704 [05:09<1:31:13, 2.03it/s]
|
610 |
5%|β | 566/11704 [05:09<1:31:07, 2.04it/s]
|
611 |
5%|β | 567/11704 [05:10<1:31:11, 2.04it/s]
|
612 |
5%|β | 568/11704 [05:10<1:31:09, 2.04it/s]
|
613 |
5%|β | 569/11704 [05:11<1:31:03, 2.04it/s]
|
614 |
5%|β | 570/11704 [05:11<1:31:11, 2.03it/s]
|
615 |
5%|β | 571/11704 [05:12<1:31:07, 2.04it/s]
|
616 |
5%|β | 572/11704 [05:12<1:31:04, 2.04it/s]
|
617 |
5%|β | 573/11704 [05:13<1:31:09, 2.04it/s]
|
618 |
5%|β | 574/11704 [05:13<1:31:05, 2.04it/s]
|
619 |
5%|β | 575/11704 [05:14<1:31:05, 2.04it/s]{'loss': 4.1233, 'grad_norm': 0.720784604549408, 'learning_rate': 0.0004910333048676345, 'epoch': 0.69}
|
|
|
620 |
|
621 |
5%|β | 575/11704 [05:14<1:31:05, 2.04it/s]
|
622 |
5%|β | 576/11704 [05:14<1:31:14, 2.03it/s]
|
623 |
5%|β | 577/11704 [05:15<1:31:06, 2.04it/s]
|
624 |
5%|β | 578/11704 [05:15<1:31:06, 2.04it/s]
|
625 |
5%|β | 579/11704 [05:16<1:31:10, 2.03it/s]
|
626 |
5%|β | 580/11704 [05:16<1:31:03, 2.04it/s]
|
627 |
5%|β | 581/11704 [05:17<1:31:00, 2.04it/s]
|
628 |
5%|β | 582/11704 [05:17<1:31:08, 2.03it/s]
|
629 |
5%|β | 583/11704 [05:18<1:31:06, 2.03it/s]
|
630 |
5%|β | 584/11704 [05:18<1:31:09, 2.03it/s]
|
631 |
5%|β | 585/11704 [05:19<1:31:07, 2.03it/s]
|
632 |
5%|β | 586/11704 [05:19<1:31:03, 2.04it/s]
|
633 |
5%|β | 587/11704 [05:20<1:31:06, 2.03it/s]
|
634 |
5%|β | 588/11704 [05:20<1:31:03, 2.03it/s]
|
635 |
5%|β | 589/11704 [05:21<1:30:56, 2.04it/s]
|
636 |
5%|β | 590/11704 [05:21<1:31:02, 2.03it/s]
|
637 |
5%|β | 591/11704 [05:21<1:31:02, 2.03it/s]
|
638 |
5%|β | 592/11704 [05:22<1:30:59, 2.04it/s]
|
639 |
5%|β | 593/11704 [05:22<1:31:05, 2.03it/s]
|
640 |
5%|β | 594/11704 [05:23<1:31:00, 2.03it/s]
|
641 |
5%|β | 595/11704 [05:23<1:30:57, 2.04it/s]
|
642 |
5%|β | 596/11704 [05:24<1:30:54, 2.04it/s]
|
643 |
5%|β | 597/11704 [05:24<1:30:55, 2.04it/s]
|
644 |
5%|β | 598/11704 [05:25<1:30:59, 2.03it/s]
|
645 |
5%|β | 599/11704 [05:25<1:31:07, 2.03it/s]
|
646 |
5%|β | 600/11704 [05:26<1:31:00, 2.03it/s]{'loss': 4.0671, 'grad_norm': 0.5922385454177856, 'learning_rate': 0.0005123825789923142, 'epoch': 0.72}
|
|
|
647 |
|
648 |
5%|β | 600/11704 [05:26<1:31:00, 2.03it/s]
|
649 |
5%|β | 601/11704 [05:26<1:31:07, 2.03it/s]
|
650 |
5%|β | 602/11704 [05:27<1:31:07, 2.03it/s]
|
651 |
5%|β | 603/11704 [05:27<1:30:58, 2.03it/s]
|
652 |
5%|β | 604/11704 [05:28<1:31:02, 2.03it/s]
|
653 |
5%|β | 605/11704 [05:28<1:30:57, 2.03it/s]
|
654 |
5%|β | 606/11704 [05:29<1:30:54, 2.03it/s]
|
655 |
5%|β | 607/11704 [05:29<1:31:00, 2.03it/s]
|
656 |
5%|β | 608/11704 [05:30<1:30:53, 2.03it/s]
|
657 |
5%|β | 609/11704 [05:30<1:30:51, 2.04it/s]
|
658 |
5%|β | 610/11704 [05:31<1:30:58, 2.03it/s]
|
659 |
5%|β | 611/11704 [05:31<1:30:50, 2.04it/s]
|
660 |
5%|β | 612/11704 [05:32<1:30:44, 2.04it/s]
|
661 |
5%|β | 613/11704 [05:32<1:30:56, 2.03it/s]
|
662 |
5%|β | 614/11704 [05:33<1:30:48, 2.04it/s]
|
663 |
5%|β | 615/11704 [05:33<1:39:03, 1.87it/s]
|
664 |
5%|β | 616/11704 [05:34<1:44:26, 1.77it/s]
|
665 |
5%|β | 617/11704 [05:35<1:40:15, 1.84it/s]
|
666 |
5%|β | 618/11704 [05:35<1:37:27, 1.90it/s]
|
667 |
5%|β | 619/11704 [05:36<1:35:28, 1.93it/s]
|
668 |
5%|β | 620/11704 [05:36<1:33:54, 1.97it/s]
|
669 |
5%|β | 621/11704 [05:37<1:33:01, 1.99it/s]
|
670 |
5%|β | 622/11704 [05:37<1:32:24, 2.00it/s]
|
671 |
5%|β | 623/11704 [05:38<1:31:51, 2.01it/s]
|
672 |
5%|β | 624/11704 [05:38<1:31:28, 2.02it/s]
|
673 |
5%|β | 625/11704 [05:38<1:31:20, 2.02it/s]{'loss': 4.0331, 'grad_norm': 0.5989447236061096, 'learning_rate': 0.000533731853116994, 'epoch': 0.75}
|
|
|
674 |
|
675 |
5%|β | 625/11704 [05:39<1:31:20, 2.02it/s]
|
676 |
5%|β | 626/11704 [05:39<1:31:10, 2.03it/s]
|
677 |
5%|β | 627/11704 [05:39<1:30:59, 2.03it/s]
|
678 |
5%|β | 628/11704 [05:40<1:31:00, 2.03it/s]
|
679 |
5%|β | 629/11704 [05:40<1:30:50, 2.03it/s]
|
680 |
5%|β | 630/11704 [05:41<1:30:48, 2.03it/s]
|
681 |
5%|β | 631/11704 [05:41<1:30:49, 2.03it/s]
|
682 |
5%|β | 632/11704 [05:42<1:30:45, 2.03it/s]
|
683 |
5%|β | 633/11704 [05:42<1:30:51, 2.03it/s]
|
684 |
5%|β | 634/11704 [05:43<1:30:44, 2.03it/s]
|
685 |
5%|β | 635/11704 [05:43<1:30:39, 2.04it/s]
|
686 |
5%|β | 636/11704 [05:44<1:30:46, 2.03it/s]
|
687 |
5%|β | 637/11704 [05:44<1:30:45, 2.03it/s]
|
688 |
5%|β | 638/11704 [05:45<1:30:43, 2.03it/s]
|
689 |
5%|β | 639/11704 [05:45<1:30:43, 2.03it/s]
|
690 |
5%|β | 640/11704 [05:46<1:30:38, 2.03it/s]
|
691 |
5%|β | 641/11704 [05:46<1:30:40, 2.03it/s]
|
692 |
5%|β | 642/11704 [05:47<1:30:42, 2.03it/s]
|
693 |
5%|β | 643/11704 [05:47<1:30:39, 2.03it/s]
|
694 |
6%|β | 644/11704 [05:48<1:30:44, 2.03it/s]
|
695 |
6%|β | 645/11704 [05:48<1:30:35, 2.03it/s]
|
696 |
6%|β | 646/11704 [05:49<1:30:40, 2.03it/s]
|
697 |
6%|β | 647/11704 [05:49<1:30:42, 2.03it/s]
|
698 |
6%|β | 648/11704 [05:50<1:30:35, 2.03it/s]
|
699 |
6%|β | 649/11704 [05:50<1:30:36, 2.03it/s]
|
700 |
6%|β | 650/11704 [05:51<1:30:42, 2.03it/s]{'loss': 3.9926, 'grad_norm': 0.630885899066925, 'learning_rate': 0.0005550811272416738, 'epoch': 0.78}
|
|
|
701 |
|
702 |
6%|β | 650/11704 [05:51<1:30:42, 2.03it/s]
|
703 |
6%|β | 651/11704 [05:51<1:30:40, 2.03it/s]
|
704 |
6%|β | 652/11704 [05:52<1:30:43, 2.03it/s]
|
705 |
6%|β | 653/11704 [05:52<1:30:36, 2.03it/s]
|
706 |
6%|β | 654/11704 [05:53<1:30:33, 2.03it/s]
|
707 |
6%|β | 655/11704 [05:53<1:30:43, 2.03it/s]
|
708 |
6%|β | 656/11704 [05:54<1:30:37, 2.03it/s]
|
709 |
6%|β | 657/11704 [05:54<1:30:37, 2.03it/s]
|
710 |
6%|β | 658/11704 [05:55<1:30:42, 2.03it/s]
|
711 |
6%|β | 659/11704 [05:55<1:30:42, 2.03it/s]
|
712 |
6%|β | 660/11704 [05:56<1:30:41, 2.03it/s]
|
713 |
6%|β | 661/11704 [05:56<1:30:43, 2.03it/s]
|
714 |
6%|β | 662/11704 [05:57<1:30:33, 2.03it/s]
|
715 |
6%|β | 663/11704 [05:57<1:30:33, 2.03it/s]
|
716 |
6%|β | 664/11704 [05:58<1:30:35, 2.03it/s]
|
717 |
6%|β | 665/11704 [05:58<1:30:35, 2.03it/s]
|
718 |
6%|β | 666/11704 [05:59<1:30:29, 2.03it/s]
|
719 |
6%|β | 667/11704 [05:59<1:30:36, 2.03it/s]
|
720 |
6%|β | 668/11704 [06:00<1:30:36, 2.03it/s]
|
721 |
6%|β | 669/11704 [06:00<1:30:32, 2.03it/s]
|
722 |
6%|β | 670/11704 [06:01<1:30:35, 2.03it/s]
|
723 |
6%|β | 671/11704 [06:01<1:30:30, 2.03it/s]
|
724 |
6%|β | 672/11704 [06:02<1:30:28, 2.03it/s]
|
725 |
6%|β | 673/11704 [06:02<1:30:26, 2.03it/s]
|
726 |
6%|β | 674/11704 [06:03<1:30:24, 2.03it/s]
|
727 |
6%|β | 675/11704 [06:03<1:30:22, 2.03it/s]{'loss': 3.9532, 'grad_norm': 0.6419950723648071, 'learning_rate': 0.0005764304013663536, 'epoch': 0.81}
|
|
|
728 |
|
729 |
6%|β | 675/11704 [06:03<1:30:22, 2.03it/s]
|
730 |
6%|β | 676/11704 [06:04<1:30:31, 2.03it/s]
|
731 |
6%|β | 677/11704 [06:04<1:30:28, 2.03it/s]
|
732 |
6%|β | 678/11704 [06:05<1:30:33, 2.03it/s]
|
733 |
6%|β | 679/11704 [06:05<1:30:34, 2.03it/s]
|
734 |
6%|β | 680/11704 [06:06<1:30:24, 2.03it/s]
|
735 |
6%|β | 681/11704 [06:06<1:30:30, 2.03it/s]
|
736 |
6%|β | 682/11704 [06:07<1:30:29, 2.03it/s]
|
737 |
6%|β | 683/11704 [06:07<1:30:31, 2.03it/s]
|
738 |
6%|β | 684/11704 [06:08<1:30:31, 2.03it/s]
|
739 |
6%|β | 685/11704 [06:08<1:30:25, 2.03it/s]
|
740 |
6%|β | 686/11704 [06:09<1:30:28, 2.03it/s]
|
741 |
6%|β | 687/11704 [06:09<1:30:25, 2.03it/s]
|
742 |
6%|β | 688/11704 [06:10<1:30:24, 2.03it/s]
|
743 |
6%|β | 689/11704 [06:10<1:30:25, 2.03it/s]
|
744 |
6%|β | 690/11704 [06:10<1:30:24, 2.03it/s]
|
745 |
6%|β | 691/11704 [06:11<1:30:24, 2.03it/s]
|
746 |
6%|β | 692/11704 [06:11<1:30:19, 2.03it/s]
|
747 |
6%|β | 693/11704 [06:12<1:30:15, 2.03it/s]
|
748 |
6%|β | 694/11704 [06:12<1:30:20, 2.03it/s]
|
749 |
6%|β | 695/11704 [06:13<1:30:17, 2.03it/s]
|
750 |
6%|β | 696/11704 [06:13<1:30:23, 2.03it/s]
|
751 |
6%|β | 697/11704 [06:14<1:30:18, 2.03it/s]
|
752 |
6%|β | 698/11704 [06:14<1:30:13, 2.03it/s]
|
753 |
6%|β | 699/11704 [06:15<1:30:15, 2.03it/s]
|
754 |
6%|β | 700/11704 [06:15<1:30:13, 2.03it/s]{'loss': 3.9036, 'grad_norm': 0.5995571613311768, 'learning_rate': 0.0005977796754910333, 'epoch': 0.84}
|
|
|
755 |
|
756 |
6%|β | 700/11704 [06:15<1:30:13, 2.03it/s]
|
757 |
6%|β | 701/11704 [06:16<1:30:21, 2.03it/s]
|
758 |
6%|β | 702/11704 [06:16<1:30:22, 2.03it/s]
|
759 |
6%|β | 703/11704 [06:17<1:30:16, 2.03it/s]
|
760 |
6%|β | 704/11704 [06:17<1:30:19, 2.03it/s]
|
761 |
6%|β | 705/11704 [06:18<1:30:16, 2.03it/s]
|
762 |
6%|β | 706/11704 [06:18<1:30:18, 2.03it/s]
|
763 |
6%|β | 707/11704 [06:19<1:30:13, 2.03it/s]
|
764 |
6%|β | 708/11704 [06:19<1:30:10, 2.03it/s]
|
765 |
6%|β | 709/11704 [06:20<1:30:16, 2.03it/s]
|
766 |
6%|β | 710/11704 [06:20<1:30:10, 2.03it/s]
|
767 |
6%|β | 711/11704 [06:21<1:30:22, 2.03it/s]
|
768 |
6%|β | 712/11704 [06:21<1:30:14, 2.03it/s]
|
769 |
6%|β | 713/11704 [06:22<1:30:10, 2.03it/s]
|
770 |
6%|β | 714/11704 [06:22<1:30:16, 2.03it/s]
|
771 |
6%|β | 715/11704 [06:23<1:30:10, 2.03it/s]
|
772 |
6%|β | 716/11704 [06:23<1:30:13, 2.03it/s]
|
773 |
6%|β | 717/11704 [06:24<1:30:17, 2.03it/s]
|
774 |
6%|β | 718/11704 [06:24<1:30:09, 2.03it/s]
|
775 |
6%|β | 719/11704 [06:25<1:30:13, 2.03it/s]
|
776 |
6%|β | 720/11704 [06:25<1:30:05, 2.03it/s]
|
777 |
6%|β | 721/11704 [06:26<1:30:09, 2.03it/s]
|
778 |
6%|β | 722/11704 [06:26<1:30:09, 2.03it/s]
|
779 |
6%|β | 723/11704 [06:27<1:30:08, 2.03it/s]
|
780 |
6%|β | 724/11704 [06:27<1:30:10, 2.03it/s]
|
781 |
6%|β | 725/11704 [06:28<1:30:07, 2.03it/s]{'loss': 3.8761, 'grad_norm': 0.5167131423950195, 'learning_rate': 0.000619128949615713, 'epoch': 0.87}
|
|
|
782 |
|
783 |
6%|β | 725/11704 [06:28<1:30:07, 2.03it/s]
|
784 |
6%|β | 726/11704 [06:28<1:30:10, 2.03it/s]
|
785 |
6%|β | 727/11704 [06:29<1:30:07, 2.03it/s]
|
786 |
6%|β | 728/11704 [06:29<1:30:06, 2.03it/s]
|
787 |
6%|β | 729/11704 [06:30<1:30:10, 2.03it/s]
|
788 |
6%|β | 730/11704 [06:30<1:30:08, 2.03it/s]
|
789 |
6%|β | 731/11704 [06:31<1:30:10, 2.03it/s]
|
790 |
6%|β | 732/11704 [06:31<1:30:04, 2.03it/s]
|
791 |
6%|β | 733/11704 [06:32<1:30:01, 2.03it/s]
|
792 |
6%|β | 734/11704 [06:32<1:30:04, 2.03it/s]
|
793 |
6%|β | 735/11704 [06:33<1:29:58, 2.03it/s]
|
794 |
6%|β | 736/11704 [06:33<1:30:04, 2.03it/s]
|
795 |
6%|β | 737/11704 [06:34<1:30:03, 2.03it/s]
|
796 |
6%|β | 738/11704 [06:34<1:30:02, 2.03it/s]
|
797 |
6%|β | 739/11704 [06:35<1:30:03, 2.03it/s]
|
798 |
6%|β | 740/11704 [06:35<1:29:57, 2.03it/s]
|
799 |
6%|β | 741/11704 [06:36<1:30:03, 2.03it/s]
|
800 |
6%|β | 742/11704 [06:36<1:30:01, 2.03it/s]
|
801 |
6%|β | 743/11704 [06:37<1:29:59, 2.03it/s]
|
802 |
6%|β | 744/11704 [06:37<1:30:00, 2.03it/s]
|
803 |
6%|β | 745/11704 [06:38<1:29:54, 2.03it/s]
|
804 |
6%|β | 746/11704 [06:38<1:30:01, 2.03it/s]
|
805 |
6%|β | 747/11704 [06:39<1:30:00, 2.03it/s]
|
806 |
6%|β | 748/11704 [06:39<1:29:55, 2.03it/s]
|
807 |
6%|β | 749/11704 [06:40<1:30:01, 2.03it/s]
|
808 |
6%|β | 750/11704 [06:40<1:29:54, 2.03it/s]{'loss': 3.851, 'grad_norm': 0.5299991369247437, 'learning_rate': 0.0006404782237403928, 'epoch': 0.9}
|
|
|
809 |
|
810 |
6%|β | 750/11704 [06:40<1:29:54, 2.03it/s]
|
811 |
6%|β | 751/11704 [06:41<1:29:57, 2.03it/s]
|
812 |
6%|β | 752/11704 [06:41<1:30:00, 2.03it/s]
|
813 |
6%|β | 753/11704 [06:42<1:29:52, 2.03it/s]
|
814 |
6%|β | 754/11704 [06:42<1:30:01, 2.03it/s]
|
815 |
6%|β | 755/11704 [06:43<1:29:53, 2.03it/s]
|
816 |
6%|β | 756/11704 [06:43<1:29:51, 2.03it/s]
|
817 |
6%|β | 757/11704 [06:43<1:29:52, 2.03it/s]
|
818 |
6%|β | 758/11704 [06:44<1:29:51, 2.03it/s]
|
819 |
6%|β | 759/11704 [06:44<1:29:56, 2.03it/s]
|
820 |
6%|β | 760/11704 [06:45<1:29:50, 2.03it/s]
|
821 |
7%|β | 761/11704 [06:45<1:29:48, 2.03it/s]
|
822 |
7%|β | 762/11704 [06:46<1:29:55, 2.03it/s]
|
823 |
7%|β | 763/11704 [06:46<1:29:48, 2.03it/s]
|
824 |
7%|β | 764/11704 [06:47<1:29:48, 2.03it/s]
|
825 |
7%|β | 765/11704 [06:47<1:29:54, 2.03it/s]
|
826 |
7%|β | 766/11704 [06:48<1:29:47, 2.03it/s]
|
827 |
7%|β | 767/11704 [06:48<1:29:56, 2.03it/s]
|
828 |
7%|β | 768/11704 [06:49<1:29:52, 2.03it/s]
|
829 |
7%|β | 769/11704 [06:49<1:29:52, 2.03it/s]
|
830 |
7%|β | 770/11704 [06:50<1:29:50, 2.03it/s]
|
831 |
7%|β | 771/11704 [06:50<1:29:44, 2.03it/s]
|
832 |
7%|β | 772/11704 [06:51<1:29:44, 2.03it/s]
|
833 |
7%|β | 773/11704 [06:51<1:29:36, 2.03it/s]
|
834 |
7%|β | 774/11704 [06:52<1:29:42, 2.03it/s]
|
835 |
7%|β | 775/11704 [06:52<1:29:42, 2.03it/s]{'loss': 3.7964, 'grad_norm': 0.5229047536849976, 'learning_rate': 0.0006618274978650726, 'epoch': 0.93}
|
|
|
836 |
|
837 |
7%|β | 775/11704 [06:52<1:29:42, 2.03it/s]
|
838 |
7%|β | 776/11704 [06:53<1:29:43, 2.03it/s]
|
839 |
7%|β | 777/11704 [06:53<1:29:50, 2.03it/s]
|
840 |
7%|β | 778/11704 [06:54<1:29:41, 2.03it/s]
|
841 |
7%|β | 779/11704 [06:54<1:29:43, 2.03it/s]
|
842 |
7%|β | 780/11704 [06:55<1:29:41, 2.03it/s]
|
843 |
7%|β | 781/11704 [06:55<1:29:40, 2.03it/s]
|
844 |
7%|β | 782/11704 [06:56<1:29:39, 2.03it/s]
|
845 |
7%|β | 783/11704 [06:56<1:29:35, 2.03it/s]
|
846 |
7%|β | 784/11704 [06:57<1:29:34, 2.03it/s]
|
847 |
7%|β | 785/11704 [06:57<1:29:40, 2.03it/s]
|
848 |
7%|β | 786/11704 [06:58<1:29:40, 2.03it/s]
|
849 |
7%|β | 787/11704 [06:58<1:29:44, 2.03it/s]
|
850 |
7%|β | 788/11704 [06:59<1:29:40, 2.03it/s]
|
851 |
7%|β | 789/11704 [06:59<1:29:42, 2.03it/s]
|
852 |
7%|β | 790/11704 [07:00<1:29:42, 2.03it/s]
|
853 |
7%|β | 791/11704 [07:00<1:29:41, 2.03it/s]
|
854 |
7%|β | 792/11704 [07:01<1:29:39, 2.03it/s]
|
855 |
7%|β | 793/11704 [07:01<1:29:34, 2.03it/s]
|
856 |
7%|β | 794/11704 [07:02<1:29:26, 2.03it/s]
|
857 |
7%|β | 795/11704 [07:02<1:29:29, 2.03it/s]
|
858 |
7%|β | 796/11704 [07:03<1:29:29, 2.03it/s]
|
859 |
7%|β | 797/11704 [07:03<1:29:29, 2.03it/s]
|
860 |
7%|β | 798/11704 [07:04<1:29:32, 2.03it/s]
|
861 |
7%|β | 799/11704 [07:04<1:29:33, 2.03it/s]
|
862 |
7%|β | 800/11704 [07:05<1:29:32, 2.03it/s]{'loss': 3.7888, 'grad_norm': 0.48627805709838867, 'learning_rate': 0.0006831767719897524, 'epoch': 0.96}
|
|
|
863 |
|
864 |
7%|β | 800/11704 [07:05<1:29:32, 2.03it/s]
|
865 |
7%|β | 801/11704 [07:05<1:29:38, 2.03it/s]
|
866 |
7%|β | 802/11704 [07:06<1:29:34, 2.03it/s]
|
867 |
7%|β | 803/11704 [07:06<1:29:34, 2.03it/s]
|
868 |
7%|β | 804/11704 [07:07<1:29:28, 2.03it/s]
|
869 |
7%|β | 805/11704 [07:07<1:29:23, 2.03it/s]
|
870 |
7%|β | 806/11704 [07:08<1:29:21, 2.03it/s]
|
871 |
7%|β | 807/11704 [07:08<1:29:29, 2.03it/s]
|
872 |
7%|β | 808/11704 [07:09<1:29:27, 2.03it/s]
|
873 |
7%|β | 809/11704 [07:09<1:29:26, 2.03it/s]
|
874 |
7%|β | 810/11704 [07:10<1:29:27, 2.03it/s]
|
875 |
7%|β | 811/11704 [07:10<1:29:23, 2.03it/s]
|
876 |
7%|β | 812/11704 [07:11<1:29:25, 2.03it/s]
|
877 |
7%|β | 813/11704 [07:11<1:29:24, 2.03it/s]
|
878 |
7%|β | 814/11704 [07:12<1:29:16, 2.03it/s]
|
879 |
7%|β | 815/11704 [07:12<1:29:08, 2.04it/s]
|
880 |
7%|β | 816/11704 [07:13<1:29:13, 2.03it/s]
|
881 |
7%|β | 817/11704 [07:13<1:29:13, 2.03it/s]
|
882 |
7%|β | 818/11704 [07:14<1:29:15, 2.03it/s]
|
883 |
7%|β | 819/11704 [07:14<1:29:19, 2.03it/s]
|
884 |
7%|β | 820/11704 [07:15<1:29:12, 2.03it/s]
|
885 |
7%|β | 821/11704 [07:15<1:29:15, 2.03it/s]
|
886 |
7%|β | 822/11704 [07:16<1:29:14, 2.03it/s]
|
887 |
7%|β | 823/11704 [07:16<1:29:12, 2.03it/s]
|
888 |
7%|β | 824/11704 [07:16<1:29:17, 2.03it/s]
|
889 |
7%|β | 825/11704 [07:17<1:29:15, 2.03it/s]{'loss': 3.7506, 'grad_norm': 0.5719705820083618, 'learning_rate': 0.0007045260461144322, 'epoch': 0.99}
|
|
|
890 |
|
891 |
7%|β | 825/11704 [07:17<1:29:15, 2.03it/s]
|
892 |
7%|β | 826/11704 [07:17<1:29:22, 2.03it/s]
|
893 |
7%|β | 827/11704 [07:18<1:29:19, 2.03it/s]
|
894 |
7%|β | 828/11704 [07:18<1:29:13, 2.03it/s]
|
895 |
7%|β | 829/11704 [07:19<1:29:19, 2.03it/s]
|
896 |
7%|β | 830/11704 [07:19<1:29:18, 2.03it/s]
|
897 |
7%|β | 831/11704 [07:20<1:29:20, 2.03it/s]
|
898 |
7%|β | 832/11704 [07:20<1:29:17, 2.03it/s]
|
899 |
7%|β | 833/11704 [07:21<1:29:13, 2.03it/s]
|
900 |
7%|β | 834/11704 [07:21<1:29:20, 2.03it/s]
|
901 |
7%|β | 835/11704 [07:22<1:29:11, 2.03it/s]
|
902 |
7%|β | 836/11704 [07:22<1:31:45, 1.97it/s]
|
903 |
7%|β | 837/11704 [07:34<11:50:27, 3.92s/it]
|
904 |
7%|β | 838/11704 [07:35<8:44:46, 2.90s/it]
|
905 |
7%|β | 839/11704 [07:35<6:33:59, 2.18s/it]
|
906 |
7%|β | 840/11704 [07:36<5:02:24, 1.67s/it]
|
907 |
7%|β | 841/11704 [07:36<3:58:30, 1.32s/it]
|
908 |
7%|β | 842/11704 [07:37<3:13:48, 1.07s/it]
|
909 |
7%|β | 843/11704 [07:37<2:42:26, 1.11it/s]
|
910 |
7%|β | 844/11704 [07:38<2:20:22, 1.29it/s]
|
911 |
7%|β | 845/11704 [07:38<2:05:46, 1.44it/s]
|
912 |
7%|β | 846/11704 [07:39<1:54:47, 1.58it/s]
|
913 |
7%|β | 847/11704 [07:39<1:47:02, 1.69it/s]
|
914 |
7%|β | 848/11704 [07:40<1:41:37, 1.78it/s]
|
915 |
7%|β | 849/11704 [07:40<1:37:49, 1.85it/s]
|
916 |
7%|β | 850/11704 [07:41<1:35:06, 1.90it/s]{'loss': 3.7125, 'grad_norm': 0.48182985186576843, 'learning_rate': 0.0007258753202391119, 'epoch': 1.02}
|
|
|
917 |
|
918 |
7%|β | 850/11704 [07:41<1:35:06, 1.90it/s]
|
919 |
7%|β | 851/11704 [07:41<1:33:29, 1.93it/s]
|
920 |
7%|β | 852/11704 [07:42<1:32:09, 1.96it/s]
|
921 |
7%|β | 853/11704 [07:42<1:31:05, 1.99it/s]
|
922 |
7%|β | 854/11704 [07:43<1:30:25, 2.00it/s]
|
923 |
7%|β | 855/11704 [07:43<1:29:58, 2.01it/s]
|
924 |
7%|β | 856/11704 [07:44<1:29:45, 2.01it/s]
|
925 |
7%|β | 857/11704 [07:44<1:29:20, 2.02it/s]
|
926 |
7%|β | 858/11704 [07:45<1:29:19, 2.02it/s]
|
927 |
7%|β | 859/11704 [07:45<1:29:09, 2.03it/s]
|
928 |
7%|β | 860/11704 [07:46<1:29:11, 2.03it/s]
|
929 |
7%|β | 861/11704 [07:46<1:29:02, 2.03it/s]
|
930 |
7%|β | 862/11704 [07:47<1:28:59, 2.03it/s]
|
931 |
7%|β | 863/11704 [07:47<1:28:54, 2.03it/s]
|
932 |
7%|β | 864/11704 [07:48<1:28:51, 2.03it/s]
|
933 |
7%|β | 865/11704 [07:48<1:28:48, 2.03it/s]
|
934 |
7%|β | 866/11704 [07:49<1:28:57, 2.03it/s]
|
935 |
7%|β | 867/11704 [07:49<1:28:59, 2.03it/s]
|
936 |
7%|β | 868/11704 [07:50<1:28:51, 2.03it/s]
|
937 |
7%|β | 869/11704 [07:50<1:28:52, 2.03it/s]
|
938 |
7%|β | 870/11704 [07:51<1:28:50, 2.03it/s]
|
939 |
7%|β | 871/11704 [07:51<1:28:46, 2.03it/s]
|
940 |
7%|β | 872/11704 [07:52<1:28:53, 2.03it/s]
|
|
|
1 |
+
slurm submission log: 2024-05-25 22:01:15.607519
|
2 |
+
created following sbatch script:
|
3 |
+
|
4 |
+
###############################
|
5 |
+
|
6 |
+
#!/bin/bash
|
7 |
+
|
8 |
+
#SBATCH --account=nlp
|
9 |
+
#SBATCH --cpus-per-task=16
|
10 |
+
#SBATCH --dependency=afterok:7651385
|
11 |
+
#SBATCH --gres=gpu:2
|
12 |
+
#SBATCH --job-name=tthrush-job-339895
|
13 |
+
#SBATCH --mem=100G
|
14 |
+
#SBATCH --nodelist=sphinx2
|
15 |
+
#SBATCH --open-mode=append
|
16 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
|
17 |
+
#SBATCH --partition=sphinx
|
18 |
+
#SBATCH --time=14-0
|
19 |
+
|
20 |
+
# activate your desired anaconda environment
|
21 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
22 |
+
|
23 |
+
# cd to working directory
|
24 |
+
cd .
|
25 |
+
|
26 |
+
# launch commands
|
27 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
|
28 |
+
|
29 |
+
###############################
|
30 |
+
|
31 |
+
submission to slurm complete!
|
32 |
+
|
33 |
+
|
34 |
+
###############################
|
35 |
+
slurm submission output
|
36 |
+
|
37 |
+
Submitted batch job 7651386
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
###############################
|
42 |
+
|
43 |
+
slurm submission log: 2024-05-25 22:02:24.879435
|
44 |
+
created following sbatch script:
|
45 |
+
|
46 |
+
###############################
|
47 |
+
|
48 |
+
#!/bin/bash
|
49 |
+
|
50 |
+
#SBATCH --account=nlp
|
51 |
+
#SBATCH --cpus-per-task=16
|
52 |
+
#SBATCH --dependency=afterok:7651416
|
53 |
+
#SBATCH --gres=gpu:2
|
54 |
+
#SBATCH --job-name=tthrush-job-1467582
|
55 |
+
#SBATCH --mem=100G
|
56 |
+
#SBATCH --nodelist=sphinx2
|
57 |
+
#SBATCH --open-mode=append
|
58 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
|
59 |
+
#SBATCH --partition=sphinx
|
60 |
+
#SBATCH --time=14-0
|
61 |
+
|
62 |
+
# activate your desired anaconda environment
|
63 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
64 |
+
|
65 |
+
# cd to working directory
|
66 |
+
cd .
|
67 |
+
|
68 |
+
# launch commands
|
69 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
|
70 |
+
|
71 |
+
###############################
|
72 |
+
|
73 |
+
submission to slurm complete!
|
74 |
+
|
75 |
+
|
76 |
+
###############################
|
77 |
+
slurm submission output
|
78 |
+
|
79 |
+
Submitted batch job 7651417
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
###############################
|
84 |
+
|
85 |
+
slurm submission log: 2024-05-25 22:12:50.153381
|
86 |
+
created following sbatch script:
|
87 |
+
|
88 |
+
###############################
|
89 |
+
|
90 |
+
#!/bin/bash
|
91 |
+
|
92 |
+
#SBATCH --account=nlp
|
93 |
+
#SBATCH --cpus-per-task=16
|
94 |
+
#SBATCH --dependency=afterok:7651457
|
95 |
+
#SBATCH --gres=gpu:2
|
96 |
+
#SBATCH --job-name=tthrush-job-1098424
|
97 |
+
#SBATCH --mem=100G
|
98 |
+
#SBATCH --nodelist=sphinx2
|
99 |
+
#SBATCH --open-mode=append
|
100 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
|
101 |
+
#SBATCH --partition=sphinx
|
102 |
+
#SBATCH --time=14-0
|
103 |
+
|
104 |
+
# activate your desired anaconda environment
|
105 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
106 |
+
|
107 |
+
# cd to working directory
|
108 |
+
cd .
|
109 |
+
|
110 |
+
# launch commands
|
111 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
|
112 |
+
|
113 |
+
###############################
|
114 |
+
|
115 |
+
submission to slurm complete!
|
116 |
+
|
117 |
+
|
118 |
+
###############################
|
119 |
+
slurm submission output
|
120 |
+
|
121 |
+
Submitted batch job 7651458
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
###############################
|
126 |
+
|
127 |
+
slurm submission log: 2024-05-25 22:15:55.753417
|
128 |
+
created following sbatch script:
|
129 |
+
|
130 |
+
###############################
|
131 |
+
|
132 |
+
#!/bin/bash
|
133 |
+
|
134 |
+
#SBATCH --account=nlp
|
135 |
+
#SBATCH --cpus-per-task=16
|
136 |
+
#SBATCH --dependency=afterok:7651485
|
137 |
+
#SBATCH --gres=gpu:2
|
138 |
+
#SBATCH --job-name=tthrush-job-3438810
|
139 |
+
#SBATCH --mem=100G
|
140 |
+
#SBATCH --nodelist=sphinx2
|
141 |
+
#SBATCH --open-mode=append
|
142 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
|
143 |
+
#SBATCH --partition=sphinx
|
144 |
+
#SBATCH --time=14-0
|
145 |
+
|
146 |
+
# activate your desired anaconda environment
|
147 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
148 |
+
|
149 |
+
# cd to working directory
|
150 |
+
cd .
|
151 |
+
|
152 |
+
# launch commands
|
153 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
|
154 |
+
|
155 |
+
###############################
|
156 |
+
|
157 |
+
submission to slurm complete!
|
158 |
+
|
159 |
+
|
160 |
+
###############################
|
161 |
+
slurm submission output
|
162 |
+
|
163 |
+
Submitted batch job 7651486
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
###############################
|
168 |
+
|
169 |
+
slurm submission log: 2024-05-25 22:18:14.763422
|
170 |
+
created following sbatch script:
|
171 |
+
|
172 |
+
###############################
|
173 |
+
|
174 |
+
#!/bin/bash
|
175 |
+
|
176 |
+
#SBATCH --account=nlp
|
177 |
+
#SBATCH --cpus-per-task=16
|
178 |
+
#SBATCH --dependency=afterok:7651515
|
179 |
+
#SBATCH --gres=gpu:2
|
180 |
+
#SBATCH --job-name=tthrush-job-3850774
|
181 |
+
#SBATCH --mem=100G
|
182 |
+
#SBATCH --nodelist=sphinx2
|
183 |
+
#SBATCH --open-mode=append
|
184 |
+
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1/train_job_output.txt
|
185 |
+
#SBATCH --partition=sphinx
|
186 |
+
#SBATCH --time=14-0
|
187 |
+
|
188 |
+
# activate your desired anaconda environment
|
189 |
+
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection
|
190 |
+
|
191 |
+
# cd to working directory
|
192 |
+
cd .
|
193 |
+
|
194 |
+
# launch commands
|
195 |
+
srun --unbuffered run_as_child_processes 'torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14'
|
196 |
+
|
197 |
+
###############################
|
198 |
+
|
199 |
+
submission to slurm complete!
|
200 |
+
|
201 |
+
|
202 |
+
###############################
|
203 |
+
slurm submission output
|
204 |
+
|
205 |
+
Submitted batch job 7651516
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
###############################
|
210 |
+
|
211 |
+
/var/lib/slurm/slurmd/job7651516/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory
|
212 |
+
|
213 |
+
CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
|
214 |
+
To initialize your shell, run
|
215 |
+
|
216 |
+
$ conda init <SHELL_NAME>
|
217 |
+
|
218 |
+
Currently supported shells are:
|
219 |
+
- bash
|
220 |
+
- fish
|
221 |
+
- tcsh
|
222 |
+
- xonsh
|
223 |
+
- zsh
|
224 |
+
- powershell
|
225 |
+
|
226 |
+
See 'conda init --help' for more information and options.
|
227 |
+
|
228 |
+
IMPORTANT: You may need to close and restart your shell after running 'conda init'.
|
229 |
+
|
230 |
+
|
231 |
+
###############################
|
232 |
+
start time: 2024-05-26 08:43:47.611801
|
233 |
+
machine: sphinx2
|
234 |
+
conda env: pretraining-coreset-selection
|
235 |
+
###############################
|
236 |
+
running following processes
|
237 |
+
|
238 |
+
torchrun --master_port 29508 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1 --output_hub_id pythia-70m_xnli_es --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14
|
239 |
+
|
240 |
+
|
241 |
+
###############################
|
242 |
+
command outputs:
|
243 |
+
|
244 |
+
|
245 |
+
[2024-05-26 08:43:51,176] torch.distributed.run: [WARNING]
|
246 |
+
[2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] *****************************************
|
247 |
+
[2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
248 |
+
[2024-05-26 08:43:51,176] torch.distributed.run: [WARNING] *****************************************
|
249 |
+
05/26/2024 08:44:01 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
|
250 |
+
05/26/2024 08:44:06 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/data/xnli_es', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_upsample/llms/pythia-70m_xnli_es_1', output_hub_id='pythia-70m_xnli_es', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False)
|
251 |
+
|
252 |
0%| | 0/11704 [00:00<?, ?it/s][rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
253 |
+
[rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
254 |
+
|
255 |
0%| | 1/11704 [00:11<36:01:50, 11.08s/it]
|
256 |
0%| | 2/11704 [00:15<22:29:09, 6.92s/it]
|
257 |
0%| | 3/11704 [00:18<17:13:49, 5.30s/it]
|
258 |
0%| | 4/11704 [00:21<13:50:30, 4.26s/it]
|
259 |
0%| | 5/11704 [00:22<10:32:47, 3.25s/it]
|
260 |
0%| | 6/11704 [00:24<9:19:43, 2.87s/it]
|
261 |
0%| | 7/11704 [00:26<7:44:55, 2.38s/it]
|
262 |
0%| | 8/11704 [00:27<6:35:35, 2.03s/it]
|
263 |
0%| | 9/11704 [00:28<5:32:02, 1.70s/it]
|
264 |
0%| | 10/11704 [00:29<5:12:33, 1.60s/it]
|
265 |
0%| | 11/11704 [00:30<4:43:07, 1.45s/it]
|
266 |
0%| | 12/11704 [00:31<4:09:08, 1.28s/it]
|
267 |
0%| | 13/11704 [00:32<3:36:18, 1.11s/it]
|
268 |
0%| | 14/11704 [00:33<3:17:27, 1.01s/it]
|
269 |
0%| | 15/11704 [00:34<3:11:14, 1.02it/s]
|
270 |
0%| | 16/11704 [00:34<3:01:30, 1.07it/s]
|
271 |
0%| | 17/11704 [00:35<2:48:52, 1.15it/s]
|
272 |
0%| | 18/11704 [00:36<2:33:34, 1.27it/s]
|
273 |
0%| | 19/11704 [00:36<2:26:24, 1.33it/s]
|
274 |
0%| | 20/11704 [00:37<2:19:52, 1.39it/s]
|
275 |
0%| | 21/11704 [00:38<2:16:43, 1.42it/s]
|
276 |
0%| | 22/11704 [00:38<2:16:47, 1.42it/s]
|
277 |
0%| | 23/11704 [00:39<2:13:38, 1.46it/s]
|
278 |
0%| | 24/11704 [00:40<2:20:53, 1.38it/s]
|
279 |
0%| | 25/11704 [00:41<2:25:03, 1.34it/s]{'loss': 10.6948, 'grad_norm': 1.2803484201431274, 'learning_rate': 2.134927412467976e-05, 'epoch': 0.03}
|
280 |
|
281 |
+
|
282 |
0%| | 25/11704 [00:41<2:25:03, 1.34it/s]
|
283 |
0%| | 26/11704 [00:41<2:14:46, 1.44it/s]
|
284 |
0%| | 27/11704 [00:42<2:08:18, 1.52it/s]
|
285 |
0%| | 28/11704 [00:42<2:06:17, 1.54it/s]
|
286 |
0%| | 29/11704 [00:43<2:03:23, 1.58it/s]
|
287 |
0%| | 30/11704 [00:44<1:58:17, 1.64it/s]
|
288 |
0%| | 31/11704 [00:44<1:53:07, 1.72it/s]
|
289 |
0%| | 32/11704 [00:45<1:50:58, 1.75it/s]
|
290 |
0%| | 33/11704 [00:45<1:50:04, 1.77it/s]
|
291 |
0%| | 34/11704 [00:46<1:51:19, 1.75it/s]
|
292 |
0%| | 35/11704 [00:46<1:47:30, 1.81it/s]
|
293 |
0%| | 36/11704 [00:47<1:47:57, 1.80it/s]
|
294 |
0%| | 37/11704 [00:47<1:46:41, 1.82it/s]
|
295 |
0%| | 38/11704 [00:48<1:45:56, 1.84it/s]
|
296 |
0%| | 39/11704 [00:49<1:47:28, 1.81it/s]
|
297 |
0%| | 40/11704 [00:49<1:46:05, 1.83it/s]
|
298 |
0%| | 41/11704 [00:50<1:45:06, 1.85it/s]
|
299 |
0%| | 42/11704 [00:50<1:45:14, 1.85it/s]
|
300 |
0%| | 43/11704 [00:51<1:46:16, 1.83it/s]
|
301 |
0%| | 44/11704 [00:51<1:44:48, 1.85it/s]
|
302 |
0%| | 45/11704 [00:52<1:42:40, 1.89it/s]
|
303 |
0%| | 46/11704 [00:52<1:43:30, 1.88it/s]
|
304 |
0%| | 47/11704 [00:53<1:43:45, 1.87it/s]
|
305 |
0%| | 48/11704 [00:53<1:42:51, 1.89it/s]
|
306 |
0%| | 49/11704 [00:54<1:42:14, 1.90it/s]
|
307 |
0%| | 50/11704 [00:54<1:41:41, 1.91it/s]{'loss': 9.9789, 'grad_norm': 1.1821939945220947, 'learning_rate': 4.269854824935952e-05, 'epoch': 0.06}
|
308 |
+
|
309 |
|
310 |
0%| | 50/11704 [00:54<1:41:41, 1.91it/s]
|
311 |
0%| | 51/11704 [00:55<1:41:01, 1.92it/s]
|
312 |
0%| | 52/11704 [00:55<1:40:15, 1.94it/s]
|
313 |
0%| | 53/11704 [00:56<1:39:39, 1.95it/s]
|
314 |
0%| | 54/11704 [00:56<1:40:20, 1.94it/s]
|
315 |
0%| | 55/11704 [00:57<1:39:05, 1.96it/s]
|
316 |
0%| | 56/11704 [00:57<1:39:10, 1.96it/s]
|
317 |
0%| | 57/11704 [00:58<1:39:27, 1.95it/s]
|
318 |
0%| | 58/11704 [00:58<1:39:40, 1.95it/s]
|
319 |
1%| | 59/11704 [00:59<1:52:14, 1.73it/s]
|
320 |
1%| | 60/11704 [01:00<1:48:03, 1.80it/s]
|
321 |
1%| | 61/11704 [01:00<1:45:25, 1.84it/s]
|
322 |
1%| | 62/11704 [01:01<1:43:05, 1.88it/s]
|
323 |
1%| | 63/11704 [01:01<1:41:38, 1.91it/s]
|
324 |
1%| | 64/11704 [01:02<1:40:08, 1.94it/s]
|
325 |
1%| | 65/11704 [01:02<1:39:38, 1.95it/s]
|
326 |
1%| | 66/11704 [01:03<1:38:25, 1.97it/s]
|
327 |
1%| | 67/11704 [01:03<1:38:36, 1.97it/s]
|
328 |
1%| | 68/11704 [01:04<1:38:36, 1.97it/s]
|
329 |
1%| | 69/11704 [01:04<1:37:32, 1.99it/s]
|
330 |
1%| | 70/11704 [01:05<1:37:04, 2.00it/s]
|
331 |
1%| | 71/11704 [01:05<1:36:37, 2.01it/s]
|
332 |
1%| | 72/11704 [01:06<1:36:46, 2.00it/s]
|
333 |
1%| | 73/11704 [01:06<1:36:54, 2.00it/s]
|
334 |
1%| | 74/11704 [01:07<1:37:03, 2.00it/s]
|
335 |
1%| | 75/11704 [01:07<1:36:35, 2.01it/s]{'loss': 9.2071, 'grad_norm': 1.037238597869873, 'learning_rate': 6.404782237403927e-05, 'epoch': 0.09}
|
336 |
|
337 |
+
|
338 |
1%| | 75/11704 [01:07<1:36:35, 2.01it/s]
|
339 |
1%| | 76/11704 [01:08<1:36:49, 2.00it/s]
|
340 |
1%| | 77/11704 [01:08<1:36:51, 2.00it/s]
|
341 |
1%| | 78/11704 [01:09<1:36:22, 2.01it/s]
|
342 |
1%| | 79/11704 [01:09<1:36:16, 2.01it/s]
|
343 |
1%| | 80/11704 [01:10<1:36:33, 2.01it/s]
|
344 |
1%| | 81/11704 [01:10<1:36:15, 2.01it/s]
|
345 |
1%| | 82/11704 [01:11<1:36:11, 2.01it/s]
|
346 |
1%| | 83/11704 [01:11<1:36:08, 2.01it/s]
|
347 |
1%| | 84/11704 [01:12<1:35:57, 2.02it/s]
|
348 |
1%| | 85/11704 [01:12<1:36:27, 2.01it/s]
|
349 |
1%| | 86/11704 [01:13<1:36:25, 2.01it/s]
|
350 |
1%| | 87/11704 [01:13<1:36:11, 2.01it/s]
|
351 |
1%| | 88/11704 [01:14<1:36:08, 2.01it/s]
|
352 |
1%| | 89/11704 [01:14<1:37:01, 2.00it/s]
|
353 |
1%| | 90/11704 [01:15<1:39:29, 1.95it/s]
|
354 |
1%| | 91/11704 [01:15<1:38:15, 1.97it/s]
|
355 |
1%| | 92/11704 [01:16<1:38:03, 1.97it/s]
|
356 |
1%| | 93/11704 [01:16<1:37:15, 1.99it/s]
|
357 |
1%| | 94/11704 [01:17<1:36:57, 2.00it/s]
|
358 |
1%| | 95/11704 [01:17<1:36:48, 2.00it/s]
|
359 |
1%| | 96/11704 [01:18<1:36:31, 2.00it/s]
|
360 |
1%| | 97/11704 [01:18<1:36:15, 2.01it/s]
|
361 |
1%| | 98/11704 [01:19<1:36:02, 2.01it/s]
|
362 |
1%| | 99/11704 [01:19<1:36:18, 2.01it/s]
|
363 |
1%| | 100/11704 [01:20<1:36:11, 2.01it/s]{'loss': 8.3139, 'grad_norm': 0.7935464978218079, 'learning_rate': 8.539709649871905e-05, 'epoch': 0.12}
|
364 |
+
|
365 |
|
366 |
1%| | 100/11704 [01:20<1:36:11, 2.01it/s]
|
367 |
1%| | 101/11704 [01:20<1:36:06, 2.01it/s]
|
368 |
1%| | 102/11704 [01:21<1:35:47, 2.02it/s]
|
369 |
1%| | 103/11704 [01:21<1:36:08, 2.01it/s]
|
370 |
1%| | 104/11704 [01:22<1:35:49, 2.02it/s]
|
371 |
1%| | 105/11704 [01:22<1:35:43, 2.02it/s]
|
372 |
1%| | 106/11704 [01:23<1:35:41, 2.02it/s]
|
373 |
1%| | 107/11704 [01:23<1:35:33, 2.02it/s]
|
374 |
1%| | 108/11704 [01:24<1:35:22, 2.03it/s]
|
375 |
1%| | 109/11704 [01:24<1:35:39, 2.02it/s]
|
376 |
1%| | 110/11704 [01:25<1:35:33, 2.02it/s]
|
377 |
1%| | 111/11704 [01:25<1:35:49, 2.02it/s]
|
378 |
1%| | 112/11704 [01:26<1:35:39, 2.02it/s]
|
379 |
1%| | 113/11704 [01:26<1:35:34, 2.02it/s]
|
380 |
1%| | 114/11704 [01:27<1:35:24, 2.02it/s]
|
381 |
1%| | 115/11704 [01:27<1:35:20, 2.03it/s]
|
382 |
1%| | 116/11704 [01:28<1:35:31, 2.02it/s]
|
383 |
1%| | 117/11704 [01:28<1:35:27, 2.02it/s]
|
384 |
1%| | 118/11704 [01:29<1:35:24, 2.02it/s]
|
385 |
1%| | 119/11704 [01:29<1:35:14, 2.03it/s]
|
386 |
1%| | 120/11704 [01:30<1:35:08, 2.03it/s]
|
387 |
1%| | 121/11704 [01:30<1:35:10, 2.03it/s]
|
388 |
1%| | 122/11704 [01:31<1:35:03, 2.03it/s]
|
389 |
1%| | 123/11704 [01:31<1:34:57, 2.03it/s]
|
390 |
1%| | 124/11704 [01:32<1:35:01, 2.03it/s]
|
391 |
1%| | 125/11704 [01:32<1:35:08, 2.03it/s]{'loss': 7.526, 'grad_norm': 0.5917059183120728, 'learning_rate': 0.0001067463706233988, 'epoch': 0.15}
|
392 |
+
|
393 |
|
394 |
1%| | 125/11704 [01:32<1:35:08, 2.03it/s]
|
395 |
1%| | 126/11704 [01:33<1:35:09, 2.03it/s]
|
396 |
1%| | 127/11704 [01:33<1:35:04, 2.03it/s]
|
397 |
1%| | 128/11704 [01:34<1:34:59, 2.03it/s]
|
398 |
1%| | 129/11704 [01:34<1:34:59, 2.03it/s]
|
399 |
1%| | 130/11704 [01:35<1:35:09, 2.03it/s]
|
400 |
1%| | 131/11704 [01:35<1:35:07, 2.03it/s]
|
401 |
1%| | 132/11704 [01:35<1:35:02, 2.03it/s]
|
402 |
1%| | 133/11704 [01:36<1:34:58, 2.03it/s]
|
403 |
1%| | 134/11704 [01:36<1:34:58, 2.03it/s]
|
404 |
1%| | 135/11704 [01:37<1:34:56, 2.03it/s]
|
405 |
1%| | 136/11704 [01:37<1:35:01, 2.03it/s]
|
406 |
1%| | 137/11704 [01:38<1:34:56, 2.03it/s]
|
407 |
1%| | 138/11704 [01:38<1:35:30, 2.02it/s]
|
408 |
1%| | 139/11704 [01:39<1:35:41, 2.01it/s]
|
409 |
1%| | 140/11704 [01:39<1:35:29, 2.02it/s]
|
410 |
1%| | 141/11704 [01:40<1:35:14, 2.02it/s]
|
411 |
1%| | 142/11704 [01:40<1:35:11, 2.02it/s]
|
412 |
1%| | 143/11704 [01:41<1:35:00, 2.03it/s]
|
413 |
1%| | 144/11704 [01:41<1:34:55, 2.03it/s]
|
414 |
1%| | 145/11704 [01:42<1:34:52, 2.03it/s]
|
415 |
1%| | 146/11704 [01:42<1:34:48, 2.03it/s]
|
416 |
1%|β | 147/11704 [01:43<1:34:51, 2.03it/s]
|
417 |
1%|β | 148/11704 [01:43<1:34:45, 2.03it/s]
|
418 |
1%|β | 149/11704 [01:44<1:34:43, 2.03it/s]
|
419 |
1%|β | 150/11704 [01:44<1:34:48, 2.03it/s]{'loss': 6.8951, 'grad_norm': 0.37877357006073, 'learning_rate': 0.00012809564474807855, 'epoch': 0.18}
|
420 |
+
|
421 |
|
422 |
1%|β | 150/11704 [01:44<1:34:48, 2.03it/s]
|
423 |
1%|β | 151/11704 [01:45<1:34:56, 2.03it/s]
|
424 |
1%|β | 152/11704 [01:45<1:34:58, 2.03it/s]
|
425 |
1%|β | 153/11704 [01:46<1:34:45, 2.03it/s]
|
426 |
1%|β | 154/11704 [01:46<1:34:38, 2.03it/s]
|
427 |
1%|β | 155/11704 [01:47<1:34:42, 2.03it/s]
|
428 |
1%|β | 156/11704 [01:47<1:34:49, 2.03it/s]
|
429 |
1%|β | 157/11704 [01:48<1:34:49, 2.03it/s]
|
430 |
1%|β | 158/11704 [01:48<1:34:47, 2.03it/s]
|
431 |
1%|β | 159/11704 [01:49<1:34:42, 2.03it/s]
|
432 |
1%|β | 160/11704 [01:49<1:34:40, 2.03it/s]
|
433 |
1%|β | 161/11704 [01:50<1:34:38, 2.03it/s]
|
434 |
1%|β | 162/11704 [01:50<1:34:39, 2.03it/s]
|
435 |
1%|β | 163/11704 [01:51<1:34:47, 2.03it/s]
|
436 |
1%|β | 164/11704 [01:51<1:34:47, 2.03it/s]
|
437 |
1%|β | 165/11704 [01:52<1:34:54, 2.03it/s]
|
438 |
1%|β | 166/11704 [01:52<1:34:47, 2.03it/s]
|
439 |
1%|β | 167/11704 [01:53<1:34:47, 2.03it/s]
|
440 |
1%|β | 168/11704 [01:53<1:34:38, 2.03it/s]
|
441 |
1%|β | 169/11704 [01:54<1:34:42, 2.03it/s]
|
442 |
1%|β | 170/11704 [01:54<1:34:41, 2.03it/s]
|
443 |
1%|β | 171/11704 [01:55<1:34:32, 2.03it/s]
|
444 |
1%|β | 172/11704 [01:55<1:34:38, 2.03it/s]
|
445 |
1%|β | 173/11704 [01:56<1:34:32, 2.03it/s]
|
446 |
1%|β | 174/11704 [01:56<1:34:27, 2.03it/s]
|
447 |
1%|β | 175/11704 [01:57<1:34:29, 2.03it/s]{'loss': 6.3957, 'grad_norm': 0.3709251582622528, 'learning_rate': 0.00014944491887275833, 'epoch': 0.21}
|
448 |
+
|
449 |
|
450 |
1%|β | 175/11704 [01:57<1:34:29, 2.03it/s]
|
451 |
2%|β | 176/11704 [01:57<1:34:26, 2.03it/s]
|
452 |
2%|β | 177/11704 [01:58<1:34:35, 2.03it/s]
|
453 |
2%|β | 178/11704 [01:58<1:34:30, 2.03it/s]
|
454 |
2%|β | 179/11704 [01:59<1:34:27, 2.03it/s]
|
455 |
2%|β | 180/11704 [01:59<1:34:35, 2.03it/s]
|
456 |
2%|β | 181/11704 [02:00<1:34:38, 2.03it/s]
|
457 |
2%|β | 182/11704 [02:00<1:34:39, 2.03it/s]
|
458 |
2%|β | 183/11704 [02:01<1:34:32, 2.03it/s]
|
459 |
2%|β | 184/11704 [02:01<1:34:33, 2.03it/s]
|
460 |
2%|β | 185/11704 [02:02<1:34:35, 2.03it/s]
|
461 |
2%|β | 186/11704 [02:02<1:34:32, 2.03it/s]
|
462 |
2%|β | 187/11704 [02:03<1:34:35, 2.03it/s]
|
463 |
2%|β | 188/11704 [02:03<1:34:30, 2.03it/s]
|
464 |
2%|β | 189/11704 [02:04<1:34:32, 2.03it/s]
|
465 |
2%|β | 190/11704 [02:04<1:34:33, 2.03it/s]
|
466 |
2%|β | 191/11704 [02:05<1:34:28, 2.03it/s]
|
467 |
2%|β | 192/11704 [02:05<1:34:33, 2.03it/s]
|
468 |
2%|β | 193/11704 [02:06<1:34:30, 2.03it/s]
|
469 |
2%|β | 194/11704 [02:06<1:34:35, 2.03it/s]
|
470 |
2%|β | 195/11704 [02:07<1:34:27, 2.03it/s]
|
471 |
2%|β | 196/11704 [02:07<1:34:29, 2.03it/s]
|
472 |
2%|β | 197/11704 [02:08<1:34:33, 2.03it/s]
|
473 |
2%|β | 198/11704 [02:08<1:34:28, 2.03it/s]
|
474 |
2%|β | 199/11704 [02:09<1:34:28, 2.03it/s]
|
475 |
2%|β | 200/11704 [02:09<1:34:30, 2.03it/s]{'loss': 5.9826, 'grad_norm': 0.6036785840988159, 'learning_rate': 0.0001707941929974381, 'epoch': 0.24}
|
476 |
+
|
477 |
|
478 |
2%|β | 200/11704 [02:09<1:34:30, 2.03it/s]
|
479 |
2%|β | 201/11704 [02:09<1:34:24, 2.03it/s]
|
480 |
2%|β | 202/11704 [02:10<1:34:31, 2.03it/s]
|
481 |
2%|β | 203/11704 [02:10<1:34:23, 2.03it/s]
|
482 |
2%|β | 204/11704 [02:11<1:34:21, 2.03it/s]
|
483 |
2%|β | 205/11704 [02:11<1:34:27, 2.03it/s]
|
484 |
2%|β | 206/11704 [02:12<1:34:31, 2.03it/s]
|
485 |
2%|β | 207/11704 [02:12<1:34:36, 2.03it/s]
|
486 |
2%|β | 208/11704 [02:13<1:34:28, 2.03it/s]
|
487 |
2%|β | 209/11704 [02:13<1:34:31, 2.03it/s]
|
488 |
2%|β | 210/11704 [02:14<1:34:28, 2.03it/s]
|
489 |
2%|β | 211/11704 [02:14<1:34:26, 2.03it/s]
|
490 |
2%|β | 212/11704 [02:15<1:34:28, 2.03it/s]
|
491 |
2%|β | 213/11704 [02:15<1:34:23, 2.03it/s]
|
492 |
2%|β | 214/11704 [02:16<1:34:28, 2.03it/s]
|
493 |
2%|β | 215/11704 [02:16<1:34:30, 2.03it/s]
|
494 |
2%|β | 216/11704 [02:17<1:34:29, 2.03it/s]
|
495 |
2%|β | 217/11704 [02:17<1:34:34, 2.02it/s]
|
496 |
2%|β | 218/11704 [02:18<1:34:38, 2.02it/s]
|
497 |
2%|β | 219/11704 [02:18<1:34:37, 2.02it/s]
|
498 |
2%|β | 220/11704 [02:19<1:34:45, 2.02it/s]
|
499 |
2%|β | 221/11704 [02:19<1:34:34, 2.02it/s]
|
500 |
2%|β | 222/11704 [02:20<1:34:43, 2.02it/s]
|
501 |
2%|β | 223/11704 [02:20<1:34:35, 2.02it/s]
|
502 |
2%|β | 224/11704 [02:21<1:34:33, 2.02it/s]
|
503 |
2%|β | 225/11704 [02:21<1:34:27, 2.03it/s]{'loss': 5.659, 'grad_norm': 0.9925475716590881, 'learning_rate': 0.00019214346712211785, 'epoch': 0.27}
|
504 |
|
505 |
+
|
506 |
2%|β | 225/11704 [02:21<1:34:27, 2.03it/s]
|
507 |
2%|β | 226/11704 [02:22<1:34:29, 2.02it/s]
|
508 |
2%|β | 227/11704 [02:22<1:34:27, 2.03it/s]
|
509 |
2%|β | 228/11704 [02:23<1:34:41, 2.02it/s]
|
510 |
2%|β | 229/11704 [02:23<1:34:33, 2.02it/s]
|
511 |
2%|β | 230/11704 [02:24<1:34:35, 2.02it/s]
|
512 |
2%|β | 231/11704 [02:24<1:34:27, 2.02it/s]
|
513 |
2%|β | 232/11704 [02:25<1:34:25, 2.03it/s]
|
514 |
2%|β | 233/11704 [02:25<1:34:22, 2.03it/s]
|
515 |
2%|β | 234/11704 [02:26<1:34:20, 2.03it/s]
|
516 |
2%|β | 235/11704 [02:26<1:34:18, 2.03it/s]
|
517 |
2%|β | 236/11704 [02:27<1:34:21, 2.03it/s]
|
518 |
2%|β | 237/11704 [02:27<1:34:25, 2.02it/s]
|
519 |
2%|β | 238/11704 [02:28<1:34:20, 2.03it/s]
|
520 |
2%|β | 239/11704 [02:28<1:34:29, 2.02it/s]
|
521 |
2%|β | 240/11704 [02:29<1:34:21, 2.02it/s]
|
522 |
2%|β | 241/11704 [02:29<1:34:26, 2.02it/s]
|
523 |
2%|β | 242/11704 [02:30<1:34:19, 2.03it/s]
|
524 |
2%|β | 243/11704 [02:30<1:34:16, 2.03it/s]
|
525 |
2%|β | 244/11704 [02:31<1:34:14, 2.03it/s]
|
526 |
2%|β | 245/11704 [02:31<1:34:16, 2.03it/s]
|
527 |
2%|β | 246/11704 [02:32<1:34:15, 2.03it/s]
|
528 |
2%|β | 247/11704 [02:32<1:34:18, 2.02it/s]
|
529 |
2%|β | 248/11704 [02:33<1:34:35, 2.02it/s]
|
530 |
2%|β | 249/11704 [02:33<1:34:34, 2.02it/s]
|
531 |
2%|β | 250/11704 [02:34<1:34:36, 2.02it/s]{'loss': 5.4253, 'grad_norm': 0.6297341585159302, 'learning_rate': 0.0002134927412467976, 'epoch': 0.3}
|
532 |
+
|
533 |
|
534 |
2%|β | 250/11704 [02:34<1:34:36, 2.02it/s]
|
535 |
2%|β | 251/11704 [02:34<1:34:48, 2.01it/s]
|
536 |
2%|β | 252/11704 [02:35<1:34:35, 2.02it/s]
|
537 |
2%|β | 253/11704 [02:35<1:34:32, 2.02it/s]
|
538 |
2%|β | 254/11704 [02:36<1:34:26, 2.02it/s]
|
539 |
2%|β | 255/11704 [02:36<1:34:20, 2.02it/s]
|
540 |
2%|β | 256/11704 [02:37<1:34:34, 2.02it/s]
|
541 |
2%|β | 257/11704 [02:37<1:34:28, 2.02it/s]
|
542 |
2%|β | 258/11704 [02:38<1:34:19, 2.02it/s]
|
543 |
2%|β | 259/11704 [02:38<1:34:10, 2.03it/s]
|
544 |
2%|β | 260/11704 [02:39<1:34:10, 2.03it/s]
|
545 |
2%|β | 261/11704 [02:39<1:34:06, 2.03it/s]
|
546 |
2%|β | 262/11704 [02:40<1:33:56, 2.03it/s]
|
547 |
2%|β | 263/11704 [02:40<1:33:59, 2.03it/s]
|
548 |
2%|β | 264/11704 [02:41<1:33:55, 2.03it/s]
|
549 |
2%|β | 265/11704 [02:41<1:34:01, 2.03it/s]
|
550 |
2%|β | 266/11704 [02:42<1:34:04, 2.03it/s]
|
551 |
2%|β | 267/11704 [02:42<1:33:59, 2.03it/s]
|
552 |
2%|β | 268/11704 [02:43<1:34:06, 2.03it/s]
|
553 |
2%|β | 269/11704 [02:43<1:34:01, 2.03it/s]
|
554 |
2%|β | 270/11704 [02:44<1:34:04, 2.03it/s]
|
555 |
2%|β | 271/11704 [02:44<1:33:59, 2.03it/s]
|
556 |
2%|β | 272/11704 [02:45<1:33:57, 2.03it/s]
|
557 |
2%|β | 273/11704 [02:45<1:33:59, 2.03it/s]
|
558 |
2%|β | 274/11704 [02:46<1:33:57, 2.03it/s]
|
559 |
2%|β | 275/11704 [02:46<1:34:06, 2.02it/s]{'loss': 5.2293, 'grad_norm': 0.5968512892723083, 'learning_rate': 0.00023484201537147736, 'epoch': 0.33}
|
560 |
|
561 |
+
|
562 |
2%|β | 275/11704 [02:46<1:34:06, 2.02it/s]
|
563 |
2%|β | 276/11704 [02:47<1:34:02, 2.03it/s]
|
564 |
2%|β | 277/11704 [02:47<1:34:05, 2.02it/s]
|
565 |
2%|β | 278/11704 [02:48<1:33:59, 2.03it/s]
|
566 |
2%|β | 279/11704 [02:48<1:34:01, 2.02it/s]
|
567 |
2%|β | 280/11704 [02:49<1:33:54, 2.03it/s]
|
568 |
2%|β | 281/11704 [02:49<1:34:00, 2.03it/s]
|
569 |
2%|β | 282/11704 [02:49<1:33:55, 2.03it/s]
|
570 |
2%|β | 283/11704 [02:50<1:33:55, 2.03it/s]
|
571 |
2%|β | 284/11704 [02:50<1:33:52, 2.03it/s]
|
572 |
2%|β | 285/11704 [02:51<1:33:47, 2.03it/s]
|
573 |
2%|β | 286/11704 [02:51<1:33:49, 2.03it/s]
|
574 |
2%|β | 287/11704 [02:52<1:33:49, 2.03it/s]
|
575 |
2%|β | 288/11704 [02:52<1:33:50, 2.03it/s]
|
576 |
2%|β | 289/11704 [02:53<1:33:43, 2.03it/s]
|
577 |
2%|β | 290/11704 [02:53<1:33:47, 2.03it/s]
|
578 |
2%|β | 291/11704 [02:54<1:33:42, 2.03it/s]
|
579 |
2%|β | 292/11704 [02:54<1:33:40, 2.03it/s]
|
580 |
3%|β | 293/11704 [02:55<1:33:43, 2.03it/s]
|
581 |
3%|β | 294/11704 [02:55<1:33:38, 2.03it/s]
|
582 |
3%|β | 295/11704 [02:56<1:33:41, 2.03it/s]
|
583 |
3%|β | 296/11704 [02:56<1:33:37, 2.03it/s]
|
584 |
3%|β | 297/11704 [02:57<1:33:29, 2.03it/s]
|
585 |
3%|β | 298/11704 [02:57<1:33:33, 2.03it/s]
|
586 |
3%|β | 299/11704 [02:58<1:33:34, 2.03it/s]
|
587 |
3%|β | 300/11704 [02:58<1:33:27, 2.03it/s]{'loss': 5.0541, 'grad_norm': 0.9146194458007812, 'learning_rate': 0.0002561912894961571, 'epoch': 0.36}
|
588 |
+
|
589 |
|
590 |
3%|β | 300/11704 [02:58<1:33:27, 2.03it/s]
|
591 |
3%|β | 301/11704 [02:59<1:33:30, 2.03it/s]
|
592 |
3%|β | 302/11704 [02:59<1:33:27, 2.03it/s]
|
593 |
3%|β | 303/11704 [03:00<1:33:27, 2.03it/s]
|
594 |
3%|β | 304/11704 [03:00<1:33:31, 2.03it/s]
|
595 |
3%|β | 305/11704 [03:01<1:33:25, 2.03it/s]
|
596 |
3%|β | 306/11704 [03:01<1:33:30, 2.03it/s]
|
597 |
3%|β | 307/11704 [03:02<1:33:24, 2.03it/s]
|
598 |
3%|β | 308/11704 [03:02<1:33:26, 2.03it/s]
|
599 |
3%|β | 309/11704 [03:03<1:33:31, 2.03it/s]
|
600 |
3%|β | 310/11704 [03:03<1:33:21, 2.03it/s]
|
601 |
3%|β | 311/11704 [03:04<1:33:24, 2.03it/s]
|
602 |
3%|β | 312/11704 [03:04<1:33:26, 2.03it/s]
|
603 |
3%|β | 313/11704 [03:05<1:33:25, 2.03it/s]
|
604 |
3%|β | 314/11704 [03:05<1:33:30, 2.03it/s]
|
605 |
3%|β | 315/11704 [03:06<1:33:22, 2.03it/s]
|
606 |
3%|β | 316/11704 [03:06<1:33:25, 2.03it/s]
|
607 |
3%|β | 317/11704 [03:07<1:33:24, 2.03it/s]
|
608 |
3%|β | 318/11704 [03:07<1:33:20, 2.03it/s]
|
609 |
3%|β | 319/11704 [03:08<1:33:22, 2.03it/s]
|
610 |
3%|β | 320/11704 [03:08<1:33:22, 2.03it/s]
|
611 |
3%|β | 321/11704 [03:09<1:33:16, 2.03it/s]
|
612 |
3%|β | 322/11704 [03:09<1:33:24, 2.03it/s]
|
613 |
3%|β | 323/11704 [03:10<1:33:18, 2.03it/s]
|
614 |
3%|β | 324/11704 [03:10<1:33:15, 2.03it/s]
|
615 |
3%|β | 325/11704 [03:11<1:33:18, 2.03it/s]{'loss': 4.934, 'grad_norm': 0.9477291703224182, 'learning_rate': 0.0002775405636208369, 'epoch': 0.39}
|
616 |
+
|
617 |
|
618 |
3%|β | 325/11704 [03:11<1:33:18, 2.03it/s]
|
619 |
3%|β | 326/11704 [03:11<1:33:15, 2.03it/s]
|
620 |
3%|β | 327/11704 [03:12<1:33:14, 2.03it/s]
|
621 |
3%|β | 328/11704 [03:12<1:33:18, 2.03it/s]
|
622 |
3%|β | 329/11704 [03:13<1:33:10, 2.03it/s]
|
623 |
3%|β | 330/11704 [03:13<1:33:10, 2.03it/s]
|
624 |
3%|β | 331/11704 [03:14<1:33:11, 2.03it/s]
|
625 |
3%|β | 332/11704 [03:14<1:33:11, 2.03it/s]
|
626 |
3%|β | 333/11704 [03:15<1:33:12, 2.03it/s]
|
627 |
3%|β | 334/11704 [03:15<1:33:15, 2.03it/s]
|
628 |
3%|β | 335/11704 [03:16<1:33:09, 2.03it/s]
|
629 |
3%|β | 336/11704 [03:16<1:33:09, 2.03it/s]
|
630 |
3%|β | 337/11704 [03:17<1:33:16, 2.03it/s]
|
631 |
3%|β | 338/11704 [03:17<1:33:49, 2.02it/s]
|
632 |
3%|β | 339/11704 [03:18<1:33:39, 2.02it/s]
|
633 |
3%|β | 340/11704 [03:18<1:33:32, 2.02it/s]
|
634 |
3%|β | 341/11704 [03:19<1:33:29, 2.03it/s]
|
635 |
3%|β | 342/11704 [03:19<1:33:20, 2.03it/s]
|
636 |
3%|β | 343/11704 [03:20<1:33:18, 2.03it/s]
|
637 |
3%|β | 344/11704 [03:20<1:33:18, 2.03it/s]
|
638 |
3%|β | 345/11704 [03:21<1:33:12, 2.03it/s]
|
639 |
3%|β | 346/11704 [03:21<1:33:12, 2.03it/s]
|
640 |
3%|β | 347/11704 [03:21<1:33:08, 2.03it/s]
|
641 |
3%|β | 348/11704 [03:22<1:33:03, 2.03it/s]
|
642 |
3%|β | 349/11704 [03:22<1:33:10, 2.03it/s]
|
643 |
3%|β | 350/11704 [03:23<1:33:03, 2.03it/s]{'loss': 4.8046, 'grad_norm': 0.824380099773407, 'learning_rate': 0.00029888983774551667, 'epoch': 0.42}
|
644 |
+
|
645 |
|
646 |
3%|β | 350/11704 [03:23<1:33:03, 2.03it/s]
|
647 |
3%|β | 351/11704 [03:23<1:33:04, 2.03it/s]
|
648 |
3%|β | 352/11704 [03:24<1:33:07, 2.03it/s]
|
649 |
3%|β | 353/11704 [03:24<1:33:07, 2.03it/s]
|
650 |
3%|β | 354/11704 [03:25<1:33:00, 2.03it/s]
|
651 |
3%|β | 355/11704 [03:25<1:33:01, 2.03it/s]
|
652 |
3%|β | 356/11704 [03:26<1:32:58, 2.03it/s]
|
653 |
3%|β | 357/11704 [03:26<1:33:00, 2.03it/s]
|
654 |
3%|β | 358/11704 [03:27<1:32:59, 2.03it/s]
|
655 |
3%|β | 359/11704 [03:27<1:32:59, 2.03it/s]
|
656 |
3%|β | 360/11704 [03:28<1:32:57, 2.03it/s]
|
657 |
3%|β | 361/11704 [03:28<1:32:57, 2.03it/s]
|
658 |
3%|β | 362/11704 [03:29<1:32:54, 2.03it/s]
|
659 |
3%|β | 363/11704 [03:29<1:32:51, 2.04it/s]
|
660 |
3%|β | 364/11704 [03:30<1:32:54, 2.03it/s]
|
661 |
3%|β | 365/11704 [03:30<1:32:55, 2.03it/s]
|
662 |
3%|β | 366/11704 [03:31<1:32:52, 2.03it/s]
|
663 |
3%|β | 367/11704 [03:31<1:33:03, 2.03it/s]
|
664 |
3%|β | 368/11704 [03:32<1:33:03, 2.03it/s]
|
665 |
3%|β | 369/11704 [03:32<1:33:10, 2.03it/s]
|
666 |
3%|β | 370/11704 [03:33<1:33:01, 2.03it/s]
|
667 |
3%|β | 371/11704 [03:33<1:32:54, 2.03it/s]
|
668 |
3%|β | 372/11704 [03:34<1:32:58, 2.03it/s]
|
669 |
3%|β | 373/11704 [03:34<1:32:51, 2.03it/s]
|
670 |
3%|β | 374/11704 [03:35<1:32:56, 2.03it/s]
|
671 |
3%|β | 375/11704 [03:35<1:33:04, 2.03it/s]{'loss': 4.6929, 'grad_norm': 0.7358281016349792, 'learning_rate': 0.0003202391118701964, 'epoch': 0.45}
|
672 |
+
|
673 |
|
674 |
3%|β | 375/11704 [03:35<1:33:04, 2.03it/s]
|
675 |
3%|β | 376/11704 [03:36<1:33:02, 2.03it/s]
|
676 |
3%|β | 377/11704 [03:36<1:33:04, 2.03it/s]
|
677 |
3%|β | 378/11704 [03:37<1:33:01, 2.03it/s]
|
678 |
3%|β | 379/11704 [03:37<1:33:04, 2.03it/s]
|
679 |
3%|β | 380/11704 [03:38<1:33:01, 2.03it/s]
|
680 |
3%|β | 381/11704 [03:38<1:32:53, 2.03it/s]
|
681 |
3%|β | 382/11704 [03:39<1:33:07, 2.03it/s]
|
682 |
3%|β | 383/11704 [03:39<1:33:02, 2.03it/s]
|
683 |
3%|β | 384/11704 [03:40<1:33:01, 2.03it/s]
|
684 |
3%|β | 385/11704 [03:40<1:32:59, 2.03it/s]
|
685 |
3%|β | 386/11704 [03:41<1:32:51, 2.03it/s]
|
686 |
3%|β | 387/11704 [03:41<1:32:44, 2.03it/s]
|
687 |
3%|β | 388/11704 [03:42<1:32:52, 2.03it/s]
|
688 |
3%|β | 389/11704 [03:42<1:32:47, 2.03it/s]
|
689 |
3%|β | 390/11704 [03:43<1:32:51, 2.03it/s]
|
690 |
3%|β | 391/11704 [03:43<1:32:56, 2.03it/s]
|
691 |
3%|β | 392/11704 [03:44<1:32:50, 2.03it/s]
|
692 |
3%|β | 393/11704 [03:44<1:32:50, 2.03it/s]
|
693 |
3%|β | 394/11704 [03:45<1:32:44, 2.03it/s]
|
694 |
3%|β | 395/11704 [03:45<1:32:41, 2.03it/s]
|
695 |
3%|β | 396/11704 [03:46<1:32:36, 2.04it/s]
|
696 |
3%|β | 397/11704 [03:46<1:32:45, 2.03it/s]
|
697 |
3%|β | 398/11704 [03:47<1:32:39, 2.03it/s]
|
698 |
3%|β | 399/11704 [03:47<1:32:35, 2.03it/s]
|
699 |
3%|β | 400/11704 [03:48<1:32:43, 2.03it/s]
|
700 |
|
701 |
+
|
702 |
3%|β | 400/11704 [03:48<1:32:43, 2.03it/s]
|
703 |
3%|β | 401/11704 [03:48<1:32:46, 2.03it/s]
|
704 |
3%|β | 402/11704 [03:49<1:32:45, 2.03it/s]
|
705 |
3%|β | 403/11704 [03:49<1:32:48, 2.03it/s]
|
706 |
3%|β | 404/11704 [03:50<1:32:43, 2.03it/s]
|
707 |
3%|β | 405/11704 [03:50<1:32:41, 2.03it/s]
|
708 |
3%|β | 406/11704 [03:51<1:32:39, 2.03it/s]
|
709 |
3%|β | 407/11704 [03:51<1:32:34, 2.03it/s]
|
710 |
3%|β | 408/11704 [03:52<1:32:39, 2.03it/s]
|
711 |
3%|β | 409/11704 [03:52<1:32:39, 2.03it/s]
|
712 |
4%|β | 410/11704 [03:53<1:32:35, 2.03it/s]
|
713 |
4%|β | 411/11704 [03:53<1:32:35, 2.03it/s]
|
714 |
4%|β | 412/11704 [03:53<1:32:32, 2.03it/s]
|
715 |
4%|β | 413/11704 [03:54<1:32:25, 2.04it/s]
|
716 |
4%|β | 414/11704 [03:54<1:32:29, 2.03it/s]
|
717 |
4%|β | 415/11704 [03:55<1:32:26, 2.04it/s]
|
718 |
4%|β | 416/11704 [03:55<1:32:17, 2.04it/s]
|
719 |
4%|β | 417/11704 [03:56<1:32:23, 2.04it/s]
|
720 |
4%|β | 418/11704 [03:56<1:32:26, 2.03it/s]
|
721 |
4%|β | 419/11704 [03:57<1:32:22, 2.04it/s]
|
722 |
4%|β | 420/11704 [03:57<1:32:24, 2.04it/s]
|
723 |
4%|β | 421/11704 [03:58<1:32:22, 2.04it/s]
|
724 |
4%|β | 422/11704 [03:58<1:32:19, 2.04it/s]
|
725 |
4%|β | 423/11704 [03:59<1:32:20, 2.04it/s]
|
726 |
4%|β | 424/11704 [03:59<1:32:19, 2.04it/s]
|
727 |
4%|β | 425/11704 [04:00<1:32:14, 2.04it/s]
|
728 |
|
729 |
+
|
730 |
4%|β | 425/11704 [04:00<1:32:14, 2.04it/s]
|
731 |
4%|β | 426/11704 [04:00<1:32:21, 2.04it/s]
|
732 |
4%|β | 427/11704 [04:01<1:32:20, 2.04it/s]
|
733 |
4%|β | 428/11704 [04:01<1:32:16, 2.04it/s]
|
734 |
4%|β | 429/11704 [04:02<1:32:26, 2.03it/s]
|
735 |
4%|β | 430/11704 [04:02<1:32:20, 2.03it/s]
|
736 |
4%|β | 431/11704 [04:03<1:32:17, 2.04it/s]
|
737 |
4%|β | 432/11704 [04:03<1:32:23, 2.03it/s]
|
738 |
4%|β | 433/11704 [04:04<1:32:20, 2.03it/s]
|
739 |
4%|β | 434/11704 [04:04<1:32:17, 2.04it/s]
|
740 |
4%|β | 435/11704 [04:05<1:32:23, 2.03it/s]
|
741 |
4%|β | 436/11704 [04:05<1:32:19, 2.03it/s]
|
742 |
4%|β | 437/11704 [04:06<1:32:21, 2.03it/s]
|
743 |
4%|β | 438/11704 [04:06<1:32:25, 2.03it/s]
|
744 |
4%|β | 439/11704 [04:07<1:32:13, 2.04it/s]
|
745 |
4%|β | 440/11704 [04:07<1:32:22, 2.03it/s]
|
746 |
4%|β | 441/11704 [04:08<1:32:18, 2.03it/s]
|
747 |
4%|β | 442/11704 [04:08<1:32:09, 2.04it/s]
|
748 |
4%|β | 443/11704 [04:09<1:32:17, 2.03it/s]
|
749 |
4%|β | 444/11704 [04:09<1:32:16, 2.03it/s]
|
750 |
4%|β | 445/11704 [04:10<1:32:08, 2.04it/s]
|
751 |
4%|β | 446/11704 [04:10<1:32:13, 2.03it/s]
|
752 |
4%|β | 447/11704 [04:11<1:32:10, 2.04it/s]
|
753 |
4%|β | 448/11704 [04:11<1:32:08, 2.04it/s]
|
754 |
4%|β | 449/11704 [04:12<1:32:12, 2.03it/s]
|
755 |
4%|β | 450/11704 [04:12<1:32:11, 2.03it/s]{'loss': 4.4307, 'grad_norm': 0.6776960492134094, 'learning_rate': 0.0003842869342442357, 'epoch': 0.54}
|
756 |
|
757 |
+
|
758 |
4%|β | 450/11704 [04:12<1:32:11, 2.03it/s]
|
759 |
4%|β | 451/11704 [04:13<1:32:13, 2.03it/s]
|
760 |
4%|β | 452/11704 [04:13<1:32:17, 2.03it/s]
|
761 |
4%|β | 453/11704 [04:14<1:32:13, 2.03it/s]
|
762 |
4%|β | 454/11704 [04:14<1:32:16, 2.03it/s]
|
763 |
4%|β | 455/11704 [04:15<1:32:11, 2.03it/s]
|
764 |
4%|β | 456/11704 [04:15<1:32:06, 2.04it/s]
|
765 |
4%|β | 457/11704 [04:16<1:32:11, 2.03it/s]
|
766 |
4%|β | 458/11704 [04:16<1:32:09, 2.03it/s]
|
767 |
4%|β | 459/11704 [04:17<1:32:04, 2.04it/s]
|
768 |
4%|β | 460/11704 [04:17<1:32:09, 2.03it/s]
|
769 |
4%|β | 461/11704 [04:18<1:32:10, 2.03it/s]
|
770 |
4%|β | 462/11704 [04:18<1:32:04, 2.03it/s]
|
771 |
4%|β | 463/11704 [04:19<1:32:10, 2.03it/s]
|
772 |
4%|β | 464/11704 [04:19<1:32:03, 2.04it/s]
|
773 |
4%|β | 465/11704 [04:20<1:31:56, 2.04it/s]
|
774 |
4%|β | 466/11704 [04:20<1:32:00, 2.04it/s]
|
775 |
4%|β | 467/11704 [04:21<1:32:05, 2.03it/s]
|
776 |
4%|β | 468/11704 [04:21<1:31:58, 2.04it/s]
|
777 |
4%|β | 469/11704 [04:22<1:32:03, 2.03it/s]
|
778 |
4%|β | 470/11704 [04:22<1:31:58, 2.04it/s]
|
779 |
4%|β | 471/11704 [04:22<1:31:56, 2.04it/s]
|
780 |
4%|β | 472/11704 [04:23<1:31:58, 2.04it/s]
|
781 |
4%|β | 473/11704 [04:23<1:32:04, 2.03it/s]
|
782 |
4%|β | 474/11704 [04:24<1:31:58, 2.04it/s]
|
783 |
4%|β | 475/11704 [04:24<1:32:03, 2.03it/s]{'loss': 4.3608, 'grad_norm': 0.7181493043899536, 'learning_rate': 0.00040563620836891546, 'epoch': 0.57}
|
784 |
+
|
785 |
|
786 |
4%|β | 475/11704 [04:24<1:32:03, 2.03it/s]
|
787 |
4%|β | 476/11704 [04:25<1:32:02, 2.03it/s]
|
788 |
4%|β | 477/11704 [04:25<1:32:02, 2.03it/s]
|
789 |
4%|β | 478/11704 [04:26<1:32:07, 2.03it/s]
|
790 |
4%|β | 479/11704 [04:26<1:31:59, 2.03it/s]
|
791 |
4%|β | 480/11704 [04:27<1:32:01, 2.03it/s]
|
792 |
4%|β | 481/11704 [04:27<1:32:03, 2.03it/s]
|
793 |
4%|β | 482/11704 [04:28<1:31:58, 2.03it/s]
|
794 |
4%|β | 483/11704 [04:28<1:32:07, 2.03it/s]
|
795 |
4%|β | 484/11704 [04:29<1:32:02, 2.03it/s]
|
796 |
4%|β | 485/11704 [04:29<1:31:57, 2.03it/s]
|
797 |
4%|β | 486/11704 [04:30<1:31:53, 2.03it/s]
|
798 |
4%|β | 487/11704 [04:30<1:31:57, 2.03it/s]
|
799 |
4%|β | 488/11704 [04:31<1:31:56, 2.03it/s]
|
800 |
4%|β | 489/11704 [04:31<1:31:56, 2.03it/s]
|
801 |
4%|β | 490/11704 [04:32<1:31:57, 2.03it/s]
|
802 |
4%|β | 491/11704 [04:32<1:31:54, 2.03it/s]
|
803 |
4%|β | 492/11704 [04:33<1:31:57, 2.03it/s]
|
804 |
4%|β | 493/11704 [04:33<1:31:54, 2.03it/s]
|
805 |
4%|β | 494/11704 [04:34<1:31:55, 2.03it/s]
|
806 |
4%|β | 495/11704 [04:34<1:31:55, 2.03it/s]
|
807 |
4%|β | 496/11704 [04:35<1:31:53, 2.03it/s]
|
808 |
4%|β | 497/11704 [04:35<1:31:52, 2.03it/s]
|
809 |
4%|β | 498/11704 [04:36<1:31:53, 2.03it/s]
|
810 |
4%|β | 499/11704 [04:36<1:31:47, 2.03it/s]
|
811 |
4%|β | 500/11704 [04:37<1:31:42, 2.04it/s]{'loss': 4.2807, 'grad_norm': 0.6647967100143433, 'learning_rate': 0.0004269854824935952, 'epoch': 0.6}
|
812 |
+
|
813 |
|
814 |
4%|β | 500/11704 [04:37<1:31:42, 2.04it/s]
|
815 |
4%|β | 501/11704 [04:37<1:31:46, 2.03it/s]
|
816 |
4%|β | 502/11704 [04:38<1:31:41, 2.04it/s]
|
817 |
4%|β | 503/11704 [04:38<1:31:44, 2.03it/s]
|
818 |
4%|β | 504/11704 [04:39<1:31:46, 2.03it/s]
|
819 |
4%|β | 505/11704 [04:39<1:31:44, 2.03it/s]
|
820 |
4%|β | 506/11704 [04:40<1:31:47, 2.03it/s]
|
821 |
4%|β | 507/11704 [04:40<1:31:46, 2.03it/s]
|
822 |
4%|β | 508/11704 [04:41<1:31:43, 2.03it/s]
|
823 |
4%|β | 509/11704 [04:41<1:31:50, 2.03it/s]
|
824 |
4%|β | 510/11704 [04:42<1:31:44, 2.03it/s]
|
825 |
4%|β | 511/11704 [04:42<1:31:40, 2.04it/s]
|
826 |
4%|β | 512/11704 [04:43<1:31:45, 2.03it/s]
|
827 |
4%|β | 513/11704 [04:43<1:31:42, 2.03it/s]
|
828 |
4%|β | 514/11704 [04:44<1:31:42, 2.03it/s]
|
829 |
4%|β | 515/11704 [04:44<1:31:45, 2.03it/s]
|
830 |
4%|β | 516/11704 [04:45<1:31:43, 2.03it/s]
|
831 |
4%|β | 517/11704 [04:45<1:31:48, 2.03it/s]
|
832 |
4%|β | 518/11704 [04:46<1:32:34, 2.01it/s]
|
833 |
4%|β | 519/11704 [04:46<1:32:16, 2.02it/s]
|
834 |
4%|β | 520/11704 [04:47<1:31:57, 2.03it/s]
|
835 |
4%|β | 521/11704 [04:47<1:31:50, 2.03it/s]
|
836 |
4%|β | 522/11704 [04:48<1:32:03, 2.02it/s]
|
837 |
4%|β | 523/11704 [04:48<1:31:50, 2.03it/s]
|
838 |
4%|β | 524/11704 [04:49<1:31:45, 2.03it/s]
|
839 |
4%|β | 525/11704 [04:49<1:31:44, 2.03it/s]{'loss': 4.221, 'grad_norm': 0.5851206183433533, 'learning_rate': 0.00044833475661827497, 'epoch': 0.63}
|
840 |
+
|
841 |
|
842 |
4%|β | 525/11704 [04:49<1:31:44, 2.03it/s]
|
843 |
4%|β | 526/11704 [04:50<1:31:38, 2.03it/s]
|
844 |
5%|β | 527/11704 [04:50<1:31:31, 2.04it/s]
|
845 |
5%|β | 528/11704 [04:51<1:31:36, 2.03it/s]
|
846 |
5%|β | 529/11704 [04:51<1:31:34, 2.03it/s]
|
847 |
5%|β | 530/11704 [04:52<1:31:33, 2.03it/s]
|
848 |
5%|β | 531/11704 [04:52<1:31:38, 2.03it/s]
|
849 |
5%|β | 532/11704 [04:53<1:31:36, 2.03it/s]
|
850 |
5%|β | 533/11704 [04:53<1:31:28, 2.04it/s]
|
851 |
5%|β | 534/11704 [04:53<1:31:30, 2.03it/s]
|
852 |
5%|β | 535/11704 [04:54<1:31:34, 2.03it/s]
|
853 |
5%|β | 536/11704 [04:54<1:31:33, 2.03it/s]
|
854 |
5%|β | 537/11704 [04:55<1:31:29, 2.03it/s]
|
855 |
5%|β | 538/11704 [04:55<1:31:32, 2.03it/s]
|
856 |
5%|β | 539/11704 [04:56<1:31:29, 2.03it/s]
|
857 |
5%|β | 540/11704 [04:56<1:31:23, 2.04it/s]
|
858 |
5%|β | 541/11704 [04:57<1:31:26, 2.03it/s]
|
859 |
5%|β | 542/11704 [04:57<1:31:21, 2.04it/s]
|
860 |
5%|β | 543/11704 [04:58<1:31:19, 2.04it/s]
|
861 |
5%|β | 544/11704 [04:58<1:31:27, 2.03it/s]
|
862 |
5%|β | 545/11704 [04:59<1:31:24, 2.03it/s]
|
863 |
5%|β | 546/11704 [04:59<1:31:18, 2.04it/s]
|
864 |
5%|β | 547/11704 [05:00<1:31:20, 2.04it/s]
|
865 |
5%|β | 548/11704 [05:00<1:31:17, 2.04it/s]
|
866 |
5%|β | 549/11704 [05:01<1:31:09, 2.04it/s]
|
867 |
5%|β | 550/11704 [05:01<1:31:18, 2.04it/s]{'loss': 4.1699, 'grad_norm': 0.8667837381362915, 'learning_rate': 0.00046968403074295473, 'epoch': 0.66}
|
868 |
+
|
869 |
|
870 |
5%|β | 550/11704 [05:01<1:31:18, 2.04it/s]
|
871 |
5%|β | 551/11704 [05:02<1:31:19, 2.04it/s]
|
872 |
5%|β | 552/11704 [05:02<1:31:17, 2.04it/s]
|
873 |
5%|β | 553/11704 [05:03<1:31:25, 2.03it/s]
|
874 |
5%|β | 554/11704 [05:03<1:31:22, 2.03it/s]
|
875 |
5%|β | 555/11704 [05:04<1:31:22, 2.03it/s]
|
876 |
5%|β | 556/11704 [05:04<1:31:25, 2.03it/s]
|
877 |
5%|β | 557/11704 [05:05<1:31:21, 2.03it/s]
|
878 |
5%|β | 558/11704 [05:05<1:31:20, 2.03it/s]
|
879 |
5%|β | 559/11704 [05:06<1:31:21, 2.03it/s]
|
880 |
5%|β | 560/11704 [05:06<1:31:22, 2.03it/s]
|
881 |
5%|β | 561/11704 [05:07<1:31:22, 2.03it/s]
|
882 |
5%|β | 562/11704 [05:07<1:31:17, 2.03it/s]
|
883 |
5%|β | 563/11704 [05:08<1:31:13, 2.04it/s]
|
884 |
5%|β | 564/11704 [05:08<1:31:18, 2.03it/s]
|
885 |
5%|β | 565/11704 [05:09<1:31:13, 2.03it/s]
|
886 |
5%|β | 566/11704 [05:09<1:31:07, 2.04it/s]
|
887 |
5%|β | 567/11704 [05:10<1:31:11, 2.04it/s]
|
888 |
5%|β | 568/11704 [05:10<1:31:09, 2.04it/s]
|
889 |
5%|β | 569/11704 [05:11<1:31:03, 2.04it/s]
|
890 |
5%|β | 570/11704 [05:11<1:31:11, 2.03it/s]
|
891 |
5%|β | 571/11704 [05:12<1:31:07, 2.04it/s]
|
892 |
5%|β | 572/11704 [05:12<1:31:04, 2.04it/s]
|
893 |
5%|β | 573/11704 [05:13<1:31:09, 2.04it/s]
|
894 |
5%|β | 574/11704 [05:13<1:31:05, 2.04it/s]
|
895 |
5%|β | 575/11704 [05:14<1:31:05, 2.04it/s]{'loss': 4.1233, 'grad_norm': 0.720784604549408, 'learning_rate': 0.0004910333048676345, 'epoch': 0.69}
|
896 |
+
|
897 |
|
898 |
5%|β | 575/11704 [05:14<1:31:05, 2.04it/s]
|
899 |
5%|β | 576/11704 [05:14<1:31:14, 2.03it/s]
|
900 |
5%|β | 577/11704 [05:15<1:31:06, 2.04it/s]
|
901 |
5%|β | 578/11704 [05:15<1:31:06, 2.04it/s]
|
902 |
5%|β | 579/11704 [05:16<1:31:10, 2.03it/s]
|
903 |
5%|β | 580/11704 [05:16<1:31:03, 2.04it/s]
|
904 |
5%|β | 581/11704 [05:17<1:31:00, 2.04it/s]
|
905 |
5%|β | 582/11704 [05:17<1:31:08, 2.03it/s]
|
906 |
5%|β | 583/11704 [05:18<1:31:06, 2.03it/s]
|
907 |
5%|β | 584/11704 [05:18<1:31:09, 2.03it/s]
|
908 |
5%|β | 585/11704 [05:19<1:31:07, 2.03it/s]
|
909 |
5%|β | 586/11704 [05:19<1:31:03, 2.04it/s]
|
910 |
5%|β | 587/11704 [05:20<1:31:06, 2.03it/s]
|
911 |
5%|β | 588/11704 [05:20<1:31:03, 2.03it/s]
|
912 |
5%|β | 589/11704 [05:21<1:30:56, 2.04it/s]
|
913 |
5%|β | 590/11704 [05:21<1:31:02, 2.03it/s]
|
914 |
5%|β | 591/11704 [05:21<1:31:02, 2.03it/s]
|
915 |
5%|β | 592/11704 [05:22<1:30:59, 2.04it/s]
|
916 |
5%|β | 593/11704 [05:22<1:31:05, 2.03it/s]
|
917 |
5%|β | 594/11704 [05:23<1:31:00, 2.03it/s]
|
918 |
5%|β | 595/11704 [05:23<1:30:57, 2.04it/s]
|
919 |
5%|β | 596/11704 [05:24<1:30:54, 2.04it/s]
|
920 |
5%|β | 597/11704 [05:24<1:30:55, 2.04it/s]
|
921 |
5%|β | 598/11704 [05:25<1:30:59, 2.03it/s]
|
922 |
5%|β | 599/11704 [05:25<1:31:07, 2.03it/s]
|
923 |
5%|β | 600/11704 [05:26<1:31:00, 2.03it/s]{'loss': 4.0671, 'grad_norm': 0.5922385454177856, 'learning_rate': 0.0005123825789923142, 'epoch': 0.72}
|
924 |
+
|
925 |
|
926 |
5%|β | 600/11704 [05:26<1:31:00, 2.03it/s]
|
927 |
5%|β | 601/11704 [05:26<1:31:07, 2.03it/s]
|
928 |
5%|β | 602/11704 [05:27<1:31:07, 2.03it/s]
|
929 |
5%|β | 603/11704 [05:27<1:30:58, 2.03it/s]
|
930 |
5%|β | 604/11704 [05:28<1:31:02, 2.03it/s]
|
931 |
5%|β | 605/11704 [05:28<1:30:57, 2.03it/s]
|
932 |
5%|β | 606/11704 [05:29<1:30:54, 2.03it/s]
|
933 |
5%|β | 607/11704 [05:29<1:31:00, 2.03it/s]
|
934 |
5%|β | 608/11704 [05:30<1:30:53, 2.03it/s]
|
935 |
5%|β | 609/11704 [05:30<1:30:51, 2.04it/s]
|
936 |
5%|β | 610/11704 [05:31<1:30:58, 2.03it/s]
|
937 |
5%|β | 611/11704 [05:31<1:30:50, 2.04it/s]
|
938 |
5%|β | 612/11704 [05:32<1:30:44, 2.04it/s]
|
939 |
5%|β | 613/11704 [05:32<1:30:56, 2.03it/s]
|
940 |
5%|β | 614/11704 [05:33<1:30:48, 2.04it/s]
|
941 |
5%|β | 615/11704 [05:33<1:39:03, 1.87it/s]
|
942 |
5%|β | 616/11704 [05:34<1:44:26, 1.77it/s]
|
943 |
5%|β | 617/11704 [05:35<1:40:15, 1.84it/s]
|
944 |
5%|β | 618/11704 [05:35<1:37:27, 1.90it/s]
|
945 |
5%|β | 619/11704 [05:36<1:35:28, 1.93it/s]
|
946 |
5%|β | 620/11704 [05:36<1:33:54, 1.97it/s]
|
947 |
5%|β | 621/11704 [05:37<1:33:01, 1.99it/s]
|
948 |
5%|β | 622/11704 [05:37<1:32:24, 2.00it/s]
|
949 |
5%|β | 623/11704 [05:38<1:31:51, 2.01it/s]
|
950 |
5%|β | 624/11704 [05:38<1:31:28, 2.02it/s]
|
951 |
5%|β | 625/11704 [05:38<1:31:20, 2.02it/s]{'loss': 4.0331, 'grad_norm': 0.5989447236061096, 'learning_rate': 0.000533731853116994, 'epoch': 0.75}
|
952 |
+
|
953 |
|
954 |
5%|β | 625/11704 [05:39<1:31:20, 2.02it/s]
|
955 |
5%|β | 626/11704 [05:39<1:31:10, 2.03it/s]
|
956 |
5%|β | 627/11704 [05:39<1:30:59, 2.03it/s]
|
957 |
5%|β | 628/11704 [05:40<1:31:00, 2.03it/s]
|
958 |
5%|β | 629/11704 [05:40<1:30:50, 2.03it/s]
|
959 |
5%|β | 630/11704 [05:41<1:30:48, 2.03it/s]
|
960 |
5%|β | 631/11704 [05:41<1:30:49, 2.03it/s]
|
961 |
5%|β | 632/11704 [05:42<1:30:45, 2.03it/s]
|
962 |
5%|β | 633/11704 [05:42<1:30:51, 2.03it/s]
|
963 |
5%|β | 634/11704 [05:43<1:30:44, 2.03it/s]
|
964 |
5%|β | 635/11704 [05:43<1:30:39, 2.04it/s]
|
965 |
5%|β | 636/11704 [05:44<1:30:46, 2.03it/s]
|
966 |
5%|β | 637/11704 [05:44<1:30:45, 2.03it/s]
|
967 |
5%|β | 638/11704 [05:45<1:30:43, 2.03it/s]
|
968 |
5%|β | 639/11704 [05:45<1:30:43, 2.03it/s]
|
969 |
5%|β | 640/11704 [05:46<1:30:38, 2.03it/s]
|
970 |
5%|β | 641/11704 [05:46<1:30:40, 2.03it/s]
|
971 |
5%|β | 642/11704 [05:47<1:30:42, 2.03it/s]
|
972 |
5%|β | 643/11704 [05:47<1:30:39, 2.03it/s]
|
973 |
6%|β | 644/11704 [05:48<1:30:44, 2.03it/s]
|
974 |
6%|β | 645/11704 [05:48<1:30:35, 2.03it/s]
|
975 |
6%|β | 646/11704 [05:49<1:30:40, 2.03it/s]
|
976 |
6%|β | 647/11704 [05:49<1:30:42, 2.03it/s]
|
977 |
6%|β | 648/11704 [05:50<1:30:35, 2.03it/s]
|
978 |
6%|β | 649/11704 [05:50<1:30:36, 2.03it/s]
|
979 |
6%|β | 650/11704 [05:51<1:30:42, 2.03it/s]{'loss': 3.9926, 'grad_norm': 0.630885899066925, 'learning_rate': 0.0005550811272416738, 'epoch': 0.78}
|
980 |
+
|
981 |
|
982 |
6%|β | 650/11704 [05:51<1:30:42, 2.03it/s]
|
983 |
6%|β | 651/11704 [05:51<1:30:40, 2.03it/s]
|
984 |
6%|β | 652/11704 [05:52<1:30:43, 2.03it/s]
|
985 |
6%|β | 653/11704 [05:52<1:30:36, 2.03it/s]
|
986 |
6%|β | 654/11704 [05:53<1:30:33, 2.03it/s]
|
987 |
6%|β | 655/11704 [05:53<1:30:43, 2.03it/s]
|
988 |
6%|β | 656/11704 [05:54<1:30:37, 2.03it/s]
|
989 |
6%|β | 657/11704 [05:54<1:30:37, 2.03it/s]
|
990 |
6%|β | 658/11704 [05:55<1:30:42, 2.03it/s]
|
991 |
6%|β | 659/11704 [05:55<1:30:42, 2.03it/s]
|
992 |
6%|β | 660/11704 [05:56<1:30:41, 2.03it/s]
|
993 |
6%|β | 661/11704 [05:56<1:30:43, 2.03it/s]
|
994 |
6%|β | 662/11704 [05:57<1:30:33, 2.03it/s]
|
995 |
6%|β | 663/11704 [05:57<1:30:33, 2.03it/s]
|
996 |
6%|β | 664/11704 [05:58<1:30:35, 2.03it/s]
|
997 |
6%|β | 665/11704 [05:58<1:30:35, 2.03it/s]
|
998 |
6%|β | 666/11704 [05:59<1:30:29, 2.03it/s]
|
999 |
6%|β | 667/11704 [05:59<1:30:36, 2.03it/s]
|
1000 |
6%|β | 668/11704 [06:00<1:30:36, 2.03it/s]
|
1001 |
6%|β | 669/11704 [06:00<1:30:32, 2.03it/s]
|
1002 |
6%|β | 670/11704 [06:01<1:30:35, 2.03it/s]
|
1003 |
6%|β | 671/11704 [06:01<1:30:30, 2.03it/s]
|
1004 |
6%|β | 672/11704 [06:02<1:30:28, 2.03it/s]
|
1005 |
6%|β | 673/11704 [06:02<1:30:26, 2.03it/s]
|
1006 |
6%|β | 674/11704 [06:03<1:30:24, 2.03it/s]
|
1007 |
6%|β | 675/11704 [06:03<1:30:22, 2.03it/s]{'loss': 3.9532, 'grad_norm': 0.6419950723648071, 'learning_rate': 0.0005764304013663536, 'epoch': 0.81}
|
1008 |
+
|
1009 |
|
1010 |
6%|β | 675/11704 [06:03<1:30:22, 2.03it/s]
|
1011 |
6%|β | 676/11704 [06:04<1:30:31, 2.03it/s]
|
1012 |
6%|β | 677/11704 [06:04<1:30:28, 2.03it/s]
|
1013 |
6%|β | 678/11704 [06:05<1:30:33, 2.03it/s]
|
1014 |
6%|β | 679/11704 [06:05<1:30:34, 2.03it/s]
|
1015 |
6%|β | 680/11704 [06:06<1:30:24, 2.03it/s]
|
1016 |
6%|β | 681/11704 [06:06<1:30:30, 2.03it/s]
|
1017 |
6%|β | 682/11704 [06:07<1:30:29, 2.03it/s]
|
1018 |
6%|β | 683/11704 [06:07<1:30:31, 2.03it/s]
|
1019 |
6%|β | 684/11704 [06:08<1:30:31, 2.03it/s]
|
1020 |
6%|β | 685/11704 [06:08<1:30:25, 2.03it/s]
|
1021 |
6%|β | 686/11704 [06:09<1:30:28, 2.03it/s]
|
1022 |
6%|β | 687/11704 [06:09<1:30:25, 2.03it/s]
|
1023 |
6%|β | 688/11704 [06:10<1:30:24, 2.03it/s]
|
1024 |
6%|β | 689/11704 [06:10<1:30:25, 2.03it/s]
|
1025 |
6%|β | 690/11704 [06:10<1:30:24, 2.03it/s]
|
1026 |
6%|β | 691/11704 [06:11<1:30:24, 2.03it/s]
|
1027 |
6%|β | 692/11704 [06:11<1:30:19, 2.03it/s]
|
1028 |
6%|β | 693/11704 [06:12<1:30:15, 2.03it/s]
|
1029 |
6%|β | 694/11704 [06:12<1:30:20, 2.03it/s]
|
1030 |
6%|β | 695/11704 [06:13<1:30:17, 2.03it/s]
|
1031 |
6%|β | 696/11704 [06:13<1:30:23, 2.03it/s]
|
1032 |
6%|β | 697/11704 [06:14<1:30:18, 2.03it/s]
|
1033 |
6%|β | 698/11704 [06:14<1:30:13, 2.03it/s]
|
1034 |
6%|β | 699/11704 [06:15<1:30:15, 2.03it/s]
|
1035 |
6%|β | 700/11704 [06:15<1:30:13, 2.03it/s]{'loss': 3.9036, 'grad_norm': 0.5995571613311768, 'learning_rate': 0.0005977796754910333, 'epoch': 0.84}
|
1036 |
+
|
1037 |
|
1038 |
6%|β | 700/11704 [06:15<1:30:13, 2.03it/s]
|
1039 |
6%|β | 701/11704 [06:16<1:30:21, 2.03it/s]
|
1040 |
6%|β | 702/11704 [06:16<1:30:22, 2.03it/s]
|
1041 |
6%|β | 703/11704 [06:17<1:30:16, 2.03it/s]
|
1042 |
6%|β | 704/11704 [06:17<1:30:19, 2.03it/s]
|
1043 |
6%|β | 705/11704 [06:18<1:30:16, 2.03it/s]
|
1044 |
6%|β | 706/11704 [06:18<1:30:18, 2.03it/s]
|
1045 |
6%|β | 707/11704 [06:19<1:30:13, 2.03it/s]
|
1046 |
6%|β | 708/11704 [06:19<1:30:10, 2.03it/s]
|
1047 |
6%|β | 709/11704 [06:20<1:30:16, 2.03it/s]
|
1048 |
6%|β | 710/11704 [06:20<1:30:10, 2.03it/s]
|
1049 |
6%|β | 711/11704 [06:21<1:30:22, 2.03it/s]
|
1050 |
6%|β | 712/11704 [06:21<1:30:14, 2.03it/s]
|
1051 |
6%|β | 713/11704 [06:22<1:30:10, 2.03it/s]
|
1052 |
6%|β | 714/11704 [06:22<1:30:16, 2.03it/s]
|
1053 |
6%|β | 715/11704 [06:23<1:30:10, 2.03it/s]
|
1054 |
6%|β | 716/11704 [06:23<1:30:13, 2.03it/s]
|
1055 |
6%|β | 717/11704 [06:24<1:30:17, 2.03it/s]
|
1056 |
6%|β | 718/11704 [06:24<1:30:09, 2.03it/s]
|
1057 |
6%|β | 719/11704 [06:25<1:30:13, 2.03it/s]
|
1058 |
6%|β | 720/11704 [06:25<1:30:05, 2.03it/s]
|
1059 |
6%|β | 721/11704 [06:26<1:30:09, 2.03it/s]
|
1060 |
6%|β | 722/11704 [06:26<1:30:09, 2.03it/s]
|
1061 |
6%|β | 723/11704 [06:27<1:30:08, 2.03it/s]
|
1062 |
6%|β | 724/11704 [06:27<1:30:10, 2.03it/s]
|
1063 |
6%|β | 725/11704 [06:28<1:30:07, 2.03it/s]{'loss': 3.8761, 'grad_norm': 0.5167131423950195, 'learning_rate': 0.000619128949615713, 'epoch': 0.87}
|
1064 |
+
|
1065 |
|
1066 |
6%|β | 725/11704 [06:28<1:30:07, 2.03it/s]
|
1067 |
6%|β | 726/11704 [06:28<1:30:10, 2.03it/s]
|
1068 |
6%|β | 727/11704 [06:29<1:30:07, 2.03it/s]
|
1069 |
6%|β | 728/11704 [06:29<1:30:06, 2.03it/s]
|
1070 |
6%|β | 729/11704 [06:30<1:30:10, 2.03it/s]
|
1071 |
6%|β | 730/11704 [06:30<1:30:08, 2.03it/s]
|
1072 |
6%|β | 731/11704 [06:31<1:30:10, 2.03it/s]
|
1073 |
6%|β | 732/11704 [06:31<1:30:04, 2.03it/s]
|
1074 |
6%|β | 733/11704 [06:32<1:30:01, 2.03it/s]
|
1075 |
6%|β | 734/11704 [06:32<1:30:04, 2.03it/s]
|
1076 |
6%|β | 735/11704 [06:33<1:29:58, 2.03it/s]
|
1077 |
6%|β | 736/11704 [06:33<1:30:04, 2.03it/s]
|
1078 |
6%|β | 737/11704 [06:34<1:30:03, 2.03it/s]
|
1079 |
6%|β | 738/11704 [06:34<1:30:02, 2.03it/s]
|
1080 |
6%|β | 739/11704 [06:35<1:30:03, 2.03it/s]
|
1081 |
6%|β | 740/11704 [06:35<1:29:57, 2.03it/s]
|
1082 |
6%|β | 741/11704 [06:36<1:30:03, 2.03it/s]
|
1083 |
6%|β | 742/11704 [06:36<1:30:01, 2.03it/s]
|
1084 |
6%|β | 743/11704 [06:37<1:29:59, 2.03it/s]
|
1085 |
6%|β | 744/11704 [06:37<1:30:00, 2.03it/s]
|
1086 |
6%|β | 745/11704 [06:38<1:29:54, 2.03it/s]
|
1087 |
6%|β | 746/11704 [06:38<1:30:01, 2.03it/s]
|
1088 |
6%|β | 747/11704 [06:39<1:30:00, 2.03it/s]
|
1089 |
6%|β | 748/11704 [06:39<1:29:55, 2.03it/s]
|
1090 |
6%|β | 749/11704 [06:40<1:30:01, 2.03it/s]
|
1091 |
6%|β | 750/11704 [06:40<1:29:54, 2.03it/s]{'loss': 3.851, 'grad_norm': 0.5299991369247437, 'learning_rate': 0.0006404782237403928, 'epoch': 0.9}
|
1092 |
+
|
1093 |
|
1094 |
6%|β | 750/11704 [06:40<1:29:54, 2.03it/s]
|
1095 |
6%|β | 751/11704 [06:41<1:29:57, 2.03it/s]
|
1096 |
6%|β | 752/11704 [06:41<1:30:00, 2.03it/s]
|
1097 |
6%|β | 753/11704 [06:42<1:29:52, 2.03it/s]
|
1098 |
6%|β | 754/11704 [06:42<1:30:01, 2.03it/s]
|
1099 |
6%|β | 755/11704 [06:43<1:29:53, 2.03it/s]
|
1100 |
6%|β | 756/11704 [06:43<1:29:51, 2.03it/s]
|
1101 |
6%|β | 757/11704 [06:43<1:29:52, 2.03it/s]
|
1102 |
6%|β | 758/11704 [06:44<1:29:51, 2.03it/s]
|
1103 |
6%|β | 759/11704 [06:44<1:29:56, 2.03it/s]
|
1104 |
6%|β | 760/11704 [06:45<1:29:50, 2.03it/s]
|
1105 |
7%|β | 761/11704 [06:45<1:29:48, 2.03it/s]
|
1106 |
7%|β | 762/11704 [06:46<1:29:55, 2.03it/s]
|
1107 |
7%|β | 763/11704 [06:46<1:29:48, 2.03it/s]
|
1108 |
7%|β | 764/11704 [06:47<1:29:48, 2.03it/s]
|
1109 |
7%|β | 765/11704 [06:47<1:29:54, 2.03it/s]
|
1110 |
7%|β | 766/11704 [06:48<1:29:47, 2.03it/s]
|
1111 |
7%|β | 767/11704 [06:48<1:29:56, 2.03it/s]
|
1112 |
7%|β | 768/11704 [06:49<1:29:52, 2.03it/s]
|
1113 |
7%|β | 769/11704 [06:49<1:29:52, 2.03it/s]
|
1114 |
7%|β | 770/11704 [06:50<1:29:50, 2.03it/s]
|
1115 |
7%|β | 771/11704 [06:50<1:29:44, 2.03it/s]
|
1116 |
7%|β | 772/11704 [06:51<1:29:44, 2.03it/s]
|
1117 |
7%|β | 773/11704 [06:51<1:29:36, 2.03it/s]
|
1118 |
7%|β | 774/11704 [06:52<1:29:42, 2.03it/s]
|
1119 |
7%|β | 775/11704 [06:52<1:29:42, 2.03it/s]{'loss': 3.7964, 'grad_norm': 0.5229047536849976, 'learning_rate': 0.0006618274978650726, 'epoch': 0.93}
|
1120 |
+
|
1121 |
|
1122 |
7%|β | 775/11704 [06:52<1:29:42, 2.03it/s]
|
1123 |
7%|β | 776/11704 [06:53<1:29:43, 2.03it/s]
|
1124 |
7%|β | 777/11704 [06:53<1:29:50, 2.03it/s]
|
1125 |
7%|β | 778/11704 [06:54<1:29:41, 2.03it/s]
|
1126 |
7%|β | 779/11704 [06:54<1:29:43, 2.03it/s]
|
1127 |
7%|β | 780/11704 [06:55<1:29:41, 2.03it/s]
|
1128 |
7%|β | 781/11704 [06:55<1:29:40, 2.03it/s]
|
1129 |
7%|β | 782/11704 [06:56<1:29:39, 2.03it/s]
|
1130 |
7%|β | 783/11704 [06:56<1:29:35, 2.03it/s]
|
1131 |
7%|β | 784/11704 [06:57<1:29:34, 2.03it/s]
|
1132 |
7%|β | 785/11704 [06:57<1:29:40, 2.03it/s]
|
1133 |
7%|β | 786/11704 [06:58<1:29:40, 2.03it/s]
|
1134 |
7%|β | 787/11704 [06:58<1:29:44, 2.03it/s]
|
1135 |
7%|β | 788/11704 [06:59<1:29:40, 2.03it/s]
|
1136 |
7%|β | 789/11704 [06:59<1:29:42, 2.03it/s]
|
1137 |
7%|β | 790/11704 [07:00<1:29:42, 2.03it/s]
|
1138 |
7%|β | 791/11704 [07:00<1:29:41, 2.03it/s]
|
1139 |
7%|β | 792/11704 [07:01<1:29:39, 2.03it/s]
|
1140 |
7%|β | 793/11704 [07:01<1:29:34, 2.03it/s]
|
1141 |
7%|β | 794/11704 [07:02<1:29:26, 2.03it/s]
|
1142 |
7%|β | 795/11704 [07:02<1:29:29, 2.03it/s]
|
1143 |
7%|β | 796/11704 [07:03<1:29:29, 2.03it/s]
|
1144 |
7%|β | 797/11704 [07:03<1:29:29, 2.03it/s]
|
1145 |
7%|β | 798/11704 [07:04<1:29:32, 2.03it/s]
|
1146 |
7%|β | 799/11704 [07:04<1:29:33, 2.03it/s]
|
1147 |
7%|β | 800/11704 [07:05<1:29:32, 2.03it/s]{'loss': 3.7888, 'grad_norm': 0.48627805709838867, 'learning_rate': 0.0006831767719897524, 'epoch': 0.96}
|
1148 |
+
|
1149 |
|
1150 |
7%|β | 800/11704 [07:05<1:29:32, 2.03it/s]
|
1151 |
7%|β | 801/11704 [07:05<1:29:38, 2.03it/s]
|
1152 |
7%|β | 802/11704 [07:06<1:29:34, 2.03it/s]
|
1153 |
7%|β | 803/11704 [07:06<1:29:34, 2.03it/s]
|
1154 |
7%|β | 804/11704 [07:07<1:29:28, 2.03it/s]
|
1155 |
7%|β | 805/11704 [07:07<1:29:23, 2.03it/s]
|
1156 |
7%|β | 806/11704 [07:08<1:29:21, 2.03it/s]
|
1157 |
7%|β | 807/11704 [07:08<1:29:29, 2.03it/s]
|
1158 |
7%|β | 808/11704 [07:09<1:29:27, 2.03it/s]
|
1159 |
7%|β | 809/11704 [07:09<1:29:26, 2.03it/s]
|
1160 |
7%|β | 810/11704 [07:10<1:29:27, 2.03it/s]
|
1161 |
7%|β | 811/11704 [07:10<1:29:23, 2.03it/s]
|
1162 |
7%|β | 812/11704 [07:11<1:29:25, 2.03it/s]
|
1163 |
7%|β | 813/11704 [07:11<1:29:24, 2.03it/s]
|
1164 |
7%|β | 814/11704 [07:12<1:29:16, 2.03it/s]
|
1165 |
7%|β | 815/11704 [07:12<1:29:08, 2.04it/s]
|
1166 |
7%|β | 816/11704 [07:13<1:29:13, 2.03it/s]
|
1167 |
7%|β | 817/11704 [07:13<1:29:13, 2.03it/s]
|
1168 |
7%|β | 818/11704 [07:14<1:29:15, 2.03it/s]
|
1169 |
7%|β | 819/11704 [07:14<1:29:19, 2.03it/s]
|
1170 |
7%|β | 820/11704 [07:15<1:29:12, 2.03it/s]
|
1171 |
7%|β | 821/11704 [07:15<1:29:15, 2.03it/s]
|
1172 |
7%|β | 822/11704 [07:16<1:29:14, 2.03it/s]
|
1173 |
7%|β | 823/11704 [07:16<1:29:12, 2.03it/s]
|
1174 |
7%|β | 824/11704 [07:16<1:29:17, 2.03it/s]
|
1175 |
7%|β | 825/11704 [07:17<1:29:15, 2.03it/s]{'loss': 3.7506, 'grad_norm': 0.5719705820083618, 'learning_rate': 0.0007045260461144322, 'epoch': 0.99}
|
1176 |
+
|
1177 |
|
1178 |
7%|β | 825/11704 [07:17<1:29:15, 2.03it/s]
|
1179 |
7%|β | 826/11704 [07:17<1:29:22, 2.03it/s]
|
1180 |
7%|β | 827/11704 [07:18<1:29:19, 2.03it/s]
|
1181 |
7%|β | 828/11704 [07:18<1:29:13, 2.03it/s]
|
1182 |
7%|β | 829/11704 [07:19<1:29:19, 2.03it/s]
|
1183 |
7%|β | 830/11704 [07:19<1:29:18, 2.03it/s]
|
1184 |
7%|β | 831/11704 [07:20<1:29:20, 2.03it/s]
|
1185 |
7%|β | 832/11704 [07:20<1:29:17, 2.03it/s]
|
1186 |
7%|β | 833/11704 [07:21<1:29:13, 2.03it/s]
|
1187 |
7%|β | 834/11704 [07:21<1:29:20, 2.03it/s]
|
1188 |
7%|β | 835/11704 [07:22<1:29:11, 2.03it/s]
|
1189 |
7%|β | 836/11704 [07:22<1:31:45, 1.97it/s]
|
1190 |
7%|β | 837/11704 [07:34<11:50:27, 3.92s/it]
|
1191 |
7%|β | 838/11704 [07:35<8:44:46, 2.90s/it]
|
1192 |
7%|β | 839/11704 [07:35<6:33:59, 2.18s/it]
|
1193 |
7%|β | 840/11704 [07:36<5:02:24, 1.67s/it]
|
1194 |
7%|β | 841/11704 [07:36<3:58:30, 1.32s/it]
|
1195 |
7%|β | 842/11704 [07:37<3:13:48, 1.07s/it]
|
1196 |
7%|β | 843/11704 [07:37<2:42:26, 1.11it/s]
|
1197 |
7%|β | 844/11704 [07:38<2:20:22, 1.29it/s]
|
1198 |
7%|β | 845/11704 [07:38<2:05:46, 1.44it/s]
|
1199 |
7%|β | 846/11704 [07:39<1:54:47, 1.58it/s]
|
1200 |
7%|β | 847/11704 [07:39<1:47:02, 1.69it/s]
|
1201 |
7%|β | 848/11704 [07:40<1:41:37, 1.78it/s]
|
1202 |
7%|β | 849/11704 [07:40<1:37:49, 1.85it/s]
|
1203 |
7%|β | 850/11704 [07:41<1:35:06, 1.90it/s]{'loss': 3.7125, 'grad_norm': 0.48182985186576843, 'learning_rate': 0.0007258753202391119, 'epoch': 1.02}
|
1204 |
+
|
1205 |
|
1206 |
7%|β | 850/11704 [07:41<1:35:06, 1.90it/s]
|
1207 |
7%|β | 851/11704 [07:41<1:33:29, 1.93it/s]
|
1208 |
7%|β | 852/11704 [07:42<1:32:09, 1.96it/s]
|
1209 |
7%|β | 853/11704 [07:42<1:31:05, 1.99it/s]
|
1210 |
7%|β | 854/11704 [07:43<1:30:25, 2.00it/s]
|
1211 |
7%|β | 855/11704 [07:43<1:29:58, 2.01it/s]
|
1212 |
7%|β | 856/11704 [07:44<1:29:45, 2.01it/s]
|
1213 |
7%|β | 857/11704 [07:44<1:29:20, 2.02it/s]
|
1214 |
7%|β | 858/11704 [07:45<1:29:19, 2.02it/s]
|
1215 |
7%|β | 859/11704 [07:45<1:29:09, 2.03it/s]
|
1216 |
7%|β | 860/11704 [07:46<1:29:11, 2.03it/s]
|
1217 |
7%|β | 861/11704 [07:46<1:29:02, 2.03it/s]
|
1218 |
7%|β | 862/11704 [07:47<1:28:59, 2.03it/s]
|
1219 |
7%|β | 863/11704 [07:47<1:28:54, 2.03it/s]
|
1220 |
7%|β | 864/11704 [07:48<1:28:51, 2.03it/s]
|
1221 |
7%|β | 865/11704 [07:48<1:28:48, 2.03it/s]
|
1222 |
7%|β | 866/11704 [07:49<1:28:57, 2.03it/s]
|
1223 |
7%|β | 867/11704 [07:49<1:28:59, 2.03it/s]
|
1224 |
7%|β | 868/11704 [07:50<1:28:51, 2.03it/s]
|
1225 |
7%|β | 869/11704 [07:50<1:28:52, 2.03it/s]
|
1226 |
7%|β | 870/11704 [07:51<1:28:50, 2.03it/s]
|
1227 |
7%|β | 871/11704 [07:51<1:28:46, 2.03it/s]
|
1228 |
7%|β | 872/11704 [07:52<1:28:53, 2.03it/s]
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9509fc878acd0defe5adfeb20bab4c79a7a8244d939dbf9aedc248e6079ebae0
|
3 |
+
size 5240
|