gsarti commited on
Commit
21ef05d
1 Parent(s): c428a01

Initial commmit

Browse files
README.md CHANGED
@@ -1,3 +1,83 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: mit
3
+ tags:
4
+ - generated_from_trainer
5
+ datasets:
6
+ - it5/datasets
7
+ metrics:
8
+ - rouge
9
+ model-index:
10
+ - name: it5-efficient-small-el32-qg-0.0003
11
+ results:
12
+ - task:
13
+ name: Summarization
14
+ type: summarization
15
+ dataset:
16
+ name: it5/datasets qg
17
+ type: it5/datasets
18
+ args: qg
19
+ metrics:
20
+ - name: Rouge1
21
+ type: rouge
22
+ value: 40.5452
23
  ---
24
+
25
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
26
+ should probably proofread and complete it, then remove this comment. -->
27
+
28
+ # it5-efficient-small-el32-qg-0.0003
29
+
30
+ This model is a fine-tuned version of [stefan-it/it5-efficient-small-el32](https://huggingface.co/stefan-it/it5-efficient-small-el32) on the it5/datasets qg dataset.
31
+ It achieves the following results on the evaluation set:
32
+ - Loss: 1.8460
33
+ - Rouge1: 40.5452
34
+ - Rouge2: 21.7821
35
+ - Rougel: 37.9644
36
+ - Rougelsum: 37.9407
37
+ - Gen Len: 14.059
38
+
39
+ ## Model description
40
+
41
+ More information needed
42
+
43
+ ## Intended uses & limitations
44
+
45
+ More information needed
46
+
47
+ ## Training and evaluation data
48
+
49
+ More information needed
50
+
51
+ ## Training procedure
52
+
53
+ ### Training hyperparameters
54
+
55
+ The following hyperparameters were used during training:
56
+ - learning_rate: 0.0003
57
+ - train_batch_size: 8
58
+ - eval_batch_size: 8
59
+ - seed: 42
60
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
61
+ - lr_scheduler_type: linear
62
+ - num_epochs: 7.0
63
+
64
+ ### Training results
65
+
66
+ | Training Loss | Epoch | Step | Validation Loss | Rouge1 | Rouge2 | Rougel | Rougelsum | Gen Len |
67
+ |:-------------:|:-----:|:-----:|:---------------:|:-------:|:-------:|:-------:|:---------:|:-------:|
68
+ | 2.3227 | 0.78 | 5000 | 2.0119 | 35.4228 | 16.8454 | 33.0039 | 33.0042 | 13.4213 |
69
+ | 2.0517 | 1.56 | 10000 | 1.9002 | 36.7771 | 18.1217 | 34.4954 | 34.4605 | 12.7787 |
70
+ | 1.8388 | 2.35 | 15000 | 1.8676 | 38.3396 | 19.4592 | 35.8451 | 35.8358 | 13.2803 |
71
+ | 1.6942 | 3.13 | 20000 | 1.8758 | 39.0889 | 20.3841 | 36.655 | 36.6291 | 13.0213 |
72
+ | 1.7123 | 3.91 | 25000 | 1.8253 | 39.6282 | 20.9321 | 37.1541 | 37.1195 | 13.1837 |
73
+ | 1.5719 | 4.69 | 30000 | 1.8311 | 39.7541 | 21.1663 | 37.3503 | 37.3096 | 13.3723 |
74
+ | 1.4763 | 5.47 | 35000 | 1.8474 | 39.8798 | 21.3044 | 37.4297 | 37.4135 | 13.2783 |
75
+ | 1.3963 | 6.25 | 40000 | 1.8533 | 40.1839 | 21.4959 | 37.5371 | 37.5414 | 13.4713 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - Transformers 4.15.0
81
+ - Pytorch 1.10.0+cu102
82
+ - Datasets 1.17.0
83
+ - Tokenizers 0.10.3
all_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.0,
3
+ "eval_gen_len": 14.059,
4
+ "eval_loss": 1.8460021018981934,
5
+ "eval_rouge1": 40.5452,
6
+ "eval_rouge2": 21.7821,
7
+ "eval_rougeL": 37.9644,
8
+ "eval_rougeLsum": 37.9407,
9
+ "eval_runtime": 116.4742,
10
+ "eval_samples": 3000,
11
+ "eval_samples_per_second": 25.757,
12
+ "eval_steps_per_second": 3.22,
13
+ "train_loss": 1.7918549016882,
14
+ "train_runtime": 13775.2907,
15
+ "train_samples": 51159,
16
+ "train_samples_per_second": 25.997,
17
+ "train_steps_per_second": 3.25
18
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": ".",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 512,
9
+ "decoder_start_token_id": 0,
10
+ "dropout_rate": 0.1,
11
+ "eos_token_id": 1,
12
+ "feed_forward_proj": "relu",
13
+ "initializer_factor": 1.0,
14
+ "is_encoder_decoder": true,
15
+ "layer_norm_epsilon": 1e-06,
16
+ "model_type": "t5",
17
+ "n_positions": 512,
18
+ "num_decoder_layers": 6,
19
+ "num_heads": 8,
20
+ "num_layers": 32,
21
+ "output_past": true,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.15.0",
27
+ "use_cache": true,
28
+ "vocab_size": 32100
29
+ }
eval_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.0,
3
+ "eval_gen_len": 14.059,
4
+ "eval_loss": 1.8460021018981934,
5
+ "eval_rouge1": 40.5452,
6
+ "eval_rouge2": 21.7821,
7
+ "eval_rougeL": 37.9644,
8
+ "eval_rougeLsum": 37.9407,
9
+ "eval_runtime": 116.4742,
10
+ "eval_samples": 3000,
11
+ "eval_samples_per_second": 25.757,
12
+ "eval_steps_per_second": 3.22
13
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03a0d79302924998f7960b4871ab8efc1b703fd7ff0ae1012e6a208c62247b61
3
+ size 569246164
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28faf5fd43b2c586e2a67158b6d5248841d9faafea93dfe02742c5191c2143c2
3
+ size 569387035
runs/Apr26_13-05-50_pg-gpu34/1650971209.8765502/events.out.tfevents.1650971209.pg-gpu34.16405.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07671b828afb84fc6d64a01eda7ceece9e386708562f276f253b30e143bb6abe
3
+ size 5127
runs/Apr26_13-05-50_pg-gpu34/events.out.tfevents.1650971209.pg-gpu34.16405.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae22815369d8fea3aff4004552e66fafdb9b3096b1e71e4ce9a1f3dadc65fd87
3
+ size 22379
runs/Apr26_13-05-50_pg-gpu34/events.out.tfevents.1650985106.pg-gpu34.16405.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1eb473412f58c767a22de089d729682e380e5c1a4d2fc857ccf3363ef3135f5c
3
+ size 575
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"]}
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dffd01fc009b7e92d98eddff8853983e271b41302ed0d363000e8581df12000
3
+ size 817200
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f96c5805470ff7e8022bb180014da85e36574097e631c206f91b95d8bb676d3e
3
+ size 569947488
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "special_tokens_map_file": null, "name_or_path": "stefan-it/it5-efficient-small-el32", "sp_model_kwargs": {}, "tokenizer_class": "T5Tokenizer"}
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 7.0,
3
+ "train_loss": 1.7918549016882,
4
+ "train_runtime": 13775.2907,
5
+ "train_samples": 51159,
6
+ "train_samples_per_second": 25.997,
7
+ "train_steps_per_second": 3.25
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 7.0,
5
+ "global_step": 44765,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.08,
12
+ "learning_rate": 0.00029664916787668934,
13
+ "loss": 3.2895,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.16,
18
+ "learning_rate": 0.0002932983357533787,
19
+ "loss": 2.7517,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.23,
24
+ "learning_rate": 0.00028994750363006813,
25
+ "loss": 2.6498,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.31,
30
+ "learning_rate": 0.0002865966715067575,
31
+ "loss": 2.5331,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.39,
36
+ "learning_rate": 0.00028324583938344687,
37
+ "loss": 2.4775,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.47,
42
+ "learning_rate": 0.00027989500726013623,
43
+ "loss": 2.4187,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 0.55,
48
+ "learning_rate": 0.00027654417513682566,
49
+ "loss": 2.402,
50
+ "step": 3500
51
+ },
52
+ {
53
+ "epoch": 0.63,
54
+ "learning_rate": 0.000273193343013515,
55
+ "loss": 2.3717,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "epoch": 0.7,
60
+ "learning_rate": 0.0002698425108902044,
61
+ "loss": 2.3738,
62
+ "step": 4500
63
+ },
64
+ {
65
+ "epoch": 0.78,
66
+ "learning_rate": 0.00026649167876689376,
67
+ "loss": 2.3227,
68
+ "step": 5000
69
+ },
70
+ {
71
+ "epoch": 0.78,
72
+ "eval_gen_len": 13.4213,
73
+ "eval_loss": 2.011871576309204,
74
+ "eval_rouge1": 35.4228,
75
+ "eval_rouge2": 16.8454,
76
+ "eval_rougeL": 33.0039,
77
+ "eval_rougeLsum": 33.0042,
78
+ "eval_runtime": 109.3685,
79
+ "eval_samples_per_second": 27.43,
80
+ "eval_steps_per_second": 3.429,
81
+ "step": 5000
82
+ },
83
+ {
84
+ "epoch": 0.86,
85
+ "learning_rate": 0.0002631408466435831,
86
+ "loss": 2.2981,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 0.00025979001452027255,
92
+ "loss": 2.2863,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 1.02,
97
+ "learning_rate": 0.0002564391823969619,
98
+ "loss": 2.2597,
99
+ "step": 6500
100
+ },
101
+ {
102
+ "epoch": 1.09,
103
+ "learning_rate": 0.00025308835027365123,
104
+ "loss": 2.0983,
105
+ "step": 7000
106
+ },
107
+ {
108
+ "epoch": 1.17,
109
+ "learning_rate": 0.00024973751815034065,
110
+ "loss": 2.0933,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.25,
115
+ "learning_rate": 0.00024638668602703,
116
+ "loss": 2.1024,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.33,
121
+ "learning_rate": 0.0002430358539037194,
122
+ "loss": 2.0818,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.41,
127
+ "learning_rate": 0.00023968502178040878,
128
+ "loss": 2.0917,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.49,
133
+ "learning_rate": 0.00023633418965709815,
134
+ "loss": 2.0658,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.56,
139
+ "learning_rate": 0.00023298335753378752,
140
+ "loss": 2.0517,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.56,
145
+ "eval_gen_len": 12.7787,
146
+ "eval_loss": 1.900165319442749,
147
+ "eval_rouge1": 36.7771,
148
+ "eval_rouge2": 18.1217,
149
+ "eval_rougeL": 34.4954,
150
+ "eval_rougeLsum": 34.4605,
151
+ "eval_runtime": 106.6009,
152
+ "eval_samples_per_second": 28.142,
153
+ "eval_steps_per_second": 3.518,
154
+ "step": 10000
155
+ },
156
+ {
157
+ "epoch": 1.64,
158
+ "learning_rate": 0.0002296325254104769,
159
+ "loss": 2.0494,
160
+ "step": 10500
161
+ },
162
+ {
163
+ "epoch": 1.72,
164
+ "learning_rate": 0.00022628169328716628,
165
+ "loss": 2.0672,
166
+ "step": 11000
167
+ },
168
+ {
169
+ "epoch": 1.8,
170
+ "learning_rate": 0.00022293086116385567,
171
+ "loss": 2.0591,
172
+ "step": 11500
173
+ },
174
+ {
175
+ "epoch": 1.88,
176
+ "learning_rate": 0.00021958002904054504,
177
+ "loss": 2.0515,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 1.95,
182
+ "learning_rate": 0.00021622919691723444,
183
+ "loss": 2.0628,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 2.03,
188
+ "learning_rate": 0.0002128783647939238,
189
+ "loss": 1.959,
190
+ "step": 13000
191
+ },
192
+ {
193
+ "epoch": 2.11,
194
+ "learning_rate": 0.00020952753267061317,
195
+ "loss": 1.8223,
196
+ "step": 13500
197
+ },
198
+ {
199
+ "epoch": 2.19,
200
+ "learning_rate": 0.00020617670054730257,
201
+ "loss": 1.8548,
202
+ "step": 14000
203
+ },
204
+ {
205
+ "epoch": 2.27,
206
+ "learning_rate": 0.00020282586842399193,
207
+ "loss": 1.8462,
208
+ "step": 14500
209
+ },
210
+ {
211
+ "epoch": 2.35,
212
+ "learning_rate": 0.00019947503630068133,
213
+ "loss": 1.8388,
214
+ "step": 15000
215
+ },
216
+ {
217
+ "epoch": 2.35,
218
+ "eval_gen_len": 13.2803,
219
+ "eval_loss": 1.8676202297210693,
220
+ "eval_rouge1": 38.3396,
221
+ "eval_rouge2": 19.4592,
222
+ "eval_rougeL": 35.8451,
223
+ "eval_rougeLsum": 35.8358,
224
+ "eval_runtime": 111.5416,
225
+ "eval_samples_per_second": 26.896,
226
+ "eval_steps_per_second": 3.362,
227
+ "step": 15000
228
+ },
229
+ {
230
+ "epoch": 2.42,
231
+ "learning_rate": 0.0001961242041773707,
232
+ "loss": 1.8675,
233
+ "step": 15500
234
+ },
235
+ {
236
+ "epoch": 2.5,
237
+ "learning_rate": 0.0001927733720540601,
238
+ "loss": 1.8656,
239
+ "step": 16000
240
+ },
241
+ {
242
+ "epoch": 2.58,
243
+ "learning_rate": 0.00018942253993074946,
244
+ "loss": 1.8836,
245
+ "step": 16500
246
+ },
247
+ {
248
+ "epoch": 2.66,
249
+ "learning_rate": 0.00018607170780743885,
250
+ "loss": 1.8646,
251
+ "step": 17000
252
+ },
253
+ {
254
+ "epoch": 2.74,
255
+ "learning_rate": 0.00018272087568412822,
256
+ "loss": 1.8718,
257
+ "step": 17500
258
+ },
259
+ {
260
+ "epoch": 2.81,
261
+ "learning_rate": 0.0001793700435608176,
262
+ "loss": 1.8641,
263
+ "step": 18000
264
+ },
265
+ {
266
+ "epoch": 2.89,
267
+ "learning_rate": 0.00017601921143750698,
268
+ "loss": 1.8633,
269
+ "step": 18500
270
+ },
271
+ {
272
+ "epoch": 2.97,
273
+ "learning_rate": 0.00017266837931419635,
274
+ "loss": 1.8435,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 3.05,
279
+ "learning_rate": 0.00016931754719088575,
280
+ "loss": 1.7436,
281
+ "step": 19500
282
+ },
283
+ {
284
+ "epoch": 3.13,
285
+ "learning_rate": 0.0001659667150675751,
286
+ "loss": 1.6942,
287
+ "step": 20000
288
+ },
289
+ {
290
+ "epoch": 3.13,
291
+ "eval_gen_len": 13.0213,
292
+ "eval_loss": 1.8757922649383545,
293
+ "eval_rouge1": 39.0889,
294
+ "eval_rouge2": 20.3841,
295
+ "eval_rougeL": 36.655,
296
+ "eval_rougeLsum": 36.6291,
297
+ "eval_runtime": 103.7035,
298
+ "eval_samples_per_second": 28.929,
299
+ "eval_steps_per_second": 3.616,
300
+ "step": 20000
301
+ },
302
+ {
303
+ "epoch": 3.21,
304
+ "learning_rate": 0.00016261588294426446,
305
+ "loss": 1.691,
306
+ "step": 20500
307
+ },
308
+ {
309
+ "epoch": 3.28,
310
+ "learning_rate": 0.00015926505082095385,
311
+ "loss": 1.6786,
312
+ "step": 21000
313
+ },
314
+ {
315
+ "epoch": 3.36,
316
+ "learning_rate": 0.00015591421869764322,
317
+ "loss": 1.7014,
318
+ "step": 21500
319
+ },
320
+ {
321
+ "epoch": 3.44,
322
+ "learning_rate": 0.0001525633865743326,
323
+ "loss": 1.72,
324
+ "step": 22000
325
+ },
326
+ {
327
+ "epoch": 3.52,
328
+ "learning_rate": 0.000149212554451022,
329
+ "loss": 1.7038,
330
+ "step": 22500
331
+ },
332
+ {
333
+ "epoch": 3.6,
334
+ "learning_rate": 0.00014586172232771135,
335
+ "loss": 1.7212,
336
+ "step": 23000
337
+ },
338
+ {
339
+ "epoch": 3.67,
340
+ "learning_rate": 0.00014251089020440074,
341
+ "loss": 1.6839,
342
+ "step": 23500
343
+ },
344
+ {
345
+ "epoch": 3.75,
346
+ "learning_rate": 0.0001391600580810901,
347
+ "loss": 1.6908,
348
+ "step": 24000
349
+ },
350
+ {
351
+ "epoch": 3.83,
352
+ "learning_rate": 0.0001358092259577795,
353
+ "loss": 1.6973,
354
+ "step": 24500
355
+ },
356
+ {
357
+ "epoch": 3.91,
358
+ "learning_rate": 0.00013245839383446887,
359
+ "loss": 1.7123,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 3.91,
364
+ "eval_gen_len": 13.1837,
365
+ "eval_loss": 1.8253004550933838,
366
+ "eval_rouge1": 39.6282,
367
+ "eval_rouge2": 20.9321,
368
+ "eval_rougeL": 37.1541,
369
+ "eval_rougeLsum": 37.1195,
370
+ "eval_runtime": 103.6467,
371
+ "eval_samples_per_second": 28.944,
372
+ "eval_steps_per_second": 3.618,
373
+ "step": 25000
374
+ },
375
+ {
376
+ "epoch": 3.99,
377
+ "learning_rate": 0.00012910756171115827,
378
+ "loss": 1.7076,
379
+ "step": 25500
380
+ },
381
+ {
382
+ "epoch": 4.07,
383
+ "learning_rate": 0.00012575672958784764,
384
+ "loss": 1.5729,
385
+ "step": 26000
386
+ },
387
+ {
388
+ "epoch": 4.14,
389
+ "learning_rate": 0.00012240589746453703,
390
+ "loss": 1.5568,
391
+ "step": 26500
392
+ },
393
+ {
394
+ "epoch": 4.22,
395
+ "learning_rate": 0.0001190550653412264,
396
+ "loss": 1.5458,
397
+ "step": 27000
398
+ },
399
+ {
400
+ "epoch": 4.3,
401
+ "learning_rate": 0.00011570423321791578,
402
+ "loss": 1.5678,
403
+ "step": 27500
404
+ },
405
+ {
406
+ "epoch": 4.38,
407
+ "learning_rate": 0.00011235340109460516,
408
+ "loss": 1.5979,
409
+ "step": 28000
410
+ },
411
+ {
412
+ "epoch": 4.46,
413
+ "learning_rate": 0.00010900256897129453,
414
+ "loss": 1.5604,
415
+ "step": 28500
416
+ },
417
+ {
418
+ "epoch": 4.53,
419
+ "learning_rate": 0.00010565173684798391,
420
+ "loss": 1.5539,
421
+ "step": 29000
422
+ },
423
+ {
424
+ "epoch": 4.61,
425
+ "learning_rate": 0.00010230090472467328,
426
+ "loss": 1.5681,
427
+ "step": 29500
428
+ },
429
+ {
430
+ "epoch": 4.69,
431
+ "learning_rate": 9.895007260136266e-05,
432
+ "loss": 1.5719,
433
+ "step": 30000
434
+ },
435
+ {
436
+ "epoch": 4.69,
437
+ "eval_gen_len": 13.3723,
438
+ "eval_loss": 1.8311357498168945,
439
+ "eval_rouge1": 39.7541,
440
+ "eval_rouge2": 21.1663,
441
+ "eval_rougeL": 37.3503,
442
+ "eval_rougeLsum": 37.3096,
443
+ "eval_runtime": 106.3723,
444
+ "eval_samples_per_second": 28.203,
445
+ "eval_steps_per_second": 3.525,
446
+ "step": 30000
447
+ },
448
+ {
449
+ "epoch": 4.77,
450
+ "learning_rate": 9.559924047805204e-05,
451
+ "loss": 1.5787,
452
+ "step": 30500
453
+ },
454
+ {
455
+ "epoch": 4.85,
456
+ "learning_rate": 9.224840835474142e-05,
457
+ "loss": 1.5805,
458
+ "step": 31000
459
+ },
460
+ {
461
+ "epoch": 4.93,
462
+ "learning_rate": 8.889757623143079e-05,
463
+ "loss": 1.5646,
464
+ "step": 31500
465
+ },
466
+ {
467
+ "epoch": 5.0,
468
+ "learning_rate": 8.554674410812017e-05,
469
+ "loss": 1.5593,
470
+ "step": 32000
471
+ },
472
+ {
473
+ "epoch": 5.08,
474
+ "learning_rate": 8.219591198480955e-05,
475
+ "loss": 1.4606,
476
+ "step": 32500
477
+ },
478
+ {
479
+ "epoch": 5.16,
480
+ "learning_rate": 7.884507986149893e-05,
481
+ "loss": 1.4393,
482
+ "step": 33000
483
+ },
484
+ {
485
+ "epoch": 5.24,
486
+ "learning_rate": 7.549424773818831e-05,
487
+ "loss": 1.4826,
488
+ "step": 33500
489
+ },
490
+ {
491
+ "epoch": 5.32,
492
+ "learning_rate": 7.21434156148777e-05,
493
+ "loss": 1.4668,
494
+ "step": 34000
495
+ },
496
+ {
497
+ "epoch": 5.39,
498
+ "learning_rate": 6.879258349156706e-05,
499
+ "loss": 1.4599,
500
+ "step": 34500
501
+ },
502
+ {
503
+ "epoch": 5.47,
504
+ "learning_rate": 6.544175136825644e-05,
505
+ "loss": 1.4763,
506
+ "step": 35000
507
+ },
508
+ {
509
+ "epoch": 5.47,
510
+ "eval_gen_len": 13.2783,
511
+ "eval_loss": 1.8474199771881104,
512
+ "eval_rouge1": 39.8798,
513
+ "eval_rouge2": 21.3044,
514
+ "eval_rougeL": 37.4297,
515
+ "eval_rougeLsum": 37.4135,
516
+ "eval_runtime": 104.818,
517
+ "eval_samples_per_second": 28.621,
518
+ "eval_steps_per_second": 3.578,
519
+ "step": 35000
520
+ },
521
+ {
522
+ "epoch": 5.55,
523
+ "learning_rate": 6.209091924494583e-05,
524
+ "loss": 1.4651,
525
+ "step": 35500
526
+ },
527
+ {
528
+ "epoch": 5.63,
529
+ "learning_rate": 5.87400871216352e-05,
530
+ "loss": 1.475,
531
+ "step": 36000
532
+ },
533
+ {
534
+ "epoch": 5.71,
535
+ "learning_rate": 5.538925499832458e-05,
536
+ "loss": 1.4734,
537
+ "step": 36500
538
+ },
539
+ {
540
+ "epoch": 5.79,
541
+ "learning_rate": 5.203842287501396e-05,
542
+ "loss": 1.4487,
543
+ "step": 37000
544
+ },
545
+ {
546
+ "epoch": 5.86,
547
+ "learning_rate": 4.868759075170333e-05,
548
+ "loss": 1.4408,
549
+ "step": 37500
550
+ },
551
+ {
552
+ "epoch": 5.94,
553
+ "learning_rate": 4.533675862839271e-05,
554
+ "loss": 1.4643,
555
+ "step": 38000
556
+ },
557
+ {
558
+ "epoch": 6.02,
559
+ "learning_rate": 4.198592650508209e-05,
560
+ "loss": 1.4441,
561
+ "step": 38500
562
+ },
563
+ {
564
+ "epoch": 6.1,
565
+ "learning_rate": 3.863509438177147e-05,
566
+ "loss": 1.3854,
567
+ "step": 39000
568
+ },
569
+ {
570
+ "epoch": 6.18,
571
+ "learning_rate": 3.528426225846085e-05,
572
+ "loss": 1.3919,
573
+ "step": 39500
574
+ },
575
+ {
576
+ "epoch": 6.25,
577
+ "learning_rate": 3.193343013515022e-05,
578
+ "loss": 1.3963,
579
+ "step": 40000
580
+ },
581
+ {
582
+ "epoch": 6.25,
583
+ "eval_gen_len": 13.4713,
584
+ "eval_loss": 1.8532978296279907,
585
+ "eval_rouge1": 40.1839,
586
+ "eval_rouge2": 21.4959,
587
+ "eval_rougeL": 37.5371,
588
+ "eval_rougeLsum": 37.5414,
589
+ "eval_runtime": 113.6133,
590
+ "eval_samples_per_second": 26.405,
591
+ "eval_steps_per_second": 3.301,
592
+ "step": 40000
593
+ },
594
+ {
595
+ "epoch": 6.33,
596
+ "learning_rate": 2.8582598011839604e-05,
597
+ "loss": 1.379,
598
+ "step": 40500
599
+ },
600
+ {
601
+ "epoch": 6.41,
602
+ "learning_rate": 2.5231765888528982e-05,
603
+ "loss": 1.3968,
604
+ "step": 41000
605
+ },
606
+ {
607
+ "epoch": 6.49,
608
+ "learning_rate": 2.188093376521836e-05,
609
+ "loss": 1.3814,
610
+ "step": 41500
611
+ },
612
+ {
613
+ "epoch": 6.57,
614
+ "learning_rate": 1.8530101641907737e-05,
615
+ "loss": 1.3891,
616
+ "step": 42000
617
+ },
618
+ {
619
+ "epoch": 6.65,
620
+ "learning_rate": 1.5179269518597117e-05,
621
+ "loss": 1.3787,
622
+ "step": 42500
623
+ },
624
+ {
625
+ "epoch": 6.72,
626
+ "learning_rate": 1.1828437395286495e-05,
627
+ "loss": 1.386,
628
+ "step": 43000
629
+ },
630
+ {
631
+ "epoch": 6.8,
632
+ "learning_rate": 8.477605271975873e-06,
633
+ "loss": 1.3927,
634
+ "step": 43500
635
+ },
636
+ {
637
+ "epoch": 6.88,
638
+ "learning_rate": 5.126773148665251e-06,
639
+ "loss": 1.368,
640
+ "step": 44000
641
+ },
642
+ {
643
+ "epoch": 6.96,
644
+ "learning_rate": 1.7759410253546295e-06,
645
+ "loss": 1.375,
646
+ "step": 44500
647
+ },
648
+ {
649
+ "epoch": 7.0,
650
+ "step": 44765,
651
+ "total_flos": 7.978118776281907e+16,
652
+ "train_loss": 1.7918549016882,
653
+ "train_runtime": 13775.2907,
654
+ "train_samples_per_second": 25.997,
655
+ "train_steps_per_second": 3.25
656
+ }
657
+ ],
658
+ "max_steps": 44765,
659
+ "num_train_epochs": 7,
660
+ "total_flos": 7.978118776281907e+16,
661
+ "trial_name": null,
662
+ "trial_params": null
663
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a646a42a43e6a11c8a7a3ca672b0cf8e999346bdec9924c88ff6088473efb0a0
3
+ size 3183