mirco commited on
Commit
cf035d0
1 Parent(s): ebfbe15

cleaned inference hyparam file

Browse files
Files changed (2) hide show
  1. hyperparams.yaml +16 -129
  2. hyperparams_train.yaml +177 -0
hyperparams.yaml CHANGED
@@ -1,108 +1,21 @@
1
- # Generated 2021-03-09 from:
2
- # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/train/hparams/sepformer-wham.yaml
3
- # yamllint disable
4
  # ################################
5
- # Model: SepFormer for source separation
6
  # https://arxiv.org/abs/2010.13154
7
- #
8
- # Dataset : WSJ0-2mix and WSJ0-3mix
9
  # ################################
10
- # Basic parameters
11
- # Seed needs to be set at top of yaml, before objects with parameters are made
12
- #
13
- seed: 1234
14
- __set_seed: !apply:torch.manual_seed [1234]
15
 
16
- # Data params
17
 
18
- # the data folder for the wham dataset
19
- # needs to end with wham_original for the wham dataset
20
- # needs to end with wham_reverb for the whamr dataset
21
- data_folder: /network/tmp1/subakany/wham_original
22
-
23
- # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
24
- # e.g. /yourpath/wsj0-processed/si_tr_s/
25
- wsj0_tr: /yourpath/wsj0-processed/si_tr_s/
26
-
27
- experiment_name: sepformer-wham
28
- output_folder: results/sepformer-wham/1234
29
- train_log: results/sepformer-wham/1234/train_log.txt
30
- save_folder: results/sepformer-wham/1234/save
31
-
32
- # the file names should start with whamr instead of whamorg
33
- train_data: results/sepformer-wham/1234/save/whamorg_tr.csv
34
- valid_data: results/sepformer-wham/1234/save/whamorg_cv.csv
35
- test_data: results/sepformer-wham/1234/save/whamorg_tt.csv
36
- skip_prep: false
37
-
38
-
39
- # Experiment params
40
- auto_mix_prec: false # Set it to True for mixed precision
41
- test_only: false
42
- num_spks: 2 # set to 3 for wsj0-3mix
43
- progressbar: true
44
- save_audio: false # Save estimated sources on disk
45
  sample_rate: 8000
46
-
47
- # Training parameters
48
- N_epochs: 200
49
- batch_size: 1
50
- lr: 0.00015
51
- clip_grad_norm: 5
52
- loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
53
- # if True, the training sequences are cut to a specified length
54
- limit_training_signal_len: false
55
- # this is the length of sequences if we choose to limit
56
- # the signal length of training sequences
57
- training_signal_len: 32000000
58
-
59
- # Set it to True to dynamically create mixtures at training time
60
- dynamic_mixing: false
61
-
62
- # Parameters for data augmentation
63
- use_wavedrop: false
64
- use_speedperturb: true
65
- use_speedperturb_sameforeachsource: false
66
- use_rand_shift: false
67
- min_shift: -8000
68
- max_shift: 8000
69
-
70
- speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
71
- perturb_prob: 1.0
72
- drop_freq_prob: 0.0
73
- drop_chunk_prob: 0.0
74
- sample_rate: 8000
75
- speeds: [95, 100, 105]
76
-
77
- wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
78
- perturb_prob: 0.0
79
- drop_freq_prob: 1.0
80
- drop_chunk_prob: 1.0
81
- sample_rate: 8000
82
-
83
- # loss thresholding -- this thresholds the training loss
84
- threshold_byloss: true
85
- threshold: -30
86
-
87
- # Encoder parameters
88
- N_encoder_out: 256
89
- out_channels: 256
90
- kernel_size: 16
91
- kernel_stride: 8
92
-
93
- # Dataloader options
94
- dataloader_opts:
95
- batch_size: 1
96
- num_workers: 3
97
-
98
 
99
  # Specifying the network
100
- Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
101
  kernel_size: 16
102
  out_channels: 256
103
 
104
-
105
- SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
106
  num_layers: 8
107
  d_model: 256
108
  nhead: 8
@@ -111,7 +24,7 @@ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
111
  use_positional_encoding: true
112
  norm_before: true
113
 
114
- SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
115
  num_layers: 8
116
  d_model: 256
117
  nhead: 8
@@ -120,55 +33,29 @@ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
120
  use_positional_encoding: true
121
  norm_before: true
122
 
123
- MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
124
-
125
- num_spks: 2
126
  in_channels: 256
127
  out_channels: 256
128
  num_layers: 2
129
  K: 250
130
- intra_model: *id001
131
- inter_model: *id002
132
  norm: ln
133
  linear_layer_after_inter_intra: false
134
  skip_around_intra: true
135
 
136
- Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
137
  in_channels: 256
138
  out_channels: 1
139
  kernel_size: 16
140
  stride: 8
141
  bias: false
142
 
143
- optimizer: !name:torch.optim.Adam
144
- lr: 0.00015
145
- weight_decay: 0
146
-
147
- loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
148
-
149
- lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
150
-
151
- factor: 0.5
152
- patience: 2
153
- dont_halve_until_epoch: 65
154
-
155
- epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
156
- limit: 200
157
-
158
  modules:
159
- encoder: *id003
160
- decoder: *id004
161
- masknet: *id005
162
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
163
- checkpoints_dir: results/sepformer-wham/1234/save
164
- recoverables:
165
- encoder: *id003
166
- decoder: *id004
167
- masknet: *id005
168
- counter: *id006
169
- lr_scheduler: *id007
170
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
171
- save_file: results/sepformer-wham/1234/train_log.txt
172
 
173
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
174
  loadables:
 
 
 
 
1
  # ################################
2
+ # Model: Inference for source separation with SepFormer
3
  # https://arxiv.org/abs/2010.13154
4
+ # Generated from speechbrain/recipes/WSJ0Mix/separation/train/hparams/sepformer-wham.yaml
5
+ # Dataset : Wham
6
  # ################################
 
 
 
 
 
7
 
 
8
 
9
+ # Parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  sample_rate: 8000
11
+ num_spks: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Specifying the network
14
+ Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
15
  kernel_size: 16
16
  out_channels: 256
17
 
18
+ SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
 
19
  num_layers: 8
20
  d_model: 256
21
  nhead: 8
 
24
  use_positional_encoding: true
25
  norm_before: true
26
 
27
+ SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
28
  num_layers: 8
29
  d_model: 256
30
  nhead: 8
 
33
  use_positional_encoding: true
34
  norm_before: true
35
 
36
+ MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
37
+ num_spks: !ref <num_spks>
 
38
  in_channels: 256
39
  out_channels: 256
40
  num_layers: 2
41
  K: 250
42
+ intra_model: !ref <SBtfintra>
43
+ inter_model: !ref <SBtfinter>
44
  norm: ln
45
  linear_layer_after_inter_intra: false
46
  skip_around_intra: true
47
 
48
+ Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
49
  in_channels: 256
50
  out_channels: 1
51
  kernel_size: 16
52
  stride: 8
53
  bias: false
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  modules:
56
+ encoder: !ref <Encoder>
57
+ decoder: !ref <Decoder>
58
+ masknet: !ref <MaskNet>
 
 
 
 
 
 
 
 
 
 
59
 
60
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
61
  loadables:
hyperparams_train.yaml ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2021-03-09 from:
2
+ # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/train/hparams/sepformer-wham.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: SepFormer for source separation
6
+ # https://arxiv.org/abs/2010.13154
7
+ #
8
+ # Dataset : WSJ0-2mix and WSJ0-3mix
9
+ # ################################
10
+ # Basic parameters
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ #
13
+ seed: 1234
14
+ __set_seed: !apply:torch.manual_seed [1234]
15
+
16
+ # Data params
17
+
18
+ # the data folder for the wham dataset
19
+ # needs to end with wham_original for the wham dataset
20
+ # needs to end with wham_reverb for the whamr dataset
21
+ data_folder: /network/tmp1/subakany/wham_original
22
+
23
+ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
24
+ # e.g. /yourpath/wsj0-processed/si_tr_s/
25
+ wsj0_tr: /yourpath/wsj0-processed/si_tr_s/
26
+
27
+ experiment_name: sepformer-wham
28
+ output_folder: results/sepformer-wham/1234
29
+ train_log: results/sepformer-wham/1234/train_log.txt
30
+ save_folder: results/sepformer-wham/1234/save
31
+
32
+ # the file names should start with whamr instead of whamorg
33
+ train_data: results/sepformer-wham/1234/save/whamorg_tr.csv
34
+ valid_data: results/sepformer-wham/1234/save/whamorg_cv.csv
35
+ test_data: results/sepformer-wham/1234/save/whamorg_tt.csv
36
+ skip_prep: false
37
+
38
+
39
+ # Experiment params
40
+ auto_mix_prec: false # Set it to True for mixed precision
41
+ test_only: false
42
+ num_spks: 2 # set to 3 for wsj0-3mix
43
+ progressbar: true
44
+ save_audio: false # Save estimated sources on disk
45
+ sample_rate: 8000
46
+
47
+ # Training parameters
48
+ N_epochs: 200
49
+ batch_size: 1
50
+ lr: 0.00015
51
+ clip_grad_norm: 5
52
+ loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
53
+ # if True, the training sequences are cut to a specified length
54
+ limit_training_signal_len: false
55
+ # this is the length of sequences if we choose to limit
56
+ # the signal length of training sequences
57
+ training_signal_len: 32000000
58
+
59
+ # Set it to True to dynamically create mixtures at training time
60
+ dynamic_mixing: false
61
+
62
+ # Parameters for data augmentation
63
+ use_wavedrop: false
64
+ use_speedperturb: true
65
+ use_speedperturb_sameforeachsource: false
66
+ use_rand_shift: false
67
+ min_shift: -8000
68
+ max_shift: 8000
69
+
70
+ speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
71
+ perturb_prob: 1.0
72
+ drop_freq_prob: 0.0
73
+ drop_chunk_prob: 0.0
74
+ sample_rate: 8000
75
+ speeds: [95, 100, 105]
76
+
77
+ wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
78
+ perturb_prob: 0.0
79
+ drop_freq_prob: 1.0
80
+ drop_chunk_prob: 1.0
81
+ sample_rate: 8000
82
+
83
+ # loss thresholding -- this thresholds the training loss
84
+ threshold_byloss: true
85
+ threshold: -30
86
+
87
+ # Encoder parameters
88
+ N_encoder_out: 256
89
+ out_channels: 256
90
+ kernel_size: 16
91
+ kernel_stride: 8
92
+
93
+ # Dataloader options
94
+ dataloader_opts:
95
+ batch_size: 1
96
+ num_workers: 3
97
+
98
+
99
+ # Specifying the network
100
+ Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
101
+ kernel_size: 16
102
+ out_channels: 256
103
+
104
+
105
+ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
106
+ num_layers: 8
107
+ d_model: 256
108
+ nhead: 8
109
+ d_ffn: 1024
110
+ dropout: 0
111
+ use_positional_encoding: true
112
+ norm_before: true
113
+
114
+ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
115
+ num_layers: 8
116
+ d_model: 256
117
+ nhead: 8
118
+ d_ffn: 1024
119
+ dropout: 0
120
+ use_positional_encoding: true
121
+ norm_before: true
122
+
123
+ MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
124
+
125
+ num_spks: 2
126
+ in_channels: 256
127
+ out_channels: 256
128
+ num_layers: 2
129
+ K: 250
130
+ intra_model: *id001
131
+ inter_model: *id002
132
+ norm: ln
133
+ linear_layer_after_inter_intra: false
134
+ skip_around_intra: true
135
+
136
+ Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
137
+ in_channels: 256
138
+ out_channels: 1
139
+ kernel_size: 16
140
+ stride: 8
141
+ bias: false
142
+
143
+ optimizer: !name:torch.optim.Adam
144
+ lr: 0.00015
145
+ weight_decay: 0
146
+
147
+ loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
148
+
149
+ lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
150
+
151
+ factor: 0.5
152
+ patience: 2
153
+ dont_halve_until_epoch: 65
154
+
155
+ epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
156
+ limit: 200
157
+
158
+ modules:
159
+ encoder: *id003
160
+ decoder: *id004
161
+ masknet: *id005
162
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
163
+ checkpoints_dir: results/sepformer-wham/1234/save
164
+ recoverables:
165
+ encoder: *id003
166
+ decoder: *id004
167
+ masknet: *id005
168
+ counter: *id006
169
+ lr_scheduler: *id007
170
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
171
+ save_file: results/sepformer-wham/1234/train_log.txt
172
+
173
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
174
+ loadables:
175
+ masknet: !ref <MaskNet>
176
+ encoder: !ref <Encoder>
177
+ decoder: !ref <Decoder>