Wangyou Zhang commited on
Commit
e97826a
1 Parent(s): 9230748

Initial commit

Browse files
Files changed (31) hide show
  1. README.md +314 -0
  2. exp/enh_train_enh_tfgridnet_raw/21epoch.pth +3 -0
  3. exp/enh_train_enh_tfgridnet_raw/RESULTS.md +1 -0
  4. exp/enh_train_enh_tfgridnet_raw/config.yaml +232 -0
  5. exp/enh_train_enh_tfgridnet_raw/images/backward_time.png +0 -0
  6. exp/enh_train_enh_tfgridnet_raw/images/clip.png +0 -0
  7. exp/enh_train_enh_tfgridnet_raw/images/forward_time.png +0 -0
  8. exp/enh_train_enh_tfgridnet_raw/images/gpu_max_cached_mem_GB.png +0 -0
  9. exp/enh_train_enh_tfgridnet_raw/images/grad_norm.png +0 -0
  10. exp/enh_train_enh_tfgridnet_raw/images/iter_time.png +0 -0
  11. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz.png +0 -0
  12. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz.png +0 -0
  13. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz.png +0 -0
  14. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz.png +0 -0
  15. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz.png +0 -0
  16. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz.png +0 -0
  17. exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz.png +0 -0
  18. exp/enh_train_enh_tfgridnet_raw/images/loss.png +0 -0
  19. exp/enh_train_enh_tfgridnet_raw/images/loss_scale.png +0 -0
  20. exp/enh_train_enh_tfgridnet_raw/images/optim0_lr0.png +0 -0
  21. exp/enh_train_enh_tfgridnet_raw/images/optim_step_time.png +0 -0
  22. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_16000Hz.png +0 -0
  23. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_22050Hz.png +0 -0
  24. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_24000Hz.png +0 -0
  25. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_32000Hz.png +0 -0
  26. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_44100Hz.png +0 -0
  27. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_48000Hz.png +0 -0
  28. exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_8000Hz.png +0 -0
  29. exp/enh_train_enh_tfgridnet_raw/images/train_time.png +0 -0
  30. exp/enh_train_enh_tfgridnet_raw/valid.loss.best.pth +1 -0
  31. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,317 @@
1
  ---
 
 
 
 
 
 
2
  license: cc-by-4.0
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - audio
4
+ - audio-to-audio
5
+ language: en
6
+ datasets:
7
+ - urgent24
8
  license: cc-by-4.0
9
+ library_name: espnet
10
  ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `wyz/tfgridnet_for_urgent24`
15
+
16
+ This model was trained by Wangyou Zhang using the wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+
26
+ pip install -e .
27
+ cd egs2/urgent24/enh1
28
+ ./run.sh --skip_data_prep false --skip_train true --is_tse_task true --download_model wyz/tfgridnet_for_urgent24
29
+ ```
30
+
31
+ ## ENH config
32
+
33
+ <details><summary>expand</summary>
34
+
35
+ ```
36
+ config: conf/tuning/train_enh_tfgridnet.yaml
37
+ print_config: false
38
+ log_level: INFO
39
+ drop_last_iter: false
40
+ dry_run: false
41
+ iterator_type: chunk
42
+ valid_iterator_type: null
43
+ output_dir: exp/enh_train_enh_tfgridnet_raw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 4
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 54825
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: 40
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ create_graph_in_tensorboard: false
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ use_adapter: false
99
+ adapter: lora
100
+ save_strategy: all
101
+ adapter_conf: {}
102
+ pretrain_path: null
103
+ init_param: []
104
+ ignore_init_mismatch: false
105
+ freeze_param: []
106
+ num_iters_per_epoch: 8000
107
+ batch_size: 4
108
+ valid_batch_size: null
109
+ batch_bins: 1000000
110
+ valid_batch_bins: null
111
+ train_shape_file:
112
+ - exp/enh_stats_16k/train/speech_mix_shape
113
+ - exp/enh_stats_16k/train/speech_ref1_shape
114
+ valid_shape_file:
115
+ - exp/enh_stats_16k/valid/speech_mix_shape
116
+ - exp/enh_stats_16k/valid/speech_ref1_shape
117
+ batch_type: folded
118
+ valid_batch_type: null
119
+ fold_length:
120
+ - 80000
121
+ - 80000
122
+ sort_in_batch: descending
123
+ shuffle_within_batch: false
124
+ sort_batch: descending
125
+ multiple_iterator: false
126
+ chunk_length: 200
127
+ chunk_shift_ratio: 0.5
128
+ num_cache_chunks: 1024
129
+ chunk_excluded_key_prefixes: []
130
+ chunk_default_fs: 50
131
+ chunk_max_abs_length: 100000
132
+ chunk_discard_short_samples: true
133
+ train_data_path_and_name_and_type:
134
+ - - dump/raw/train/wav.scp
135
+ - speech_mix
136
+ - sound
137
+ - - dump/raw/train/spk1.scp
138
+ - speech_ref1
139
+ - sound
140
+ - - dump/raw/train/utt2category
141
+ - category
142
+ - text
143
+ - - dump/raw/train/utt2fs
144
+ - fs
145
+ - text_int
146
+ valid_data_path_and_name_and_type:
147
+ - - dump/raw/validation/wav.scp
148
+ - speech_mix
149
+ - sound
150
+ - - dump/raw/validation/spk1.scp
151
+ - speech_ref1
152
+ - sound
153
+ - - dump/raw/validation/utt2category
154
+ - category
155
+ - text
156
+ - - dump/raw/validation/utt2fs
157
+ - fs
158
+ - text_int
159
+ allow_variable_data_keys: false
160
+ max_cache_size: 0.0
161
+ max_cache_fd: 32
162
+ allow_multi_rates: true
163
+ valid_max_cache_size: null
164
+ exclude_weight_decay: false
165
+ exclude_weight_decay_conf: {}
166
+ optim: adam
167
+ optim_conf:
168
+ lr: 0.001
169
+ eps: 1.0e-08
170
+ weight_decay: 1.0e-05
171
+ scheduler: steplr
172
+ scheduler_conf:
173
+ step_size: 2
174
+ gamma: 0.99
175
+ init: null
176
+ model_conf:
177
+ normalize_variance_per_ch: true
178
+ categories:
179
+ - 1ch_8000Hz
180
+ - 1ch_16000Hz
181
+ - 1ch_22050Hz
182
+ - 1ch_24000Hz
183
+ - 1ch_32000Hz
184
+ - 1ch_44100Hz
185
+ - 1ch_48000Hz
186
+ criterions:
187
+ - name: mr_l1_tfd
188
+ conf:
189
+ window_sz:
190
+ - 256
191
+ - 512
192
+ - 768
193
+ - 1024
194
+ hop_sz: null
195
+ eps: 1.0e-08
196
+ time_domain_weight: 0.5
197
+ normalize_variance: true
198
+ wrapper: fixed_order
199
+ wrapper_conf:
200
+ weight: 1.0
201
+ - name: si_snr
202
+ conf:
203
+ eps: 1.0e-07
204
+ wrapper: fixed_order
205
+ wrapper_conf:
206
+ weight: 0.0
207
+ speech_volume_normalize: null
208
+ rir_scp: null
209
+ rir_apply_prob: 1.0
210
+ noise_scp: null
211
+ noise_apply_prob: 1.0
212
+ noise_db_range: '13_15'
213
+ short_noise_thres: 0.5
214
+ use_reverberant_ref: false
215
+ num_spk: 1
216
+ num_noise_type: 1
217
+ sample_rate: 8000
218
+ force_single_channel: true
219
+ channel_reordering: true
220
+ categories:
221
+ - 1ch_8000Hz
222
+ - 1ch_16000Hz
223
+ - 1ch_22050Hz
224
+ - 1ch_24000Hz
225
+ - 1ch_32000Hz
226
+ - 1ch_44100Hz
227
+ - 1ch_48000Hz
228
+ speech_segment: null
229
+ avoid_allzero_segment: true
230
+ flexible_numspk: false
231
+ dynamic_mixing: false
232
+ utt2spk: null
233
+ dynamic_mixing_gain_db: 0.0
234
+ encoder: stft
235
+ encoder_conf:
236
+ n_fft: 256
237
+ hop_length: 128
238
+ use_builtin_complex: true
239
+ default_fs: 8000
240
+ separator: tfgridnetv3
241
+ separator_conf:
242
+ n_srcs: 1
243
+ n_imics: 1
244
+ n_layers: 6
245
+ lstm_hidden_units: 200
246
+ attn_n_head: 4
247
+ attn_qk_output_channel: 2
248
+ emb_dim: 48
249
+ emb_ks: 4
250
+ emb_hs: 1
251
+ activation: prelu
252
+ eps: 1.0e-05
253
+ decoder: stft
254
+ decoder_conf:
255
+ n_fft: 256
256
+ hop_length: 128
257
+ default_fs: 8000
258
+ mask_module: multi_mask
259
+ mask_module_conf: {}
260
+ preprocessor: enh
261
+ preprocessor_conf: {}
262
+ diffusion_model: null
263
+ diffusion_model_conf: {}
264
+ required:
265
+ - output_dir
266
+ version: '202402'
267
+ distributed: true
268
+ ```
269
+
270
+ </details>
271
+
272
+
273
+
274
+ ### Citing ESPnet
275
+
276
+ ```BibTex
277
+ @inproceedings{watanabe2018espnet,
278
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
279
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
280
+ year={2018},
281
+ booktitle={Proceedings of Interspeech},
282
+ pages={2207--2211},
283
+ doi={10.21437/Interspeech.2018-1456},
284
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
285
+ }
286
+
287
+
288
+ @inproceedings{ESPnet-SE,
289
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
290
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
291
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
292
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
293
+ pages = {785--792},
294
+ publisher = {{IEEE}},
295
+ year = {2021},
296
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
297
+ doi = {10.1109/SLT48900.2021.9383615},
298
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
299
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
300
+ bibsource = {dblp computer science bibliography, https://dblp.org}
301
+ }
302
+
303
+
304
+ ```
305
+
306
+ or arXiv:
307
+
308
+ ```bibtex
309
+ @misc{watanabe2018espnet,
310
+ title={ESPnet: End-to-End Speech Processing Toolkit},
311
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
312
+ year={2018},
313
+ eprint={1804.00015},
314
+ archivePrefix={arXiv},
315
+ primaryClass={cs.CL}
316
+ }
317
+ ```
exp/enh_train_enh_tfgridnet_raw/21epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be416f78659998e8196fd6565adb50b35e2645cea23d15d28c081fda40504751
3
+ size 34175373
exp/enh_train_enh_tfgridnet_raw/RESULTS.md ADDED
@@ -0,0 +1 @@
 
 
1
+ This model was trained on simulated data (based on DNS5 Engligh speech, CommonVoice 11.0 English portion, LibriTTS, VCTK, and WSJ) for 21 epochs.
exp/enh_train_enh_tfgridnet_raw/config.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_tfgridnet.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: chunk
7
+ valid_iterator_type: null
8
+ output_dir: exp/enh_train_enh_tfgridnet_raw
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 4
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 54825
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 100
30
+ patience: 40
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ keep_nbest_models: 1
43
+ nbest_averaging_interval: 0
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_matplotlib: true
54
+ use_tensorboard: true
55
+ create_graph_in_tensorboard: false
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ use_adapter: false
64
+ adapter: lora
65
+ save_strategy: all
66
+ adapter_conf: {}
67
+ pretrain_path: null
68
+ init_param: []
69
+ ignore_init_mismatch: false
70
+ freeze_param: []
71
+ num_iters_per_epoch: 8000
72
+ batch_size: 4
73
+ valid_batch_size: null
74
+ batch_bins: 1000000
75
+ valid_batch_bins: null
76
+ train_shape_file:
77
+ - exp/enh_stats_16k/train/speech_mix_shape
78
+ - exp/enh_stats_16k/train/speech_ref1_shape
79
+ valid_shape_file:
80
+ - exp/enh_stats_16k/valid/speech_mix_shape
81
+ - exp/enh_stats_16k/valid/speech_ref1_shape
82
+ batch_type: folded
83
+ valid_batch_type: null
84
+ fold_length:
85
+ - 80000
86
+ - 80000
87
+ sort_in_batch: descending
88
+ shuffle_within_batch: false
89
+ sort_batch: descending
90
+ multiple_iterator: false
91
+ chunk_length: 200
92
+ chunk_shift_ratio: 0.5
93
+ num_cache_chunks: 1024
94
+ chunk_excluded_key_prefixes: []
95
+ chunk_default_fs: 50
96
+ chunk_max_abs_length: 100000
97
+ chunk_discard_short_samples: true
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/train/wav.scp
100
+ - speech_mix
101
+ - sound
102
+ - - dump/raw/train/spk1.scp
103
+ - speech_ref1
104
+ - sound
105
+ - - dump/raw/train/utt2category
106
+ - category
107
+ - text
108
+ - - dump/raw/train/utt2fs
109
+ - fs
110
+ - text_int
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/raw/validation/wav.scp
113
+ - speech_mix
114
+ - sound
115
+ - - dump/raw/validation/spk1.scp
116
+ - speech_ref1
117
+ - sound
118
+ - - dump/raw/validation/utt2category
119
+ - category
120
+ - text
121
+ - - dump/raw/validation/utt2fs
122
+ - fs
123
+ - text_int
124
+ allow_variable_data_keys: false
125
+ max_cache_size: 0.0
126
+ max_cache_fd: 32
127
+ allow_multi_rates: true
128
+ valid_max_cache_size: null
129
+ exclude_weight_decay: false
130
+ exclude_weight_decay_conf: {}
131
+ optim: adam
132
+ optim_conf:
133
+ lr: 0.001
134
+ eps: 1.0e-08
135
+ weight_decay: 1.0e-05
136
+ scheduler: steplr
137
+ scheduler_conf:
138
+ step_size: 2
139
+ gamma: 0.99
140
+ init: null
141
+ model_conf:
142
+ normalize_variance_per_ch: true
143
+ categories:
144
+ - 1ch_8000Hz
145
+ - 1ch_16000Hz
146
+ - 1ch_22050Hz
147
+ - 1ch_24000Hz
148
+ - 1ch_32000Hz
149
+ - 1ch_44100Hz
150
+ - 1ch_48000Hz
151
+ criterions:
152
+ - name: mr_l1_tfd
153
+ conf:
154
+ window_sz:
155
+ - 256
156
+ - 512
157
+ - 768
158
+ - 1024
159
+ hop_sz: null
160
+ eps: 1.0e-08
161
+ time_domain_weight: 0.5
162
+ normalize_variance: true
163
+ wrapper: fixed_order
164
+ wrapper_conf:
165
+ weight: 1.0
166
+ - name: si_snr
167
+ conf:
168
+ eps: 1.0e-07
169
+ wrapper: fixed_order
170
+ wrapper_conf:
171
+ weight: 0.0
172
+ speech_volume_normalize: null
173
+ rir_scp: null
174
+ rir_apply_prob: 1.0
175
+ noise_scp: null
176
+ noise_apply_prob: 1.0
177
+ noise_db_range: '13_15'
178
+ short_noise_thres: 0.5
179
+ use_reverberant_ref: false
180
+ num_spk: 1
181
+ num_noise_type: 1
182
+ sample_rate: 8000
183
+ force_single_channel: true
184
+ channel_reordering: true
185
+ categories:
186
+ - 1ch_8000Hz
187
+ - 1ch_16000Hz
188
+ - 1ch_22050Hz
189
+ - 1ch_24000Hz
190
+ - 1ch_32000Hz
191
+ - 1ch_44100Hz
192
+ - 1ch_48000Hz
193
+ speech_segment: null
194
+ avoid_allzero_segment: true
195
+ flexible_numspk: false
196
+ dynamic_mixing: false
197
+ utt2spk: null
198
+ dynamic_mixing_gain_db: 0.0
199
+ encoder: stft
200
+ encoder_conf:
201
+ n_fft: 256
202
+ hop_length: 128
203
+ use_builtin_complex: true
204
+ default_fs: 8000
205
+ separator: tfgridnetv3
206
+ separator_conf:
207
+ n_srcs: 1
208
+ n_imics: 1
209
+ n_layers: 6
210
+ lstm_hidden_units: 200
211
+ attn_n_head: 4
212
+ attn_qk_output_channel: 2
213
+ emb_dim: 48
214
+ emb_ks: 4
215
+ emb_hs: 1
216
+ activation: prelu
217
+ eps: 1.0e-05
218
+ decoder: stft
219
+ decoder_conf:
220
+ n_fft: 256
221
+ hop_length: 128
222
+ default_fs: 8000
223
+ mask_module: multi_mask
224
+ mask_module_conf: {}
225
+ preprocessor: enh
226
+ preprocessor_conf: {}
227
+ diffusion_model: null
228
+ diffusion_model_conf: {}
229
+ required:
230
+ - output_dir
231
+ version: '202402'
232
+ distributed: true
exp/enh_train_enh_tfgridnet_raw/images/backward_time.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/clip.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/forward_time.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/gpu_max_cached_mem_GB.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/grad_norm.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/iter_time.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_16000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_22050Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_24000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_32000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_44100Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_48000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/l1_timedomain+magspec_loss_1ch_8000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/loss.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/loss_scale.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/optim0_lr0.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/optim_step_time.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_16000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_22050Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_24000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_32000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_44100Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_48000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/si_snr_loss_1ch_8000Hz.png ADDED
exp/enh_train_enh_tfgridnet_raw/images/train_time.png ADDED
exp/enh_train_enh_tfgridnet_raw/valid.loss.best.pth ADDED
@@ -0,0 +1 @@
 
 
1
+ 21epoch.pth
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202402'
2
+ files:
3
+ model_file: exp/enh_train_enh_tfgridnet_raw/21epoch.pth
4
+ python: "3.8.19 (default, Mar 20 2024, 19:58:24) \n[GCC 11.2.0]"
5
+ timestamp: 1719552816.976782
6
+ torch: 2.0.1
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_tfgridnet_raw/config.yaml