jiaqili3 commited on
Commit
addb7e5
1 Parent(s): e1cca71
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +63 -0
  2. Dockerfile +64 -0
  3. LICENSE +21 -0
  4. README.md +163 -0
  5. bins/calc_metrics.py +268 -0
  6. bins/svc/inference.py +265 -0
  7. bins/svc/preprocess.py +183 -0
  8. bins/svc/train.py +111 -0
  9. bins/tta/inference.py +94 -0
  10. bins/tta/preprocess.py +195 -0
  11. bins/tta/train_tta.py +77 -0
  12. bins/tts/inference.py +169 -0
  13. bins/tts/preprocess.py +244 -0
  14. bins/tts/train.py +152 -0
  15. bins/vocoder/inference.py +115 -0
  16. bins/vocoder/preprocess.py +151 -0
  17. bins/vocoder/train.py +93 -0
  18. config/audioldm.json +92 -0
  19. config/autoencoderkl.json +69 -0
  20. config/base.json +185 -0
  21. config/comosvc.json +215 -0
  22. config/fs2.json +120 -0
  23. config/jets.json +120 -0
  24. config/ns2.json +88 -0
  25. config/svc/base.json +119 -0
  26. config/svc/diffusion.json +142 -0
  27. config/transformer.json +179 -0
  28. config/tts.json +25 -0
  29. config/valle.json +55 -0
  30. config/vits.json +101 -0
  31. config/vitssvc.json +306 -0
  32. config/vocoder.json +84 -0
  33. egs/datasets/README.md +458 -0
  34. egs/datasets/docker.md +19 -0
  35. egs/metrics/README.md +174 -0
  36. egs/metrics/run.sh +132 -0
  37. egs/svc/DiffComoSVC/README.md +234 -0
  38. egs/svc/DiffComoSVC/exp_config.json +143 -0
  39. egs/svc/DiffComoSVC/run.sh +1 -0
  40. egs/svc/MultipleContentsSVC/README.md +248 -0
  41. egs/svc/MultipleContentsSVC/exp_config.json +127 -0
  42. egs/svc/MultipleContentsSVC/run.sh +1 -0
  43. egs/svc/README.md +34 -0
  44. egs/svc/TransformerSVC/README.md +164 -0
  45. egs/svc/TransformerSVC/exp_config.json +108 -0
  46. egs/svc/TransformerSVC/run.sh +1 -0
  47. egs/svc/VitsSVC/README.md +125 -0
  48. egs/svc/VitsSVC/exp_config.json +106 -0
  49. egs/svc/VitsSVC/run.sh +1 -0
  50. egs/svc/_template/run.sh +160 -0
.gitignore ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mac OS files
2
+ .DS_Store
3
+ ckpt
4
+ # IDEs
5
+ .idea
6
+ .vs
7
+ .vscode
8
+ .cache
9
+ *.png
10
+ # GitHub files
11
+ .github
12
+
13
+ # Byte-compiled / optimized / DLL / cached files
14
+ __pycache__/
15
+ *.py[cod]
16
+ *$py.class
17
+ *.pyc
18
+ .temp
19
+ *.c
20
+ *.so
21
+ *.o
22
+
23
+ # Developing mode
24
+ _*.sh
25
+ _*.json
26
+ *.lst
27
+ yard*
28
+ *.out
29
+ evaluation/evalset_selection
30
+ mfa
31
+ egs/svc/*wavmark
32
+ egs/svc/custom
33
+ egs/svc/*/dev*
34
+ egs/svc/dev_exp_config.json
35
+ bins/svc/demo*
36
+ bins/svc/preprocess_custom.py
37
+ data
38
+ ckpts
39
+
40
+ # Data and ckpt
41
+ *.pkl
42
+ *.pt
43
+ *.npy
44
+ *.npz
45
+ *.tar.gz
46
+ *.ckpt
47
+ *.wav
48
+ *.flac
49
+ pretrained/wenet/*conformer_exp
50
+ !egs/tts/VALLE/prompt_examples/*.wav
51
+ *.bin
52
+
53
+ # Runtime data dirs
54
+ processed_data
55
+ data
56
+ model_ckpt
57
+ logs
58
+ *.ipynb
59
+ *.lst
60
+ source_audio
61
+ result
62
+ conversion_results
63
+ get_available_gpu.py
Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # Other version: https://hub.docker.com/r/nvidia/cuda/tags
7
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04
8
+
9
+ ARG DEBIAN_FRONTEND=noninteractive
10
+ ARG PYTORCH='2.0.0'
11
+ ARG CUDA='cu118'
12
+ ARG SHELL='/bin/bash'
13
+ ARG MINICONDA='Miniconda3-py39_23.3.1-0-Linux-x86_64.sh'
14
+
15
+ ENV LANG=en_US.UTF-8 PYTHONIOENCODING=utf-8 PYTHONDONTWRITEBYTECODE=1 CUDA_HOME=/usr/local/cuda CONDA_HOME=/opt/conda SHELL=${SHELL}
16
+ ENV PATH=$CONDA_HOME/bin:$CUDA_HOME/bin:$PATH \
17
+ LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH \
18
+ LIBRARY_PATH=$CUDA_HOME/lib64:$LIBRARY_PATH \
19
+ CONDA_PREFIX=$CONDA_HOME \
20
+ NCCL_HOME=$CUDA_HOME
21
+
22
+ # Install ubuntu packages
23
+ RUN sed -i 's/archive.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
24
+ && sed -i 's/security.ubuntu.com/mirrors.cloud.tencent.com/g' /etc/apt/sources.list \
25
+ && rm /etc/apt/sources.list.d/cuda.list \
26
+ && apt-get update \
27
+ && apt-get -y install \
28
+ python3-pip ffmpeg git less wget libsm6 libxext6 libxrender-dev \
29
+ build-essential cmake pkg-config libx11-dev libatlas-base-dev \
30
+ libgtk-3-dev libboost-python-dev vim libgl1-mesa-glx \
31
+ libaio-dev software-properties-common tmux \
32
+ espeak-ng
33
+
34
+ # Install miniconda with python 3.9
35
+ USER root
36
+ # COPY Miniconda3-py39_23.3.1-0-Linux-x86_64.sh /root/anaconda.sh
37
+ RUN wget -t 0 -c -O /tmp/anaconda.sh https://repo.anaconda.com/miniconda/${MINICONDA} \
38
+ && mv /tmp/anaconda.sh /root/anaconda.sh \
39
+ && ${SHELL} /root/anaconda.sh -b -p $CONDA_HOME \
40
+ && rm /root/anaconda.sh
41
+
42
+ RUN conda create -y --name amphion python=3.9.15
43
+
44
+ WORKDIR /app
45
+ COPY env.sh env.sh
46
+ RUN chmod +x ./env.sh
47
+
48
+ RUN ["conda", "run", "-n", "amphion", "-vvv", "--no-capture-output", "./env.sh"]
49
+
50
+ RUN conda init \
51
+ && echo "\nconda activate amphion\n" >> ~/.bashrc
52
+
53
+ CMD ["/bin/bash"]
54
+
55
+ # *** Build ***
56
+ # docker build -t realamphion/amphion .
57
+
58
+ # *** Run ***
59
+ # cd Amphion
60
+ # docker run --runtime=nvidia --gpus all -it -v .:/app -v /mnt:/mnt_host realamphion/amphion
61
+
62
+ # *** Push and release ***
63
+ # docker login
64
+ # docker push realamphion/amphion
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Amphion
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion: An Open-Source Audio, Music, and Speech Generation Toolkit
2
+
3
+ <div>
4
+ <a href="https://arxiv.org/abs/2312.09911"><img src="https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg"></a>
5
+ <a href="https://huggingface.co/amphion"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink"></a>
6
+ <a href="https://openxlab.org.cn/usercenter/Amphion"><img src="https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg"></a>
7
+ <a href="egs/tts/README.md"><img src="https://img.shields.io/badge/README-TTS-blue"></a>
8
+ <a href="egs/svc/README.md"><img src="https://img.shields.io/badge/README-SVC-blue"></a>
9
+ <a href="egs/tta/README.md"><img src="https://img.shields.io/badge/README-TTA-blue"></a>
10
+ <a href="egs/vocoder/README.md"><img src="https://img.shields.io/badge/README-Vocoder-purple"></a>
11
+ <a href="egs/metrics/README.md"><img src="https://img.shields.io/badge/README-Evaluation-yellow"></a>
12
+ <a href="LICENSE"><img src="https://img.shields.io/badge/LICENSE-MIT-red"></a>
13
+ </div>
14
+ <br>
15
+
16
+ **Amphion (/æmˈfaɪən/) is a toolkit for Audio, Music, and Speech Generation.** Its purpose is to support reproducible research and help junior researchers and engineers get started in the field of audio, music, and speech generation research and development. Amphion offers a unique feature: **visualizations** of classic models or architectures. We believe that these visualizations are beneficial for junior researchers and engineers who wish to gain a better understanding of the model.
17
+
18
+ **The North-Star objective of Amphion is to offer a platform for studying the conversion of any inputs into audio.** Amphion is designed to support individual generation tasks, including but not limited to,
19
+
20
+ - **TTS**: Text to Speech (⛳ supported)
21
+ - **SVS**: Singing Voice Synthesis (👨‍💻 developing)
22
+ - **VC**: Voice Conversion (👨‍💻 developing)
23
+ - **SVC**: Singing Voice Conversion (⛳ supported)
24
+ - **TTA**: Text to Audio (⛳ supported)
25
+ - **TTM**: Text to Music (👨‍💻 developing)
26
+ - more…
27
+
28
+ In addition to the specific generation tasks, Amphion includes several **vocoders** and **evaluation metrics**. A vocoder is an important module for producing high-quality audio signals, while evaluation metrics are critical for ensuring consistent metrics in generation tasks. Moreover, Amphion is dedicated to advancing audio generation in real-world applications, such as building **large-scale datasets** for speech synthesis.
29
+
30
+ ## 🚀 News
31
+ - **2024/07/01**: Amphion now releases **Emilia**, the first open-source multilingual in-the-wild dataset for speech generation with over 101k hours of speech data, and the **Emilia-Pipe**, the first open-source preprocessing pipeline designed to transform in-the-wild speech data into high-quality training data with annotations for speech generation! [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2407.05361) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Dataset-yellow)](https://huggingface.co/datasets/amphion/Emilia) [![demo](https://img.shields.io/badge/WebPage-Demo-red)](https://emilia-dataset.github.io/Emilia-Demo-Page/) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](preprocessors/Emilia/README.md)
32
+ - **2024/06/17**: Amphion has a new release for its **VALL-E** model! It uses Llama as its underlying architecture and has better model performance, faster training speed, and more readable codes compared to our first version. [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](egs/tts/VALLE_V2/README.md)
33
+ - **2024/03/12**: Amphion now support **NaturalSpeech3 FACodec** and release pretrained checkpoints. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2403.03100) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/amphion/naturalspeech3_facodec) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-demo-pink)](https://huggingface.co/spaces/amphion/naturalspeech3_facodec) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](models/codec/ns3_codec/README.md)
34
+ - **2024/02/22**: The first Amphion visualization tool, **SingVisio**, release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](egs/visualization/SingVisio/README.md)
35
+ - **2023/12/18**: Amphion v0.1 release. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2312.09911) [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Amphion-pink)](https://huggingface.co/amphion) [![youtube](https://img.shields.io/badge/YouTube-Demo-red)](https://www.youtube.com/watch?v=1aw0HhcggvQ) [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/39)
36
+ - **2023/11/28**: Amphion alpha release. [![readme](https://img.shields.io/badge/README-Key%20Features-blue)](https://github.com/open-mmlab/Amphion/pull/2)
37
+
38
+ ## ⭐ Key Features
39
+
40
+ ### TTS: Text to Speech
41
+
42
+ - Amphion achieves state-of-the-art performance compared to existing open-source repositories on text-to-speech (TTS) systems. It supports the following models or architectures:
43
+ - [FastSpeech2](https://arxiv.org/abs/2006.04558): A non-autoregressive TTS architecture that utilizes feed-forward Transformer blocks.
44
+ - [VITS](https://arxiv.org/abs/2106.06103): An end-to-end TTS architecture that utilizes conditional variational autoencoder with adversarial learning
45
+ - [VALL-E](https://arxiv.org/abs/2301.02111): A zero-shot TTS architecture that uses a neural codec language model with discrete codes.
46
+ - [NaturalSpeech2](https://arxiv.org/abs/2304.09116): An architecture for TTS that utilizes a latent diffusion model to generate natural-sounding voices.
47
+ - [Jets](Jets): An end-to-end TTS model that jointly trains FastSpeech2 and HiFi-GAN with an alignment module.
48
+
49
+ ### SVC: Singing Voice Conversion
50
+
51
+ - Ampion supports multiple content-based features from various pretrained models, including [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec). Their specific roles in SVC has been investigated in our NeurIPS 2023 workshop paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160) [![code](https://img.shields.io/badge/README-Code-red)](egs/svc/MultipleContentsSVC)
52
+ - Amphion implements several state-of-the-art model architectures, including diffusion-, transformer-, VAE- and flow-based models. The diffusion-based architecture uses [Bidirectional dilated CNN](https://openreview.net/pdf?id=a-xFK8Ymz5J) as a backend and supports several sampling algorithms such as [DDPM](https://arxiv.org/pdf/2006.11239.pdf), [DDIM](https://arxiv.org/pdf/2010.02502.pdf), and [PNDM](https://arxiv.org/pdf/2202.09778.pdf). Additionally, it supports single-step inference based on the [Consistency Model](https://openreview.net/pdf?id=FmqFfMTNnv).
53
+
54
+ ### TTA: Text to Audio
55
+
56
+ - Amphion supports the TTA with a latent diffusion model. It is designed like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830). It is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2304.00830) [![code](https://img.shields.io/badge/README-Code-red)](egs/tta/RECIPE.md)
57
+
58
+ ### Vocoder
59
+
60
+ - Amphion supports various widely-used neural vocoders, including:
61
+ - GAN-based vocoders: [MelGAN](https://arxiv.org/abs/1910.06711), [HiFi-GAN](https://arxiv.org/abs/2010.05646), [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts), [BigVGAN](https://arxiv.org/abs/2206.04658), [APNet](https://arxiv.org/abs/2305.07952).
62
+ - Flow-based vocoders: [WaveGlow](https://arxiv.org/abs/1811.00002).
63
+ - Diffusion-based vocoders: [Diffwave](https://arxiv.org/abs/2009.09761).
64
+ - Auto-regressive based vocoders: [WaveNet](https://arxiv.org/abs/1609.03499), [WaveRNN](https://arxiv.org/abs/1802.08435v1).
65
+ - Amphion provides the official implementation of [Multi-Scale Constant-Q Transform Discriminator](https://arxiv.org/abs/2311.14957) (our ICASSP 2024 paper). It can be used to enhance any architecture GAN-based vocoders during training, and keep the inference stage (such as memory or speed) unchanged. [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957) [![code](https://img.shields.io/badge/README-Code-red)](egs/vocoder/gan/tfr_enhanced_hifigan)
66
+
67
+ ### Evaluation
68
+
69
+ Amphion provides a comprehensive objective evaluation of the generated audio. The evaluation metrics contain:
70
+
71
+ - **F0 Modeling**: F0 Pearson Coefficients, F0 Periodicity Root Mean Square Error, F0 Root Mean Square Error, Voiced/Unvoiced F1 Score, etc.
72
+ - **Energy Modeling**: Energy Root Mean Square Error, Energy Pearson Coefficients, etc.
73
+ - **Intelligibility**: Character/Word Error Rate, which can be calculated based on [Whisper](https://github.com/openai/whisper) and more.
74
+ - **Spectrogram Distortion**: Frechet Audio Distance (FAD), Mel Cepstral Distortion (MCD), Multi-Resolution STFT Distance (MSTFT), Perceptual Evaluation of Speech Quality (PESQ), Short Time Objective Intelligibility (STOI), etc.
75
+ - **Speaker Similarity**: Cosine similarity, which can be calculated based on [RawNet3](https://github.com/Jungjee/RawNet), [Resemblyzer](https://github.com/resemble-ai/Resemblyzer), [WeSpeaker](https://github.com/wenet-e2e/wespeaker), [WavLM](https://github.com/microsoft/unilm/tree/master/wavlm) and more.
76
+
77
+ ### Datasets
78
+
79
+ - Amphion unifies the data preprocess of the open-source datasets including [AudioCaps](https://audiocaps.github.io/), [LibriTTS](https://www.openslr.org/60/), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/), [M4Singer](https://github.com/M4Singer/M4Singer), [Opencpop](https://wenet.org.cn/opencpop/), [OpenSinger](https://github.com/Multi-Singer/Multi-Singer.github.io), [SVCC](http://vc-challenge.org/), [VCTK](https://datashare.ed.ac.uk/handle/10283/3443), and more. The supported dataset list can be seen [here](egs/datasets/README.md) (updating).
80
+ - Amphion (exclusively) supports the [**Emilia**](preprocessors/Emilia/README.md) dataset and its preprocessing pipeline **Emilia-Pipe** for in-the-wild speech data!
81
+
82
+ ### Visualization
83
+
84
+ Amphion provides visualization tools to interactively illustrate the internal processing mechanism of classic models. This provides an invaluable resource for educational purposes and for facilitating understandable research.
85
+
86
+ Currently, Amphion supports [SingVisio](egs/visualization/SingVisio/README.md), a visualization tool of the diffusion model for singing voice conversion. [![arXiv](https://img.shields.io/badge/arXiv-Paper-COLOR.svg)](https://arxiv.org/abs/2402.12660) [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/SingVisio) [![Video](https://img.shields.io/badge/Video-Demo-orange)](https://github.com/open-mmlab/Amphion/assets/33707885/0a6e39e8-d5f1-4288-b0f8-32da5a2d6e96)
87
+
88
+
89
+ ## 📀 Installation
90
+
91
+ Amphion can be installed through either Setup Installer or Docker Image.
92
+
93
+ ### Setup Installer
94
+
95
+ ```bash
96
+ git clone https://github.com/open-mmlab/Amphion.git
97
+ cd Amphion
98
+
99
+ # Install Python Environment
100
+ conda create --name amphion python=3.9.15
101
+ conda activate amphion
102
+
103
+ # Install Python Packages Dependencies
104
+ sh env.sh
105
+ ```
106
+
107
+ ### Docker Image
108
+
109
+ 1. Install [Docker](https://docs.docker.com/get-docker/), [NVIDIA Driver](https://www.nvidia.com/download/index.aspx), [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html), and [CUDA](https://developer.nvidia.com/cuda-downloads).
110
+
111
+ 2. Run the following commands:
112
+ ```bash
113
+ git clone https://github.com/open-mmlab/Amphion.git
114
+ cd Amphion
115
+
116
+ docker pull realamphion/amphion
117
+ docker run --runtime=nvidia --gpus all -it -v .:/app realamphion/amphion
118
+ ```
119
+ Mount dataset by argument `-v` is necessary when using Docker. Please refer to [Mount dataset in Docker container](egs/datasets/docker.md) and [Docker Docs](https://docs.docker.com/engine/reference/commandline/container_run/#volume) for more details.
120
+
121
+
122
+ ## 🐍 Usage in Python
123
+
124
+ We detail the instructions of different tasks in the following recipes:
125
+
126
+ - [Text to Speech (TTS)](egs/tts/README.md)
127
+ - [Singing Voice Conversion (SVC)](egs/svc/README.md)
128
+ - [Text to Audio (TTA)](egs/tta/README.md)
129
+ - [Vocoder](egs/vocoder/README.md)
130
+ - [Evaluation](egs/metrics/README.md)
131
+ - [Visualization](egs/visualization/README.md)
132
+
133
+ ## 👨‍💻 Contributing
134
+ We appreciate all contributions to improve Amphion. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
135
+
136
+ ## 🙏 Acknowledgement
137
+
138
+
139
+ - [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2) and [jaywalnut310's VITS](https://github.com/jaywalnut310/vits) for model architecture code.
140
+ - [lifeiteng's VALL-E](https://github.com/lifeiteng/vall-e) for training pipeline and model architecture design.
141
+ - [SpeechTokenizer](https://github.com/ZhangXInFD/SpeechTokenizer) for semantic-distilled tokenizer design.
142
+ - [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), [ContentVec](https://github.com/auspicious3000/contentvec), and [RawNet3](https://github.com/Jungjee/RawNet) for pretrained models and inference code.
143
+ - [HiFi-GAN](https://github.com/jik876/hifi-gan) for GAN-based Vocoder's architecture design and training strategy.
144
+ - [Encodec](https://github.com/facebookresearch/encodec) for well-organized GAN Discriminator's architecture and basic blocks.
145
+ - [Latent Diffusion](https://github.com/CompVis/latent-diffusion) for model architecture design.
146
+ - [TensorFlowTTS](https://github.com/TensorSpeech/TensorFlowTTS) for preparing the MFA tools.
147
+
148
+
149
+ ## ©️ License
150
+
151
+ Amphion is under the [MIT License](LICENSE). It is free for both research and commercial use cases.
152
+
153
+ ## 📚 Citations
154
+
155
+ ```bibtex
156
+ @article{zhang2023amphion,
157
+ title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit},
158
+ author={Xueyao Zhang and Liumeng Xue and Yicheng Gu and Yuancheng Wang and Haorui He and Chaoren Wang and Xi Chen and Zihao Fang and Haopeng Chen and Junan Zhang and Tze Ying Tang and Lexiao Zou and Mingxuan Wang and Jun Han and Kai Chen and Haizhou Li and Zhizheng Wu},
159
+ journal={arXiv},
160
+ year={2024},
161
+ volume={abs/2312.09911}
162
+ }
163
+ ```
bins/calc_metrics.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import sys
8
+ import numpy as np
9
+ import json
10
+ import argparse
11
+ import whisper
12
+ import torch
13
+
14
+ from glob import glob
15
+ from tqdm import tqdm
16
+ from collections import defaultdict
17
+
18
+
19
+ from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
20
+ from evaluation.metrics.energy.energy_pearson_coefficients import (
21
+ extract_energy_pearson_coeffcients,
22
+ )
23
+ from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
24
+ from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
25
+ from evaluation.metrics.f0.f0_rmse import extract_f0rmse
26
+ from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
27
+ from evaluation.metrics.intelligibility.character_error_rate import extract_cer
28
+ from evaluation.metrics.intelligibility.word_error_rate import extract_wer
29
+ from evaluation.metrics.similarity.speaker_similarity import extract_similarity
30
+ from evaluation.metrics.spectrogram.frechet_distance import extract_fad
31
+ from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
32
+ from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
33
+ from evaluation.metrics.spectrogram.pesq import extract_pesq
34
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
35
+ extract_si_sdr,
36
+ )
37
+ from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
38
+ extract_si_snr,
39
+ )
40
+ from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
41
+ extract_stoi,
42
+ )
43
+
44
+ METRIC_FUNC = {
45
+ "energy_rmse": extract_energy_rmse,
46
+ "energy_pc": extract_energy_pearson_coeffcients,
47
+ "fpc": extract_fpc,
48
+ "f0_periodicity_rmse": extract_f0_periodicity_rmse,
49
+ "f0rmse": extract_f0rmse,
50
+ "v_uv_f1": extract_f1_v_uv,
51
+ "cer": extract_cer,
52
+ "wer": extract_wer,
53
+ "similarity": extract_similarity,
54
+ "fad": extract_fad,
55
+ "mcd": extract_mcd,
56
+ "mstft": extract_mstft,
57
+ "pesq": extract_pesq,
58
+ "si_sdr": extract_si_sdr,
59
+ "si_snr": extract_si_snr,
60
+ "stoi": extract_stoi,
61
+ }
62
+
63
+
64
+ def calc_metric(
65
+ ref_dir,
66
+ deg_dir,
67
+ dump_dir,
68
+ metrics,
69
+ **kwargs,
70
+ ):
71
+ result = defaultdict()
72
+
73
+ for metric in tqdm(metrics):
74
+ if metric in ["fad", "similarity"]:
75
+ result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir, kwargs=kwargs))
76
+ continue
77
+
78
+ audios_ref = []
79
+ audios_deg = []
80
+
81
+ files = glob(deg_dir + "/*.wav")
82
+
83
+ for file in files:
84
+ audios_deg.append(file)
85
+ uid = file.split("/")[-1].split(".wav")[0]
86
+ file_gt = ref_dir + "/{}.wav".format(uid)
87
+ audios_ref.append(file_gt)
88
+
89
+ if metric in ["wer", "cer"] and kwargs["intelligibility_mode"] == "gt_content":
90
+ ltr_path = kwargs["ltr_path"]
91
+ tmpltrs = {}
92
+ with open(ltr_path, "r") as f:
93
+ for line in f:
94
+ paras = line.replace("\n", "").split("|")
95
+ paras[1] = paras[1].replace(" ", "")
96
+ paras[1] = paras[1].replace(".", "")
97
+ paras[1] = paras[1].replace("'", "")
98
+ paras[1] = paras[1].replace("-", "")
99
+ paras[1] = paras[1].replace(",", "")
100
+ paras[1] = paras[1].replace("!", "")
101
+ paras[1] = paras[1].lower()
102
+ tmpltrs[paras[0]] = paras[1]
103
+ ltrs = []
104
+ files = glob(ref_dir + "/*.wav")
105
+ for file in files:
106
+ ltrs.append(tmpltrs[os.path.basename(file)])
107
+
108
+ if metric in ["v_uv_f1"]:
109
+ tp_total = 0
110
+ fp_total = 0
111
+ fn_total = 0
112
+
113
+ for i in tqdm(range(len(audios_ref))):
114
+ audio_ref = audios_ref[i]
115
+ audio_deg = audios_deg[i]
116
+ tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, kwargs=kwargs)
117
+ tp_total += tp
118
+ fp_total += fp
119
+ fn_total += fn
120
+
121
+ result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
122
+ else:
123
+ scores = []
124
+ for i in tqdm(range(len(audios_ref))):
125
+ audio_ref = audios_ref[i]
126
+ audio_deg = audios_deg[i]
127
+
128
+ if metric in ["wer", "cer"]:
129
+ model = whisper.load_model("large")
130
+ mode = kwargs["intelligibility_mode"]
131
+ if torch.cuda.is_available():
132
+ device = torch.device("cuda")
133
+ model = model.to(device)
134
+
135
+ if mode == "gt_audio":
136
+ kwargs["audio_ref"] = audio_ref
137
+ kwargs["audio_deg"] = audio_deg
138
+ score = METRIC_FUNC[metric](
139
+ model,
140
+ kwargs=kwargs,
141
+ )
142
+ elif mode == "gt_content":
143
+ kwargs["content_gt"] = ltrs[i]
144
+ kwargs["audio_deg"] = audio_deg
145
+ score = METRIC_FUNC[metric](
146
+ model,
147
+ kwargs=kwargs,
148
+ )
149
+ else:
150
+ score = METRIC_FUNC[metric](
151
+ audio_ref,
152
+ audio_deg,
153
+ kwargs=kwargs,
154
+ )
155
+ if not np.isnan(score):
156
+ scores.append(score)
157
+
158
+ scores = np.array(scores)
159
+ result["{}".format(metric)] = str(np.mean(scores))
160
+
161
+ data = json.dumps(result, indent=4)
162
+
163
+ with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
164
+ f.write(data)
165
+
166
+
167
+ if __name__ == "__main__":
168
+ parser = argparse.ArgumentParser()
169
+ parser.add_argument(
170
+ "--ref_dir",
171
+ type=str,
172
+ help="Path to the reference audio folder.",
173
+ )
174
+ parser.add_argument(
175
+ "--deg_dir",
176
+ type=str,
177
+ help="Path to the test audio folder.",
178
+ )
179
+ parser.add_argument(
180
+ "--dump_dir",
181
+ type=str,
182
+ help="Path to dump the results.",
183
+ )
184
+ parser.add_argument(
185
+ "--metrics",
186
+ nargs="+",
187
+ help="Metrics used to evaluate.",
188
+ )
189
+ parser.add_argument(
190
+ "--fs",
191
+ type=str,
192
+ default="None",
193
+ help="(Optional) Sampling rate",
194
+ )
195
+ parser.add_argument(
196
+ "--align_method",
197
+ type=str,
198
+ default="dtw",
199
+ help="(Optional) Method for aligning feature length. ['cut', 'dtw']",
200
+ )
201
+
202
+ parser.add_argument(
203
+ "--db_scale",
204
+ type=str,
205
+ default="True",
206
+ help="(Optional) Wether or not computing energy related metrics in db scale.",
207
+ )
208
+ parser.add_argument(
209
+ "--f0_subtract_mean",
210
+ type=str,
211
+ default="True",
212
+ help="(Optional) Wether or not computing f0 related metrics with mean value subtracted.",
213
+ )
214
+
215
+ parser.add_argument(
216
+ "--similarity_model",
217
+ type=str,
218
+ default="wavlm",
219
+ help="(Optional)The model for computing speaker similarity. ['rawnet', 'wavlm', 'resemblyzer']",
220
+ )
221
+ parser.add_argument(
222
+ "--similarity_mode",
223
+ type=str,
224
+ default="pairwith",
225
+ help="(Optional)The method of calculating similarity, where set to overall means computing \
226
+ the speaker similarity between two folder of audios content freely, and set to pairwith means \
227
+ computing the speaker similarity between a seires of paired gt/pred audios",
228
+ )
229
+
230
+ parser.add_argument(
231
+ "--ltr_path",
232
+ type=str,
233
+ default="None",
234
+ help="(Optional)Path to the transcription file,Note that the format in the transcription \
235
+ file is 'file name|transcription'",
236
+ )
237
+ parser.add_argument(
238
+ "--intelligibility_mode",
239
+ type=str,
240
+ default="gt_audio",
241
+ help="(Optional)The method of calculating WER and CER, where set to gt_audio means selecting \
242
+ the recognition content of the reference audio as the target, and set to gt_content means \
243
+ using transcription as the target",
244
+ )
245
+ parser.add_argument(
246
+ "--language",
247
+ type=str,
248
+ default="english",
249
+ help="(Optional)['english','chinese']",
250
+ )
251
+
252
+ args = parser.parse_args()
253
+
254
+ calc_metric(
255
+ args.ref_dir,
256
+ args.deg_dir,
257
+ args.dump_dir,
258
+ args.metrics,
259
+ fs=int(args.fs) if args.fs != "None" else None,
260
+ method=args.align_method,
261
+ db_scale=True if args.db_scale == "True" else False,
262
+ need_mean=True if args.f0_subtract_mean == "True" else False,
263
+ model_name=args.similarity_model,
264
+ similarity_mode=args.similarity_mode,
265
+ ltr_path=args.ltr_path,
266
+ intelligibility_mode=args.intelligibility_mode,
267
+ language=args.language,
268
+ )
bins/svc/inference.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import glob
9
+ from tqdm import tqdm
10
+ import json
11
+ import torch
12
+ import time
13
+
14
+ from models.svc.diffusion.diffusion_inference import DiffusionInference
15
+ from models.svc.comosvc.comosvc_inference import ComoSVCInference
16
+ from models.svc.transformer.transformer_inference import TransformerInference
17
+ from models.svc.vits.vits_inference import VitsInference
18
+ from utils.util import load_config
19
+ from utils.audio_slicer import split_audio, merge_segments_encodec
20
+ from processors import acoustic_extractor, content_extractor
21
+
22
+
23
+ def build_inference(args, cfg, infer_type="from_dataset"):
24
+ supported_inference = {
25
+ "DiffWaveNetSVC": DiffusionInference,
26
+ "DiffComoSVC": ComoSVCInference,
27
+ "TransformerSVC": TransformerInference,
28
+ "VitsSVC": VitsInference,
29
+ }
30
+
31
+ inference_class = supported_inference[cfg.model_type]
32
+ return inference_class(args, cfg, infer_type)
33
+
34
+
35
+ def prepare_for_audio_file(args, cfg, num_workers=1):
36
+ preprocess_path = cfg.preprocess.processed_dir
37
+ audio_name = cfg.inference.source_audio_name
38
+ temp_audio_dir = os.path.join(preprocess_path, audio_name)
39
+
40
+ ### eval file
41
+ t = time.time()
42
+ eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
43
+ args.source = eval_file
44
+ with open(eval_file, "r") as f:
45
+ metadata = json.load(f)
46
+ print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
47
+
48
+ ### acoustic features
49
+ t = time.time()
50
+ acoustic_extractor.extract_utt_acoustic_features_serial(
51
+ metadata, temp_audio_dir, cfg
52
+ )
53
+ if cfg.preprocess.use_min_max_norm_mel == True:
54
+ acoustic_extractor.cal_mel_min_max(
55
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
56
+ )
57
+ acoustic_extractor.cal_pitch_statistics_svc(
58
+ dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
59
+ )
60
+ print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
61
+
62
+ ### content features
63
+ t = time.time()
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+ print("Prepare for content features: {:.1f}s".format(time.time() - t))
68
+ return args, cfg, temp_audio_dir
69
+
70
+
71
+ def merge_for_audio_segments(audio_files, args, cfg):
72
+ audio_name = cfg.inference.source_audio_name
73
+ target_singer_name = args.target_singer
74
+
75
+ merge_segments_encodec(
76
+ wav_files=audio_files,
77
+ fs=cfg.preprocess.sample_rate,
78
+ output_path=os.path.join(
79
+ args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
80
+ ),
81
+ overlap_duration=cfg.inference.segments_overlap_duration,
82
+ )
83
+
84
+ for tmp_file in audio_files:
85
+ os.remove(tmp_file)
86
+
87
+
88
+ def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
89
+ """
90
+ Prepare the eval file (json) for an audio
91
+ """
92
+
93
+ audio_chunks_results = split_audio(
94
+ wav_file=cfg.inference.source_audio_path,
95
+ target_sr=cfg.preprocess.sample_rate,
96
+ output_dir=os.path.join(temp_audio_dir, "wavs"),
97
+ max_duration_of_segment=cfg.inference.segments_max_duration,
98
+ overlap_duration=cfg.inference.segments_overlap_duration,
99
+ )
100
+
101
+ metadata = []
102
+ for i, res in enumerate(audio_chunks_results):
103
+ res["index"] = i
104
+ res["Dataset"] = audio_name
105
+ res["Singer"] = audio_name
106
+ res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
107
+ metadata.append(res)
108
+
109
+ eval_file = os.path.join(temp_audio_dir, "eval.json")
110
+ with open(eval_file, "w") as f:
111
+ json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
112
+
113
+ return eval_file
114
+
115
+
116
+ def cuda_relevant(deterministic=False):
117
+ torch.cuda.empty_cache()
118
+ # TF32 on Ampere and above
119
+ torch.backends.cuda.matmul.allow_tf32 = True
120
+ torch.backends.cudnn.enabled = True
121
+ torch.backends.cudnn.allow_tf32 = True
122
+ # Deterministic
123
+ torch.backends.cudnn.deterministic = deterministic
124
+ torch.backends.cudnn.benchmark = not deterministic
125
+ torch.use_deterministic_algorithms(deterministic)
126
+
127
+
128
+ def infer(args, cfg, infer_type):
129
+ # Build inference
130
+ t = time.time()
131
+ trainer = build_inference(args, cfg, infer_type)
132
+ print("Model Init: {:.1f}s".format(time.time() - t))
133
+
134
+ # Run inference
135
+ t = time.time()
136
+ output_audio_files = trainer.inference()
137
+ print("Model inference: {:.1f}s".format(time.time() - t))
138
+ return output_audio_files
139
+
140
+
141
+ def build_parser():
142
+ r"""Build argument parser for inference.py.
143
+ Anything else should be put in an extra config YAML file.
144
+ """
145
+
146
+ parser = argparse.ArgumentParser()
147
+ parser.add_argument(
148
+ "--config",
149
+ type=str,
150
+ required=True,
151
+ help="JSON/YAML file for configurations.",
152
+ )
153
+ parser.add_argument(
154
+ "--acoustics_dir",
155
+ type=str,
156
+ help="Acoustics model checkpoint directory. If a directory is given, "
157
+ "search for the latest checkpoint dir in the directory. If a specific "
158
+ "checkpoint dir is given, directly load the checkpoint.",
159
+ )
160
+ parser.add_argument(
161
+ "--vocoder_dir",
162
+ type=str,
163
+ required=True,
164
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
165
+ "the acoustics one.",
166
+ )
167
+ parser.add_argument(
168
+ "--target_singer",
169
+ type=str,
170
+ required=True,
171
+ help="convert to a specific singer (e.g. --target_singers singer_id).",
172
+ )
173
+ parser.add_argument(
174
+ "--trans_key",
175
+ default=0,
176
+ help="0: no pitch shift; autoshift: pitch shift; int: key shift.",
177
+ )
178
+ parser.add_argument(
179
+ "--source",
180
+ type=str,
181
+ default="source_audio",
182
+ help="Source audio file or directory. If a JSON file is given, "
183
+ "inference from dataset is applied. If a directory is given, "
184
+ "inference from all wav/flac/mp3 audio files in the directory is applied. "
185
+ "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
186
+ )
187
+ parser.add_argument(
188
+ "--output_dir",
189
+ type=str,
190
+ default="conversion_results",
191
+ help="Output directory. Default: ./conversion_results",
192
+ )
193
+ parser.add_argument(
194
+ "--log_level",
195
+ type=str,
196
+ default="warning",
197
+ help="Logging level. Default: warning",
198
+ )
199
+ parser.add_argument(
200
+ "--keep_cache",
201
+ action="store_true",
202
+ default=True,
203
+ help="Keep cache files. Only applicable to inference from files.",
204
+ )
205
+ parser.add_argument(
206
+ "--diffusion_inference_steps",
207
+ type=int,
208
+ default=1000,
209
+ help="Number of inference steps. Only applicable to diffusion inference.",
210
+ )
211
+ return parser
212
+
213
+
214
+ def main():
215
+ ### Parse arguments and config
216
+ args = build_parser().parse_args()
217
+ cfg = load_config(args.config)
218
+
219
+ # CUDA settings
220
+ cuda_relevant()
221
+
222
+ if os.path.isdir(args.source):
223
+ ### Infer from file
224
+
225
+ # Get all the source audio files (.wav, .flac, .mp3)
226
+ source_audio_dir = args.source
227
+ audio_list = []
228
+ for suffix in ["wav", "flac", "mp3"]:
229
+ audio_list += glob.glob(
230
+ os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
231
+ )
232
+ print("There are {} source audios: ".format(len(audio_list)))
233
+
234
+ # Infer for every file as dataset
235
+ output_root_path = args.output_dir
236
+ for audio_path in tqdm(audio_list):
237
+ audio_name = audio_path.split("/")[-1].split(".")[0]
238
+ args.output_dir = os.path.join(output_root_path, audio_name)
239
+ print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
240
+
241
+ cfg.inference.source_audio_path = audio_path
242
+ cfg.inference.source_audio_name = audio_name
243
+ cfg.inference.segments_max_duration = 10.0
244
+ cfg.inference.segments_overlap_duration = 1.0
245
+
246
+ # Prepare metadata and features
247
+ args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
248
+
249
+ # Infer from file
250
+ output_audio_files = infer(args, cfg, infer_type="from_file")
251
+
252
+ # Merge the split segments
253
+ merge_for_audio_segments(output_audio_files, args, cfg)
254
+
255
+ # Keep or remove caches
256
+ if not args.keep_cache:
257
+ os.removedirs(cache_dir)
258
+
259
+ else:
260
+ ### Infer from dataset
261
+ infer(args, cfg, infer_type="from_dataset")
262
+
263
+
264
+ if __name__ == "__main__":
265
+ main()
bins/svc/preprocess.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ from multiprocessing import cpu_count
14
+
15
+
16
+ from utils.util import load_config
17
+ from preprocessors.processor import preprocess_dataset
18
+ from preprocessors.metadata import cal_metadata
19
+ from processors import acoustic_extractor, content_extractor, data_augment
20
+
21
+
22
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
23
+ """Extract acoustic features of utterances in the dataset
24
+
25
+ Args:
26
+ dataset (str): name of dataset, e.g. opencpop
27
+ output_path (str): directory that stores train, test and feature files of datasets
28
+ cfg (dict): dictionary that stores configurations
29
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
30
+ """
31
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
32
+ metadata = []
33
+ dataset_output = os.path.join(output_path, dataset)
34
+
35
+ for dataset_type in types:
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+ preprocess_dataset(
84
+ dataset,
85
+ cfg.dataset_path[dataset],
86
+ output_path,
87
+ cfg.preprocess,
88
+ cfg.task_type,
89
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
90
+ )
91
+
92
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
93
+ try:
94
+ assert isinstance(
95
+ cfg.preprocess.data_augment, list
96
+ ), "Please provide a list of datasets need to be augmented."
97
+ if len(cfg.preprocess.data_augment) > 0:
98
+ new_datasets_list = []
99
+ for dataset in cfg.preprocess.data_augment:
100
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
101
+ new_datasets_list.extend(new_datasets)
102
+ cfg.dataset.extend(new_datasets_list)
103
+ print("Augmentation datasets: ", cfg.dataset)
104
+ except:
105
+ print("No Data Augmentation.")
106
+
107
+ # Dump metadata of datasets (singers, train/test durations, etc.)
108
+ cal_metadata(cfg)
109
+
110
+ ## Prepare the acoustic features
111
+ for dataset in cfg.dataset:
112
+ # Skip augmented datasets which do not need to extract acoustic features
113
+ # We will copy acoustic features from the original dataset later
114
+ if (
115
+ "pitch_shift" in dataset
116
+ or "formant_shift" in dataset
117
+ or "equalizer" in dataset in dataset
118
+ ):
119
+ continue
120
+ print(
121
+ "Extracting acoustic features for {} using {} workers ...".format(
122
+ dataset, args.num_workers
123
+ )
124
+ )
125
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
126
+ # Calculate the statistics of acoustic features
127
+ if cfg.preprocess.mel_min_max_norm:
128
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
129
+
130
+ if cfg.preprocess.extract_pitch:
131
+ acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
132
+
133
+ # Copy acoustic features for augmented datasets by creating soft-links
134
+ for dataset in cfg.dataset:
135
+ if "pitch_shift" in dataset:
136
+ src_dataset = dataset.replace("_pitch_shift", "")
137
+ src_dataset_dir = os.path.join(output_path, src_dataset)
138
+ elif "formant_shift" in dataset:
139
+ src_dataset = dataset.replace("_formant_shift", "")
140
+ src_dataset_dir = os.path.join(output_path, src_dataset)
141
+ elif "equalizer" in dataset:
142
+ src_dataset = dataset.replace("_equalizer", "")
143
+ src_dataset_dir = os.path.join(output_path, src_dataset)
144
+ else:
145
+ continue
146
+ dataset_dir = os.path.join(output_path, dataset)
147
+ metadata = []
148
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
149
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
150
+ with open(metadata_file_path, "r") as f:
151
+ metadata.extend(json.load(f))
152
+ print("Copying acoustic features for {}...".format(dataset))
153
+ acoustic_extractor.copy_acoustic_features(
154
+ metadata, dataset_dir, src_dataset_dir, cfg
155
+ )
156
+ if cfg.preprocess.mel_min_max_norm:
157
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
158
+
159
+ if cfg.preprocess.extract_pitch:
160
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
161
+
162
+ # Prepare the content features
163
+ for dataset in cfg.dataset:
164
+ print("Extracting content features for {}...".format(dataset))
165
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
166
+
167
+
168
+ def main():
169
+ parser = argparse.ArgumentParser()
170
+ parser.add_argument(
171
+ "--config", default="config.json", help="json files for configurations."
172
+ )
173
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
174
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
175
+
176
+ args = parser.parse_args()
177
+ cfg = load_config(args.config)
178
+
179
+ preprocess(cfg, args)
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()
bins/svc/train.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
11
+ from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
12
+ from models.svc.transformer.transformer_trainer import TransformerTrainer
13
+ from models.svc.vits.vits_trainer import VitsSVCTrainer
14
+ from utils.util import load_config
15
+
16
+
17
+ def build_trainer(args, cfg):
18
+ supported_trainer = {
19
+ "DiffWaveNetSVC": DiffusionTrainer,
20
+ "DiffComoSVC": ComoSVCTrainer,
21
+ "TransformerSVC": TransformerTrainer,
22
+ "VitsSVC": VitsSVCTrainer,
23
+ }
24
+
25
+ trainer_class = supported_trainer[cfg.model_type]
26
+ trainer = trainer_class(args, cfg)
27
+ return trainer
28
+
29
+
30
+ def cuda_relevant(deterministic=False):
31
+ torch.cuda.empty_cache()
32
+ # TF32 on Ampere and above
33
+ torch.backends.cuda.matmul.allow_tf32 = True
34
+ torch.backends.cudnn.enabled = True
35
+ torch.backends.cudnn.allow_tf32 = True
36
+ # Deterministic
37
+ torch.backends.cudnn.deterministic = deterministic
38
+ torch.backends.cudnn.benchmark = not deterministic
39
+ torch.use_deterministic_algorithms(deterministic)
40
+
41
+
42
+ def main():
43
+ parser = argparse.ArgumentParser()
44
+ parser.add_argument(
45
+ "--config",
46
+ default="config.json",
47
+ help="json files for configurations.",
48
+ required=True,
49
+ )
50
+ parser.add_argument(
51
+ "--exp_name",
52
+ type=str,
53
+ default="exp_name",
54
+ help="A specific name to note the experiment",
55
+ required=True,
56
+ )
57
+ parser.add_argument(
58
+ "--resume",
59
+ action="store_true",
60
+ help="If specified, to resume from the existing checkpoint.",
61
+ )
62
+ parser.add_argument(
63
+ "--resume_from_ckpt_path",
64
+ type=str,
65
+ default="",
66
+ help="The specific checkpoint path that you want to resume from.",
67
+ )
68
+ parser.add_argument(
69
+ "--resume_type",
70
+ type=str,
71
+ default="",
72
+ help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
73
+ )
74
+
75
+ parser.add_argument(
76
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
77
+ )
78
+ args = parser.parse_args()
79
+ cfg = load_config(args.config)
80
+
81
+ # Data Augmentation
82
+ if (
83
+ type(cfg.preprocess.data_augment) == list
84
+ and len(cfg.preprocess.data_augment) > 0
85
+ ):
86
+ new_datasets_list = []
87
+ for dataset in cfg.preprocess.data_augment:
88
+ new_datasets = [
89
+ f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
90
+ (
91
+ f"{dataset}_formant_shift"
92
+ if cfg.preprocess.use_formant_shift
93
+ else None
94
+ ),
95
+ f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
96
+ f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
97
+ ]
98
+ new_datasets_list.extend(filter(None, new_datasets))
99
+ cfg.dataset.extend(new_datasets_list)
100
+
101
+ # CUDA settings
102
+ cuda_relevant()
103
+
104
+ # Build trainer
105
+ trainer = build_trainer(args, cfg)
106
+
107
+ trainer.train_loop()
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
bins/tta/inference.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tta.ldm.audioldm_inference import AudioLDMInference
11
+ from utils.util import save_config, load_model_config, load_config
12
+ import numpy as np
13
+ import torch
14
+
15
+
16
+ def build_inference(args, cfg):
17
+ supported_inference = {
18
+ "AudioLDM": AudioLDMInference,
19
+ }
20
+
21
+ inference_class = supported_inference[cfg.model_type]
22
+ inference = inference_class(args, cfg)
23
+ return inference
24
+
25
+
26
+ def build_parser():
27
+ parser = argparse.ArgumentParser()
28
+
29
+ parser.add_argument(
30
+ "--config",
31
+ type=str,
32
+ required=True,
33
+ help="JSON/YAML file for configurations.",
34
+ )
35
+ parser.add_argument(
36
+ "--text",
37
+ help="Text to be synthesized",
38
+ type=str,
39
+ default="Text to be synthesized.",
40
+ )
41
+ parser.add_argument(
42
+ "--checkpoint_path",
43
+ type=str,
44
+ )
45
+ parser.add_argument(
46
+ "--vocoder_path", type=str, help="Checkpoint path of the vocoder"
47
+ )
48
+ parser.add_argument(
49
+ "--vocoder_config_path", type=str, help="Config path of the vocoder"
50
+ )
51
+ parser.add_argument(
52
+ "--output_dir",
53
+ type=str,
54
+ default=None,
55
+ help="Output dir for saving generated results",
56
+ )
57
+ parser.add_argument(
58
+ "--num_steps",
59
+ type=int,
60
+ default=200,
61
+ help="The total number of denosing steps",
62
+ )
63
+ parser.add_argument(
64
+ "--guidance_scale",
65
+ type=float,
66
+ default=4.0,
67
+ help="The scale of classifer free guidance",
68
+ )
69
+ parser.add_argument("--local_rank", default=-1, type=int)
70
+ return parser
71
+
72
+
73
+ def main():
74
+ # Parse arguments
75
+ args = build_parser().parse_args()
76
+ # args, infer_type = formulate_parser(args)
77
+
78
+ # Parse config
79
+ cfg = load_config(args.config)
80
+ if torch.cuda.is_available():
81
+ args.local_rank = torch.device("cuda")
82
+ else:
83
+ args.local_rank = torch.device("cpu")
84
+ print("args: ", args)
85
+
86
+ # Build inference
87
+ inferencer = build_inference(args, cfg)
88
+
89
+ # Run inference
90
+ inferencer.inference()
91
+
92
+
93
+ if __name__ == "__main__":
94
+ main()
bins/tta/preprocess.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
41
+ # metadata, dataset_output, cfg, n_workers=n_workers
42
+ # )
43
+ acoustic_extractor.extract_utt_acoustic_features_serial(
44
+ metadata, dataset_output, cfg
45
+ )
46
+
47
+
48
+ def extract_content_features(dataset, output_path, cfg, num_workers=1):
49
+ """Extract content features of utterances in the dataset
50
+
51
+ Args:
52
+ dataset (str): name of dataset, e.g. opencpop
53
+ output_path (str): directory that stores train, test and feature files of datasets
54
+ cfg (dict): dictionary that stores configurations
55
+ """
56
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
57
+ metadata = []
58
+ for dataset_type in types:
59
+ dataset_output = os.path.join(output_path, dataset)
60
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
61
+ with open(dataset_file, "r") as f:
62
+ metadata.extend(json.load(f))
63
+
64
+ content_extractor.extract_utt_content_features_dataloader(
65
+ cfg, metadata, num_workers
66
+ )
67
+
68
+
69
+ def preprocess(cfg, args):
70
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
71
+
72
+ Args:
73
+ cfg (dict): dictionary that stores configurations
74
+ args (ArgumentParser): specify the configuration file and num_workers
75
+ """
76
+ # Specify the output root path to save the processed data
77
+ output_path = cfg.preprocess.processed_dir
78
+ os.makedirs(output_path, exist_ok=True)
79
+
80
+ ## Split train and test sets
81
+ for dataset in cfg.dataset:
82
+ print("Preprocess {}...".format(dataset))
83
+
84
+ if args.prepare_alignment:
85
+ ## Prepare alignment with MFA
86
+ print("Prepare alignment {}...".format(dataset))
87
+ prepare_align(
88
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
89
+ )
90
+ preprocess_dataset(
91
+ dataset,
92
+ cfg.dataset_path[dataset],
93
+ output_path,
94
+ cfg.preprocess,
95
+ cfg.task_type,
96
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
97
+ )
98
+
99
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
100
+ try:
101
+ assert isinstance(
102
+ cfg.preprocess.data_augment, list
103
+ ), "Please provide a list of datasets need to be augmented."
104
+ if len(cfg.preprocess.data_augment) > 0:
105
+ new_datasets_list = []
106
+ for dataset in cfg.preprocess.data_augment:
107
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
108
+ new_datasets_list.extend(new_datasets)
109
+ cfg.dataset.extend(new_datasets_list)
110
+ print("Augmentation datasets: ", cfg.dataset)
111
+ except:
112
+ print("No Data Augmentation.")
113
+
114
+ # Dump metadata of datasets (singers, train/test durations, etc.)
115
+ cal_metadata(cfg)
116
+
117
+ ## Prepare the acoustic features
118
+ for dataset in cfg.dataset:
119
+ # Skip augmented datasets which do not need to extract acoustic features
120
+ # We will copy acoustic features from the original dataset later
121
+ if (
122
+ "pitch_shift" in dataset
123
+ or "formant_shift" in dataset
124
+ or "equalizer" in dataset in dataset
125
+ ):
126
+ continue
127
+ print(
128
+ "Extracting acoustic features for {} using {} workers ...".format(
129
+ dataset, args.num_workers
130
+ )
131
+ )
132
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
133
+ # Calculate the statistics of acoustic features
134
+ if cfg.preprocess.mel_min_max_norm:
135
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
136
+
137
+ if cfg.preprocess.extract_pitch:
138
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
139
+ if cfg.preprocess.extract_energy:
140
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
141
+
142
+ if cfg.preprocess.align_mel_duration:
143
+ acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
144
+
145
+ # Copy acoustic features for augmented datasets by creating soft-links
146
+ for dataset in cfg.dataset:
147
+ if "pitch_shift" in dataset:
148
+ src_dataset = dataset.replace("_pitch_shift", "")
149
+ src_dataset_dir = os.path.join(output_path, src_dataset)
150
+ elif "formant_shift" in dataset:
151
+ src_dataset = dataset.replace("_formant_shift", "")
152
+ src_dataset_dir = os.path.join(output_path, src_dataset)
153
+ elif "equalizer" in dataset:
154
+ src_dataset = dataset.replace("_equalizer", "")
155
+ src_dataset_dir = os.path.join(output_path, src_dataset)
156
+ else:
157
+ continue
158
+ dataset_dir = os.path.join(output_path, dataset)
159
+ metadata = []
160
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
161
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
162
+ with open(metadata_file_path, "r") as f:
163
+ metadata.extend(json.load(f))
164
+ print("Copying acoustic features for {}...".format(dataset))
165
+ acoustic_extractor.copy_acoustic_features(
166
+ metadata, dataset_dir, src_dataset_dir, cfg
167
+ )
168
+ if cfg.preprocess.mel_min_max_norm:
169
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
170
+
171
+ if cfg.preprocess.extract_pitch:
172
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
173
+
174
+ # Prepare the content features
175
+ for dataset in cfg.dataset:
176
+ print("Extracting content features for {}...".format(dataset))
177
+ extract_content_features(dataset, output_path, cfg, args.num_workers)
178
+
179
+
180
+ def main():
181
+ parser = argparse.ArgumentParser()
182
+ parser.add_argument(
183
+ "--config", default="config.json", help="json files for configurations."
184
+ )
185
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
186
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
187
+
188
+ args = parser.parse_args()
189
+ cfg = load_config(args.config)
190
+
191
+ preprocess(cfg, args)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
bins/tta/train_tta.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+ import torch
9
+
10
+ from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
11
+ from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_trainer(args, cfg):
16
+ supported_trainer = {
17
+ "AutoencoderKL": AutoencoderKLTrainer,
18
+ "AudioLDM": AudioLDMTrainer,
19
+ }
20
+
21
+ trainer_class = supported_trainer[cfg.model_type]
22
+ trainer = trainer_class(args, cfg)
23
+ return trainer
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument(
29
+ "--config",
30
+ default="config.json",
31
+ help="json files for configurations.",
32
+ required=True,
33
+ )
34
+ parser.add_argument(
35
+ "--num_workers", type=int, default=6, help="Number of dataloader workers."
36
+ )
37
+ parser.add_argument(
38
+ "--exp_name",
39
+ type=str,
40
+ default="exp_name",
41
+ help="A specific name to note the experiment",
42
+ required=True,
43
+ )
44
+ parser.add_argument(
45
+ "--resume",
46
+ type=str,
47
+ default=None,
48
+ # action="store_true",
49
+ help="The model name to restore",
50
+ )
51
+ parser.add_argument(
52
+ "--log_level", default="info", help="logging level (info, debug, warning)"
53
+ )
54
+ parser.add_argument("--stdout_interval", default=5, type=int)
55
+ parser.add_argument("--local_rank", default=-1, type=int)
56
+ args = parser.parse_args()
57
+ cfg = load_config(args.config)
58
+ cfg.exp_name = args.exp_name
59
+
60
+ # Model saving dir
61
+ args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
62
+ os.makedirs(args.log_dir, exist_ok=True)
63
+
64
+ if not cfg.train.ddp:
65
+ args.local_rank = torch.device("cuda")
66
+
67
+ # Build trainer
68
+ trainer = build_trainer(args, cfg)
69
+
70
+ # Restore models
71
+ if args.resume:
72
+ trainer.restore()
73
+ trainer.train()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
bins/tts/inference.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ from argparse import ArgumentParser
8
+ import os
9
+
10
+ from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
11
+ from models.tts.vits.vits_inference import VitsInference
12
+ from models.tts.valle.valle_inference import VALLEInference
13
+ from models.tts.naturalspeech2.ns2_inference import NS2Inference
14
+ from models.tts.jets.jets_inference import JetsInference
15
+ from utils.util import load_config
16
+ import torch
17
+
18
+
19
+ def build_inference(args, cfg):
20
+ supported_inference = {
21
+ "FastSpeech2": FastSpeech2Inference,
22
+ "VITS": VitsInference,
23
+ "VALLE": VALLEInference,
24
+ "NaturalSpeech2": NS2Inference,
25
+ "Jets": JetsInference,
26
+ }
27
+
28
+ inference_class = supported_inference[cfg.model_type]
29
+ inference = inference_class(args, cfg)
30
+ return inference
31
+
32
+
33
+ def cuda_relevant(deterministic=False):
34
+ torch.cuda.empty_cache()
35
+ # TF32 on Ampere and above
36
+ torch.backends.cuda.matmul.allow_tf32 = True
37
+ torch.backends.cudnn.enabled = True
38
+ torch.backends.cudnn.allow_tf32 = True
39
+ # Deterministic
40
+ torch.backends.cudnn.deterministic = deterministic
41
+ torch.backends.cudnn.benchmark = not deterministic
42
+ torch.use_deterministic_algorithms(deterministic)
43
+
44
+
45
+ def build_parser():
46
+ parser = argparse.ArgumentParser()
47
+
48
+ parser.add_argument(
49
+ "--config",
50
+ type=str,
51
+ required=True,
52
+ help="JSON/YAML file for configurations.",
53
+ )
54
+ parser.add_argument(
55
+ "--dataset",
56
+ type=str,
57
+ help="convert from the source data",
58
+ default=None,
59
+ )
60
+ parser.add_argument(
61
+ "--testing_set",
62
+ type=str,
63
+ help="train, test, golden_test",
64
+ default="test",
65
+ )
66
+ parser.add_argument(
67
+ "--test_list_file",
68
+ type=str,
69
+ help="convert from the test list file",
70
+ default=None,
71
+ )
72
+ parser.add_argument(
73
+ "--speaker_name",
74
+ type=str,
75
+ default=None,
76
+ help="speaker name for multi-speaker synthesis, for single-sentence mode only",
77
+ )
78
+ parser.add_argument(
79
+ "--text",
80
+ help="Text to be synthesized.",
81
+ type=str,
82
+ default="",
83
+ )
84
+ parser.add_argument(
85
+ "--vocoder_dir",
86
+ type=str,
87
+ default=None,
88
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
89
+ "the acoustics one.",
90
+ )
91
+ parser.add_argument(
92
+ "--acoustics_dir",
93
+ type=str,
94
+ default=None,
95
+ help="Acoustic model checkpoint directory. If a directory is given, "
96
+ "search for the latest checkpoint dir in the directory. If a specific "
97
+ "checkpoint dir is given, directly load the checkpoint.",
98
+ )
99
+ parser.add_argument(
100
+ "--checkpoint_path",
101
+ type=str,
102
+ default=None,
103
+ help="Acoustic model checkpoint directory. If a directory is given, "
104
+ "search for the latest checkpoint dir in the directory. If a specific "
105
+ "checkpoint dir is given, directly load the checkpoint.",
106
+ )
107
+ parser.add_argument(
108
+ "--mode",
109
+ type=str,
110
+ choices=["batch", "single"],
111
+ required=True,
112
+ help="Synthesize a whole dataset or a single sentence",
113
+ )
114
+ parser.add_argument(
115
+ "--log_level",
116
+ type=str,
117
+ default="warning",
118
+ help="Logging level. Default: warning",
119
+ )
120
+ parser.add_argument(
121
+ "--pitch_control",
122
+ type=float,
123
+ default=1.0,
124
+ help="control the pitch of the whole utterance, larger value for higher pitch",
125
+ )
126
+ parser.add_argument(
127
+ "--energy_control",
128
+ type=float,
129
+ default=1.0,
130
+ help="control the energy of the whole utterance, larger value for larger volume",
131
+ )
132
+ parser.add_argument(
133
+ "--duration_control",
134
+ type=float,
135
+ default=1.0,
136
+ help="control the speed of the whole utterance, larger value for slower speaking rate",
137
+ )
138
+ parser.add_argument(
139
+ "--output_dir",
140
+ type=str,
141
+ default=None,
142
+ help="Output dir for saving generated results",
143
+ )
144
+ return parser
145
+
146
+
147
+ def main():
148
+ # Parse arguments
149
+ parser = build_parser()
150
+ VALLEInference.add_arguments(parser)
151
+ NS2Inference.add_arguments(parser)
152
+ args = parser.parse_args()
153
+ print(args)
154
+
155
+ # Parse config
156
+ cfg = load_config(args.config)
157
+
158
+ # CUDA settings
159
+ cuda_relevant()
160
+
161
+ # Build inference
162
+ inferencer = build_inference(args, cfg)
163
+
164
+ # Run inference
165
+ inferencer.inference()
166
+
167
+
168
+ if __name__ == "__main__":
169
+ main()
bins/tts/preprocess.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import (
21
+ acoustic_extractor,
22
+ content_extractor,
23
+ data_augment,
24
+ phone_extractor,
25
+ )
26
+
27
+
28
+ def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
29
+ """Extract acoustic features of utterances in the dataset
30
+
31
+ Args:
32
+ dataset (str): name of dataset, e.g. opencpop
33
+ output_path (str): directory that stores train, test and feature files of datasets
34
+ cfg (dict): dictionary that stores configurations
35
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
36
+ """
37
+
38
+ metadata = []
39
+ for dataset_type in dataset_types:
40
+ dataset_output = os.path.join(output_path, dataset)
41
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
42
+ with open(dataset_file, "r") as f:
43
+ metadata.extend(json.load(f))
44
+
45
+ # acoustic_extractor.extract_utt_acoustic_features_parallel(
46
+ # metadata, dataset_output, cfg, n_workers=n_workers
47
+ # )
48
+ acoustic_extractor.extract_utt_acoustic_features_serial(
49
+ metadata, dataset_output, cfg
50
+ )
51
+
52
+
53
+ def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
54
+ """Extract content features of utterances in the dataset
55
+
56
+ Args:
57
+ dataset (str): name of dataset, e.g. opencpop
58
+ output_path (str): directory that stores train, test and feature files of datasets
59
+ cfg (dict): dictionary that stores configurations
60
+ """
61
+
62
+ metadata = []
63
+ for dataset_type in dataset_types:
64
+ dataset_output = os.path.join(output_path, dataset)
65
+ # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
66
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
67
+ with open(dataset_file, "r") as f:
68
+ metadata.extend(json.load(f))
69
+
70
+ content_extractor.extract_utt_content_features_dataloader(
71
+ cfg, metadata, num_workers
72
+ )
73
+
74
+
75
+ def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
76
+ """Extract phoneme features of utterances in the dataset
77
+
78
+ Args:
79
+ dataset (str): name of dataset, e.g. opencpop
80
+ output_path (str): directory that stores train, test and feature files of datasets
81
+ cfg (dict): dictionary that stores configurations
82
+
83
+ """
84
+
85
+ metadata = []
86
+ for dataset_type in dataset_types:
87
+ dataset_output = os.path.join(output_path, dataset)
88
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
89
+ with open(dataset_file, "r") as f:
90
+ metadata.extend(json.load(f))
91
+ phone_extractor.extract_utt_phone_sequence(dataset, cfg, metadata)
92
+
93
+
94
+ def preprocess(cfg, args):
95
+ """Preprocess raw data of single or multiple datasets (in cfg.dataset)
96
+
97
+ Args:
98
+ cfg (dict): dictionary that stores configurations
99
+ args (ArgumentParser): specify the configuration file and num_workers
100
+ """
101
+ # Specify the output root path to save the processed data
102
+ output_path = cfg.preprocess.processed_dir
103
+ os.makedirs(output_path, exist_ok=True)
104
+
105
+ # Split train and test sets
106
+ for dataset in cfg.dataset:
107
+ print("Preprocess {}...".format(dataset))
108
+
109
+ if args.prepare_alignment:
110
+ # Prepare alignment with MFA
111
+ print("Prepare alignment {}...".format(dataset))
112
+ prepare_align(
113
+ dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
114
+ )
115
+
116
+ preprocess_dataset(
117
+ dataset,
118
+ cfg.dataset_path[dataset],
119
+ output_path,
120
+ cfg.preprocess,
121
+ cfg.task_type,
122
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
123
+ )
124
+
125
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
126
+ try:
127
+ assert isinstance(
128
+ cfg.preprocess.data_augment, list
129
+ ), "Please provide a list of datasets need to be augmented."
130
+ if len(cfg.preprocess.data_augment) > 0:
131
+ new_datasets_list = []
132
+ for dataset in cfg.preprocess.data_augment:
133
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
134
+ new_datasets_list.extend(new_datasets)
135
+ cfg.dataset.extend(new_datasets_list)
136
+ print("Augmentation datasets: ", cfg.dataset)
137
+ except:
138
+ print("No Data Augmentation.")
139
+
140
+ # json files
141
+ dataset_types = list()
142
+ dataset_types.append((cfg.preprocess.train_file).split(".")[0])
143
+ dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
144
+ if "test" not in dataset_types:
145
+ dataset_types.append("test")
146
+ if "eval" in dataset:
147
+ dataset_types = ["test"]
148
+
149
+ # Dump metadata of datasets (singers, train/test durations, etc.)
150
+ cal_metadata(cfg, dataset_types)
151
+
152
+ # Prepare the acoustic features
153
+ for dataset in cfg.dataset:
154
+ # Skip augmented datasets which do not need to extract acoustic features
155
+ # We will copy acoustic features from the original dataset later
156
+ if (
157
+ "pitch_shift" in dataset
158
+ or "formant_shift" in dataset
159
+ or "equalizer" in dataset in dataset
160
+ ):
161
+ continue
162
+ print(
163
+ "Extracting acoustic features for {} using {} workers ...".format(
164
+ dataset, args.num_workers
165
+ )
166
+ )
167
+ extract_acoustic_features(
168
+ dataset, output_path, cfg, dataset_types, args.num_workers
169
+ )
170
+ # Calculate the statistics of acoustic features
171
+ if cfg.preprocess.mel_min_max_norm:
172
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
173
+
174
+ if cfg.preprocess.extract_pitch:
175
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
176
+
177
+ if cfg.preprocess.extract_energy:
178
+ acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
179
+
180
+ if cfg.preprocess.pitch_norm:
181
+ acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
182
+
183
+ if cfg.preprocess.energy_norm:
184
+ acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
185
+
186
+ # Copy acoustic features for augmented datasets by creating soft-links
187
+ for dataset in cfg.dataset:
188
+ if "pitch_shift" in dataset:
189
+ src_dataset = dataset.replace("_pitch_shift", "")
190
+ src_dataset_dir = os.path.join(output_path, src_dataset)
191
+ elif "formant_shift" in dataset:
192
+ src_dataset = dataset.replace("_formant_shift", "")
193
+ src_dataset_dir = os.path.join(output_path, src_dataset)
194
+ elif "equalizer" in dataset:
195
+ src_dataset = dataset.replace("_equalizer", "")
196
+ src_dataset_dir = os.path.join(output_path, src_dataset)
197
+ else:
198
+ continue
199
+ dataset_dir = os.path.join(output_path, dataset)
200
+ metadata = []
201
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
202
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
203
+ with open(metadata_file_path, "r") as f:
204
+ metadata.extend(json.load(f))
205
+ print("Copying acoustic features for {}...".format(dataset))
206
+ acoustic_extractor.copy_acoustic_features(
207
+ metadata, dataset_dir, src_dataset_dir, cfg
208
+ )
209
+ if cfg.preprocess.mel_min_max_norm:
210
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
211
+
212
+ if cfg.preprocess.extract_pitch:
213
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
214
+
215
+ # Prepare the content features
216
+ for dataset in cfg.dataset:
217
+ print("Extracting content features for {}...".format(dataset))
218
+ extract_content_features(
219
+ dataset, output_path, cfg, dataset_types, args.num_workers
220
+ )
221
+
222
+ # Prepare the phenome squences
223
+ if cfg.preprocess.extract_phone:
224
+ for dataset in cfg.dataset:
225
+ print("Extracting phoneme sequence for {}...".format(dataset))
226
+ extract_phonme_sequences(dataset, output_path, cfg, dataset_types)
227
+
228
+
229
+ def main():
230
+ parser = argparse.ArgumentParser()
231
+ parser.add_argument(
232
+ "--config", default="config.json", help="json files for configurations."
233
+ )
234
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
235
+ parser.add_argument("--prepare_alignment", type=bool, default=False)
236
+
237
+ args = parser.parse_args()
238
+ cfg = load_config(args.config)
239
+
240
+ preprocess(cfg, args)
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main()
bins/tts/train.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
11
+ from models.tts.vits.vits_trainer import VITSTrainer
12
+ from models.tts.valle.valle_trainer import VALLETrainer
13
+ from models.tts.naturalspeech2.ns2_trainer import NS2Trainer
14
+ from models.tts.valle_v2.valle_ar_trainer import ValleARTrainer as VALLE_V2_AR
15
+ from models.tts.valle_v2.valle_nar_trainer import ValleNARTrainer as VALLE_V2_NAR
16
+ from models.tts.jets.jets_trainer import JetsTrainer
17
+
18
+ from utils.util import load_config
19
+
20
+
21
+ def build_trainer(args, cfg):
22
+ supported_trainer = {
23
+ "FastSpeech2": FastSpeech2Trainer,
24
+ "VITS": VITSTrainer,
25
+ "VALLE": VALLETrainer,
26
+ "NaturalSpeech2": NS2Trainer,
27
+ "VALLE_V2_AR": VALLE_V2_AR,
28
+ "VALLE_V2_NAR": VALLE_V2_NAR,
29
+ "Jets": JetsTrainer,
30
+ }
31
+
32
+ trainer_class = supported_trainer[cfg.model_type]
33
+ trainer = trainer_class(args, cfg)
34
+ return trainer
35
+
36
+
37
+ def cuda_relevant(deterministic=False):
38
+ torch.cuda.empty_cache()
39
+ # TF32 on Ampere and above
40
+ torch.backends.cuda.matmul.allow_tf32 = True
41
+ torch.backends.cudnn.enabled = True
42
+ torch.backends.cudnn.benchmark = False
43
+ torch.backends.cudnn.allow_tf32 = True
44
+ # Deterministic
45
+ torch.backends.cudnn.deterministic = deterministic
46
+ torch.backends.cudnn.benchmark = not deterministic
47
+ torch.use_deterministic_algorithms(deterministic)
48
+
49
+
50
+ def main():
51
+ parser = argparse.ArgumentParser()
52
+ parser.add_argument(
53
+ "--config",
54
+ default="config.json",
55
+ help="json files for configurations.",
56
+ required=True,
57
+ )
58
+ parser.add_argument(
59
+ "--seed",
60
+ type=int,
61
+ default=1234,
62
+ help="random seed",
63
+ required=False,
64
+ )
65
+ parser.add_argument(
66
+ "--exp_name",
67
+ type=str,
68
+ default="exp_name",
69
+ help="A specific name to note the experiment",
70
+ required=True,
71
+ )
72
+ parser.add_argument(
73
+ "--resume", action="store_true", help="The model name to restore"
74
+ )
75
+ parser.add_argument(
76
+ "--test", action="store_true", default=False, help="Test the model"
77
+ )
78
+ parser.add_argument(
79
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
80
+ )
81
+ parser.add_argument(
82
+ "--resume_type",
83
+ type=str,
84
+ default="resume",
85
+ help="Resume training or finetuning.",
86
+ )
87
+ parser.add_argument(
88
+ "--checkpoint_path",
89
+ type=str,
90
+ default=None,
91
+ help="Checkpoint for resume training or finetuning.",
92
+ )
93
+ parser.add_argument(
94
+ "--resume_from_ckpt_path",
95
+ type=str,
96
+ default="",
97
+ help="Checkpoint for resume training or finetuning.",
98
+ )
99
+ # VALLETrainer.add_arguments(parser)
100
+ args = parser.parse_args()
101
+ cfg = load_config(args.config)
102
+
103
+ # Data Augmentation
104
+ if hasattr(cfg, "preprocess"):
105
+ if hasattr(cfg.preprocess, "data_augment"):
106
+ if (
107
+ type(cfg.preprocess.data_augment) == list
108
+ and len(cfg.preprocess.data_augment) > 0
109
+ ):
110
+ new_datasets_list = []
111
+ for dataset in cfg.preprocess.data_augment:
112
+ new_datasets = [
113
+ (
114
+ f"{dataset}_pitch_shift"
115
+ if cfg.preprocess.use_pitch_shift
116
+ else None
117
+ ),
118
+ (
119
+ f"{dataset}_formant_shift"
120
+ if cfg.preprocess.use_formant_shift
121
+ else None
122
+ ),
123
+ (
124
+ f"{dataset}_equalizer"
125
+ if cfg.preprocess.use_equalizer
126
+ else None
127
+ ),
128
+ (
129
+ f"{dataset}_time_stretch"
130
+ if cfg.preprocess.use_time_stretch
131
+ else None
132
+ ),
133
+ ]
134
+ new_datasets_list.extend(filter(None, new_datasets))
135
+ cfg.dataset.extend(new_datasets_list)
136
+
137
+ print("experiment name: ", args.exp_name)
138
+ # # CUDA settings
139
+ cuda_relevant()
140
+
141
+ # Build trainer
142
+ print(f"Building {cfg.model_type} trainer")
143
+ trainer = build_trainer(args, cfg)
144
+ print(f"Start training {cfg.model_type} model")
145
+ if args.test:
146
+ trainer.test_loop()
147
+ else:
148
+ trainer.train_loop()
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()
bins/vocoder/inference.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+ import os
8
+
9
+ import torch
10
+
11
+ from models.vocoders.vocoder_inference import VocoderInference
12
+ from utils.util import load_config
13
+
14
+
15
+ def build_inference(args, cfg, infer_type="infer_from_dataset"):
16
+ supported_inference = {
17
+ "GANVocoder": VocoderInference,
18
+ "DiffusionVocoder": VocoderInference,
19
+ }
20
+
21
+ inference_class = supported_inference[cfg.model_type]
22
+ return inference_class(args, cfg, infer_type)
23
+
24
+
25
+ def cuda_relevant(deterministic=False):
26
+ torch.cuda.empty_cache()
27
+ # TF32 on Ampere and above
28
+ torch.backends.cuda.matmul.allow_tf32 = True
29
+ torch.backends.cudnn.enabled = True
30
+ torch.backends.cudnn.allow_tf32 = True
31
+ # Deterministic
32
+ torch.backends.cudnn.deterministic = deterministic
33
+ torch.backends.cudnn.benchmark = not deterministic
34
+ torch.use_deterministic_algorithms(deterministic)
35
+
36
+
37
+ def build_parser():
38
+ r"""Build argument parser for inference.py.
39
+ Anything else should be put in an extra config YAML file.
40
+ """
41
+
42
+ parser = argparse.ArgumentParser()
43
+ parser.add_argument(
44
+ "--config",
45
+ type=str,
46
+ required=True,
47
+ help="JSON/YAML file for configurations.",
48
+ )
49
+ parser.add_argument(
50
+ "--infer_mode",
51
+ type=str,
52
+ required=None,
53
+ )
54
+ parser.add_argument(
55
+ "--infer_datasets",
56
+ nargs="+",
57
+ default=None,
58
+ )
59
+ parser.add_argument(
60
+ "--feature_folder",
61
+ type=str,
62
+ default=None,
63
+ )
64
+ parser.add_argument(
65
+ "--audio_folder",
66
+ type=str,
67
+ default=None,
68
+ )
69
+ parser.add_argument(
70
+ "--vocoder_dir",
71
+ type=str,
72
+ required=True,
73
+ help="Vocoder checkpoint directory. Searching behavior is the same as "
74
+ "the acoustics one.",
75
+ )
76
+ parser.add_argument(
77
+ "--output_dir",
78
+ type=str,
79
+ default="result",
80
+ help="Output directory. Default: ./result",
81
+ )
82
+ parser.add_argument(
83
+ "--log_level",
84
+ type=str,
85
+ default="warning",
86
+ help="Logging level. Default: warning",
87
+ )
88
+ parser.add_argument(
89
+ "--keep_cache",
90
+ action="store_true",
91
+ default=False,
92
+ help="Keep cache files. Only applicable to inference from files.",
93
+ )
94
+ return parser
95
+
96
+
97
+ def main():
98
+ # Parse arguments
99
+ args = build_parser().parse_args()
100
+
101
+ # Parse config
102
+ cfg = load_config(args.config)
103
+
104
+ # CUDA settings
105
+ cuda_relevant()
106
+
107
+ # Build inference
108
+ trainer = build_inference(args, cfg, args.infer_mode)
109
+
110
+ # Run inference
111
+ trainer.inference()
112
+
113
+
114
+ if __name__ == "__main__":
115
+ main()
bins/vocoder/preprocess.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import faulthandler
7
+
8
+ faulthandler.enable()
9
+
10
+ import os
11
+ import argparse
12
+ import json
13
+ import pyworld as pw
14
+ from multiprocessing import cpu_count
15
+
16
+
17
+ from utils.util import load_config
18
+ from preprocessors.processor import preprocess_dataset, prepare_align
19
+ from preprocessors.metadata import cal_metadata
20
+ from processors import acoustic_extractor, content_extractor, data_augment
21
+
22
+
23
+ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
24
+ """Extract acoustic features of utterances in the dataset
25
+
26
+ Args:
27
+ dataset (str): name of dataset, e.g. opencpop
28
+ output_path (str): directory that stores train, test and feature files of datasets
29
+ cfg (dict): dictionary that stores configurations
30
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
31
+ """
32
+ types = ["train", "test"] if "eval" not in dataset else ["test"]
33
+ metadata = []
34
+ for dataset_type in types:
35
+ dataset_output = os.path.join(output_path, dataset)
36
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
37
+ with open(dataset_file, "r") as f:
38
+ metadata.extend(json.load(f))
39
+
40
+ acoustic_extractor.extract_utt_acoustic_features_serial(
41
+ metadata, dataset_output, cfg
42
+ )
43
+
44
+
45
+ def preprocess(cfg, args):
46
+ """Proprocess raw data of single or multiple datasets (in cfg.dataset)
47
+
48
+ Args:
49
+ cfg (dict): dictionary that stores configurations
50
+ args (ArgumentParser): specify the configuration file and num_workers
51
+ """
52
+ # Specify the output root path to save the processed data
53
+ output_path = cfg.preprocess.processed_dir
54
+ os.makedirs(output_path, exist_ok=True)
55
+
56
+ ## Split train and test sets
57
+ for dataset in cfg.dataset:
58
+ print("Preprocess {}...".format(dataset))
59
+
60
+ preprocess_dataset(
61
+ dataset,
62
+ cfg.dataset_path[dataset],
63
+ output_path,
64
+ cfg.preprocess,
65
+ cfg.task_type,
66
+ is_custom_dataset=dataset in cfg.use_custom_dataset,
67
+ )
68
+
69
+ # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
70
+ try:
71
+ assert isinstance(
72
+ cfg.preprocess.data_augment, list
73
+ ), "Please provide a list of datasets need to be augmented."
74
+ if len(cfg.preprocess.data_augment) > 0:
75
+ new_datasets_list = []
76
+ for dataset in cfg.preprocess.data_augment:
77
+ new_datasets = data_augment.augment_dataset(cfg, dataset)
78
+ new_datasets_list.extend(new_datasets)
79
+ cfg.dataset.extend(new_datasets_list)
80
+ print("Augmentation datasets: ", cfg.dataset)
81
+ except:
82
+ print("No Data Augmentation.")
83
+
84
+ # Dump metadata of datasets (singers, train/test durations, etc.)
85
+ cal_metadata(cfg)
86
+
87
+ ## Prepare the acoustic features
88
+ for dataset in cfg.dataset:
89
+ # Skip augmented datasets which do not need to extract acoustic features
90
+ # We will copy acoustic features from the original dataset later
91
+ if (
92
+ "pitch_shift" in dataset
93
+ or "formant_shift" in dataset
94
+ or "equalizer" in dataset in dataset
95
+ ):
96
+ continue
97
+ print(
98
+ "Extracting acoustic features for {} using {} workers ...".format(
99
+ dataset, args.num_workers
100
+ )
101
+ )
102
+ extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
103
+ # Calculate the statistics of acoustic features
104
+ if cfg.preprocess.mel_min_max_norm:
105
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
106
+
107
+ # Copy acoustic features for augmented datasets by creating soft-links
108
+ for dataset in cfg.dataset:
109
+ if "pitch_shift" in dataset:
110
+ src_dataset = dataset.replace("_pitch_shift", "")
111
+ src_dataset_dir = os.path.join(output_path, src_dataset)
112
+ elif "formant_shift" in dataset:
113
+ src_dataset = dataset.replace("_formant_shift", "")
114
+ src_dataset_dir = os.path.join(output_path, src_dataset)
115
+ elif "equalizer" in dataset:
116
+ src_dataset = dataset.replace("_equalizer", "")
117
+ src_dataset_dir = os.path.join(output_path, src_dataset)
118
+ else:
119
+ continue
120
+ dataset_dir = os.path.join(output_path, dataset)
121
+ metadata = []
122
+ for split in ["train", "test"] if not "eval" in dataset else ["test"]:
123
+ metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
124
+ with open(metadata_file_path, "r") as f:
125
+ metadata.extend(json.load(f))
126
+ print("Copying acoustic features for {}...".format(dataset))
127
+ acoustic_extractor.copy_acoustic_features(
128
+ metadata, dataset_dir, src_dataset_dir, cfg
129
+ )
130
+ if cfg.preprocess.mel_min_max_norm:
131
+ acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
132
+
133
+ if cfg.preprocess.extract_pitch:
134
+ acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser()
139
+ parser.add_argument(
140
+ "--config", default="config.json", help="json files for configurations."
141
+ )
142
+ parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
143
+
144
+ args = parser.parse_args()
145
+ cfg = load_config(args.config)
146
+
147
+ preprocess(cfg, args)
148
+
149
+
150
+ if __name__ == "__main__":
151
+ main()
bins/vocoder/train.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import argparse
7
+
8
+ import torch
9
+
10
+ from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
11
+ from models.vocoders.diffusion.diffusion_vocoder_trainer import DiffusionVocoderTrainer
12
+
13
+ from utils.util import load_config
14
+
15
+
16
+ def build_trainer(args, cfg):
17
+ supported_trainer = {
18
+ "GANVocoder": GANVocoderTrainer,
19
+ "DiffusionVocoder": DiffusionVocoderTrainer,
20
+ }
21
+
22
+ trainer_class = supported_trainer[cfg.model_type]
23
+ trainer = trainer_class(args, cfg)
24
+ return trainer
25
+
26
+
27
+ def cuda_relevant(deterministic=False):
28
+ torch.cuda.empty_cache()
29
+ # TF32 on Ampere and above
30
+ torch.backends.cuda.matmul.allow_tf32 = True
31
+ torch.backends.cudnn.enabled = True
32
+ torch.backends.cudnn.allow_tf32 = True
33
+ # Deterministic
34
+ torch.backends.cudnn.deterministic = deterministic
35
+ torch.backends.cudnn.benchmark = not deterministic
36
+ torch.use_deterministic_algorithms(deterministic)
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument(
42
+ "--config",
43
+ default="config.json",
44
+ help="json files for configurations.",
45
+ required=True,
46
+ )
47
+ parser.add_argument(
48
+ "--exp_name",
49
+ type=str,
50
+ default="exp_name",
51
+ help="A specific name to note the experiment",
52
+ required=True,
53
+ )
54
+ parser.add_argument(
55
+ "--resume_type",
56
+ type=str,
57
+ help="resume for continue to train, finetune for finetuning",
58
+ )
59
+ parser.add_argument(
60
+ "--checkpoint",
61
+ type=str,
62
+ help="checkpoint to resume",
63
+ )
64
+ parser.add_argument(
65
+ "--log_level", default="warning", help="logging level (debug, info, warning)"
66
+ )
67
+ args = parser.parse_args()
68
+ cfg = load_config(args.config)
69
+
70
+ # Data Augmentation
71
+ if cfg.preprocess.data_augment:
72
+ new_datasets_list = []
73
+ for dataset in cfg.preprocess.data_augment:
74
+ new_datasets = [
75
+ # f"{dataset}_pitch_shift",
76
+ # f"{dataset}_formant_shift",
77
+ f"{dataset}_equalizer",
78
+ f"{dataset}_time_stretch",
79
+ ]
80
+ new_datasets_list.extend(new_datasets)
81
+ cfg.dataset.extend(new_datasets_list)
82
+
83
+ # CUDA settings
84
+ cuda_relevant()
85
+
86
+ # Build trainer
87
+ trainer = build_trainer(args, cfg)
88
+
89
+ trainer.train_loop()
90
+
91
+
92
+ if __name__ == "__main__":
93
+ main()
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": [],
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "align_mel_duration": false
122
+ },
123
+ "train": {
124
+ "ddp": true,
125
+ "batch_size": 16,
126
+ "max_steps": 1000000,
127
+ // Trackers
128
+ "tracker": [
129
+ "tensorboard"
130
+ // "wandb",
131
+ // "cometml",
132
+ // "mlflow",
133
+ ],
134
+ "max_epoch": -1,
135
+ // -1 means no limit
136
+ "save_checkpoint_stride": [
137
+ 5,
138
+ 20
139
+ ],
140
+ // unit is epoch
141
+ "keep_last": [
142
+ 3,
143
+ -1
144
+ ],
145
+ // -1 means infinite, if one number will broadcast
146
+ "run_eval": [
147
+ false,
148
+ true
149
+ ],
150
+ // if one number will broadcast
151
+ // Fix the random seed
152
+ "random_seed": 10086,
153
+ // Optimizer
154
+ "optimizer": "AdamW",
155
+ "adamw": {
156
+ "lr": 4.0e-4
157
+ // nn model lr
158
+ },
159
+ // LR Scheduler
160
+ "scheduler": "ReduceLROnPlateau",
161
+ "reducelronplateau": {
162
+ "factor": 0.8,
163
+ "patience": 10,
164
+ // unit is epoch
165
+ "min_lr": 1.0e-4
166
+ },
167
+ // Batchsampler
168
+ "sampler": {
169
+ "holistic_shuffle": true,
170
+ "drop_last": true
171
+ },
172
+ // Dataloader
173
+ "dataloader": {
174
+ "num_worker": 32,
175
+ "pin_memory": true
176
+ },
177
+ "gradient_accumulation_step": 1,
178
+ "total_training_steps": 50000,
179
+ "save_summary_steps": 500,
180
+ "save_checkpoints_steps": 10000,
181
+ "valid_interval": 10000,
182
+ "keep_checkpoint_max": 5,
183
+ "multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model;
184
+ }
185
+ }
config/comosvc.json ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/svc/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "preprocess": {
6
+ // data augmentations
7
+ "use_pitch_shift": false,
8
+ "use_formant_shift": false,
9
+ "use_time_stretch": false,
10
+ "use_equalizer": false,
11
+ // acoustic features
12
+ "extract_mel": true,
13
+ "mel_min_max_norm": true,
14
+ "extract_pitch": true,
15
+ "pitch_extractor": "parselmouth",
16
+ "extract_uv": true,
17
+ "extract_energy": true,
18
+ // content features
19
+ "extract_whisper_feature": false,
20
+ "whisper_sample_rate": 16000,
21
+ "extract_contentvec_feature": false,
22
+ "contentvec_sample_rate": 16000,
23
+ "extract_wenet_feature": false,
24
+ "wenet_sample_rate": 16000,
25
+ "extract_mert_feature": false,
26
+ "mert_sample_rate": 16000,
27
+ // Default config for whisper
28
+ "whisper_frameshift": 0.01,
29
+ "whisper_downsample_rate": 2,
30
+ // Default config for content vector
31
+ "contentvec_frameshift": 0.02,
32
+ // Default config for mert
33
+ "mert_model": "m-a-p/MERT-v1-330M",
34
+ "mert_feature_layer": -1,
35
+ "mert_hop_size": 320,
36
+ // 24k
37
+ "mert_frameshit": 0.01333,
38
+ // 10ms
39
+ "wenet_frameshift": 0.01,
40
+ // wenetspeech is 4, gigaspeech is 6
41
+ "wenet_downsample_rate": 4,
42
+ // Default config
43
+ "n_mel": 100,
44
+ "win_size": 1024,
45
+ // todo
46
+ "hop_size": 256,
47
+ "sample_rate": 24000,
48
+ "n_fft": 1024,
49
+ // todo
50
+ "fmin": 0,
51
+ "fmax": 12000,
52
+ // todo
53
+ "f0_min": 50,
54
+ // ~C2
55
+ "f0_max": 1100,
56
+ //1100, // ~C6(1100), ~G5(800)
57
+ "pitch_bin": 256,
58
+ "pitch_max": 1100.0,
59
+ "pitch_min": 50.0,
60
+ "is_label": true,
61
+ "is_mu_law": true,
62
+ "bits": 8,
63
+ "mel_min_max_stats_dir": "mel_min_max_stats",
64
+ "whisper_dir": "whisper",
65
+ "contentvec_dir": "contentvec",
66
+ "wenet_dir": "wenet",
67
+ "mert_dir": "mert",
68
+ // Extract content features using dataloader
69
+ "pin_memory": true,
70
+ "num_workers": 8,
71
+ "content_feature_batch_size": 16,
72
+ // Features used for model training
73
+ "use_mel": true,
74
+ "use_min_max_norm_mel": true,
75
+ "use_frame_pitch": true,
76
+ "use_uv": true,
77
+ "use_frame_energy": true,
78
+ "use_log_scale_pitch": false,
79
+ "use_log_scale_energy": false,
80
+ "use_spkid": true,
81
+ // Meta file
82
+ "train_file": "train.json",
83
+ "valid_file": "test.json",
84
+ "spk2id": "singers.json",
85
+ "utt2spk": "utt2singer"
86
+ },
87
+ "model": {
88
+ "teacher_model_path": "[Your Teacher Model Path].bin",
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": false,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "comosvc": {
115
+ "distill": false,
116
+ // conformer encoder
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ // karras diffusion
124
+ "P_mean": -1.2,
125
+ "P_std": 1.2,
126
+ "sigma_data": 0.5,
127
+ "sigma_min": 0.002,
128
+ "sigma_max": 80,
129
+ "rho": 7,
130
+ "n_timesteps": 18,
131
+ },
132
+ "diffusion": {
133
+ // Diffusion steps encoder
134
+ "step_encoder": {
135
+ "dim_raw_embedding": 128,
136
+ "dim_hidden_layer": 512,
137
+ "activation": "SiLU",
138
+ "num_layer": 2,
139
+ "max_period": 10000
140
+ },
141
+ // Diffusion decoder
142
+ "model_type": "bidilconv",
143
+ // bidilconv, unet2d, TODO: unet1d
144
+ "bidilconv": {
145
+ "base_channel": 384,
146
+ "n_res_block": 20,
147
+ "conv_kernel_size": 3,
148
+ "dilation_cycle_length": 4,
149
+ // specially, 1 means no dilation
150
+ "conditioner_size": 100
151
+ }
152
+ },
153
+ },
154
+ "train": {
155
+ // Basic settings
156
+ "fast_steps": 0,
157
+ "batch_size": 64,
158
+ "gradient_accumulation_step": 1,
159
+ "max_epoch": -1,
160
+ // -1 means no limit
161
+ "save_checkpoint_stride": [
162
+ 10,
163
+ 100
164
+ ],
165
+ // unit is epoch
166
+ "keep_last": [
167
+ 3,
168
+ -1
169
+ ],
170
+ // -1 means infinite, if one number will broadcast
171
+ "run_eval": [
172
+ false,
173
+ true
174
+ ],
175
+ // if one number will broadcast
176
+ // Fix the random seed
177
+ "random_seed": 10086,
178
+ // Batchsampler
179
+ "sampler": {
180
+ "holistic_shuffle": true,
181
+ "drop_last": true
182
+ },
183
+ // Dataloader
184
+ "dataloader": {
185
+ "num_worker": 32,
186
+ "pin_memory": true
187
+ },
188
+ // Trackers
189
+ "tracker": [
190
+ "tensorboard"
191
+ // "wandb",
192
+ // "cometml",
193
+ // "mlflow",
194
+ ],
195
+ // Optimizer
196
+ "optimizer": "AdamW",
197
+ "adamw": {
198
+ "lr": 5.0e-5
199
+ // nn model lr
200
+ },
201
+ // LR Scheduler
202
+ "scheduler": "ReduceLROnPlateau",
203
+ "reducelronplateau": {
204
+ "factor": 0.8,
205
+ "patience": 10,
206
+ // unit is epoch
207
+ "min_lr": 5.0e-6
208
+ }
209
+ },
210
+ "inference": {
211
+ "comosvc": {
212
+ "inference_steps": 40
213
+ }
214
+ }
215
+ }
config/fs2.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": false,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+ "valid_file": "test.json",
51
+
52
+ // Features used for model training
53
+ "use_mel": true,
54
+ "use_min_max_norm_mel": false,
55
+ "use_frame_pitch": false,
56
+ "use_frame_energy": false,
57
+ "use_phone_pitch": true,
58
+ "use_phone_energy": true,
59
+ "use_log_scale_pitch": false,
60
+ "use_log_scale_energy": false,
61
+ "use_spkid": false,
62
+ "align_mel_duration": true,
63
+ "text_cleaners": ["english_cleaners"],
64
+ "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
65
+ },
66
+ "model": {
67
+ // Settings for transformer
68
+ "transformer": {
69
+ "encoder_layer": 4,
70
+ "encoder_head": 2,
71
+ "encoder_hidden": 256,
72
+ "decoder_layer": 6,
73
+ "decoder_head": 2,
74
+ "decoder_hidden": 256,
75
+ "conv_filter_size": 1024,
76
+ "conv_kernel_size": [9, 1],
77
+ "encoder_dropout": 0.2,
78
+ "decoder_dropout": 0.2
79
+ },
80
+
81
+ // Settings for variance_predictor
82
+ "variance_predictor":{
83
+ "filter_size": 256,
84
+ "kernel_size": 3,
85
+ "dropout": 0.5
86
+ },
87
+ "variance_embedding":{
88
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
89
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
90
+ "n_bins": 256
91
+ },
92
+ "max_seq_len": 1000
93
+ },
94
+ "train":{
95
+ "batch_size": 16,
96
+ "max_epoch": 100,
97
+ "sort_sample": true,
98
+ "drop_last": true,
99
+ "group_size": 4,
100
+ "grad_clip_thresh": 1.0,
101
+ "dataloader": {
102
+ "num_worker": 8,
103
+ "pin_memory": true
104
+ },
105
+ "lr_scheduler":{
106
+ "num_warmup": 4000
107
+ },
108
+ // LR Scheduler
109
+ "scheduler": "NoamLR",
110
+ // Optimizer
111
+ "optimizer": "Adam",
112
+ "adam": {
113
+ "lr": 0.0625,
114
+ "betas": [0.9, 0.98],
115
+ "eps": 0.000000001,
116
+ "weight_decay": 0.0
117
+ },
118
+ }
119
+
120
+ }
config/jets.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "Jets",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": false,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+ "valid_file": "test.json",
51
+
52
+ // Features used for model training
53
+ "use_mel": true,
54
+ "use_min_max_norm_mel": false,
55
+ "use_frame_pitch": true,
56
+ "use_frame_energy": true,
57
+ "use_phone_pitch": false,
58
+ "use_phone_energy": false,
59
+ "use_log_scale_pitch": false,
60
+ "use_log_scale_energy": false,
61
+ "use_spkid": false,
62
+ "align_mel_duration": true,
63
+ "text_cleaners": ["english_cleaners"],
64
+ "phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
65
+ },
66
+ "model": {
67
+ // Settings for transformer
68
+ "transformer": {
69
+ "encoder_layer": 4,
70
+ "encoder_head": 2,
71
+ "encoder_hidden": 256,
72
+ "decoder_layer": 6,
73
+ "decoder_head": 2,
74
+ "decoder_hidden": 256,
75
+ "conv_filter_size": 1024,
76
+ "conv_kernel_size": [9, 1],
77
+ "encoder_dropout": 0.2,
78
+ "decoder_dropout": 0.2
79
+ },
80
+
81
+ // Settings for variance_predictor
82
+ "variance_predictor":{
83
+ "filter_size": 256,
84
+ "kernel_size": 3,
85
+ "dropout": 0.5
86
+ },
87
+ "variance_embedding":{
88
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
89
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
90
+ "n_bins": 256
91
+ },
92
+ "max_seq_len": 1000
93
+ },
94
+ "train":{
95
+ "batch_size": 16,
96
+ "max_epoch": 100,
97
+ "sort_sample": true,
98
+ "drop_last": true,
99
+ "group_size": 4,
100
+ "grad_clip_thresh": 1.0,
101
+ "dataloader": {
102
+ "num_worker": 8,
103
+ "pin_memory": true
104
+ },
105
+ "lr_scheduler":{
106
+ "num_warmup": 4000
107
+ },
108
+ // LR Scheduler
109
+ "scheduler": "NoamLR",
110
+ // Optimizer
111
+ "optimizer": "Adam",
112
+ "adam": {
113
+ "lr": 0.0625,
114
+ "betas": [0.9, 0.98],
115
+ "eps": 0.000000001,
116
+ "weight_decay": 0.0
117
+ },
118
+ }
119
+
120
+ }
config/ns2.json ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "NaturalSpeech2",
4
+ "dataset": ["libritts"],
5
+ "preprocess": {
6
+ "use_mel": false,
7
+ "use_code": true,
8
+ "use_spkid": true,
9
+ "use_pitch": true,
10
+ "use_duration": true,
11
+ "use_phone": true,
12
+ "use_len": true,
13
+ "use_cross_reference": true,
14
+ "train_file": "train.json",
15
+ "melspec_dir": "mel",
16
+ "code_dir": "code",
17
+ "pitch_dir": "pitch",
18
+ "duration_dir": "duration",
19
+ "clip_mode": "start"
20
+ },
21
+ "model": {
22
+ "latent_dim": 128,
23
+ "prior_encoder": {
24
+ "vocab_size": 100,
25
+ "pitch_min": 50,
26
+ "pitch_max": 1100,
27
+ "pitch_bins_num": 512,
28
+ "encoder": {
29
+ "encoder_layer": 6,
30
+ "encoder_hidden": 512,
31
+ "encoder_head": 8,
32
+ "conv_filter_size": 2048,
33
+ "conv_kernel_size": 9,
34
+ "encoder_dropout": 0.2,
35
+ "use_cln": true
36
+ },
37
+ "duration_predictor": {
38
+ "input_size": 512,
39
+ "filter_size": 512,
40
+ "kernel_size": 3,
41
+ "conv_layers": 30,
42
+ "cross_attn_per_layer": 3,
43
+ "attn_head": 8,
44
+ "drop_out": 0.5
45
+ },
46
+ "pitch_predictor": {
47
+ "input_size": 512,
48
+ "filter_size": 512,
49
+ "kernel_size": 5,
50
+ "conv_layers": 30,
51
+ "cross_attn_per_layer": 3,
52
+ "attn_head": 8,
53
+ "drop_out": 0.5
54
+ }
55
+ },
56
+ "diffusion": {
57
+ "wavenet": {
58
+ "input_size": 128,
59
+ "hidden_size": 512,
60
+ "out_size": 128,
61
+ "num_layers": 40,
62
+ "cross_attn_per_layer": 3,
63
+ "dilation_cycle": 2,
64
+ "attn_head": 8,
65
+ "drop_out": 0.2
66
+ },
67
+ "beta_min": 0.05,
68
+ "beta_max": 20,
69
+ "sigma": 1.0,
70
+ "noise_factor": 1.0,
71
+ "ode_solver": "euler"
72
+ },
73
+ "prompt_encoder": {
74
+ "encoder_layer": 6,
75
+ "encoder_hidden": 512,
76
+ "encoder_head": 8,
77
+ "conv_filter_size": 2048,
78
+ "conv_kernel_size": 9,
79
+ "encoder_dropout": 0.2,
80
+ "use_cln": false
81
+ },
82
+ "query_emb": {
83
+ "query_token_num": 32,
84
+ "hidden_size": 512,
85
+ "head_num": 8
86
+ }
87
+ }
88
+ }
config/svc/base.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "task_type": "svc",
4
+ "preprocess": {
5
+ // data augmentations
6
+ "use_pitch_shift": false,
7
+ "use_formant_shift": false,
8
+ "use_time_stretch": false,
9
+ "use_equalizer": false,
10
+ // Online or offline features extraction ("offline" or "online")
11
+ "features_extraction_mode": "offline",
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_interpolation_for_uv": false,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ // Prosody Features
93
+ "use_f0": true,
94
+ "use_uv": true,
95
+ "use_energy": true,
96
+ // Quantization (0 for not quantization)
97
+ "input_melody_dim": 1,
98
+ "n_bins_melody": 256,
99
+ "output_melody_dim": 384,
100
+ "input_loudness_dim": 1,
101
+ "n_bins_loudness": 256,
102
+ "output_loudness_dim": 384,
103
+ // Semantic Features
104
+ "use_whisper": false,
105
+ "use_contentvec": false,
106
+ "use_wenet": false,
107
+ "use_mert": false,
108
+ "whisper_dim": 1024,
109
+ "contentvec_dim": 256,
110
+ "mert_dim": 256,
111
+ "wenet_dim": 512,
112
+ "content_encoder_dim": 384,
113
+ // Speaker Features
114
+ "output_singer_dim": 384,
115
+ "singer_table_size": 512,
116
+ "use_spkid": true
117
+ }
118
+ },
119
+ }
config/svc/diffusion.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/svc/base.json",
3
+ "model": {
4
+ "condition_encoder": {
5
+ "merge_mode": "add",
6
+ // Prosody Features
7
+ "use_f0": true,
8
+ "use_uv": true,
9
+ "use_energy": true,
10
+ // Quantization (0 for not quantization)
11
+ "input_melody_dim": 1,
12
+ "n_bins_melody": 256,
13
+ "output_melody_dim": 384,
14
+ "input_loudness_dim": 1,
15
+ "n_bins_loudness": 256,
16
+ "output_loudness_dim": 384,
17
+ // Semantic Features
18
+ "use_whisper": false,
19
+ "use_contentvec": false,
20
+ "use_wenet": false,
21
+ "use_mert": false,
22
+ "whisper_dim": 1024,
23
+ "contentvec_dim": 256,
24
+ "mert_dim": 256,
25
+ "wenet_dim": 512,
26
+ "content_encoder_dim": 384,
27
+ // Speaker Features
28
+ "output_singer_dim": 384,
29
+ "singer_table_size": 512,
30
+ "use_spkid": true
31
+ },
32
+ "diffusion": {
33
+ "scheduler": "ddpm",
34
+ "scheduler_settings": {
35
+ "num_train_timesteps": 1000,
36
+ "beta_start": 1.0e-4,
37
+ "beta_end": 0.02,
38
+ "beta_schedule": "linear"
39
+ },
40
+ // Diffusion steps encoder
41
+ "step_encoder": {
42
+ "dim_raw_embedding": 128,
43
+ "dim_hidden_layer": 512,
44
+ "activation": "SiLU",
45
+ "num_layer": 2,
46
+ "max_period": 10000
47
+ },
48
+ // Diffusion decoder
49
+ "model_type": "bidilconv",
50
+ // bidilconv, unet2d, TODO: unet1d
51
+ "bidilconv": {
52
+ "base_channel": 384,
53
+ "n_res_block": 20,
54
+ "conv_kernel_size": 3,
55
+ "dilation_cycle_length": 4,
56
+ // specially, 1 means no dilation
57
+ "conditioner_size": 384
58
+ },
59
+ "unet2d": {
60
+ "in_channels": 1,
61
+ "out_channels": 1,
62
+ "down_block_types": [
63
+ "CrossAttnDownBlock2D",
64
+ "CrossAttnDownBlock2D",
65
+ "CrossAttnDownBlock2D",
66
+ "DownBlock2D"
67
+ ],
68
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "only_cross_attention": false
76
+ }
77
+ }
78
+ },
79
+ "train": {
80
+ // Basic settings
81
+ "batch_size": 64,
82
+ "gradient_accumulation_step": 1,
83
+ "max_epoch": -1,
84
+ // -1 means no limit
85
+ "save_checkpoint_stride": [
86
+ 5,
87
+ 20
88
+ ],
89
+ // unit is epoch
90
+ "keep_last": [
91
+ 3,
92
+ -1
93
+ ],
94
+ // -1 means infinite, if one number will broadcast
95
+ "run_eval": [
96
+ false,
97
+ true
98
+ ],
99
+ // if one number will broadcast
100
+ // Fix the random seed
101
+ "random_seed": 10086,
102
+ // Batchsampler
103
+ "sampler": {
104
+ "holistic_shuffle": true,
105
+ "drop_last": true
106
+ },
107
+ // Dataloader
108
+ "dataloader": {
109
+ "num_worker": 32,
110
+ "pin_memory": true
111
+ },
112
+ // Trackers
113
+ "tracker": [
114
+ "tensorboard"
115
+ // "wandb",
116
+ // "cometml",
117
+ // "mlflow",
118
+ ],
119
+ // Optimizer
120
+ "optimizer": "AdamW",
121
+ "adamw": {
122
+ "lr": 4.0e-4
123
+ // nn model lr
124
+ },
125
+ // LR Scheduler
126
+ "scheduler": "ReduceLROnPlateau",
127
+ "reducelronplateau": {
128
+ "factor": 0.8,
129
+ "patience": 10,
130
+ // unit is epoch
131
+ "min_lr": 1.0e-4
132
+ }
133
+ },
134
+ "inference": {
135
+ "diffusion": {
136
+ "scheduler": "pndm",
137
+ "scheduler_settings": {
138
+ "num_inference_timesteps": 1000
139
+ }
140
+ }
141
+ }
142
+ }
config/transformer.json ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/svc/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "preprocess": {
6
+ // data augmentations
7
+ "use_pitch_shift": false,
8
+ "use_formant_shift": false,
9
+ "use_time_stretch": false,
10
+ "use_equalizer": false,
11
+ // acoustic features
12
+ "extract_mel": true,
13
+ "mel_min_max_norm": true,
14
+ "extract_pitch": true,
15
+ "pitch_extractor": "parselmouth",
16
+ "extract_uv": true,
17
+ "extract_energy": true,
18
+ // content features
19
+ "extract_whisper_feature": false,
20
+ "whisper_sample_rate": 16000,
21
+ "extract_contentvec_feature": false,
22
+ "contentvec_sample_rate": 16000,
23
+ "extract_wenet_feature": false,
24
+ "wenet_sample_rate": 16000,
25
+ "extract_mert_feature": false,
26
+ "mert_sample_rate": 16000,
27
+ // Default config for whisper
28
+ "whisper_frameshift": 0.01,
29
+ "whisper_downsample_rate": 2,
30
+ // Default config for content vector
31
+ "contentvec_frameshift": 0.02,
32
+ // Default config for mert
33
+ "mert_model": "m-a-p/MERT-v1-330M",
34
+ "mert_feature_layer": -1,
35
+ "mert_hop_size": 320,
36
+ // 24k
37
+ "mert_frameshit": 0.01333,
38
+ // 10ms
39
+ "wenet_frameshift": 0.01,
40
+ // wenetspeech is 4, gigaspeech is 6
41
+ "wenet_downsample_rate": 4,
42
+ // Default config
43
+ "n_mel": 100,
44
+ "win_size": 1024,
45
+ // todo
46
+ "hop_size": 256,
47
+ "sample_rate": 24000,
48
+ "n_fft": 1024,
49
+ // todo
50
+ "fmin": 0,
51
+ "fmax": 12000,
52
+ // todo
53
+ "f0_min": 50,
54
+ // ~C2
55
+ "f0_max": 1100,
56
+ //1100, // ~C6(1100), ~G5(800)
57
+ "pitch_bin": 256,
58
+ "pitch_max": 1100.0,
59
+ "pitch_min": 50.0,
60
+ "is_label": true,
61
+ "is_mu_law": true,
62
+ "bits": 8,
63
+ "mel_min_max_stats_dir": "mel_min_max_stats",
64
+ "whisper_dir": "whisper",
65
+ "contentvec_dir": "contentvec",
66
+ "wenet_dir": "wenet",
67
+ "mert_dir": "mert",
68
+ // Extract content features using dataloader
69
+ "pin_memory": true,
70
+ "num_workers": 8,
71
+ "content_feature_batch_size": 16,
72
+ // Features used for model training
73
+ "use_mel": true,
74
+ "use_min_max_norm_mel": true,
75
+ "use_frame_pitch": true,
76
+ "use_uv": true,
77
+ "use_frame_energy": true,
78
+ "use_log_scale_pitch": false,
79
+ "use_log_scale_energy": false,
80
+ "use_spkid": true,
81
+ // Meta file
82
+ "train_file": "train.json",
83
+ "valid_file": "test.json",
84
+ "spk2id": "singers.json",
85
+ "utt2spk": "utt2singer"
86
+ },
87
+ "model": {
88
+ "condition_encoder": {
89
+ "merge_mode": "add",
90
+ "input_melody_dim": 1,
91
+ "use_log_f0": true,
92
+ "n_bins_melody": 256,
93
+ //# Quantization (0 for not quantization)
94
+ "output_melody_dim": 384,
95
+ "input_loudness_dim": 1,
96
+ "use_log_loudness": true,
97
+ "n_bins_loudness": 256,
98
+ "output_loudness_dim": 384,
99
+ "use_whisper": false,
100
+ "use_contentvec": true,
101
+ "use_wenet": false,
102
+ "use_mert": false,
103
+ "whisper_dim": 1024,
104
+ "contentvec_dim": 256,
105
+ "mert_dim": 256,
106
+ "wenet_dim": 512,
107
+ "content_encoder_dim": 384,
108
+ "output_singer_dim": 384,
109
+ "singer_table_size": 512,
110
+ "output_content_dim": 384,
111
+ "use_spkid": true
112
+ },
113
+ "transformer": {
114
+ "type": "conformer",
115
+ // 'conformer' or 'transformer'
116
+ "input_dim": 384,
117
+ "output_dim": 100,
118
+ "n_heads": 2,
119
+ "n_layers": 6,
120
+ "filter_channels": 512,
121
+ "dropout": 0.1,
122
+ }
123
+ },
124
+ "train": {
125
+ // Basic settings
126
+ "batch_size": 64,
127
+ "gradient_accumulation_step": 1,
128
+ "max_epoch": -1,
129
+ // -1 means no limit
130
+ "save_checkpoint_stride": [
131
+ 10,
132
+ 100
133
+ ],
134
+ // unit is epoch
135
+ "keep_last": [
136
+ 3,
137
+ -1
138
+ ],
139
+ // -1 means infinite, if one number will broadcast
140
+ "run_eval": [
141
+ false,
142
+ true
143
+ ],
144
+ // if one number will broadcast
145
+ // Fix the random seed
146
+ "random_seed": 10086,
147
+ // Batchsampler
148
+ "sampler": {
149
+ "holistic_shuffle": true,
150
+ "drop_last": true
151
+ },
152
+ // Dataloader
153
+ "dataloader": {
154
+ "num_worker": 32,
155
+ "pin_memory": true
156
+ },
157
+ // Trackers
158
+ "tracker": [
159
+ "tensorboard"
160
+ // "wandb",
161
+ // "cometml",
162
+ // "mlflow",
163
+ ],
164
+ // Optimizer
165
+ "optimizer": "AdamW",
166
+ "adamw": {
167
+ "lr": 4.0e-4
168
+ // nn model lr
169
+ },
170
+ // LR Scheduler
171
+ "scheduler": "ReduceLROnPlateau",
172
+ "reducelronplateau": {
173
+ "factor": 0.8,
174
+ "patience": 10,
175
+ // unit is epoch
176
+ "min_lr": 1.0e-4
177
+ }
178
+ }
179
+ }
config/tts.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ "NaturalSpeech2"
8
+ ],
9
+ "task_type": "tts",
10
+ "preprocess": {
11
+ "language": "en-us", // espeak supports 100 languages https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
12
+ // linguistic features
13
+ "extract_phone": true,
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
15
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
16
+ // Directory names of processed data or extracted features
17
+ "phone_dir": "phones",
18
+ "use_phone": true,
19
+ "add_blank": true
20
+ },
21
+ "model": {
22
+ "text_token_num": 512,
23
+ }
24
+
25
+ }
config/valle.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sample_rate": 24000,
21
+ "codec_hop_size": 320
22
+ },
23
+ "model": {
24
+ "text_token_num": 512,
25
+ "audio_token_num": 1024,
26
+ "decoder_dim": 1024, // embedding dimension of the decoder model
27
+ "nhead": 16, // number of attention heads in the decoder layers
28
+ "num_decoder_layers": 12, // number of decoder layers
29
+ "norm_first": true, // pre or post Normalization.
30
+ "add_prenet": false, // whether add PreNet after Inputs
31
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
32
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
33
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
34
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
35
+ "num_quantizers": 8, // numbert of the audio quantization layers
36
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
37
+ },
38
+ "train": {
39
+ "use_dynamic_batchsize": false, // If use dynamic batch size
40
+ "ddp": false,
41
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
42
+ "max_epoch": 20,
43
+ "optimizer": "AdamW",
44
+ "scheduler": "cosine",
45
+ "warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
46
+ "total_training_steps": 800000,
47
+ "base_lr": 1e-4, // base learning rate."
48
+ "valid_interval": 1000,
49
+ "log_epoch_step": 1000,
50
+ "save_checkpoint_stride": [
51
+ 1,
52
+ 1
53
+ ]
54
+ }
55
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vitssvc.json ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/svc/base.json",
3
+ "model_type": "VITS",
4
+ "task_type": "svc",
5
+ "preprocess": {
6
+ // Config for features extraction
7
+ "extract_mel": true,
8
+ "extract_pitch": true,
9
+ "pitch_extractor": "parselmouth",
10
+ "extract_energy": true,
11
+ "extract_uv": true,
12
+ "extract_linear_spec": true,
13
+ "extract_audio": true,
14
+ "mel_min_max_norm": true,
15
+ // Config for features usage
16
+ "use_linear": true,
17
+ "use_mel": true,
18
+ "use_min_max_norm_mel": false,
19
+ "use_audio": true,
20
+ "use_frame_pitch": true,
21
+ "use_uv": true,
22
+ "use_spkid": true,
23
+ "use_contentvec": false,
24
+ "use_whisper": false,
25
+ "use_wenet": false,
26
+ "use_text": false,
27
+ "use_phone": false,
28
+ "fmin": 0,
29
+ "fmax": 12000,
30
+ "f0_min": 50,
31
+ "f0_max": 1100,
32
+ // f0_bin in sovits
33
+ "pitch_bin": 256,
34
+ // filter_length in sovits
35
+ "n_fft": 1024,
36
+ // hop_length in sovits
37
+ "hop_size": 256,
38
+ // win_length in sovits
39
+ "win_size": 1024,
40
+ "segment_size": 8192,
41
+ "n_mel": 100,
42
+ "sample_rate": 24000,
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "contentvec_dir": "contentvec",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ // Meta file
49
+ "train_file": "train.json",
50
+ "valid_file": "test.json",
51
+ "spk2id": "singers.json",
52
+ "utt2spk": "utt2singer"
53
+ },
54
+ "model": {
55
+ "condition_encoder": {
56
+ "merge_mode": "add",
57
+ "input_melody_dim": 1,
58
+ "use_log_f0": true,
59
+ "n_bins_melody": 256,
60
+ "output_melody_dim": 384,
61
+ "input_loudness_dim": 1,
62
+ "use_log_loudness": true,
63
+ "n_bins_loudness": 256,
64
+ "output_loudness_dim": 384,
65
+ "use_whisper": false,
66
+ "use_contentvec": false,
67
+ "use_wenet": false,
68
+ "use_mert": false,
69
+ "whisper_dim": 1024,
70
+ "contentvec_dim": 256,
71
+ "mert_dim": 256,
72
+ "wenet_dim": 512,
73
+ "content_encoder_dim": 384,
74
+ "singer_table_size": 512,
75
+ "output_singer_dim": 384,
76
+ "output_content_dim": 384,
77
+ "use_spkid": true,
78
+ "pitch_max": 1100.0,
79
+ "pitch_min": 50.0,
80
+ },
81
+ "vits": {
82
+ "filter_channels": 256,
83
+ "gin_channels": 256,
84
+ "hidden_channels": 384,
85
+ "inter_channels": 384,
86
+ "kernel_size": 3,
87
+ "n_flow_layer": 4,
88
+ "n_heads": 2,
89
+ "n_layers": 6,
90
+ "n_layers_q": 3,
91
+ "n_speakers": 512,
92
+ "p_dropout": 0.1,
93
+ "use_spectral_norm": false,
94
+ },
95
+ "generator": "hifigan",
96
+ "generator_config": {
97
+ "hifigan": {
98
+ "resblock": "1",
99
+ "resblock_kernel_sizes": [
100
+ 3,
101
+ 7,
102
+ 11
103
+ ],
104
+ "upsample_rates": [
105
+ 8,
106
+ 8,
107
+ 2,
108
+ 2
109
+ ],
110
+ "upsample_kernel_sizes": [
111
+ 16,
112
+ 16,
113
+ 4,
114
+ 4
115
+ ],
116
+ "upsample_initial_channel": 512,
117
+ "resblock_dilation_sizes": [
118
+ [
119
+ 1,
120
+ 3,
121
+ 5
122
+ ],
123
+ [
124
+ 1,
125
+ 3,
126
+ 5
127
+ ],
128
+ [
129
+ 1,
130
+ 3,
131
+ 5
132
+ ]
133
+ ]
134
+ },
135
+ "melgan": {
136
+ "ratios": [
137
+ 8,
138
+ 8,
139
+ 2,
140
+ 2
141
+ ],
142
+ "ngf": 32,
143
+ "n_residual_layers": 3,
144
+ "num_D": 3,
145
+ "ndf": 16,
146
+ "n_layers": 4,
147
+ "downsampling_factor": 4
148
+ },
149
+ "bigvgan": {
150
+ "resblock": "1",
151
+ "activation": "snakebeta",
152
+ "snake_logscale": true,
153
+ "upsample_rates": [
154
+ 8,
155
+ 8,
156
+ 2,
157
+ 2
158
+ ],
159
+ "upsample_kernel_sizes": [
160
+ 16,
161
+ 16,
162
+ 4,
163
+ 4
164
+ ],
165
+ "upsample_initial_channel": 512,
166
+ "resblock_kernel_sizes": [
167
+ 3,
168
+ 7,
169
+ 11
170
+ ],
171
+ "resblock_dilation_sizes": [
172
+ [
173
+ 1,
174
+ 3,
175
+ 5
176
+ ],
177
+ [
178
+ 1,
179
+ 3,
180
+ 5
181
+ ],
182
+ [
183
+ 1,
184
+ 3,
185
+ 5
186
+ ]
187
+ ]
188
+ },
189
+ "nsfhifigan": {
190
+ "resblock": "1",
191
+ "harmonic_num": 8,
192
+ "upsample_rates": [
193
+ 8,
194
+ 8,
195
+ 2,
196
+ 2
197
+ ],
198
+ "upsample_kernel_sizes": [
199
+ 16,
200
+ 16,
201
+ 4,
202
+ 4
203
+ ],
204
+ "upsample_initial_channel": 768,
205
+ "resblock_kernel_sizes": [
206
+ 3,
207
+ 7,
208
+ 11
209
+ ],
210
+ "resblock_dilation_sizes": [
211
+ [
212
+ 1,
213
+ 3,
214
+ 5
215
+ ],
216
+ [
217
+ 1,
218
+ 3,
219
+ 5
220
+ ],
221
+ [
222
+ 1,
223
+ 3,
224
+ 5
225
+ ]
226
+ ]
227
+ },
228
+ "apnet": {
229
+ "ASP_channel": 512,
230
+ "ASP_resblock_kernel_sizes": [
231
+ 3,
232
+ 7,
233
+ 11
234
+ ],
235
+ "ASP_resblock_dilation_sizes": [
236
+ [
237
+ 1,
238
+ 3,
239
+ 5
240
+ ],
241
+ [
242
+ 1,
243
+ 3,
244
+ 5
245
+ ],
246
+ [
247
+ 1,
248
+ 3,
249
+ 5
250
+ ]
251
+ ],
252
+ "ASP_input_conv_kernel_size": 7,
253
+ "ASP_output_conv_kernel_size": 7,
254
+ "PSP_channel": 512,
255
+ "PSP_resblock_kernel_sizes": [
256
+ 3,
257
+ 7,
258
+ 11
259
+ ],
260
+ "PSP_resblock_dilation_sizes": [
261
+ [
262
+ 1,
263
+ 3,
264
+ 5
265
+ ],
266
+ [
267
+ 1,
268
+ 3,
269
+ 5
270
+ ],
271
+ [
272
+ 1,
273
+ 3,
274
+ 5
275
+ ]
276
+ ],
277
+ "PSP_input_conv_kernel_size": 7,
278
+ "PSP_output_R_conv_kernel_size": 7,
279
+ "PSP_output_I_conv_kernel_size": 7,
280
+ }
281
+ },
282
+ },
283
+ "train": {
284
+ "fp16_run": true,
285
+ "learning_rate": 2e-4,
286
+ "betas": [
287
+ 0.8,
288
+ 0.99
289
+ ],
290
+ "eps": 1e-9,
291
+ "batch_size": 16,
292
+ "lr_decay": 0.999875,
293
+ // "segment_size": 8192,
294
+ "init_lr_ratio": 1,
295
+ "warmup_epochs": 0,
296
+ "c_mel": 45,
297
+ "c_kl": 1.0,
298
+ "AdamW": {
299
+ "betas": [
300
+ 0.8,
301
+ 0.99
302
+ ],
303
+ "eps": 1e-9,
304
+ }
305
+ }
306
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 16,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }
egs/datasets/README.md ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Format
2
+
3
+ Amphion support the following academic datasets (sort alphabetically):
4
+
5
+ - [Datasets Format](#datasets-format)
6
+ - [AudioCaps](#audiocaps)
7
+ - [CSD](#csd)
8
+ - [CustomSVCDataset](#customsvcdataset)
9
+ - [Hi-Fi TTS](#hifitts)
10
+ - [KiSing](#kising)
11
+ - [LibriLight](#librilight)
12
+ - [LibriTTS](#libritts)
13
+ - [LJSpeech](#ljspeech)
14
+ - [M4Singer](#m4singer)
15
+ - [NUS-48E](#nus-48e)
16
+ - [Opencpop](#opencpop)
17
+ - [OpenSinger](#opensinger)
18
+ - [Opera](#opera)
19
+ - [PopBuTFy](#popbutfy)
20
+ - [PopCS](#popcs)
21
+ - [PJS](#pjs)
22
+ - [SVCC](#svcc)
23
+ - [VCTK](#vctk)
24
+
25
+ The downloading link and the file structure tree of each dataset is displayed as follows.
26
+
27
+ > **Note:** When using Docker to run Amphion, mount the dataset to the container is necessary after downloading. Check [Mount dataset in Docker container](./docker.md) for more details.
28
+
29
+ ## AudioCaps
30
+
31
+ AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information.
32
+
33
+ Download AudioCaps dataset [here](https://github.com/cdjkim/audiocaps). The file structure looks like below:
34
+
35
+ ```plaintext
36
+ [AudioCaps dataset path]
37
+ ┣ AudioCpas
38
+ ┃ ┣ wav
39
+ ┃ ┃ ┣ ---1_cCGK4M_0_10000.wav
40
+ ┃ ┃ ┣ ---lTs1dxhU_30000_40000.wav
41
+ ┃ ┃ ┣ ...
42
+ ```
43
+
44
+ ## CSD
45
+
46
+ Download the official CSD dataset [here](https://zenodo.org/records/4785016). The file structure looks like below:
47
+
48
+ ```plaintext
49
+ [CSD dataset path]
50
+ ┣ english
51
+ ┣ korean
52
+ ┣ utterances
53
+ ┃ ┣ en001a
54
+ ┃ ┃ ┣ {UtterenceID}.wav
55
+ ┃ ┣ en001b
56
+ ┃ ┣ en002a
57
+ ┃ ┣ en002b
58
+ ┃ ┣ ...
59
+ ┣ README
60
+ ```
61
+
62
+ ## CustomSVCDataset
63
+
64
+ We support custom dataset for Singing Voice Conversion. Organize your data in the following structure to construct your own dataset:
65
+
66
+ ```plaintext
67
+ [Your Custom Dataset Path]
68
+ ┣ singer1
69
+ ┃ ┣ song1
70
+ ┃ ┃ ┣ utterance1.wav
71
+ ┃ ┃ ┣ utterance2.wav
72
+ ┃ ┃ ┣ ...
73
+ ┃ ┣ song2
74
+ ┃ ┣ ...
75
+ ┣ singer2
76
+ ┣ ...
77
+ ```
78
+
79
+
80
+ ## Hi-Fi TTS
81
+
82
+ Download the official Hi-Fi TTS dataset [here](https://www.openslr.org/109/). The file structure looks like below:
83
+
84
+ ```plaintext
85
+ [Hi-Fi TTS dataset path]
86
+ ┣ audio
87
+ ┃ ┣ 11614_other {Speaker_ID}_{SNR_subset}
88
+ ┃ ┃ ┣ 10547 {Book_ID}
89
+ ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0001.flac
90
+ ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0003.flac
91
+ ┃ ┃ ┃ ┣ thousandnights8_04_anonymous_0004.flac
92
+ ┃ ┃ ┃ ┣ ...
93
+ ┃ ┃ ┣ ...
94
+ ┃ ┣ ...
95
+ ┣ 92_manifest_clean_dev.json
96
+ ┣ 92_manifest_clean_test.json
97
+ ┣ 92_manifest_clean_train.json
98
+ ┣ ...
99
+ ┣ {Speaker_ID}_manifest_{SNR_subset}_{dataset_split}.json
100
+ ┣ ...
101
+ ┣ books_bandwidth.tsv
102
+ ┣ LICENSE.txt
103
+ ┣ readers_books_clean.txt
104
+ ┣ readers_books_other.txt
105
+ ┣ README.txt
106
+
107
+ ```
108
+
109
+ ## KiSing
110
+
111
+ Download the official KiSing dataset [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure looks like below:
112
+
113
+ ```plaintext
114
+ [KiSing dataset path]
115
+ ┣ clean
116
+ ┃ ┣ 421
117
+ ┃ ┣ 422
118
+ ┃ ┣ ...
119
+ ```
120
+
121
+ ## LibriLight
122
+
123
+ Download the official LibriLight dataset [here](https://github.com/facebookresearch/libri-light). The file structure looks like below:
124
+
125
+ ```plaintext
126
+ [LibriTTS dataset path]
127
+ ┣ small (Subset)
128
+ ┃ ┣ 100 {Speaker_ID}
129
+ ┃ ┃ ┣ sea_fairies_0812_librivox_64kb_mp3 {Chapter_ID}
130
+ ┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.flac
131
+ ┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.flac
132
+ ┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.flac
133
+ ┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.flac
134
+ ┃ ┃ ┃ ┣ 01_baum_sea_fairies_64kb.json
135
+ ┃ ┃ ┃ ┣ 02_baum_sea_fairies_64kb.json
136
+ ┃ ┃ ┃ ┣ 03_baum_sea_fairies_64kb.json
137
+ ┃ ┃ ┃ ┣ 22_baum_sea_fairies_64kb.json
138
+ ┃ ┃ ┃ ┣ ...
139
+ ┃ ┃ ┣ ...
140
+ ┃ ┣ ...
141
+ ┣ medium (Subset)
142
+ ┣ ...
143
+ ```
144
+
145
+ ## LibriTTS
146
+
147
+ Download the official LibriTTS dataset [here](https://www.openslr.org/60/). The file structure looks like below:
148
+
149
+ ```plaintext
150
+ [LibriTTS dataset path]
151
+ ┣ BOOKS.txt
152
+ ┣ CHAPTERS.txt
153
+ ┣ eval_sentences10.tsv
154
+ ┣ LICENSE.txt
155
+ ┣ NOTE.txt
156
+ ┣ reader_book.tsv
157
+ ┣ README_librispeech.txt
158
+ ┣ README_libritts.txt
159
+ ┣ speakers.tsv
160
+ ┣ SPEAKERS.txt
161
+ ┣ dev-clean (Subset)
162
+ ┃ ┣ 1272{Speaker_ID}
163
+ ┃ ┃ ┣ 128104 {Chapter_ID}
164
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
165
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
166
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
167
+ ┃ ┃ ┃ ┣ ...
168
+ ┃ ┃ ┃ ┣ 1272_128104.book.tsv
169
+ ┃ ┃ ┃ ┣ 1272_128104.trans.tsv
170
+ ┃ ┃ ┣ ...
171
+ ┃ ┣ ...
172
+ ┣ dev-other (Subset)
173
+ ┃ ┣ 116 (Speaker)
174
+ ┃ ┃ ┣ 288045 {Chapter_ID}
175
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
176
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
177
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
178
+ ┃ ┃ ┃ ┣ ...
179
+ ┃ ┃ ┃ ┣ 116_288045.book.tsv
180
+ ┃ ┃ ┃ ┣ 116_288045.trans.tsv
181
+ ┃ ┃ ┣ ...
182
+ ┃ ┣ ...
183
+ ┃ ┣ ...
184
+ ┣ test-clean (Subset)
185
+ ┃ �� {Speaker_ID}
186
+ ┃ ┃ ┣ {Chapter_ID}
187
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
188
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
189
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
190
+ ┃ ┃ ┃ ┣ ...
191
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
192
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
193
+ ┃ ┃ ┣ ...
194
+ ┃ ┣ ...
195
+ ┣ test-other
196
+ ┃ ┣ {Speaker_ID}
197
+ ┃ ┃ ┣ {Chapter_ID}
198
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
199
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
200
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
201
+ ┃ ┃ ┃ ┣ ...
202
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
203
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
204
+ ┃ ┃ ┣ ...
205
+ ┃ ┣ ...
206
+ ┣ train-clean-100
207
+ ┃ ┣ {Speaker_ID}
208
+ ┃ ┃ ┣ {Chapter_ID}
209
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
210
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
211
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
212
+ ┃ ┃ ┃ ┣ ...
213
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
214
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
215
+ ┃ ┃ ┣ ...
216
+ ┃ ┣ ...
217
+ ┣ train-clean-360
218
+ ┃ ┣ {Speaker_ID}
219
+ ┃ ┃ ┣ {Chapter_ID}
220
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
221
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
222
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
223
+ ┃ ┃ ┃ ┣ ...
224
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
225
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
226
+ ┃ ┃ ┣ ...
227
+ ┃ ┣ ...
228
+ ┣ train-other-500
229
+ ┃ ┣ {Speaker_ID}
230
+ ┃ ┃ ┣ {Chapter_ID}
231
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
232
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
233
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
234
+ ┃ ┃ ┃ ┣ ...
235
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
236
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
237
+ ┃ ┃ ┣ ...
238
+ ┃ ┣ ...
239
+ ```
240
+
241
+ ## LJSpeech
242
+
243
+ Download the official LJSpeech dataset [here](https://keithito.com/LJ-Speech-Dataset/). The file structure looks like below:
244
+
245
+ ```plaintext
246
+ [LJSpeech dataset path]
247
+ ┣ metadata.csv
248
+ ┣ wavs
249
+ ┃ ┣ LJ001-0001.wav
250
+ ┃ ┣ LJ001-0002.wav
251
+ ┃ ┣ ...
252
+ ┣ README
253
+ ```
254
+
255
+ ## M4Singer
256
+
257
+ Download the official M4Singer dataset [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure looks like below:
258
+
259
+ ```plaintext
260
+ [M4Singer dataset path]
261
+ ┣ {Singer_1}#{Song_1}
262
+ ┃ ┣ 0000.mid
263
+ ┃ ┣ 0000.TextGrid
264
+ ┃ ┣ 0000.wav
265
+ ┃ ┣ ...
266
+ ┣ {Singer_1}#{Song_2}
267
+ ┣ ...
268
+ ┣ {Singer_2}#{Song_1}
269
+ ┣ {Singer_2}#{Song_2}
270
+ ┣ ...
271
+ ┗ meta.json
272
+ ```
273
+
274
+ ## NUS-48E
275
+
276
+ Download the official NUS-48E dataset [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure looks like below:
277
+
278
+ ```plaintext
279
+ [NUS-48E dataset path]
280
+ ┣ {SpeakerID}
281
+ ┃ ┣ read
282
+ ┃ ┃ ┣ {SongID}.txt
283
+ ┃ ┃ ┣ {SongID}.wav
284
+ ┃ ┃ ┣ ...
285
+ ┃ ┣ sing
286
+ ┃ ┃ ┣ {SongID}.txt
287
+ ┃ ┃ ┣ {SongID}.wav
288
+ ┃ ┃ ┣ ...
289
+ ┣ ...
290
+ ┣ README.txt
291
+
292
+ ```
293
+
294
+ ## Opencpop
295
+
296
+ Download the official Opencpop dataset [here](https://wenet.org.cn/opencpop/). The file structure looks like below:
297
+
298
+ ```plaintext
299
+ [Opencpop dataset path]
300
+ ┣ midis
301
+ ┃ ┣ 2001.midi
302
+ ┃ ┣ 2002.midi
303
+ ┃ ┣ 2003.midi
304
+ ┃ ┣ ...
305
+ ┣ segments
306
+ ┃ ┣ wavs
307
+ ┃ ┃ ┣ 2001000001.wav
308
+ ┃ ┃ ┣ 2001000002.wav
309
+ ┃ ┃ ┣ 2001000003.wav
310
+ ┃ ┃ ┣ ...
311
+ ┃ ┣ test.txt
312
+ ┃ ┣ train.txt
313
+ ┃ ┗ transcriptions.txt
314
+ ┣ textgrids
315
+ ┃ ┣ 2001.TextGrid
316
+ ┃ ┣ 2002.TextGrid
317
+ ┃ ┣ 2003.TextGrid
318
+ ┃ ┣ ...
319
+ ┣ wavs
320
+ ┃ ┣ 2001.wav
321
+ ┃ ┣ 2002.wav
322
+ ┃ ┣ 2003.wav
323
+ ┃ ┣ ...
324
+ ┣ TERMS_OF_ACCESS
325
+ ┗ readme.md
326
+ ```
327
+
328
+ ## OpenSinger
329
+
330
+ Download the official OpenSinger dataset [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure looks like below:
331
+
332
+ ```plaintext
333
+ [OpenSinger dataset path]
334
+ ┣ ManRaw
335
+ ┃ ┣ {Singer_1}_{Song_1}
336
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
337
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
338
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
339
+ ┃ ┃ ┣ ...
340
+ ┃ ┣ {Singer_1}_{Song_2}
341
+ ┃ ┣ ...
342
+ ┣ WomanRaw
343
+ ┣ LICENSE
344
+ ┗ README.md
345
+ ```
346
+
347
+ ## Opera
348
+
349
+ Download the official Opera dataset [here](http://isophonics.net/SingingVoiceDataset). The file structure looks like below:
350
+
351
+ ```plaintext
352
+ [Opera dataset path]
353
+ ┣ monophonic
354
+ ┃ ┣ chinese
355
+ ┃ ┃ ┣ {Gender}_{SingerID}
356
+ ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
357
+ ┃ ┃ ┃ ┣ ...
358
+ ┃ ┃ ┣ ...
359
+ ┃ ┣ western
360
+ ┣ polyphonic
361
+ ┃ ┣ chinese
362
+ ┃ ┣ western
363
+ ┣ CrossculturalDataSet.xlsx
364
+ ```
365
+
366
+ ## PopBuTFy
367
+
368
+ Download the official PopBuTFy dataset [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure looks like below:
369
+
370
+ ```plaintext
371
+ [PopBuTFy dataset path]
372
+ ┣ data
373
+ ┃ ┣ {SingerID}#singing#{SongName}_Amateur
374
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
375
+ ┃ ┃ ┣ ...
376
+ ┃ ┣ {SingerID}#singing#{SongName}_Professional
377
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
378
+ ┃ ┃ ┣ ...
379
+ ┣ text_labels
380
+ ┗ TERMS_OF_ACCESS
381
+ ```
382
+
383
+ ## PopCS
384
+
385
+ Download the official PopCS dataset [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure looks like below:
386
+
387
+ ```plaintext
388
+ [PopCS dataset path]
389
+ ┣ popcs
390
+ ┃ ┣ popcs-{SongName}
391
+ ┃ ┃ ┣ {UtteranceID}_ph.txt
392
+ ┃ ┃ ┣ {UtteranceID}_wf0.wav
393
+ ┃ ┃ ┣ {UtteranceID}.TextGrid
394
+ ┃ ┃ ┣ {UtteranceID}.txt
395
+ ┃ ┃ ┣ ...
396
+ ┃ ┣ ...
397
+ ┗ TERMS_OF_ACCESS
398
+ ```
399
+
400
+ ## PJS
401
+
402
+ Download the official PJS dataset [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure looks like below:
403
+
404
+ ```plaintext
405
+ [PJS dataset path]
406
+ ┣ PJS_corpus_ver1.1
407
+ ┃ ┣ background_noise
408
+ ┃ ┣ pjs{SongID}
409
+ ┃ ┃ ┣ pjs{SongID}_song.wav
410
+ ┃ ┃ ┣ pjs{SongID}_speech.wav
411
+ ┃ ┃ ┣ pjs{SongID}.lab
412
+ ┃ ┃ ┣ pjs{SongID}.mid
413
+ ┃ ┃ ┣ pjs{SongID}.musicxml
414
+ ┃ ┃ ┣ pjs{SongID}.txt
415
+ ┃ ┣ ...
416
+ ```
417
+
418
+ ## SVCC
419
+
420
+ Download the official SVCC dataset [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure looks like below:
421
+
422
+ ```plaintext
423
+ [SVCC dataset path]
424
+ ┣ Data
425
+ ┃ ┣ CDF1
426
+ ┃ ┃ ┣ 10001.wav
427
+ ┃ ┃ ┣ 10002.wav
428
+ ┃ ┃ ┣ ...
429
+ ┃ ┣ CDM1
430
+ ┃ ┣ IDF1
431
+ ┃ ┣ IDM1
432
+ ┗ README.md
433
+ ```
434
+
435
+ ## VCTK
436
+
437
+ Download the official VCTK dataset [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure looks like below:
438
+
439
+ ```plaintext
440
+ [VCTK dataset path]
441
+ ┣ txt
442
+ ┃ ┣ {Speaker_1}
443
+ ┃ ┃ ┣ {Speaker_1}_001.txt
444
+ ┃ ┃ ┣ {Speaker_1}_002.txt
445
+ ┃ ┃ ┣ ...
446
+ ┃ ┣ {Speaker_2}
447
+ ┃ ┣ ...
448
+ ┣ wav48_silence_trimmed
449
+ ┃ ┣ {Speaker_1}
450
+ ┃ ┃ ┣ {Speaker_1}_001_mic1.flac
451
+ ┃ ┃ ┣ {Speaker_1}_001_mic2.flac
452
+ ┃ ┃ ┣ {Speaker_1}_002_mic1.flac
453
+ ┃ ┃ ┣ ...
454
+ ┃ ┣ {Speaker_2}
455
+ ┃ ┣ ...
456
+ ┣ speaker-info.txt
457
+ ┗ update.txt
458
+ ```
egs/datasets/docker.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mount dataset in Docker container
2
+
3
+ When using Docker to run Amphion, mount the dataset to the container first is needed. It is recommend to mounte dataset to `/mnt/<dataset_name>` in the container, where `<dataset_name>` is the name of the dataset.
4
+
5
+ When configuring the dataset in `exp_config.json`, you should use the path `/mnt/<dataset_name>` as the dataset path instead of the actual path on your host machine. Otherwise, the dataset will not be found in the container.
6
+
7
+ ## Mount Example
8
+
9
+ ```bash
10
+ docker run --runtime=nvidia --gpus all -it -v .:/app -v <dataset_path1>:/mnt/<dataset_name1> -v <dataset_path2>:/mnt/<dataset_name2> amphion
11
+ ```
12
+
13
+ For example, if you want to use the `LJSpeech` dataset, you can mount the dataset to `/mnt/LJSpeech` in the container.
14
+
15
+ ```bash
16
+ docker run --runtime=nvidia --gpus all -it -v .:/app -v /home/username/datasets/LJSpeech:/mnt/LJSpeech amphion
17
+ ```
18
+
19
+ If you want to use multiple datasets, you can mount them to different directories in the container by adding more `-v` options.
egs/metrics/README.md ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Evaluation Recipe
2
+
3
+ ## Supported Evaluation Metrics
4
+
5
+ Until now, Amphion Evaluation has supported the following objective metrics:
6
+
7
+ - **F0 Modeling**:
8
+ - F0 Pearson Coefficients (FPC)
9
+ - F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
10
+ - F0 Root Mean Square Error (F0RMSE)
11
+ - Voiced/Unvoiced F1 Score (V/UV F1)
12
+ - **Energy Modeling**:
13
+ - Energy Root Mean Square Error (EnergyRMSE)
14
+ - Energy Pearson Coefficients (EnergyPC)
15
+ - **Intelligibility**:
16
+ - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
17
+ - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
18
+ - **Spectrogram Distortion**:
19
+ - Frechet Audio Distance (FAD)
20
+ - Mel Cepstral Distortion (MCD)
21
+ - Multi-Resolution STFT Distance (MSTFT)
22
+ - Perceptual Evaluation of Speech Quality (PESQ)
23
+ - Short Time Objective Intelligibility (STOI)
24
+ - Scale Invariant Signal to Distortion Ratio (SISDR)
25
+ - Scale Invariant Signal to Noise Ratio (SISNR)
26
+ - **Speaker Similarity**:
27
+ - Cosine similarity based on:
28
+ - [Rawnet3](https://github.com/Jungjee/RawNet)
29
+ - [Resemblyzer](https://github.com/resemble-ai/Resemblyzer)
30
+ - [WavLM](https://huggingface.co/microsoft/wavlm-base-plus-sv)
31
+
32
+ We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
33
+
34
+ 1. Pretrained Models Preparation
35
+ 2. Audio Data Preparation
36
+ 3. Evaluation
37
+
38
+ ## 1. Pretrained Models Preparation
39
+
40
+ If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
41
+
42
+ ## 2. Audio Data Preparation
43
+
44
+ Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
45
+
46
+ ```plaintext
47
+ ┣ {ref_dir}
48
+ ┃ ┣ sample1.wav
49
+ ┃ ┣ sample2.wav
50
+ ┣ {gen_dir}
51
+ ┃ ┣ sample1.wav
52
+ ┃ ┣ sample2.wav
53
+ ```
54
+
55
+ You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
56
+
57
+ ## 3. Evaluation
58
+
59
+ Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
60
+
61
+ ```bash
62
+ cd Amphion
63
+ sh egs/metrics/run.sh \
64
+ --reference_folder [Your path to the reference audios] \
65
+ --generated_folder [Your path to the generated audios] \
66
+ --dump_folder [Your path to dump the objective results] \
67
+ --metrics [The metrics you need] \
68
+ --fs [Optional. To calculate all metrics in the specified sampling rate] \
69
+ --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
70
+ --similarity_mode [Optional. To choose the mode for calculating the speaker similarity. "pairwith" for calculating a series of ground truth / prediction audio pairs to obtain the speaker similarity, and "overall" for computing the average score with all possible pairs between the refernece folder and generated folder. Default to "pairwith"] \
71
+ --intelligibility_mode [Optionoal. To choose the mode for computing CER and WER. "gt_audio" means selecting the recognition content of the reference audio as the target, "gt_content" means using transcription as the target. Default to "gt_audio"] \
72
+ --ltr_path [Optional. Path to the transcription file] \
73
+ --language [Optional. Language for computing CER and WER. Default to "english"]
74
+ ```
75
+
76
+ As for the metrics, an example is provided below:
77
+
78
+ ```bash
79
+ --metrics "mcd pesq fad"
80
+ ```
81
+
82
+ All currently available metrics keywords are listed below:
83
+
84
+ | Keys | Description |
85
+ | ------------------------- | ------------------------------------------ |
86
+ | `fpc` | F0 Pearson Coefficients |
87
+ | `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error |
88
+ | `f0rmse` | F0 Root Mean Square Error |
89
+ | `v_uv_f1` | Voiced/Unvoiced F1 Score |
90
+ | `energy_rmse` | Energy Root Mean Square Error |
91
+ | `energy_pc` | Energy Pearson Coefficients |
92
+ | `cer` | Character Error Rate |
93
+ | `wer` | Word Error Rate |
94
+ | `similarity` | Speaker Similarity
95
+ | `fad` | Frechet Audio Distance |
96
+ | `mcd` | Mel Cepstral Distortion |
97
+ | `mstft` | Multi-Resolution STFT Distance |
98
+ | `pesq` | Perceptual Evaluation of Speech Quality |
99
+ | `si_sdr` | Scale Invariant Signal to Distortion Ratio |
100
+ | `si_snr` | Scale Invariant Signal to Noise Ratio |
101
+ | `stoi` | Short Time Objective Intelligibility |
102
+
103
+ For example, if want to calculate the speaker similarity between the synthesized audio and the reference audio with the same content, run:
104
+
105
+ ```bash
106
+ sh egs/metrics/run.sh \
107
+ --reference_folder [Your path to the reference audios] \
108
+ --generated_folder [Your path to the generated audios] \
109
+ --dump_folder [Your path to dump the objective results] \
110
+ --metrics "similarity" \
111
+ --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
112
+ --similarity_mode "pairwith" \
113
+ ```
114
+
115
+ If you don't have the reference audio with the same content, run the following to get the conteng-free similarity score:
116
+
117
+ ```bash
118
+ sh egs/metrics/run.sh \
119
+ --reference_folder [Your path to the reference audios] \
120
+ --generated_folder [Your path to the generated audios] \
121
+ --dump_folder [Your path to dump the objective results] \
122
+ --metrics "similarity" \
123
+ --similarity_model [Optional. To choose the model for calculating the speaker similarity. Currently "rawnet", "wavlm" and "resemblyzer" are available. Default to "wavlm"] \
124
+ --similarity_mode "overall" \
125
+ ```
126
+
127
+ ## Troubleshooting
128
+ ### FAD (Using Offline Models)
129
+ If your system is unable to access huggingface.co from the terminal, you might run into an error like "OSError: Can't load tokenizer for ...". To work around this, follow these steps to use local models:
130
+
131
+ 1. Download the [bert-base-uncased](https://huggingface.co/bert-base-uncased), [roberta-base](https://huggingface.co/roberta-base), and [facebook/bart-base](https://huggingface.co/facebook/bart-base) models from `huggingface.co`. Ensure that the models are complete and uncorrupted. Place these directories within `Amphion/pretrained`. For a detailed file structure reference, see [This README](../../pretrained/README.md#optional-model-dependencies-for-evaluation) under `Amphion/pretrained`.
132
+ 2. Inside the `Amphion/pretrained` directory, create a bash script with the content outlined below. This script will automatically update the tokenizer paths used by your system:
133
+ ```bash
134
+ #!/bin/bash
135
+
136
+ BERT_DIR="bert-base-uncased"
137
+ ROBERTA_DIR="roberta-base"
138
+ BART_DIR="facebook/bart-base"
139
+ PYTHON_SCRIPT="[YOUR ENV PATH]/lib/python3.9/site-packages/laion_clap/training/data.py"
140
+
141
+ update_tokenizer_path() {
142
+ local dir_name=$1
143
+ local tokenizer_variable=$2
144
+ local full_path
145
+
146
+ if [ -d "$dir_name" ]; then
147
+ full_path=$(realpath "$dir_name")
148
+ if [ -f "$PYTHON_SCRIPT" ]; then
149
+ sed -i "s|${tokenizer_variable}.from_pretrained(\".*\")|${tokenizer_variable}.from_pretrained(\"$full_path\")|" "$PYTHON_SCRIPT"
150
+ echo "Updated ${tokenizer_variable} path to $full_path."
151
+ else
152
+ echo "Error: The specified Python script does not exist."
153
+ exit 1
154
+ fi
155
+ else
156
+ echo "Error: The directory $dir_name does not exist in the current directory."
157
+ exit 1
158
+ fi
159
+ }
160
+
161
+ update_tokenizer_path "$BERT_DIR" "BertTokenizer"
162
+ update_tokenizer_path "$ROBERTA_DIR" "RobertaTokenizer"
163
+ update_tokenizer_path "$BART_DIR" "BartTokenizer"
164
+
165
+ echo "BERT, BART and RoBERTa Python script paths have been updated."
166
+
167
+ ```
168
+
169
+ 3. The script provided is intended to adjust the tokenizer paths in the `data.py` file, found under `/lib/python3.9/site-packages/laion_clap/training/`, within your specific environment. For those utilizing conda, you can determine your environment path by running `conda info --envs`. Then, substitute `[YOUR ENV PATH]` in the script with this path. If your environment is configured differently, you'll need to update the `PYTHON_SCRIPT` variable to correctly point to the `data.py` file.
170
+ 4. Run the script. If it executes successfully, the tokenizer paths will be updated, allowing them to be loaded locally.
171
+
172
+ ### WavLM-based Speaker Similarity (Using Offline Models)
173
+
174
+ If your system is unable to access huggingface.co from the terminal and you want to calculate `WavLM` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
egs/metrics/run.sh ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $exp_dir))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics:,fs:,align_method:,energy_db_scale:,f0_subtract_mean:,similarity_model:,similarity_mode:,ltr_path:,intelligibility_mode:,language: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Visible GPU machines. The default value is "0".
21
+ --gpu) shift; gpu=$1 ; shift ;;
22
+ # Reference Audio Folder
23
+ --reference_folder) shift; ref_dir=$1 ; shift ;;
24
+ # Generated Audio Folder
25
+ --generated_folder) shift; deg_dir=$1 ; shift ;;
26
+ # Result Dumping Folder
27
+ --dump_folder) shift; dump_dir=$1 ; shift ;;
28
+ # Metrics to Compute
29
+ --metrics) shift; metrics=$1 ; shift ;;
30
+ # Sampling Rate
31
+ --fs) shift; fs=$1 ; shift ;;
32
+
33
+ # Method for aligning F0. The default value is "cut"
34
+ --align_method) shift; align_method=$1 ; shift ;;
35
+ # Method for normalizing F0. The default value is "True"
36
+ --f0_subtract_mean) shift; f0_subtract_mean=$1 ; shift ;;
37
+ # Method for normalizing Energy. The default value is "True"
38
+ --energy_db_scale) shift; energy_db_scale=$1 ; shift ;;
39
+
40
+ # Model for computing speaker similarity. The default value is "wavlm"
41
+ --similarity_model) shift; similarity_model=$1 ; shift ;;
42
+ # Mode for computing speaker similarity. The default value is "pairwith"
43
+ --similarity_mode) shift; similarity_mode=$1 ; shift ;;
44
+
45
+ # Path for the transcript.
46
+ --ltr_path) shift; ltr_path=$1 ; shift ;;
47
+ # Mode for computing CER and WER. The default value is "gt_audio"
48
+ --intelligibility_mode) shift; intelligibility_mode=$1 ; shift ;;
49
+ # Language for computing CER and WER. The default value is "english"
50
+ --language) shift; language=$1 ; shift ;;
51
+
52
+ --) shift ; break ;;
53
+ *) echo "Invalid option: $1" exit 1 ;;
54
+ esac
55
+ done
56
+
57
+ ### Value check ###
58
+ if [ -z "$ref_dir" ]; then
59
+ echo "[Error] Please specify the reference_folder"
60
+ exit 1
61
+ fi
62
+
63
+ if [ -z "$deg_dir" ]; then
64
+ echo "[Error] Please specify the generated_folder"
65
+ exit 1
66
+ fi
67
+
68
+ if [ -z "$dump_dir" ]; then
69
+ echo "[Error] Please specify the dump_folder"
70
+ exit 1
71
+ fi
72
+
73
+ if [ -z "$metrics" ]; then
74
+ echo "[Error] Please specify the metrics"
75
+ exit 1
76
+ fi
77
+
78
+ if [ -z "$gpu" ]; then
79
+ gpu="0"
80
+ fi
81
+
82
+ if [ -z "$fs" ]; then
83
+ fs="None"
84
+ fi
85
+
86
+ if [ -z "$align_method" ]; then
87
+ align_method="dtw"
88
+ fi
89
+
90
+ if [ -z "$energy_db_scale" ]; then
91
+ energy_db_scale="True"
92
+ fi
93
+
94
+ if [ -z "$f0_subtract_mean" ]; then
95
+ f0_subtract_mean="True"
96
+ fi
97
+
98
+ if [ -z "$similarity_model" ]; then
99
+ similarity_model="wavlm"
100
+ fi
101
+
102
+ if [ -z "$similarity_mode" ]; then
103
+ similarity_mode="pairwith"
104
+ fi
105
+
106
+ if [ -z "$ltr_path" ]; then
107
+ ltr_path="None"
108
+ fi
109
+
110
+ if [ -z "$intelligibility_mode" ]; then
111
+ intelligibility_mode="gt_audio"
112
+ fi
113
+
114
+ if [ -z "$language" ]; then
115
+ language="english"
116
+ fi
117
+
118
+ ######## Calculate Objective Metrics ###########
119
+ CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
120
+ --ref_dir $ref_dir \
121
+ --deg_dir $deg_dir \
122
+ --dump_dir $dump_dir \
123
+ --metrics $metrics \
124
+ --fs $fs \
125
+ --align_method $align_method \
126
+ --db_scale $energy_db_scale \
127
+ --f0_subtract_mean $f0_subtract_mean \
128
+ --similarity_model $similarity_model \
129
+ --similarity_mode $similarity_mode \
130
+ --ltr_path $ltr_path \
131
+ --intelligibility_mode $intelligibility_mode \
132
+ --language $language
egs/svc/DiffComoSVC/README.md ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
2
+ <br>
3
+ <div align="center">
4
+ <img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
5
+ </div>
6
+ <br>
7
+
8
+ This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
9
+
10
+ * The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
11
+ * To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
12
+
13
+ There are five stages in total:
14
+
15
+ 1. Data preparation
16
+ 2. Features extraction
17
+ 3. Teacher Model Training
18
+ 4. Consistency Distillation
19
+ 5. Inference/conversion
20
+
21
+ ## 1. Data Preparation
22
+
23
+ ### Dataset Download
24
+
25
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
26
+
27
+ ### Configuration
28
+
29
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
30
+
31
+ ```json
32
+ "dataset": [
33
+ "m4singer",
34
+ "opencpop",
35
+ "opensinger",
36
+ "svcc",
37
+ "vctk"
38
+ ],
39
+ "dataset_path": {
40
+ // TODO: Fill in your dataset path
41
+ "m4singer": "[M4Singer dataset path]",
42
+ "opencpop": "[Opencpop dataset path]",
43
+ "opensinger": "[OpenSinger dataset path]",
44
+ "svcc": "[SVCC dataset path]",
45
+ "vctk": "[VCTK dataset path]"
46
+ },
47
+ ```
48
+
49
+ ## 2. Features Extraction
50
+
51
+ ### Content-based Pretrained Models Download
52
+
53
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
54
+
55
+ ### Configuration
56
+
57
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
58
+
59
+ ```json
60
+ // TODO: Fill in the output log path
61
+ "log_dir": "[Your path to save logs and checkpoints]",
62
+ "preprocess": {
63
+ // TODO: Fill in the output data path
64
+ "processed_dir": "[Your path to save processed data]",
65
+ ...
66
+ },
67
+ ```
68
+
69
+ ### Run
70
+
71
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
72
+
73
+ ```bash
74
+ cd Amphion
75
+ sh egs/svc/DiffComoSVC/run.sh --stage 1
76
+ ```
77
+
78
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
79
+
80
+ ## 3. Teacher Model Training
81
+
82
+ ### Configuration
83
+
84
+ Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
85
+
86
+ ```JSON
87
+ "comosvc":{
88
+ "distill": false,
89
+ // conformer encoder
90
+ "input_dim": 384,
91
+ "output_dim": 100,
92
+ "n_heads": 2,
93
+ "n_layers": 6,
94
+ "filter_channels":512,
95
+ // karras diffusion
96
+ "P_mean": -1.2,
97
+ "P_std": 1.2,
98
+ "sigma_data": 0.5,
99
+ "sigma_min": 0.002,
100
+ "sigma_max": 80,
101
+ "rho": 7,
102
+ "n_timesteps": 40,
103
+ },
104
+ ```
105
+
106
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
107
+
108
+ ```json
109
+ "train": {
110
+ "batch_size": 32,
111
+ ...
112
+ "adamw": {
113
+ "lr": 2.0e-4
114
+ },
115
+ ...
116
+ }
117
+ ```
118
+
119
+ ### Run
120
+
121
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
122
+
123
+ ```bash
124
+ cd Amphion
125
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
126
+ ```
127
+
128
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
129
+
130
+ ```bash
131
+ cd Amphion
132
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
133
+ ```
134
+
135
+ ## 4. Consistency Distillation
136
+
137
+ ### Configuration
138
+
139
+ Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
140
+
141
+ ```JSON
142
+ "model": {
143
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
144
+ ...
145
+ "comosvc":{
146
+ "distill": true,
147
+ // conformer encoder
148
+ "input_dim": 384,
149
+ "output_dim": 100,
150
+ "n_heads": 2,
151
+ "n_layers": 6,
152
+ "filter_channels":512,
153
+ // karras diffusion
154
+ "P_mean": -1.2,
155
+ "P_std": 1.2,
156
+ "sigma_data": 0.5,
157
+ "sigma_min": 0.002,
158
+ "sigma_max": 80,
159
+ "rho": 7,
160
+ "n_timesteps": 40,
161
+ },
162
+ ```
163
+
164
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
165
+
166
+ ```json
167
+ "train": {
168
+ "batch_size": 32,
169
+ ...
170
+ "adamw": {
171
+ "lr": 2.0e-4
172
+ },
173
+ ...
174
+ }
175
+ ```
176
+
177
+ ### Run
178
+
179
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
180
+
181
+ ```bash
182
+ cd Amphion
183
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
184
+ ```
185
+
186
+ Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
187
+
188
+ ```bash
189
+ cd Amphion
190
+ sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
191
+ ```
192
+
193
+ ## 5. Inference/Conversion
194
+
195
+ ### Pretrained Vocoder Download
196
+
197
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
198
+
199
+ ### Run
200
+
201
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
202
+
203
+ | Parameters | Description | Example |
204
+ | --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
205
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
206
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
207
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
208
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
209
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
210
+
211
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
212
+
213
+ ```bash
214
+ cd Amphion
215
+ sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
216
+ --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
217
+ --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
218
+ --infer_source_audio_dir [Your Audios Folder] \
219
+ --infer_target_speaker "opencpop_female1" \
220
+ --infer_key_shift "autoshift"
221
+ ```
222
+ Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
223
+ ```json
224
+ "inference": {
225
+ "comosvc": {
226
+ "inference_steps": 40
227
+ }
228
+ }
229
+ ```
230
+
231
+ # Reference
232
+ https://github.com/zhenye234/CoMoSpeech
233
+
234
+ https://github.com/openai/consistency_models
egs/svc/DiffComoSVC/exp_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/comosvc.json",
3
+ "model_type": "DiffComoSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path
20
+ "log_dir": "[Your path to save logs and checkpoints]",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path
23
+ "processed_dir": "[Your path to save processed data]",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
53
+ "condition_encoder": {
54
+ // Config for features usage
55
+ "use_whisper": true,
56
+ "use_contentvec": true,
57
+ "use_wenet": false,
58
+ "whisper_dim": 1024,
59
+ "contentvec_dim": 256,
60
+ "wenet_dim": 512,
61
+ "use_singer_encoder": false,
62
+ "pitch_min": 50,
63
+ "pitch_max": 1100
64
+ },
65
+ "comosvc":{
66
+ "distill": false,
67
+ // conformer encoder
68
+ "input_dim": 384,
69
+ "output_dim": 100,
70
+ "n_heads": 2,
71
+ "n_layers": 6,
72
+ "filter_channels":512,
73
+ "dropout":0.1,
74
+ // karras diffusion
75
+ "P_mean": -1.2,
76
+ "P_std": 1.2,
77
+ "sigma_data": 0.5,
78
+ "sigma_min": 0.002,
79
+ "sigma_max": 80,
80
+ "rho": 7,
81
+ "n_timesteps": 40,
82
+ },
83
+ "diffusion": {
84
+ // Diffusion steps encoder
85
+ "step_encoder": {
86
+ "dim_raw_embedding": 128,
87
+ "dim_hidden_layer": 512,
88
+ "activation": "SiLU",
89
+ "num_layer": 2,
90
+ "max_period": 10000
91
+ },
92
+ // Diffusion decoder
93
+ "model_type": "bidilconv",
94
+ // bidilconv, unet2d, TODO: unet1d
95
+ "bidilconv": {
96
+ "base_channel": 384,
97
+ "n_res_block": 20,
98
+ "conv_kernel_size": 3,
99
+ "dilation_cycle_length": 4,
100
+ // specially, 1 means no dilation
101
+ "conditioner_size": 100
102
+ }
103
+ }
104
+ },
105
+ "train": {
106
+ "batch_size": 64,
107
+ "gradient_accumulation_step": 1,
108
+ "max_epoch": -1, // -1 means no limit
109
+ "save_checkpoint_stride": [
110
+ 50,
111
+ 50
112
+ ],
113
+ "keep_last": [
114
+ 5,
115
+ -1
116
+ ],
117
+ "run_eval": [
118
+ false,
119
+ true
120
+ ],
121
+ "adamw": {
122
+ "lr": 4.0e-4
123
+ },
124
+ "reducelronplateau": {
125
+ "factor": 0.8,
126
+ "patience": 10,
127
+ "min_lr": 1.0e-4
128
+ },
129
+ "dataloader": {
130
+ "num_worker": 8,
131
+ "pin_memory": true
132
+ },
133
+ "sampler": {
134
+ "holistic_shuffle": false,
135
+ "drop_last": true
136
+ }
137
+ },
138
+ "inference": {
139
+ "comosvc": {
140
+ "inference_steps": 40
141
+ }
142
+ }
143
+ }
egs/svc/DiffComoSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/MultipleContentsSVC/README.md ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
+
3
+ [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
+ [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
+ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Models-pink)](https://huggingface.co/amphion/singing_voice_conversion)
6
+ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion)
7
+ [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion)
8
+
9
+ <br>
10
+ <div align="center">
11
+ <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
12
+ </div>
13
+ <br>
14
+
15
+ This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
16
+
17
+ - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
18
+ - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
19
+ - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
20
+
21
+ ## A Little Taste Before Getting Started
22
+
23
+ Before you delve into the code, we suggest exploring the interactive DEMO we've provided for a comprehensive overview. There are several ways you can engage with it:
24
+
25
+ 1. **Online DEMO**
26
+
27
+ | HuggingFace | OpenXLab |
28
+ | :----------------------------------------------------------: | :----------------------------------------------------------: |
29
+ | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion)<br />(Worldwide) | [![openxlab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/Amphion/singing_voice_conversion)<br />(Suitable for Mainland China Users) |
30
+
31
+ 2. **Run Local Gradio DEMO**
32
+
33
+ | Run with Docker | Duplicate Space with Private GPU |
34
+ | :----------------------------------------------------------: | :----------------------------------------------------------: |
35
+ | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?docker=true) | [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Spaces-yellow)](https://huggingface.co/spaces/amphion/singing_voice_conversion?duplicate=true) |
36
+
37
+ 3. **Run with the Extended Colab**
38
+
39
+ You can check out [this repo](https://github.com/camenduru/singing-voice-conversion-colab) to run it with Colab. Thanks to [@camenduru](https://x.com/camenduru?s=20) and the community for their support!
40
+
41
+ ## Usage Overview
42
+
43
+ To train a `DiffWaveNetSVC` model, there are four stages in total:
44
+
45
+ 1. Data preparation
46
+ 2. Features extraction
47
+ 3. Training
48
+ 4. Inference/conversion
49
+
50
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
51
+ > ```bash
52
+ > cd Amphion
53
+ > ```
54
+
55
+ ## 1. Data Preparation
56
+
57
+ ### Dataset Download
58
+
59
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
60
+
61
+ ### Configuration
62
+
63
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
64
+
65
+ ```json
66
+ "dataset": [
67
+ "m4singer",
68
+ "opencpop",
69
+ "opensinger",
70
+ "svcc",
71
+ "vctk"
72
+ ],
73
+ "dataset_path": {
74
+ // TODO: Fill in your dataset path
75
+ "m4singer": "[M4Singer dataset path]",
76
+ "opencpop": "[Opencpop dataset path]",
77
+ "opensinger": "[OpenSinger dataset path]",
78
+ "svcc": "[SVCC dataset path]",
79
+ "vctk": "[VCTK dataset path]"
80
+ },
81
+ ```
82
+
83
+ ### Custom Dataset
84
+
85
+ We support custom dataset, see [here](../../datasets/README.md#customsvcdataset) for the file structure to follow.
86
+
87
+ After constructing proper file structure, specify your dataset name in `dataset` and its path in `dataset_path`, also add its name in `use_custom_dataset`:
88
+
89
+ ```json
90
+ "dataset": [
91
+ "[Exisiting Dataset Name]",
92
+ //...
93
+ "[Your Custom Dataset Name]"
94
+ ],
95
+ "dataset_path": {
96
+ "[Exisiting Dataset Name]": "[Exisiting Dataset Path]",
97
+ //...
98
+ "[Your Custom Dataset Name]": "[Your Custom Dataset Path]"
99
+ },
100
+ "use_custom_dataset": [
101
+ "[Your Custom Dataset Name]"
102
+ ],
103
+ ```
104
+
105
+ > **NOTE:** Custom dataset name does not have to be the same as the folder name. But it needs to satisfy these rules:
106
+ > 1. It can not be the same as the exisiting dataset name.
107
+ > 2. It can not contain any space or underline(`_`).
108
+ > 3. It must be a valid folder name for operating system.
109
+ >
110
+ > Some examples of valid custom dataset names are `mydataset`, `myDataset`, `my-dataset`, `mydataset1`, `my-dataset-1`, etc.
111
+
112
+ ## 2. Features Extraction
113
+
114
+ ### Content-based Pretrained Models Download
115
+
116
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
117
+
118
+ ### Configuration
119
+
120
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
121
+
122
+ ```json
123
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
124
+ "log_dir": "ckpts/svc",
125
+ "preprocess": {
126
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
127
+ "processed_dir": "data",
128
+ ...
129
+ },
130
+ ```
131
+
132
+ ### Run
133
+
134
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
135
+
136
+ ```bash
137
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 1
138
+ ```
139
+
140
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
141
+
142
+ ## 3. Training
143
+
144
+ ### Configuration
145
+
146
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
147
+
148
+ ```json
149
+ "train": {
150
+ "batch_size": 32,
151
+ ...
152
+ "adamw": {
153
+ "lr": 2.0e-4
154
+ },
155
+ ...
156
+ }
157
+ ```
158
+
159
+ ### Train From Scratch
160
+
161
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
162
+
163
+ ```bash
164
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
165
+ ```
166
+
167
+ ### Train From Existing Source
168
+
169
+ We support training from existing source for various purposes. You can resume training the model from a checkpoint or fine-tune a model from another checkpoint.
170
+
171
+ Setting `--resume true`, the training will resume from the **latest checkpoint** by default. For example, if you want to resume training from the latest checkpoint in `Amphion/ckpts/svc/[YourExptName]/checkpoint`, run:
172
+
173
+ ```bash
174
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
175
+ --resume true
176
+ ```
177
+
178
+ You can choose a **specific checkpoint** for retraining by `--resume_from_ckpt_path` argument. For example, if you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]`, run:
179
+
180
+ ```bash
181
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
182
+ --resume true
183
+ --resume_from_ckpt_path "Amphion/ckpts/svc/[YourExptName]/checkpoint/[SpecificCheckpoint]" \
184
+ ```
185
+
186
+ If you want to **fine-tune from another checkpoint**, just use `--resume_type` and set it to `"finetune"`. For example, If you want to fine-tune from the checkpoint `Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]`, run:
187
+
188
+ ```bash
189
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName] \
190
+ --resume true
191
+ --resume_from_ckpt_path "Amphion/ckpts/svc/[AnotherExperiment]/checkpoint/[SpecificCheckpoint]" \
192
+ --resume_type "finetune"
193
+ ```
194
+
195
+ > **NOTE:** The `--resume_type` is set as `"resume"` in default. It's not necessary to specify it when resuming training.
196
+ >
197
+ > The difference between `"resume"` and `"finetune"` is that the `"finetune"` will **only** load the pretrained model weights from the checkpoint, while the `"resume"` will load all the training states (including optimizer, scheduler, etc.) from the checkpoint.
198
+
199
+ Here are some example scenarios to better understand how to use these arguments:
200
+ | Scenario | `--resume` | `--resume_from_ckpt_path` | `--resume_type` |
201
+ | ------ | -------- | ----------------------- | ------------- |
202
+ | You want to train from scratch | no | no | no |
203
+ | The machine breaks down during training and you want to resume training from the latest checkpoint | `true` | no | no |
204
+ | You find the latest model is overfitting and you want to re-train from the checkpoint before | `true` | `SpecificCheckpoint Path` | no |
205
+ | You want to fine-tune a model from another checkpoint | `true` | `SpecificCheckpoint Path` | `"finetune"` |
206
+
207
+
208
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
209
+
210
+ ## 4. Inference/Conversion
211
+
212
+ ### Pretrained Vocoder Download
213
+
214
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
215
+
216
+ ### Run
217
+
218
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
219
+
220
+ | Parameters | Description | Example |
221
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
222
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
223
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
224
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
225
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
226
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
227
+
228
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
229
+
230
+ ```bash
231
+ sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
232
+ --infer_expt_dir ckpts/svc/[YourExptName] \
233
+ --infer_output_dir ckpts/svc/[YourExptName]/result \
234
+ --infer_source_audio_dir [Your Audios Folder] \
235
+ --infer_target_speaker "opencpop_female1" \
236
+ --infer_key_shift "autoshift"
237
+ ```
238
+
239
+ ## Citations
240
+
241
+ ```bibtex
242
+ @article{zhang2023leveraging,
243
+ title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
244
+ author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
245
+ journal={Machine Learning for Audio Workshop, NeurIPS 2023},
246
+ year={2023}
247
+ }
248
+ ```
egs/svc/MultipleContentsSVC/exp_config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/svc/diffusion.json",
3
+ "model_type": "DiffWaveNetSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online")
26
+ "extract_mel": true,
27
+ "extract_pitch": true,
28
+ "extract_energy": true,
29
+ "extract_whisper_feature": true,
30
+ "extract_contentvec_feature": true,
31
+ "extract_wenet_feature": false,
32
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
33
+ "contentvec_batch_size": 1,
34
+ // Fill in the content-based pretrained model's path
35
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
36
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
37
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
38
+ "whisper_model": "medium",
39
+ "whisper_model_path": "pretrained/whisper/medium.pt",
40
+ // Config for features usage
41
+ "use_mel": true,
42
+ "use_min_max_norm_mel": true,
43
+ "use_frame_pitch": true,
44
+ "use_frame_energy": true,
45
+ "use_spkid": true,
46
+ "use_whisper": true,
47
+ "use_contentvec": true,
48
+ "use_wenet": false,
49
+ "n_mel": 100,
50
+ "sample_rate": 24000
51
+ },
52
+ "model": {
53
+ "condition_encoder": {
54
+ // Config for features usage
55
+ "use_whisper": true,
56
+ "use_contentvec": true,
57
+ "use_wenet": false,
58
+ "whisper_dim": 1024,
59
+ "contentvec_dim": 256,
60
+ "wenet_dim": 512,
61
+ "use_singer_encoder": false,
62
+ "pitch_min": 50,
63
+ "pitch_max": 1100
64
+ },
65
+ "diffusion": {
66
+ "scheduler": "ddpm",
67
+ "scheduler_settings": {
68
+ "num_train_timesteps": 1000,
69
+ "beta_start": 1.0e-4,
70
+ "beta_end": 0.02,
71
+ "beta_schedule": "linear"
72
+ },
73
+ // Diffusion steps encoder
74
+ "step_encoder": {
75
+ "dim_raw_embedding": 128,
76
+ "dim_hidden_layer": 512,
77
+ "activation": "SiLU",
78
+ "num_layer": 2,
79
+ "max_period": 10000
80
+ },
81
+ // Diffusion decoder
82
+ "model_type": "bidilconv",
83
+ // bidilconv, unet2d, TODO: unet1d
84
+ "bidilconv": {
85
+ "base_channel": 512,
86
+ "n_res_block": 40,
87
+ "conv_kernel_size": 3,
88
+ "dilation_cycle_length": 4,
89
+ // specially, 1 means no dilation
90
+ "conditioner_size": 384
91
+ }
92
+ }
93
+ },
94
+ "train": {
95
+ "batch_size": 32,
96
+ "gradient_accumulation_step": 1,
97
+ "max_epoch": -1, // -1 means no limit
98
+ "save_checkpoint_stride": [
99
+ 3,
100
+ 50
101
+ ],
102
+ "keep_last": [
103
+ 3,
104
+ 2
105
+ ],
106
+ "run_eval": [
107
+ true,
108
+ true
109
+ ],
110
+ "adamw": {
111
+ "lr": 2.0e-4
112
+ },
113
+ "reducelronplateau": {
114
+ "factor": 0.8,
115
+ "patience": 30,
116
+ "min_lr": 1.0e-4
117
+ },
118
+ "dataloader": {
119
+ "num_worker": 8,
120
+ "pin_memory": true
121
+ },
122
+ "sampler": {
123
+ "holistic_shuffle": false,
124
+ "drop_last": true
125
+ }
126
+ }
127
+ }
egs/svc/MultipleContentsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Amphion Singing Voice Conversion (SVC) Recipe
2
+
3
+ ## Quick Start
4
+
5
+ We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
+
7
+ ## Supported Model Architectures
8
+
9
+ The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
+
11
+ <br>
12
+ <div align="center">
13
+ <img src="../../imgs/svc/pipeline.png" width="70%">
14
+ </div>
15
+ <br>
16
+
17
+ Until now, Amphion SVC has supported the following features and models:
18
+
19
+ - **Speaker-agnostic Representations**:
20
+ - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
+ - Prosody Features: F0 and energy.
22
+ - **Speaker Embeddings**:
23
+ - Speaker Look-Up Table.
24
+ - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
+ - **Acoustic Decoders**:
26
+ - Diffusion-based models:
27
+ - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
+ - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
+ - Transformer-based models:
30
+ - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
+ - VAE- and Flow-based models:
32
+ - **[VitsSVC](VitsSVC)**: It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
+ - **Waveform Synthesizers (Vocoders)**:
34
+ - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
egs/svc/TransformerSVC/README.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformer for Singing Voice Conversion
2
+
3
+ This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/TransformerSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+ Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
79
+ ```json
80
+ "model": {
81
+ ...
82
+ "transformer":{
83
+ // 'conformer' or 'transformer'
84
+ "type": "conformer",
85
+ "input_dim": 384,
86
+ "output_dim": 100,
87
+ "n_heads": 2,
88
+ "n_layers": 6,
89
+ "filter_channels":512,
90
+ "dropout":0.1,
91
+ }
92
+ }
93
+ ```
94
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
95
+
96
+ ```json
97
+ "train": {
98
+ "batch_size": 32,
99
+ ...
100
+ "adamw": {
101
+ "lr": 2.0e-4
102
+ },
103
+ ...
104
+ }
105
+ ```
106
+
107
+ ### Run
108
+
109
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
110
+
111
+ ```bash
112
+ sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
113
+ ```
114
+
115
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
116
+
117
+ ## 4. Inference/Conversion
118
+
119
+ ### Pretrained Vocoder Download
120
+
121
+ We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
122
+
123
+ ### Run
124
+
125
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
126
+
127
+ | Parameters | Description | Example |
128
+ | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
129
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
130
+ | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
131
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
132
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
133
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
134
+
135
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
136
+
137
+ ```bash
138
+ cd Amphion
139
+ sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
140
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
141
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
142
+ --infer_source_audio_dir [Your Audios Folder] \
143
+ --infer_target_speaker "opencpop_female1" \
144
+ --infer_key_shift "autoshift"
145
+ ```
146
+
147
+ ## Citations
148
+
149
+ ```bibtex
150
+ @inproceedings{transformer,
151
+ author = {Ashish Vaswani and
152
+ Noam Shazeer and
153
+ Niki Parmar and
154
+ Jakob Uszkoreit and
155
+ Llion Jones and
156
+ Aidan N. Gomez and
157
+ Lukasz Kaiser and
158
+ Illia Polosukhin},
159
+ title = {Attention is All you Need},
160
+ booktitle = {{NIPS}},
161
+ pages = {5998--6008},
162
+ year = {2017}
163
+ }
164
+ ```
egs/svc/TransformerSVC/exp_config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/transformer.json",
3
+ "model_type": "TransformerSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
+ "log_dir": "ckpts/svc",
21
+ "preprocess": {
22
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
23
+ "processed_dir": "data",
24
+ // Config for features extraction
25
+ "extract_mel": true,
26
+ "extract_pitch": true,
27
+ "extract_energy": true,
28
+ "extract_whisper_feature": true,
29
+ "extract_contentvec_feature": true,
30
+ "extract_wenet_feature": false,
31
+ "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
+ "contentvec_batch_size": 1,
33
+ // Fill in the content-based pretrained model's path
34
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
+ "whisper_model": "medium",
38
+ "whisper_model_path": "pretrained/whisper/medium.pt",
39
+ // Config for features usage
40
+ "use_mel": true,
41
+ "use_min_max_norm_mel": true,
42
+ "use_frame_pitch": true,
43
+ "use_frame_energy": true,
44
+ "use_spkid": true,
45
+ "use_whisper": true,
46
+ "use_contentvec": true,
47
+ "use_wenet": false,
48
+ "n_mel": 100,
49
+ "sample_rate": 24000
50
+ },
51
+ "model": {
52
+ "condition_encoder": {
53
+ // Config for features usage
54
+ "use_whisper": true,
55
+ "use_contentvec": true,
56
+ "use_wenet": false,
57
+ "whisper_dim": 1024,
58
+ "contentvec_dim": 256,
59
+ "wenet_dim": 512,
60
+ "use_singer_encoder": false,
61
+ "pitch_min": 50,
62
+ "pitch_max": 1100
63
+ },
64
+ "transformer": {
65
+ // 'conformer' or 'transformer'
66
+ "type": "conformer",
67
+ "input_dim": 384,
68
+ "output_dim": 100,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "filter_channels": 512,
72
+ "dropout": 0.1,
73
+ }
74
+ },
75
+ "train": {
76
+ "batch_size": 64,
77
+ "gradient_accumulation_step": 1,
78
+ "max_epoch": -1, // -1 means no limit
79
+ "save_checkpoint_stride": [
80
+ 50,
81
+ 50
82
+ ],
83
+ "keep_last": [
84
+ 5,
85
+ -1
86
+ ],
87
+ "run_eval": [
88
+ false,
89
+ true
90
+ ],
91
+ "adamw": {
92
+ "lr": 4.0e-4
93
+ },
94
+ "reducelronplateau": {
95
+ "factor": 0.8,
96
+ "patience": 10,
97
+ "min_lr": 1.0e-4
98
+ },
99
+ "dataloader": {
100
+ "num_worker": 8,
101
+ "pin_memory": true
102
+ },
103
+ "sampler": {
104
+ "holistic_shuffle": false,
105
+ "drop_last": true
106
+ }
107
+ }
108
+ }
egs/svc/TransformerSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/VitsSVC/README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VITS for Singing Voice Conversion
2
+
3
+ This is an implementation of VITS as acoustic model for end-to-end singing voice conversion. Adapted from [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc), SoftVC content encoder is used to extract content features from the source audio. These feature vectors are directly fed into VITS without the need for conversion to a text-based intermediate representation.
4
+
5
+ There are four stages in total:
6
+
7
+ 1. Data preparation
8
+ 2. Features extraction
9
+ 3. Training
10
+ 4. Inference/conversion
11
+
12
+ > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
13
+ > ```bash
14
+ > cd Amphion
15
+ > ```
16
+
17
+ ## 1. Data Preparation
18
+
19
+ ### Dataset Download
20
+
21
+ By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
22
+
23
+ ### Configuration
24
+
25
+ Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
26
+
27
+ ```json
28
+ "dataset": [
29
+ "m4singer",
30
+ "opencpop",
31
+ "opensinger",
32
+ "svcc",
33
+ "vctk"
34
+ ],
35
+ "dataset_path": {
36
+ // TODO: Fill in your dataset path
37
+ "m4singer": "[M4Singer dataset path]",
38
+ "opencpop": "[Opencpop dataset path]",
39
+ "opensinger": "[OpenSinger dataset path]",
40
+ "svcc": "[SVCC dataset path]",
41
+ "vctk": "[VCTK dataset path]"
42
+ },
43
+ ```
44
+
45
+ ## 2. Features Extraction
46
+
47
+ ### Content-based Pretrained Models Download
48
+
49
+ By default, we utilize ContentVec and Whisper to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
50
+
51
+ ### Configuration
52
+
53
+ Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
54
+
55
+ ```json
56
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
57
+ "log_dir": "ckpts/svc",
58
+ "preprocess": {
59
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
60
+ "processed_dir": "data",
61
+ ...
62
+ },
63
+ ```
64
+
65
+ ### Run
66
+
67
+ Run the `run.sh` as the preproces stage (set `--stage 1`).
68
+
69
+ ```bash
70
+ sh egs/svc/VitsSVC/run.sh --stage 1
71
+ ```
72
+
73
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
74
+
75
+ ## 3. Training
76
+
77
+ ### Configuration
78
+
79
+ We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
80
+
81
+ ```json
82
+ "train": {
83
+ "batch_size": 32,
84
+ ...
85
+ "adamw": {
86
+ "lr": 2.0e-4
87
+ },
88
+ ...
89
+ }
90
+ ```
91
+
92
+ ### Run
93
+
94
+ Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
95
+
96
+ ```bash
97
+ sh egs/svc/VitsSVC/run.sh --stage 2 --name [YourExptName]
98
+ ```
99
+
100
+ > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
101
+
102
+ ## 4. Inference/Conversion
103
+
104
+ ### Run
105
+
106
+ For inference/conversion, you need to specify the following configurations when running `run.sh`:
107
+
108
+ | Parameters | Description | Example |
109
+ | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
110
+ | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `[Your path to save logs and checkpoints]/[YourExptName]` |
111
+ | `--infer_output_dir` | The output directory to save inferred audios. | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
112
+ | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
113
+ | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
114
+ | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
115
+
116
+ For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
117
+
118
+ ```bash
119
+ sh egs/svc/VitsSVC/run.sh --stage 3 --gpu "0" \
120
+ --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
121
+ --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
122
+ --infer_source_audio_dir [Your Audios Folder] \
123
+ --infer_target_speaker "opencpop_female1" \
124
+ --infer_key_shift "autoshift"
125
+ ```
egs/svc/VitsSVC/exp_config.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/vitssvc.json",
3
+ "model_type": "VitsSVC",
4
+ "dataset": [
5
+ "m4singer",
6
+ "opencpop",
7
+ "opensinger",
8
+ "svcc",
9
+ "vctk"
10
+ ],
11
+ "dataset_path": {
12
+ // TODO: Fill in your dataset path
13
+ "m4singer": "[M4Singer dataset path]",
14
+ "opencpop": "[Opencpop dataset path]",
15
+ "opensinger": "[OpenSinger dataset path]",
16
+ "svcc": "[SVCC dataset path]",
17
+ "vctk": "[VCTK dataset path]"
18
+ },
19
+ "use_custom_dataset": [],
20
+ // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
21
+ "log_dir": "ckpts/svc",
22
+ "preprocess": {
23
+ // TODO: Fill in the output data path. The default value is "Amphion/data"
24
+ "processed_dir": "data",
25
+
26
+ "n_mel": 100,
27
+ "sample_rate": 24000,
28
+
29
+ // contentvec
30
+ "extract_contentvec_feature": true,
31
+ "contentvec_sample_rate": 16000,
32
+ "contentvec_batch_size": 1,
33
+ "contentvec_frameshift": 0.02,
34
+ // whisper
35
+ "extract_whisper_feature": true,
36
+ "whisper_sample_rate": 16000,
37
+ "whisper_frameshift": 0.01,
38
+ "whisper_downsample_rate": 2,
39
+ // wenet
40
+ "extract_wenet_feature": true,
41
+ "wenet_downsample_rate": 4,
42
+ "wenet_frameshift": 0.01,
43
+ "wenet_sample_rate": 16000,
44
+ // Fill in the content-based pretrained model's path
45
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
46
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
47
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
48
+ "whisper_model": "medium",
49
+ "whisper_model_path": "pretrained/whisper/medium.pt",
50
+
51
+ "use_contentvec": true,
52
+ "use_whisper": true,
53
+ "use_wenet": false,
54
+
55
+ // Extract content features using dataloader
56
+ "pin_memory": true,
57
+ "num_workers": 8,
58
+ "content_feature_batch_size": 16,
59
+
60
+ },
61
+ "model": {
62
+ "condition_encoder": {
63
+ // Config for features usage
64
+ "merge_mode": "add",
65
+ "use_log_loudness": true,
66
+ "use_contentvec": true,
67
+ "use_whisper": true,
68
+ "use_wenet": false,
69
+ "whisper_dim": 1024,
70
+ "contentvec_dim": 256,
71
+ "wenet_dim": 512,
72
+ },
73
+ "vits": {
74
+ "inter_channels": 384,
75
+ "hidden_channels": 384,
76
+ "filter_channels": 256,
77
+ "n_heads": 2,
78
+ "n_layers": 6,
79
+ "kernel_size": 3,
80
+ "p_dropout": 0.1,
81
+ "n_flow_layer": 4,
82
+ "n_layers_q": 3,
83
+ "gin_channels": 256,
84
+ "n_speakers": 512,
85
+ "use_spectral_norm": false,
86
+ },
87
+ "generator": "nsfhifigan",
88
+ },
89
+ "train": {
90
+ "batch_size": 32,
91
+ "learning_rate": 2e-4,
92
+ "gradient_accumulation_step": 1,
93
+ "max_epoch": -1, // -1 means no limit
94
+ "save_checkpoint_stride": [
95
+ 3,
96
+ 50
97
+ ],
98
+ "keep_last": [
99
+ 3,
100
+ 2
101
+ ],
102
+ },
103
+ "inference": {
104
+ "batch_size": 1,
105
+ }
106
+ }
egs/svc/VitsSVC/run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ ../_template/run.sh
egs/svc/_template/run.sh ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ ######## Build Experiment Environment ###########
7
+ exp_dir=$(cd `dirname $0`; pwd)
8
+ work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
+
10
+ export WORK_DIR=$work_dir
11
+ export PYTHONPATH=$work_dir
12
+ export PYTHONIOENCODING=UTF-8
13
+
14
+ ######## Parse the Given Parameters from the Commond ###########
15
+ options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
+ eval set -- "$options"
17
+
18
+ while true; do
19
+ case $1 in
20
+ # Experimental Configuration File
21
+ -c | --config) shift; exp_config=$1 ; shift ;;
22
+ # Experimental Name
23
+ -n | --name) shift; exp_name=$1 ; shift ;;
24
+ # Running Stage
25
+ -s | --stage) shift; running_stage=$1 ; shift ;;
26
+ # Visible GPU machines. The default value is "0".
27
+ --gpu) shift; gpu=$1 ; shift ;;
28
+
29
+ # [Only for Training] Resume configuration
30
+ --resume) shift; resume=$1 ; shift ;;
31
+ # [Only for Training] The specific checkpoint path that you want to resume from.
32
+ --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
+ # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
+ --resume_type) shift; resume_type=$1 ; shift ;;
35
+
36
+ # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
+ --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
+ # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
+ --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
+ # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
+ --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
+ --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
+ # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
+ --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
+ # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
+ --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
+ # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
+ --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
+
50
+ --) shift ; break ;;
51
+ *) echo "Invalid option: $1" exit 1 ;;
52
+ esac
53
+ done
54
+
55
+
56
+ ### Value check ###
57
+ if [ -z "$running_stage" ]; then
58
+ echo "[Error] Please specify the running stage"
59
+ exit 1
60
+ fi
61
+
62
+ if [ -z "$exp_config" ]; then
63
+ exp_config="${exp_dir}"/exp_config.json
64
+ fi
65
+ echo "Exprimental Configuration File: $exp_config"
66
+
67
+ if [ -z "$gpu" ]; then
68
+ gpu="0"
69
+ fi
70
+
71
+ ######## Features Extraction ###########
72
+ if [ $running_stage -eq 1 ]; then
73
+ CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
+ --config $exp_config \
75
+ --num_workers 4
76
+ fi
77
+
78
+ ######## Training ###########
79
+ if [ $running_stage -eq 2 ]; then
80
+ if [ -z "$exp_name" ]; then
81
+ echo "[Error] Please specify the experiments name"
82
+ exit 1
83
+ fi
84
+ echo "Exprimental Name: $exp_name"
85
+
86
+ # add default value
87
+ if [ -z "$resume_from_ckpt_path" ]; then
88
+ resume_from_ckpt_path=""
89
+ fi
90
+
91
+ if [ -z "$resume_type" ]; then
92
+ resume_type="resume"
93
+ fi
94
+
95
+ if [ "$resume" = true ]; then
96
+ echo "Resume from the existing experiment..."
97
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
98
+ --config "$exp_config" \
99
+ --exp_name "$exp_name" \
100
+ --log_level info \
101
+ --resume \
102
+ --resume_from_ckpt_path "$resume_from_ckpt_path" \
103
+ --resume_type "$resume_type"
104
+ else
105
+ echo "Start a new experiment..."
106
+ CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
107
+ --config "$exp_config" \
108
+ --exp_name "$exp_name" \
109
+ --log_level info
110
+ fi
111
+ fi
112
+
113
+ ######## Inference/Conversion ###########
114
+ if [ $running_stage -eq 3 ]; then
115
+ if [ -z "$infer_expt_dir" ]; then
116
+ echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
117
+ exit 1
118
+ fi
119
+
120
+ if [ -z "$infer_output_dir" ]; then
121
+ infer_output_dir="$expt_dir/result"
122
+ fi
123
+
124
+ if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
125
+ echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
126
+ exit 1
127
+ fi
128
+
129
+ if [ -z "$infer_source_file" ]; then
130
+ infer_source=$infer_source_audio_dir
131
+ fi
132
+
133
+ if [ -z "$infer_source_audio_dir" ]; then
134
+ infer_source=$infer_source_file
135
+ fi
136
+
137
+ if [ -z "$infer_target_speaker" ]; then
138
+ echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
139
+ exit 1
140
+ fi
141
+
142
+ if [ -z "$infer_key_shift" ]; then
143
+ infer_key_shift="autoshift"
144
+ fi
145
+
146
+ if [ -z "$infer_vocoder_dir" ]; then
147
+ infer_vocoder_dir="$work_dir"/pretrained/bigvgan
148
+ echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
149
+ fi
150
+
151
+ CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
152
+ --config $exp_config \
153
+ --acoustics_dir $infer_expt_dir \
154
+ --vocoder_dir $infer_vocoder_dir \
155
+ --target_singer $infer_target_speaker \
156
+ --trans_key $infer_key_shift \
157
+ --source $infer_source \
158
+ --output_dir $infer_output_dir \
159
+ --log_level debug
160
+ fi