Skip to content

Commit 38606c1

Browse files
committed
[TTS] add Chinese TTS recipe based on IPA.
* add new pinyin and ipa dictionaries with 36 finals. * add yaml configs for 24-final pinyin and ipa. * add copyright header * add a directory level 24finals to discriminate from 36 finals. Signed-off-by: Xuesong Yang <[email protected]>
1 parent 9e616e9 commit 38606c1

File tree

11 files changed

+2080
-4
lines changed

11 files changed

+2080
-4
lines changed
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
# This config contains the default values for training FastPitch model with aligner using 22KHz sampling
2+
# rate. If you want to train model on other dataset, you can change config values according to your dataset.
3+
# Most dataset-specific arguments are in the head of the config file, see below.
4+
5+
name: FastPitch
6+
7+
train_dataset: ???
8+
validation_datasets: ???
9+
sup_data_path: ???
10+
sup_data_types: [ "align_prior_matrix", "pitch" ]
11+
12+
# Default values from librosa.pyin
13+
pitch_fmin: 65.40639132514966
14+
pitch_fmax: 1986.977294921875
15+
16+
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
17+
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
18+
pitch_mean: ??? # e.g. 221.4948272705078 for SFbilingual dataset.
19+
pitch_std: ??? # e.g. 64.6528930664063 for SFbilingual dataset.
20+
21+
# Default values for dataset with sample_rate=22050
22+
sample_rate: 22050
23+
n_mel_channels: 80
24+
n_window_size: 1024
25+
n_window_stride: 256
26+
n_fft: 1024
27+
lowfreq: 0
28+
highfreq: null
29+
window: hann
30+
31+
phoneme_dict_path: "scripts/tts_dataset_files/zh/24finals/ipa_dict_nv23.05.txt"
32+
33+
model:
34+
learn_alignment: true
35+
bin_loss_warmup_epochs: 100
36+
37+
n_speakers: 1
38+
max_token_duration: 75
39+
symbols_embedding_dim: 384
40+
pitch_embedding_kernel_size: 3
41+
42+
pitch_fmin: ${pitch_fmin}
43+
pitch_fmax: ${pitch_fmax}
44+
45+
pitch_mean: ${pitch_mean}
46+
pitch_std: ${pitch_std}
47+
48+
sample_rate: ${sample_rate}
49+
n_mel_channels: ${n_mel_channels}
50+
n_window_size: ${n_window_size}
51+
n_window_stride: ${n_window_stride}
52+
n_fft: ${n_fft}
53+
lowfreq: ${lowfreq}
54+
highfreq: ${highfreq}
55+
window: ${window}
56+
57+
text_normalizer:
58+
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
59+
lang: zh
60+
input_case: cased
61+
62+
text_normalizer_call_kwargs:
63+
verbose: false
64+
punct_pre_process: true
65+
punct_post_process: true
66+
67+
text_tokenizer:
68+
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.ChinesePhonemesTokenizer
69+
punct: true
70+
apostrophe: true
71+
pad_with_space: true
72+
g2p:
73+
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
74+
phoneme_dict: ${phoneme_dict_path}
75+
word_segmenter: jieba # Only jieba is supported now.
76+
phoneme_prefix: ""
77+
phoneme_case: lower
78+
tone_prefix: "#"
79+
ascii_letter_prefix: ""
80+
ascii_letter_case: upper
81+
82+
train_ds:
83+
dataset:
84+
_target_: nemo.collections.tts.data.dataset.TTSDataset
85+
manifest_filepath: ${train_dataset}
86+
sample_rate: ${model.sample_rate}
87+
sup_data_path: ${sup_data_path}
88+
sup_data_types: ${sup_data_types}
89+
n_fft: ${model.n_fft}
90+
win_length: ${model.n_window_size}
91+
hop_length: ${model.n_window_stride}
92+
window: ${model.window}
93+
n_mels: ${model.n_mel_channels}
94+
lowfreq: ${model.lowfreq}
95+
highfreq: ${model.highfreq}
96+
max_duration: null # change to null to include longer audios.
97+
min_duration: 0.1
98+
ignore_file: null
99+
trim: true
100+
trim_top_db: 50
101+
trim_frame_length: ${model.n_window_size}
102+
trim_hop_length: ${model.n_window_stride}
103+
pitch_fmin: ${model.pitch_fmin}
104+
pitch_fmax: ${model.pitch_fmax}
105+
pitch_norm: true
106+
pitch_mean: ${model.pitch_mean}
107+
pitch_std: ${model.pitch_std}
108+
109+
dataloader_params:
110+
drop_last: false
111+
shuffle: true
112+
batch_size: 32
113+
num_workers: 12
114+
pin_memory: true
115+
116+
validation_ds:
117+
dataset:
118+
_target_: nemo.collections.tts.data.dataset.TTSDataset
119+
manifest_filepath: ${validation_datasets}
120+
sample_rate: ${model.sample_rate}
121+
sup_data_path: ${sup_data_path}
122+
sup_data_types: ${sup_data_types}
123+
n_fft: ${model.n_fft}
124+
win_length: ${model.n_window_size}
125+
hop_length: ${model.n_window_stride}
126+
window: ${model.window}
127+
n_mels: ${model.n_mel_channels}
128+
lowfreq: ${model.lowfreq}
129+
highfreq: ${model.highfreq}
130+
max_duration: null # change to null to include longer audios.
131+
min_duration: 0.1
132+
ignore_file: null
133+
trim: true
134+
trim_top_db: 50
135+
trim_frame_length: ${model.n_window_size}
136+
trim_hop_length: ${model.n_window_stride}
137+
pitch_fmin: ${model.pitch_fmin}
138+
pitch_fmax: ${model.pitch_fmax}
139+
pitch_norm: true
140+
pitch_mean: ${model.pitch_mean}
141+
pitch_std: ${model.pitch_std}
142+
143+
dataloader_params:
144+
drop_last: false
145+
shuffle: false
146+
batch_size: 32
147+
num_workers: 2
148+
pin_memory: true
149+
150+
preprocessor:
151+
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
152+
features: ${model.n_mel_channels}
153+
lowfreq: ${model.lowfreq}
154+
highfreq: ${model.highfreq}
155+
n_fft: ${model.n_fft}
156+
n_window_size: ${model.n_window_size}
157+
window_size: false
158+
n_window_stride: ${model.n_window_stride}
159+
window_stride: false
160+
pad_to: 1
161+
pad_value: 0
162+
sample_rate: ${model.sample_rate}
163+
window: ${model.window}
164+
normalize: null
165+
preemph: null
166+
dither: 0.0
167+
frame_splicing: 1
168+
log: true
169+
log_zero_guard_type: add
170+
log_zero_guard_value: 1e-05
171+
mag_power: 1.0
172+
173+
input_fft: #n_embed and padding_idx are added by the model
174+
_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
175+
n_layer: 6
176+
n_head: 1
177+
d_model: ${model.symbols_embedding_dim}
178+
d_head: 64
179+
d_inner: 1536
180+
kernel_size: 3
181+
dropout: 0.1
182+
dropatt: 0.1
183+
dropemb: 0.0
184+
d_embed: ${model.symbols_embedding_dim}
185+
186+
output_fft:
187+
_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
188+
n_layer: 6
189+
n_head: 1
190+
d_model: ${model.symbols_embedding_dim}
191+
d_head: 64
192+
d_inner: 1536
193+
kernel_size: 3
194+
dropout: 0.1
195+
dropatt: 0.1
196+
dropemb: 0.0
197+
198+
alignment_module:
199+
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
200+
n_text_channels: ${model.symbols_embedding_dim}
201+
202+
duration_predictor:
203+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
204+
input_size: ${model.symbols_embedding_dim}
205+
kernel_size: 3
206+
filter_size: 256
207+
dropout: 0.1
208+
n_layers: 2
209+
210+
pitch_predictor:
211+
_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
212+
input_size: ${model.symbols_embedding_dim}
213+
kernel_size: 3
214+
filter_size: 256
215+
dropout: 0.1
216+
n_layers: 2
217+
218+
optim:
219+
name: adamw
220+
lr: 1e-3
221+
betas: [0.9, 0.999]
222+
weight_decay: 1e-6
223+
224+
sched:
225+
name: NoamAnnealing
226+
warmup_steps: 1000
227+
last_epoch: -1
228+
d_model: 1 # Disable scaling based on model dim
229+
230+
trainer:
231+
num_nodes: 1
232+
devices: -1 # number of gpus
233+
accelerator: gpu
234+
strategy: ddp
235+
precision: 16
236+
max_epochs: 5000
237+
accumulate_grad_batches: 1
238+
gradient_clip_val: 1000.0
239+
enable_checkpointing: false # Provided by exp_manager
240+
logger: false # Provided by exp_manager
241+
log_every_n_steps: 100
242+
check_val_every_n_epoch: 5
243+
benchmark: false
244+
245+
exp_manager:
246+
exp_dir: null
247+
name: ${name}
248+
create_tensorboard_logger: true
249+
create_checkpoint_callback: true
250+
checkpoint_callback_params:
251+
monitor: val_loss
252+
resume_if_exists: false
253+
resume_ignore_no_checkpoint: false

0 commit comments

Comments
 (0)