Skip to content

Commit e930514

Browse files
authored
Merge pull request #1053 from yqzhishen/mel-checks
[DiffSinger] Add more mel checks between vocoder and acoustic model
2 parents 78cab65 + 93950e5 commit e930514

File tree

3 files changed

+97
-8
lines changed

3 files changed

+97
-8
lines changed

OpenUtau.Core/DiffSinger/DiffSingerConfig.cs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,22 @@ public class DsConfig {
3333
public string linguistic;
3434
public string pitch;
3535
public string variance;
36-
public int hop_size = 512;
37-
public int sample_rate = 44100;
3836
public bool predict_dur = true;
3937
public bool predict_energy = true;
4038
public bool predict_breathiness = true;
4139
public bool predict_voicing = false;
4240
public bool predict_tension = false;
4341
public bool use_expr = false;
4442
public bool use_note_rest = false;
43+
public int sample_rate = 44100;
44+
public int hop_size = 512;
45+
public int win_size = 2048;
46+
public int fft_size = 2048;
47+
public int num_mel_bins = 128;
48+
public double mel_fmin = 40;
49+
public double mel_fmax = 16000;
50+
public string mel_base = "10"; // or "e"
51+
public string mel_scale = "slaney"; // or "htk"
4552
public float frameMs(){
4653
return 1000f * hop_size / sample_rate;
4754
}

OpenUtau.Core/DiffSinger/DiffSingerRenderer.cs

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,62 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
134134
}
135135

136136
var vocoder = singer.getVocoder();
137-
//Vocoder and singer should have the same hop sizes and sample rates.
137+
//mel specification validity checks
138+
//mel base must be 10 or e
139+
if (vocoder.mel_base != "10" && vocoder.mel_base != "e") {
140+
throw new Exception(
141+
$"Mel base must be \"10\" or \"e\", but got \"{vocoder.mel_base}\" from vocoder");
142+
}
143+
if (singer.dsConfig.mel_base != "10" && singer.dsConfig.mel_base != "e") {
144+
throw new Exception(
145+
$"Mel base must be \"10\" or \"e\", but got \"{singer.dsConfig.mel_base}\" from acoustic model");
146+
}
147+
//mel scale must be slaney or htk
148+
if (vocoder.mel_scale != "slaney" && vocoder.mel_scale != "htk") {
149+
throw new Exception(
150+
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from vocoder");
151+
}
152+
if (singer.dsConfig.mel_scale != "slaney" && singer.dsConfig.mel_scale != "htk") {
153+
throw new Exception(
154+
$"Mel scale must be \"slaney\" or \"htk\", but got \"{vocoder.mel_scale}\" from acoustic model");
155+
}
156+
//mel specification matching checks
157+
if(vocoder.sample_rate != singer.dsConfig.sample_rate) {
158+
throw new Exception(
159+
$"Vocoder and acoustic model has mismatching sample rate ({vocoder.sample_rate} != {singer.dsConfig.sample_rate})");
160+
}
138161
if(vocoder.hop_size != singer.dsConfig.hop_size){
139-
throw new Exception($"Vocoder's hop size is {vocoder.hop_size}, but acoustic's hop size is {singer.dsConfig.hop_size}.");
162+
throw new Exception(
163+
$"Vocoder and acoustic model has mismatching hop size ({vocoder.hop_size} != {singer.dsConfig.hop_size})");
164+
}
165+
if(vocoder.win_size != singer.dsConfig.win_size){
166+
throw new Exception(
167+
$"Vocoder and acoustic model has mismatching win size ({vocoder.win_size} != {singer.dsConfig.win_size})");
168+
}
169+
if(vocoder.fft_size != singer.dsConfig.fft_size){
170+
throw new Exception(
171+
$"Vocoder and acoustic model has mismatching FFT size ({vocoder.fft_size} != {singer.dsConfig.fft_size})");
172+
}
173+
if (vocoder.num_mel_bins != singer.dsConfig.num_mel_bins) {
174+
throw new Exception(
175+
$"Vocoder and acoustic model has mismatching mel bins ({vocoder.num_mel_bins} != {singer.dsConfig.num_mel_bins})");
176+
}
177+
if (Math.Abs(vocoder.mel_fmin - singer.dsConfig.mel_fmin) > 1e-5) {
178+
throw new Exception(
179+
$"Vocoder and acoustic model has mismatching fmin ({vocoder.mel_fmin} != {singer.dsConfig.mel_fmin})");
140180
}
141-
if(vocoder.sample_rate != singer.dsConfig.sample_rate){
142-
throw new Exception($"Vocoder's sample rate is {vocoder.sample_rate}, but acoustic's sample rate is {singer.dsConfig.sample_rate}.");
181+
if (Math.Abs(vocoder.mel_fmax - singer.dsConfig.mel_fmax) > 1e-5) {
182+
throw new Exception(
183+
$"Vocoder and acoustic model has mismatching fmax ({vocoder.mel_fmax} != {singer.dsConfig.mel_fmax})");
184+
}
185+
// mismatching mel base can be transformed
186+
// if (vocoder.mel_base != singer.dsConfig.mel_base) {
187+
// throw new Exception(
188+
// $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
189+
// }
190+
if (vocoder.mel_scale != singer.dsConfig.mel_scale) {
191+
throw new Exception(
192+
$"Vocoder and acoustic model has mismatching mel scale ({vocoder.mel_scale} != {singer.dsConfig.mel_scale})");
143193
}
144194

145195
var acousticModel = singer.getAcousticSession();
@@ -301,6 +351,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
301351
var acousticOutputs = acousticModel.Run(acousticInputs);
302352
mel = acousticOutputs.First().AsTensor<float>().Clone();
303353
}
354+
//mel transforms for different mel base
355+
if (vocoder.mel_base != singer.dsConfig.mel_base) {
356+
float k;
357+
if (vocoder.mel_base == "e" && singer.dsConfig.mel_base == "10") {
358+
k = 2.30259f;
359+
}
360+
else if (vocoder.mel_base == "10" && singer.dsConfig.mel_base == "e") {
361+
k = 0.434294f;
362+
} else {
363+
// this should never happen
364+
throw new Exception("This should never happen");
365+
}
366+
for (int b = 0; b < mel.Dimensions[0]; ++b) {
367+
for (int t = 0; t < mel.Dimensions[1]; ++t) {
368+
for (int c = 0; c < mel.Dimensions[2]; ++c) {
369+
mel[b, t, c] *= k;
370+
}
371+
}
372+
}
373+
}
304374
//vocoder
305375
//waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
306376
var vocoderInputs = new List<NamedOnnxValue>();

OpenUtau.Core/DiffSinger/DiffSingerVocoder.cs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,13 @@ public class DsVocoder : IDisposable {
1010

1111
public int num_mel_bins => config.num_mel_bins;
1212
public int hop_size => config.hop_size;
13+
public int win_size => config.win_size;
14+
public int fft_size => config.fft_size;
1315
public int sample_rate => config.sample_rate;
16+
public double mel_fmin => config.mel_fmin;
17+
public double mel_fmax => config.mel_fmax;
18+
public string mel_base => config.mel_base;
19+
public string mel_scale => config.mel_scale;
1420

1521
//Get vocoder by package name
1622
public DsVocoder(string name) {
@@ -54,8 +60,14 @@ public void Dispose() {
5460
public class DsVocoderConfig {
5561
public string name = "vocoder";
5662
public string model = "model.onnx";
57-
public int num_mel_bins = 128;
58-
public int hop_size = 512;
5963
public int sample_rate = 44100;
64+
public int hop_size = 512;
65+
public int win_size = 2048;
66+
public int fft_size = 2048;
67+
public int num_mel_bins = 128;
68+
public double mel_fmin = 40;
69+
public double mel_fmax = 16000;
70+
public string mel_base = "10"; // or "e"
71+
public string mel_scale = "slaney"; // or "htk"
6072
}
6173
}

0 commit comments

Comments
 (0)