@@ -134,12 +134,62 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
134134 }
135135
136136 var vocoder = singer . getVocoder ( ) ;
137- //Vocoder and singer should have the same hop sizes and sample rates.
137+ //mel specification validity checks
138+ //mel base must be 10 or e
139+ if ( vocoder . mel_base != "10" && vocoder . mel_base != "e" ) {
140+ throw new Exception (
141+ $ "Mel base must be \" 10\" or \" e\" , but got \" { vocoder . mel_base } \" from vocoder") ;
142+ }
143+ if ( singer . dsConfig . mel_base != "10" && singer . dsConfig . mel_base != "e" ) {
144+ throw new Exception (
145+ $ "Mel base must be \" 10\" or \" e\" , but got \" { singer . dsConfig . mel_base } \" from acoustic model") ;
146+ }
147+ //mel scale must be slaney or htk
148+ if ( vocoder . mel_scale != "slaney" && vocoder . mel_scale != "htk" ) {
149+ throw new Exception (
150+ $ "Mel scale must be \" slaney\" or \" htk\" , but got \" { vocoder . mel_scale } \" from vocoder") ;
151+ }
152+ if ( singer . dsConfig . mel_scale != "slaney" && singer . dsConfig . mel_scale != "htk" ) {
153+ throw new Exception (
154+ $ "Mel scale must be \" slaney\" or \" htk\" , but got \" { vocoder . mel_scale } \" from acoustic model") ;
155+ }
156+ //mel specification matching checks
157+ if ( vocoder . sample_rate != singer . dsConfig . sample_rate ) {
158+ throw new Exception (
159+ $ "Vocoder and acoustic model has mismatching sample rate ({ vocoder . sample_rate } != { singer . dsConfig . sample_rate } )") ;
160+ }
138161 if ( vocoder . hop_size != singer . dsConfig . hop_size ) {
139- throw new Exception ( $ "Vocoder's hop size is { vocoder . hop_size } , but acoustic's hop size is { singer . dsConfig . hop_size } .") ;
162+ throw new Exception (
163+ $ "Vocoder and acoustic model has mismatching hop size ({ vocoder . hop_size } != { singer . dsConfig . hop_size } )") ;
164+ }
165+ if ( vocoder . win_size != singer . dsConfig . win_size ) {
166+ throw new Exception (
167+ $ "Vocoder and acoustic model has mismatching win size ({ vocoder . win_size } != { singer . dsConfig . win_size } )") ;
168+ }
169+ if ( vocoder . fft_size != singer . dsConfig . fft_size ) {
170+ throw new Exception (
171+ $ "Vocoder and acoustic model has mismatching FFT size ({ vocoder . fft_size } != { singer . dsConfig . fft_size } )") ;
172+ }
173+ if ( vocoder . num_mel_bins != singer . dsConfig . num_mel_bins ) {
174+ throw new Exception (
175+ $ "Vocoder and acoustic model has mismatching mel bins ({ vocoder . num_mel_bins } != { singer . dsConfig . num_mel_bins } )") ;
176+ }
177+ if ( Math . Abs ( vocoder . mel_fmin - singer . dsConfig . mel_fmin ) > 1e-5 ) {
178+ throw new Exception (
179+ $ "Vocoder and acoustic model has mismatching fmin ({ vocoder . mel_fmin } != { singer . dsConfig . mel_fmin } )") ;
140180 }
141- if ( vocoder . sample_rate != singer . dsConfig . sample_rate ) {
142- throw new Exception ( $ "Vocoder's sample rate is { vocoder . sample_rate } , but acoustic's sample rate is { singer . dsConfig . sample_rate } .") ;
181+ if ( Math . Abs ( vocoder . mel_fmax - singer . dsConfig . mel_fmax ) > 1e-5 ) {
182+ throw new Exception (
183+ $ "Vocoder and acoustic model has mismatching fmax ({ vocoder . mel_fmax } != { singer . dsConfig . mel_fmax } )") ;
184+ }
185+ // mismatching mel base can be transformed
186+ // if (vocoder.mel_base != singer.dsConfig.mel_base) {
187+ // throw new Exception(
188+ // $"Vocoder and acoustic model has mismatching mel base ({vocoder.mel_base} != {singer.dsConfig.mel_base})");
189+ // }
190+ if ( vocoder . mel_scale != singer . dsConfig . mel_scale ) {
191+ throw new Exception (
192+ $ "Vocoder and acoustic model has mismatching mel scale ({ vocoder . mel_scale } != { singer . dsConfig . mel_scale } )") ;
143193 }
144194
145195 var acousticModel = singer . getAcousticSession ( ) ;
@@ -301,6 +351,26 @@ float[] InvokeDiffsinger(RenderPhrase phrase, int depth, int speedup, Cancellati
301351 var acousticOutputs = acousticModel . Run ( acousticInputs ) ;
302352 mel = acousticOutputs . First ( ) . AsTensor < float > ( ) . Clone ( ) ;
303353 }
354+ //mel transforms for different mel base
355+ if ( vocoder . mel_base != singer . dsConfig . mel_base ) {
356+ float k ;
357+ if ( vocoder . mel_base == "e" && singer . dsConfig . mel_base == "10" ) {
358+ k = 2.30259f ;
359+ }
360+ else if ( vocoder . mel_base == "10" && singer . dsConfig . mel_base == "e" ) {
361+ k = 0.434294f ;
362+ } else {
363+ // this should never happen
364+ throw new Exception ( "This should never happen" ) ;
365+ }
366+ for ( int b = 0 ; b < mel . Dimensions [ 0 ] ; ++ b ) {
367+ for ( int t = 0 ; t < mel . Dimensions [ 1 ] ; ++ t ) {
368+ for ( int c = 0 ; c < mel . Dimensions [ 2 ] ; ++ c ) {
369+ mel [ b , t , c ] *= k ;
370+ }
371+ }
372+ }
373+ }
304374 //vocoder
305375 //waveform = session.run(['waveform'], {'mel': mel, 'f0': f0})[0]
306376 var vocoderInputs = new List < NamedOnnxValue > ( ) ;
0 commit comments