|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.IO; |
| 4 | +using System.Linq; |
| 5 | +using System.Threading; |
| 6 | +using System.Threading.Tasks; |
| 7 | +using Microsoft.ML.OnnxRuntime; |
| 8 | +using Microsoft.ML.OnnxRuntime.Tensors; |
| 9 | +using NWaves.Signals; |
| 10 | +using OpenUtau.Core.Ustx; |
| 11 | + |
| 12 | +namespace OpenUtau.Core.Analysis.Some{ |
| 13 | + public static class AudioSlicer{ |
| 14 | + static int sample_rate = 44100; |
| 15 | + static float threshold = 0.02f; |
| 16 | + static int hop_size = 441; |
| 17 | + static int win_size = 1764; |
| 18 | + static int min_length = 500; |
| 19 | + static int min_interval = 30; |
| 20 | + static int max_sil_kept = 50; |
| 21 | + |
| 22 | + public struct Chunk{ |
| 23 | + public double offsetMs;//position of this slice in the audio file in milliseconds |
| 24 | + public float[] samples; |
| 25 | + |
| 26 | + public Chunk(double offsetMs, float[] samples){ |
| 27 | + this.offsetMs = offsetMs; |
| 28 | + this.samples = samples; |
| 29 | + } |
| 30 | + |
| 31 | + public Chunk(float[] originalSamples, int startIndex, int endIndex){ |
| 32 | + samples = originalSamples[startIndex..endIndex]; |
| 33 | + offsetMs = (double)startIndex * (1000.0 / sample_rate); |
| 34 | + } |
| 35 | + } |
| 36 | + |
| 37 | + static double[] get_rms( |
| 38 | + float[] samples, |
| 39 | + int frame_length = 2048, |
| 40 | + int hop_length = 512 |
| 41 | + ){ |
| 42 | + //reference: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py#L5 |
| 43 | + //y = np.pad(samples, padding, mode="constant") |
| 44 | + float[] y = new float[samples.Length + frame_length]; |
| 45 | + Array.Copy(samples, 0, y, frame_length / 2, samples.Length); |
| 46 | + for(int i=0; i<y.Length; i++){ |
| 47 | + y[i] = y[i] * y[i]; |
| 48 | + } |
| 49 | + int output_size = samples.Length / hop_length; |
| 50 | + return Enumerable.Range(0, output_size) |
| 51 | + .Select(i => Math.Sqrt(y[(i*hop_length)..(i*hop_length+frame_length)].Average())) |
| 52 | + .ToArray(); |
| 53 | + } |
| 54 | + |
| 55 | + static int argmin(this double[] array){ |
| 56 | + //numpy's argmin function |
| 57 | + return Array.IndexOf(array, array.Min()); |
| 58 | + } |
| 59 | + |
| 60 | + public static List<Chunk> Slice(float[] samples){ |
| 61 | + //reference: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py#L68 |
| 62 | + if((samples.Length + hop_size - 1) / hop_size <= min_length){ |
| 63 | + return new List<Chunk>{new Chunk(0, samples)}; |
| 64 | + } |
| 65 | + var rms_list = get_rms( |
| 66 | + samples, |
| 67 | + frame_length: win_size, |
| 68 | + hop_length: hop_size |
| 69 | + ); |
| 70 | + var sil_tags = new List<Tuple<int,int>>(); |
| 71 | + int silence_start = -1;//here -1 means none |
| 72 | + int clip_start = 0; |
| 73 | + foreach(int i in Enumerable.Range(0, rms_list.Length)){ |
| 74 | + var rms = rms_list[i]; |
| 75 | + //Keep looping while frame is silent. |
| 76 | + if(rms < threshold){ |
| 77 | + //Record start of silent frames. |
| 78 | + if(silence_start < 0){ |
| 79 | + silence_start = i; |
| 80 | + } |
| 81 | + continue; |
| 82 | + } |
| 83 | + //Keep looping while frame is not silent and silence start has not been recorded. |
| 84 | + if(silence_start < 0){ |
| 85 | + continue; |
| 86 | + } |
| 87 | + //Clear recorded silence start if interval is not enough or clip is too short |
| 88 | + var is_leading_silence = silence_start == 0 && i > max_sil_kept; |
| 89 | + var need_slice_middle = i - silence_start >= min_interval && i - clip_start >= min_length; |
| 90 | + if(!is_leading_silence && !need_slice_middle){ |
| 91 | + silence_start = -1; |
| 92 | + continue; |
| 93 | + } |
| 94 | + //Need slicing. Record the range of silent frames to be removed. |
| 95 | + if(i - silence_start <= max_sil_kept){ |
| 96 | + var pos = rms_list[silence_start..(i+1)].argmin() + silence_start; |
| 97 | + if(silence_start == 0){ |
| 98 | + sil_tags.Add(Tuple.Create(0,pos)); |
| 99 | + } else { |
| 100 | + sil_tags.Add(Tuple.Create(pos, pos)); |
| 101 | + } |
| 102 | + clip_start = pos; |
| 103 | + } else if(i - silence_start <= max_sil_kept * 2){ |
| 104 | + var pos = rms_list[(i - max_sil_kept)..(silence_start + max_sil_kept + 1)].argmin(); |
| 105 | + pos += i - max_sil_kept; |
| 106 | + var pos_l = rms_list[silence_start..(silence_start + max_sil_kept + 1)].argmin() + silence_start; |
| 107 | + var pos_r = rms_list[(i - max_sil_kept)..(i+1)].argmin() + i - max_sil_kept; |
| 108 | + if(silence_start == 0){ |
| 109 | + sil_tags.Add(Tuple.Create(0, pos_r)); |
| 110 | + clip_start = pos_r; |
| 111 | + } else { |
| 112 | + sil_tags.Add(Tuple.Create(Math.Min(pos_l, pos), Math.Max(pos_r, pos))); |
| 113 | + clip_start = Math.Max(pos_r, pos); |
| 114 | + } |
| 115 | + } else { |
| 116 | + var pos_l = rms_list[silence_start..(silence_start + max_sil_kept + 1)].argmin() + silence_start; |
| 117 | + var pos_r = rms_list[(i - max_sil_kept)..(i+1)].argmin() + i - max_sil_kept; |
| 118 | + if(silence_start == 0){ |
| 119 | + sil_tags.Add(Tuple.Create(0, pos_r)); |
| 120 | + } else { |
| 121 | + sil_tags.Add(Tuple.Create(pos_l, pos_r)); |
| 122 | + } |
| 123 | + clip_start = pos_r; |
| 124 | + } |
| 125 | + silence_start = -1; |
| 126 | + } |
| 127 | + //Deal with trailing silence. |
| 128 | + var total_frames = rms_list.Length; |
| 129 | + if(silence_start >= 0 && total_frames - silence_start >= min_interval){ |
| 130 | + var silence_end = Math.Min(total_frames, silence_start + max_sil_kept); |
| 131 | + var pos = rms_list[silence_start..(silence_end + 1)].argmin() + silence_start; |
| 132 | + sil_tags.Add(Tuple.Create(pos, total_frames + 1)); |
| 133 | + } |
| 134 | + //Apply and return slices. |
| 135 | + if(sil_tags.Count == 0){ |
| 136 | + return new List<Chunk>{new Chunk(0, samples)}; |
| 137 | + } else { |
| 138 | + var chunks = new List<Chunk>(); |
| 139 | + if(sil_tags[0].Item1 > 0){ |
| 140 | + chunks.Add(new Chunk( |
| 141 | + samples, |
| 142 | + 0, |
| 143 | + sil_tags[0].Item1 * hop_size |
| 144 | + )); |
| 145 | + } |
| 146 | + foreach(var i in Enumerable.Range(0, sil_tags.Count - 1)){ |
| 147 | + chunks.Add(new Chunk( |
| 148 | + samples, |
| 149 | + sil_tags[i].Item2 * hop_size, |
| 150 | + sil_tags[i+1].Item1 * hop_size |
| 151 | + )); |
| 152 | + } |
| 153 | + if(sil_tags[^1].Item2 < total_frames){ |
| 154 | + chunks.Add(new Chunk( |
| 155 | + samples, |
| 156 | + sil_tags[^1].Item2 * hop_size, |
| 157 | + total_frames * hop_size |
| 158 | + )); |
| 159 | + } |
| 160 | + return chunks; |
| 161 | + } |
| 162 | + } |
| 163 | + } |
| 164 | + |
| 165 | + class SomeConfig{ |
| 166 | + public string model = "model.onnx"; |
| 167 | + public int sample_rate = 44100; |
| 168 | + } |
| 169 | + |
| 170 | + public class Some: IDisposable { |
| 171 | + InferenceSession session; |
| 172 | + string Location; |
| 173 | + private bool disposedValue; |
| 174 | + |
| 175 | + struct SomeResult{ |
| 176 | + //midi number of each note |
| 177 | + public float[] note_midi; |
| 178 | + //whether each note is a rest |
| 179 | + public bool[] note_rest; |
| 180 | + //duration of each note in seconds |
| 181 | + public float[] note_dur; |
| 182 | + } |
| 183 | + |
| 184 | + public Some() { |
| 185 | + try { |
| 186 | + Location = Path.Combine(PathManager.Inst.DependencyPath, "some"); |
| 187 | + var config = Yaml.DefaultDeserializer.Deserialize<SomeConfig>( |
| 188 | + File.ReadAllText(Path.Combine(Location, "some.yaml"), |
| 189 | + System.Text.Encoding.UTF8)); |
| 190 | + session = Onnx.getInferenceSession(Path.Combine(Location, config.model)); |
| 191 | + } |
| 192 | + catch (Exception ex) { |
| 193 | + //TODO: onnx download site |
| 194 | + throw new Exception($"Error loading SOME. Please download SOME from https://github.com/xunmengshe/OpenUtau/releases/0.0.0.0\n{ex.Message}"); |
| 195 | + } |
| 196 | + } |
| 197 | + |
| 198 | + SomeResult Analyze(float[] samples) { |
| 199 | + //Analyze a slice of audio samples and return the result |
| 200 | + var min = samples.Min(); |
| 201 | + var max = samples.Max(); |
| 202 | + var inputs = new List<NamedOnnxValue>(); |
| 203 | + inputs.Add(NamedOnnxValue.CreateFromTensor("waveform", |
| 204 | + new DenseTensor<float>(samples, new int[] { samples.Length }, false) |
| 205 | + .Reshape(new int[] { 1, samples.Length }))); |
| 206 | + var outputs = session.Run(inputs); |
| 207 | + float[] note_midi = outputs |
| 208 | + .Where(o => o.Name == "note_midi") |
| 209 | + .First() |
| 210 | + .AsTensor<float>() |
| 211 | + .ToArray(); |
| 212 | + bool[] note_rest = outputs |
| 213 | + .Where(o => o.Name == "note_rest") |
| 214 | + .First() |
| 215 | + .AsTensor<bool>() |
| 216 | + .ToArray(); |
| 217 | + float[] note_dur = outputs |
| 218 | + .Where(o => o.Name == "note_dur") |
| 219 | + .First() |
| 220 | + .AsTensor<float>() |
| 221 | + .ToArray(); |
| 222 | + return new SomeResult{ |
| 223 | + note_midi = note_midi, |
| 224 | + note_rest = note_rest, |
| 225 | + note_dur = note_dur |
| 226 | + }; |
| 227 | + } |
| 228 | + |
| 229 | + private float[] ToMono(float[] stereoSamples, int channels){ |
| 230 | + if(channels == 1){ |
| 231 | + return stereoSamples; |
| 232 | + } |
| 233 | + float[] monoSamples = new float[stereoSamples.Length / channels]; |
| 234 | + for(int i = 0; i < monoSamples.Length; i++){ |
| 235 | + monoSamples[i] = stereoSamples[(i*channels)..((i+1)*channels-1)].Average(); |
| 236 | + } |
| 237 | + return monoSamples; |
| 238 | + } |
| 239 | + |
| 240 | + public UVoicePart Transcribe(UProject project, UWavePart wavePart, Action<int> progress){ |
| 241 | + //Run SOME model with the audio part user selected to extract note information |
| 242 | + //convert samples to mono and slice |
| 243 | + |
| 244 | + var monoSamples = ToMono(wavePart.Samples, wavePart.channels); |
| 245 | + var chunks = AudioSlicer.Slice(monoSamples); |
| 246 | + var part = new UVoicePart(); |
| 247 | + part.position = wavePart.position; |
| 248 | + part.Duration = wavePart.Duration; |
| 249 | + var timeAxis = project.timeAxis; |
| 250 | + double partOffsetMs = timeAxis.TickPosToMsPos(wavePart.position); |
| 251 | + double currMs = partOffsetMs; |
| 252 | + |
| 253 | + int wavPosS = 0;//position of current slice in seconds |
| 254 | + foreach(var chunk in chunks){ |
| 255 | + wavPosS = (int)(chunk.offsetMs / 1000); |
| 256 | + progress.Invoke(wavPosS); |
| 257 | + var someResult = Analyze(chunk.samples); |
| 258 | + var note_midi = someResult.note_midi; |
| 259 | + var note_rest = someResult.note_rest; |
| 260 | + var note_dur = someResult.note_dur; |
| 261 | + //Put the notes into a new voice part |
| 262 | + double chunkOffsetMs = chunk.offsetMs + partOffsetMs; |
| 263 | + currMs = chunkOffsetMs; |
| 264 | + foreach(int index in Enumerable.Range(0, note_midi.Length)){ |
| 265 | + var noteDurMs = note_dur[index] * 1000; |
| 266 | + if(!note_rest[index]){ |
| 267 | + var posTick = timeAxis.MsPosToTickPos(currMs); |
| 268 | + var durTick = timeAxis.MsPosToTickPos(currMs + noteDurMs) - posTick; |
| 269 | + var note = project.CreateNote( |
| 270 | + (int)Math.Round(note_midi[index]), |
| 271 | + posTick - wavePart.position, |
| 272 | + durTick |
| 273 | + ); |
| 274 | + part.notes.Add(note); |
| 275 | + } |
| 276 | + currMs += noteDurMs; |
| 277 | + } |
| 278 | + } |
| 279 | + var endTick = timeAxis.MsPosToTickPos(currMs); |
| 280 | + if(endTick > part.End){ |
| 281 | + part.Duration = endTick - part.position; |
| 282 | + } |
| 283 | + return part; |
| 284 | + } |
| 285 | + |
| 286 | + protected virtual void Dispose(bool disposing) { |
| 287 | + if (!disposedValue) { |
| 288 | + if (disposing) { |
| 289 | + session.Dispose(); |
| 290 | + } |
| 291 | + disposedValue = true; |
| 292 | + } |
| 293 | + } |
| 294 | + |
| 295 | + public void Dispose() { |
| 296 | + Dispose(disposing: true); |
| 297 | + GC.SuppressFinalize(this); |
| 298 | + } |
| 299 | + } |
| 300 | +} |
0 commit comments