Skip to content

Commit 6912603

Browse files
authored
Merge pull request #911 from oxygen-dioxide/some
Audio transcription with SOME
2 parents 4bb930f + 802d186 commit 6912603

File tree

5 files changed

+370
-0
lines changed

5 files changed

+370
-0
lines changed

OpenUtau.Core/Analysis/Some.cs

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Linq;
5+
using System.Threading;
6+
using System.Threading.Tasks;
7+
using Microsoft.ML.OnnxRuntime;
8+
using Microsoft.ML.OnnxRuntime.Tensors;
9+
using NWaves.Signals;
10+
using OpenUtau.Core.Ustx;
11+
12+
namespace OpenUtau.Core.Analysis.Some{
13+
public static class AudioSlicer{
14+
static int sample_rate = 44100;
15+
static float threshold = 0.02f;
16+
static int hop_size = 441;
17+
static int win_size = 1764;
18+
static int min_length = 500;
19+
static int min_interval = 30;
20+
static int max_sil_kept = 50;
21+
22+
public struct Chunk{
23+
public double offsetMs;//position of this slice in the audio file in milliseconds
24+
public float[] samples;
25+
26+
public Chunk(double offsetMs, float[] samples){
27+
this.offsetMs = offsetMs;
28+
this.samples = samples;
29+
}
30+
31+
public Chunk(float[] originalSamples, int startIndex, int endIndex){
32+
samples = originalSamples[startIndex..endIndex];
33+
offsetMs = (double)startIndex * (1000.0 / sample_rate);
34+
}
35+
}
36+
37+
static double[] get_rms(
38+
float[] samples,
39+
int frame_length = 2048,
40+
int hop_length = 512
41+
){
42+
//reference: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py#L5
43+
//y = np.pad(samples, padding, mode="constant")
44+
float[] y = new float[samples.Length + frame_length];
45+
Array.Copy(samples, 0, y, frame_length / 2, samples.Length);
46+
for(int i=0; i<y.Length; i++){
47+
y[i] = y[i] * y[i];
48+
}
49+
int output_size = samples.Length / hop_length;
50+
return Enumerable.Range(0, output_size)
51+
.Select(i => Math.Sqrt(y[(i*hop_length)..(i*hop_length+frame_length)].Average()))
52+
.ToArray();
53+
}
54+
55+
static int argmin(this double[] array){
56+
//numpy's argmin function
57+
return Array.IndexOf(array, array.Min());
58+
}
59+
60+
public static List<Chunk> Slice(float[] samples){
61+
//reference: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py#L68
62+
if((samples.Length + hop_size - 1) / hop_size <= min_length){
63+
return new List<Chunk>{new Chunk(0, samples)};
64+
}
65+
var rms_list = get_rms(
66+
samples,
67+
frame_length: win_size,
68+
hop_length: hop_size
69+
);
70+
var sil_tags = new List<Tuple<int,int>>();
71+
int silence_start = -1;//here -1 means none
72+
int clip_start = 0;
73+
foreach(int i in Enumerable.Range(0, rms_list.Length)){
74+
var rms = rms_list[i];
75+
//Keep looping while frame is silent.
76+
if(rms < threshold){
77+
//Record start of silent frames.
78+
if(silence_start < 0){
79+
silence_start = i;
80+
}
81+
continue;
82+
}
83+
//Keep looping while frame is not silent and silence start has not been recorded.
84+
if(silence_start < 0){
85+
continue;
86+
}
87+
//Clear recorded silence start if interval is not enough or clip is too short
88+
var is_leading_silence = silence_start == 0 && i > max_sil_kept;
89+
var need_slice_middle = i - silence_start >= min_interval && i - clip_start >= min_length;
90+
if(!is_leading_silence && !need_slice_middle){
91+
silence_start = -1;
92+
continue;
93+
}
94+
//Need slicing. Record the range of silent frames to be removed.
95+
if(i - silence_start <= max_sil_kept){
96+
var pos = rms_list[silence_start..(i+1)].argmin() + silence_start;
97+
if(silence_start == 0){
98+
sil_tags.Add(Tuple.Create(0,pos));
99+
} else {
100+
sil_tags.Add(Tuple.Create(pos, pos));
101+
}
102+
clip_start = pos;
103+
} else if(i - silence_start <= max_sil_kept * 2){
104+
var pos = rms_list[(i - max_sil_kept)..(silence_start + max_sil_kept + 1)].argmin();
105+
pos += i - max_sil_kept;
106+
var pos_l = rms_list[silence_start..(silence_start + max_sil_kept + 1)].argmin() + silence_start;
107+
var pos_r = rms_list[(i - max_sil_kept)..(i+1)].argmin() + i - max_sil_kept;
108+
if(silence_start == 0){
109+
sil_tags.Add(Tuple.Create(0, pos_r));
110+
clip_start = pos_r;
111+
} else {
112+
sil_tags.Add(Tuple.Create(Math.Min(pos_l, pos), Math.Max(pos_r, pos)));
113+
clip_start = Math.Max(pos_r, pos);
114+
}
115+
} else {
116+
var pos_l = rms_list[silence_start..(silence_start + max_sil_kept + 1)].argmin() + silence_start;
117+
var pos_r = rms_list[(i - max_sil_kept)..(i+1)].argmin() + i - max_sil_kept;
118+
if(silence_start == 0){
119+
sil_tags.Add(Tuple.Create(0, pos_r));
120+
} else {
121+
sil_tags.Add(Tuple.Create(pos_l, pos_r));
122+
}
123+
clip_start = pos_r;
124+
}
125+
silence_start = -1;
126+
}
127+
//Deal with trailing silence.
128+
var total_frames = rms_list.Length;
129+
if(silence_start >= 0 && total_frames - silence_start >= min_interval){
130+
var silence_end = Math.Min(total_frames, silence_start + max_sil_kept);
131+
var pos = rms_list[silence_start..(silence_end + 1)].argmin() + silence_start;
132+
sil_tags.Add(Tuple.Create(pos, total_frames + 1));
133+
}
134+
//Apply and return slices.
135+
if(sil_tags.Count == 0){
136+
return new List<Chunk>{new Chunk(0, samples)};
137+
} else {
138+
var chunks = new List<Chunk>();
139+
if(sil_tags[0].Item1 > 0){
140+
chunks.Add(new Chunk(
141+
samples,
142+
0,
143+
sil_tags[0].Item1 * hop_size
144+
));
145+
}
146+
foreach(var i in Enumerable.Range(0, sil_tags.Count - 1)){
147+
chunks.Add(new Chunk(
148+
samples,
149+
sil_tags[i].Item2 * hop_size,
150+
sil_tags[i+1].Item1 * hop_size
151+
));
152+
}
153+
if(sil_tags[^1].Item2 < total_frames){
154+
chunks.Add(new Chunk(
155+
samples,
156+
sil_tags[^1].Item2 * hop_size,
157+
total_frames * hop_size
158+
));
159+
}
160+
return chunks;
161+
}
162+
}
163+
}
164+
165+
class SomeConfig{
166+
public string model = "model.onnx";
167+
public int sample_rate = 44100;
168+
}
169+
170+
public class Some: IDisposable {
171+
InferenceSession session;
172+
string Location;
173+
private bool disposedValue;
174+
175+
struct SomeResult{
176+
//midi number of each note
177+
public float[] note_midi;
178+
//whether each note is a rest
179+
public bool[] note_rest;
180+
//duration of each note in seconds
181+
public float[] note_dur;
182+
}
183+
184+
public Some() {
185+
try {
186+
Location = Path.Combine(PathManager.Inst.DependencyPath, "some");
187+
var config = Yaml.DefaultDeserializer.Deserialize<SomeConfig>(
188+
File.ReadAllText(Path.Combine(Location, "some.yaml"),
189+
System.Text.Encoding.UTF8));
190+
session = Onnx.getInferenceSession(Path.Combine(Location, config.model));
191+
}
192+
catch (Exception ex) {
193+
//TODO: onnx download site
194+
throw new Exception($"Error loading SOME. Please download SOME from https://github.com/xunmengshe/OpenUtau/releases/0.0.0.0\n{ex.Message}");
195+
}
196+
}
197+
198+
SomeResult Analyze(float[] samples) {
199+
//Analyze a slice of audio samples and return the result
200+
var min = samples.Min();
201+
var max = samples.Max();
202+
var inputs = new List<NamedOnnxValue>();
203+
inputs.Add(NamedOnnxValue.CreateFromTensor("waveform",
204+
new DenseTensor<float>(samples, new int[] { samples.Length }, false)
205+
.Reshape(new int[] { 1, samples.Length })));
206+
var outputs = session.Run(inputs);
207+
float[] note_midi = outputs
208+
.Where(o => o.Name == "note_midi")
209+
.First()
210+
.AsTensor<float>()
211+
.ToArray();
212+
bool[] note_rest = outputs
213+
.Where(o => o.Name == "note_rest")
214+
.First()
215+
.AsTensor<bool>()
216+
.ToArray();
217+
float[] note_dur = outputs
218+
.Where(o => o.Name == "note_dur")
219+
.First()
220+
.AsTensor<float>()
221+
.ToArray();
222+
return new SomeResult{
223+
note_midi = note_midi,
224+
note_rest = note_rest,
225+
note_dur = note_dur
226+
};
227+
}
228+
229+
private float[] ToMono(float[] stereoSamples, int channels){
230+
if(channels == 1){
231+
return stereoSamples;
232+
}
233+
float[] monoSamples = new float[stereoSamples.Length / channels];
234+
for(int i = 0; i < monoSamples.Length; i++){
235+
monoSamples[i] = stereoSamples[(i*channels)..((i+1)*channels-1)].Average();
236+
}
237+
return monoSamples;
238+
}
239+
240+
public UVoicePart Transcribe(UProject project, UWavePart wavePart, Action<int> progress){
241+
//Run SOME model with the audio part user selected to extract note information
242+
//convert samples to mono and slice
243+
244+
var monoSamples = ToMono(wavePart.Samples, wavePart.channels);
245+
var chunks = AudioSlicer.Slice(monoSamples);
246+
var part = new UVoicePart();
247+
part.position = wavePart.position;
248+
part.Duration = wavePart.Duration;
249+
var timeAxis = project.timeAxis;
250+
double partOffsetMs = timeAxis.TickPosToMsPos(wavePart.position);
251+
double currMs = partOffsetMs;
252+
253+
int wavPosS = 0;//position of current slice in seconds
254+
foreach(var chunk in chunks){
255+
wavPosS = (int)(chunk.offsetMs / 1000);
256+
progress.Invoke(wavPosS);
257+
var someResult = Analyze(chunk.samples);
258+
var note_midi = someResult.note_midi;
259+
var note_rest = someResult.note_rest;
260+
var note_dur = someResult.note_dur;
261+
//Put the notes into a new voice part
262+
double chunkOffsetMs = chunk.offsetMs + partOffsetMs;
263+
currMs = chunkOffsetMs;
264+
foreach(int index in Enumerable.Range(0, note_midi.Length)){
265+
var noteDurMs = note_dur[index] * 1000;
266+
if(!note_rest[index]){
267+
var posTick = timeAxis.MsPosToTickPos(currMs);
268+
var durTick = timeAxis.MsPosToTickPos(currMs + noteDurMs) - posTick;
269+
var note = project.CreateNote(
270+
(int)Math.Round(note_midi[index]),
271+
posTick - wavePart.position,
272+
durTick
273+
);
274+
part.notes.Add(note);
275+
}
276+
currMs += noteDurMs;
277+
}
278+
}
279+
var endTick = timeAxis.MsPosToTickPos(currMs);
280+
if(endTick > part.End){
281+
part.Duration = endTick - part.position;
282+
}
283+
return part;
284+
}
285+
286+
protected virtual void Dispose(bool disposing) {
287+
if (!disposedValue) {
288+
if (disposing) {
289+
session.Dispose();
290+
}
291+
disposedValue = true;
292+
}
293+
}
294+
295+
public void Dispose() {
296+
Dispose(disposing: true);
297+
GC.SuppressFinalize(this);
298+
}
299+
}
300+
}

OpenUtau/Strings/Strings.axaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
<system:String x:Key="context.note.delete">Delete note</system:String>
77
<system:String x:Key="context.note.pasteparameters">Select and paste parameters</system:String>
88
<system:String x:Key="context.part.delete">Delete part</system:String>
9+
<system:String x:Key="context.part.gotofile">View file location</system:String>
910
<system:String x:Key="context.part.rename">Rename part</system:String>
1011
<system:String x:Key="context.part.replaceaudio">Reselect audio file</system:String>
12+
<system:String x:Key="context.part.transcribe">Transcribe audio to create a note part</system:String>
13+
<system:String x:Key="context.part.transcribing">Transcribing</system:String>
1114
<system:String x:Key="context.pitch.easein">Ease in</system:String>
1215
<system:String x:Key="context.pitch.easeinout">Ease in/out</system:String>
1316
<system:String x:Key="context.pitch.easeout">Ease out</system:String>

OpenUtau/ViewModels/MainWindowViewModel.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ public class PartsContextMenuArgs {
1818
public bool IsWavePart => Part is UWavePart;
1919
public ReactiveCommand<UPart, Unit>? PartDeleteCommand { get; set; }
2020
public ReactiveCommand<UPart, Unit>? PartRenameCommand { get; set; }
21+
public ReactiveCommand<UPart, Unit>? PartGotoFileCommand { get; set; }
2122
public ReactiveCommand<UPart, Unit>? PartReplaceAudioCommand { get; set; }
23+
public ReactiveCommand<UPart, Unit>? PartTranscribeCommand { get; set; }
2224
}
2325

2426
public class MainWindowViewModel : ViewModelBase, ICmdSubscriber {

OpenUtau/Views/MainWindow.axaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,10 @@
229229
<MenuItem Header="{DynamicResource context.part.delete}"
230230
Command="{Binding PartDeleteCommand}"
231231
CommandParameter="{Binding Part}"/>
232+
<MenuItem Header="{DynamicResource context.part.gotofile}"
233+
IsVisible="{Binding IsWavePart}"
234+
Command="{Binding PartGotoFileCommand}"
235+
CommandParameter="{Binding Part}"/>
232236
<MenuItem Header="{DynamicResource context.part.replaceaudio}"
233237
IsVisible="{Binding IsWavePart}"
234238
Command="{Binding PartReplaceAudioCommand}"
@@ -237,6 +241,10 @@
237241
IsVisible="{Binding IsVoicePart}"
238242
Command="{Binding PartRenameCommand}"
239243
CommandParameter="{Binding Part}"/>
244+
<MenuItem Header="{DynamicResource context.part.transcribe}"
245+
IsVisible="{Binding IsWavePart}"
246+
Command="{Binding PartTranscribeCommand}"
247+
CommandParameter="{Binding Part}"/>
240248
</ContextMenu>
241249
</c:PartsCanvas.ContextMenu>
242250
</c:PartsCanvas>

0 commit comments

Comments
 (0)