Skip to content

Commit 3ceafc3

Browse files
committed
Integrate speech gate processor into azure speech detection
1 parent 4bd34a1 commit 3ceafc3

File tree

4 files changed

+91
-87
lines changed

4 files changed

+91
-87
lines changed

core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod duration;
66
mod protocol;
77
mod registry;
88
pub mod service;
9+
pub mod speech_gate;
910

1011
use std::time;
1112

src/speech_gate.rs renamed to core/src/speech_gate.rs

Lines changed: 86 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
use crate::AudioFrame;
2-
use fundsp::{hacker::*, numeric_array::NumericArray};
32

43
pub fn make_speech_gate_processor(
54
threshold: f32,
@@ -11,102 +10,100 @@ pub fn make_speech_gate_processor(
1110
// 0.05 / 0.025
1211
// 0.5 / 0.030 (not very good)
1312
// 0.1 / 0.075 (but not too good.)
14-
// soft_rms: (knee_width / threshold)
15-
// 0.01 / 0.0025 (echo example barely audible)
1613
make_speech_gate_processor_soft_rms(threshold, attack_ms, release_ms, 0.01)
1714
}
1815

19-
/// Returns a processing function that can be called for each AudioFrame (mono, 16kHz, i16)
20-
#[allow(unused)]
21-
pub fn make_speech_gate_processor_(
22-
threshold: f32,
23-
attack_ms: f32,
24-
release_ms: f32,
25-
) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
26-
let mut node = simple_speech_gate(threshold, 6.0, attack_ms, release_ms);
27-
let mut sample_rate = None;
16+
// Returns a processing function that can be called for each AudioFrame (mono, 16kHz, i16)
17+
// #[allow(unused)]
18+
// pub fn make_speech_gate_processor_(
19+
// threshold: f32,
20+
// attack_ms: f32,
21+
// release_ms: f32,
22+
// ) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
23+
// let mut node = simple_speech_gate(threshold, 6.0, attack_ms, release_ms);
24+
// let mut sample_rate = None;
2825

29-
Box::new(move |frame: &AudioFrame| {
30-
let frame_sample_rate = frame.format.sample_rate as f64;
31-
match sample_rate {
32-
None => {
33-
node.set_sample_rate(frame_sample_rate);
34-
sample_rate = Some(frame_sample_rate);
35-
}
36-
Some(rate) if rate != frame_sample_rate => {
37-
panic!("Changing frame sample rate is not supported in the speech gate processor");
38-
}
39-
Some(_) => {
40-
// same rate, all good
41-
}
42-
}
26+
// Box::new(move |frame: &AudioFrame| {
27+
// let frame_sample_rate = frame.format.sample_rate as f64;
28+
// match sample_rate {
29+
// None => {
30+
// node.set_sample_rate(frame_sample_rate);
31+
// sample_rate = Some(frame_sample_rate);
32+
// }
33+
// Some(rate) if rate != frame_sample_rate => {
34+
// panic!("Changing frame sample rate is not supported in the speech gate processor");
35+
// }
36+
// Some(_) => {
37+
// // same rate, all good
38+
// }
39+
// }
4340

44-
let samples_f32: Vec<f32> = frame.samples.iter().map(|&s| s as f32 / 32768.0).collect();
45-
let processed: Vec<f32> = samples_f32
46-
.iter()
47-
.map(|&sample| node.tick(&NumericArray::from([sample]))[0])
48-
.collect();
49-
let processed_i16: Vec<i16> = processed
50-
.iter()
51-
.map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
52-
.collect();
53-
AudioFrame {
54-
format: frame.format,
55-
samples: processed_i16,
56-
}
57-
})
58-
}
41+
// let samples_f32: Vec<f32> = frame.samples.iter().map(|&s| s as f32 / 32768.0).collect();
42+
// let processed: Vec<f32> = samples_f32
43+
// .iter()
44+
// .map(|&sample| node.tick(&NumericArray::from([sample]))[0])
45+
// .collect();
46+
// let processed_i16: Vec<i16> = processed
47+
// .iter()
48+
// .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
49+
// .collect();
50+
// AudioFrame {
51+
// format: frame.format,
52+
// samples: processed_i16,
53+
// }
54+
// })
55+
// }
5956

60-
fn simple_speech_gate(
61-
threshold: f32,
62-
softness: f32, // e.g., 6.0 dB for soft knee
63-
attack_ms: f32,
64-
release_ms: f32,
65-
) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
66-
let envelope_follower = envelope(|x| x * x)
67-
>> lowpass_hz(10.0, 1.0) // smoother RMS, ~100 ms
68-
>> map(|x| x[0].sqrt());
57+
// fn simple_speech_gate(
58+
// threshold: f32,
59+
// softness: f32, // e.g., 6.0 dB for soft knee
60+
// attack_ms: f32,
61+
// release_ms: f32,
62+
// ) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
63+
// let envelope_follower = envelope(|x| x * x)
64+
// >> lowpass_hz(10.0, 1.0) // smoother RMS, ~100 ms
65+
// >> map(|x| x[0].sqrt());
6966

70-
let soft_gate = envelope_follower
71-
>> map(move |level| {
72-
let db = 20.0 * level[0].log10().max(-120.0);
73-
let gain_db = if db < threshold - softness {
74-
-60.0 // silence
75-
} else if db > threshold + softness {
76-
0.0 // full gain
77-
} else {
78-
// Linear ramp over 2 * softness dB
79-
-60.0 * (1.0 - (db - (threshold - softness)) / (2.0 * softness))
80-
};
81-
db_to_gain(gain_db)
82-
})
83-
>> afollow(attack_ms / 1000.0, release_ms / 1000.0);
67+
// let soft_gate = envelope_follower
68+
// >> map(move |level| {
69+
// let db = 20.0 * level[0].log10().max(-120.0);
70+
// let gain_db = if db < threshold - softness {
71+
// -60.0 // silence
72+
// } else if db > threshold + softness {
73+
// 0.0 // full gain
74+
// } else {
75+
// // Linear ramp over 2 * softness dB
76+
// -60.0 * (1.0 - (db - (threshold - softness)) / (2.0 * softness))
77+
// };
78+
// db_to_gain(gain_db)
79+
// })
80+
// >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
8481

85-
pass() * soft_gate
86-
}
82+
// pass() * soft_gate
83+
// }
8784

8885
// Convert dB to linear gain
89-
fn db_to_gain(db: f32) -> f32 {
90-
10.0_f32.powf(db / 20.0)
91-
}
86+
// fn db_to_gain(db: f32) -> f32 {
87+
// 10.0_f32.powf(db / 20.0)
88+
// }
9289

93-
#[allow(unused)]
94-
fn simple_speech_gate_v1(
95-
threshold: f32,
96-
attack_ms: f32,
97-
release_ms: f32,
98-
) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
99-
// Proper RMS envelope follower with 10ms window
100-
let rms = envelope(|x| x * x) >> lowpass_hz(100.0, 1.0) >> map(|x| x[0].sqrt());
90+
// #[allow(unused)]
91+
// fn simple_speech_gate_v1(
92+
// threshold: f32,
93+
// attack_ms: f32,
94+
// release_ms: f32,
95+
// ) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
96+
// // Proper RMS envelope follower with 10ms window
97+
// let rms = envelope(|x| x * x) >> lowpass_hz(100.0, 1.0) >> map(|x| x[0].sqrt());
10198

102-
// Gate control with smoothing using follow
103-
let gate_control = rms
104-
>> map(move |level| if level[0] > threshold { 1.0 } else { 0.0 })
105-
>> afollow(attack_ms / 1000.0, release_ms / 1000.0);
99+
// // Gate control with smoothing using follow
100+
// let gate_control = rms
101+
// >> map(move |level| if level[0] > threshold { 1.0 } else { 0.0 })
102+
// >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
106103

107-
// Apply gating
108-
pass() * gate_control
109-
}
104+
// // Apply gating
105+
// pass() * gate_control
106+
// }
110107

111108
/// Returns a processing function that applies an attack/release envelope-based speech gate (no fundsp), with lazy sample rate initialization and a hard threshold (no knee).
112109
/// Works well with 0.0025 threshold for the examples.
@@ -217,12 +214,16 @@ pub fn make_speech_gate_processor_soft(
217214
/// for smoother, more natural speech gating and better echo suppression.
218215
///
219216
/// Claude 3.7
217+
///
218+
/// soft_rms: (knee_width / threshold)
219+
/// 0.01 / 0.0025 (echo example barely audible)
220+
/// Optimization: Take the buffer of the AudioFrame and return it, input is not used anymore.
220221
pub fn make_speech_gate_processor_soft_rms(
221222
threshold: f32, // normalized, 0.0 to 1.0
222223
attack_ms: f32,
223224
release_ms: f32,
224225
knee_width: f32,
225-
) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
226+
) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame + Send + Sync> {
226227
let mut envelope = 0.0f32;
227228
let mut sample_rate: Option<f32> = None;
228229
let mut attack_coeff = 0.0f32;

services/azure/src/transcribe.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use crate::Host;
1010
use context_switch_core::{
1111
BillingRecord, Service,
1212
conversation::{BillingSchedule, Conversation, Input},
13+
speech_gate::make_speech_gate_processor_soft_rms,
1314
};
1415

1516
#[derive(Debug, Deserialize)]
@@ -66,7 +67,9 @@ impl Service for AzureTranscribe {
6667
.into_header_for_infinite_file();
6768
stream! {
6869
yield wav_header;
69-
while let Some(Input::Audio{frame}) = input.recv().await {
70+
let mut speech_gate = make_speech_gate_processor_soft_rms(0.0025, 10., 300., 0.01);
71+
while let Some(Input::Audio{ frame }) = input.recv().await {
72+
let frame = speech_gate(&frame);
7073
yield frame.to_le_bytes();
7174
// <https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services/>
7275
// Speech to text hours are measured as the hours of audio _sent to the service_, billed in second increments.

src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
mod audio_tracer;
22
mod context_switch;
33
mod protocol;
4-
mod speech_gate;
54

65
#[cfg(test)]
76
mod tests;

0 commit comments

Comments
 (0)