Integrate speech gate processor into azure speech detection

pragmatrix · pragmatrix · commit 3ceafc30ed71 · 2025-08-01T16:44:39.000+02:00
diff --git a/core/src/lib.rs b/core/src/lib.rs
@@ -6,6 +6,7 @@ mod duration;
 mod protocol;
 mod registry;
 pub mod service;
+pub mod speech_gate;
 
 use std::time;
 
diff --git a/core/src/speech_gate.rs b/core/src/speech_gate.rs
@@ -1,5 +1,4 @@
 use crate::AudioFrame;
-use fundsp::{hacker::*, numeric_array::NumericArray};
 
 pub fn make_speech_gate_processor(
     threshold: f32,
@@ -11,102 +10,100 @@ pub fn make_speech_gate_processor(
     // 0.05 / 0.025
     // 0.5 / 0.030 (not very good)
     // 0.1 / 0.075 (but not too good.)
-    // soft_rms: (knee_width / threshold)
-    // 0.01 / 0.0025 (echo example barely audible)
     make_speech_gate_processor_soft_rms(threshold, attack_ms, release_ms, 0.01)
 }
 
-/// Returns a processing function that can be called for each AudioFrame (mono, 16kHz, i16)
-#[allow(unused)]
-pub fn make_speech_gate_processor_(
-    threshold: f32,
-    attack_ms: f32,
-    release_ms: f32,
-) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
-    let mut node = simple_speech_gate(threshold, 6.0, attack_ms, release_ms);
-    let mut sample_rate = None;
+// Returns a processing function that can be called for each AudioFrame (mono, 16kHz, i16)
+// #[allow(unused)]
+// pub fn make_speech_gate_processor_(
+//     threshold: f32,
+//     attack_ms: f32,
+//     release_ms: f32,
+// ) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
+//     let mut node = simple_speech_gate(threshold, 6.0, attack_ms, release_ms);
+//     let mut sample_rate = None;
 
-    Box::new(move |frame: &AudioFrame| {
-        let frame_sample_rate = frame.format.sample_rate as f64;
-        match sample_rate {
-            None => {
-                node.set_sample_rate(frame_sample_rate);
-                sample_rate = Some(frame_sample_rate);
-            }
-            Some(rate) if rate != frame_sample_rate => {
-                panic!("Changing frame sample rate is not supported in the speech gate processor");
-            }
-            Some(_) => {
-                // same rate, all good
-            }
-        }
+//     Box::new(move |frame: &AudioFrame| {
+//         let frame_sample_rate = frame.format.sample_rate as f64;
+//         match sample_rate {
+//             None => {
+//                 node.set_sample_rate(frame_sample_rate);
+//                 sample_rate = Some(frame_sample_rate);
+//             }
+//             Some(rate) if rate != frame_sample_rate => {
+//                 panic!("Changing frame sample rate is not supported in the speech gate processor");
+//             }
+//             Some(_) => {
+//                 // same rate, all good
+//             }
+//         }
 
-        let samples_f32: Vec<f32> = frame.samples.iter().map(|&s| s as f32 / 32768.0).collect();
-        let processed: Vec<f32> = samples_f32
-            .iter()
-            .map(|&sample| node.tick(&NumericArray::from([sample]))[0])
-            .collect();
-        let processed_i16: Vec<i16> = processed
-            .iter()
-            .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
-            .collect();
-        AudioFrame {
-            format: frame.format,
-            samples: processed_i16,
-        }
-    })
-}
+//         let samples_f32: Vec<f32> = frame.samples.iter().map(|&s| s as f32 / 32768.0).collect();
+//         let processed: Vec<f32> = samples_f32
+//             .iter()
+//             .map(|&sample| node.tick(&NumericArray::from([sample]))[0])
+//             .collect();
+//         let processed_i16: Vec<i16> = processed
+//             .iter()
+//             .map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
+//             .collect();
+//         AudioFrame {
+//             format: frame.format,
+//             samples: processed_i16,
+//         }
+//     })
+// }
 
-fn simple_speech_gate(
-    threshold: f32,
-    softness: f32, // e.g., 6.0 dB for soft knee
-    attack_ms: f32,
-    release_ms: f32,
-) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
-    let envelope_follower = envelope(|x| x * x)
-        >> lowpass_hz(10.0, 1.0) // smoother RMS, ~100 ms
-        >> map(|x| x[0].sqrt());
+// fn simple_speech_gate(
+//     threshold: f32,
+//     softness: f32, // e.g., 6.0 dB for soft knee
+//     attack_ms: f32,
+//     release_ms: f32,
+// ) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
+//     let envelope_follower = envelope(|x| x * x)
+//         >> lowpass_hz(10.0, 1.0) // smoother RMS, ~100 ms
+//         >> map(|x| x[0].sqrt());
 
-    let soft_gate = envelope_follower
-        >> map(move |level| {
-            let db = 20.0 * level[0].log10().max(-120.0);
-            let gain_db = if db < threshold - softness {
-                -60.0 // silence
-            } else if db > threshold + softness {
-                0.0 // full gain
-            } else {
-                // Linear ramp over 2 * softness dB
-                -60.0 * (1.0 - (db - (threshold - softness)) / (2.0 * softness))
-            };
-            db_to_gain(gain_db)
-        })
-        >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
+//     let soft_gate = envelope_follower
+//         >> map(move |level| {
+//             let db = 20.0 * level[0].log10().max(-120.0);
+//             let gain_db = if db < threshold - softness {
+//                 -60.0 // silence
+//             } else if db > threshold + softness {
+//                 0.0 // full gain
+//             } else {
+//                 // Linear ramp over 2 * softness dB
+//                 -60.0 * (1.0 - (db - (threshold - softness)) / (2.0 * softness))
+//             };
+//             db_to_gain(gain_db)
+//         })
+//         >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
 
-    pass() * soft_gate
-}
+//     pass() * soft_gate
+// }
 
 // Convert dB to linear gain
-fn db_to_gain(db: f32) -> f32 {
-    10.0_f32.powf(db / 20.0)
-}
+// fn db_to_gain(db: f32) -> f32 {
+//     10.0_f32.powf(db / 20.0)
+// }
 
-#[allow(unused)]
-fn simple_speech_gate_v1(
-    threshold: f32,
-    attack_ms: f32,
-    release_ms: f32,
-) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
-    // Proper RMS envelope follower with 10ms window
-    let rms = envelope(|x| x * x) >> lowpass_hz(100.0, 1.0) >> map(|x| x[0].sqrt());
+// #[allow(unused)]
+// fn simple_speech_gate_v1(
+//     threshold: f32,
+//     attack_ms: f32,
+//     release_ms: f32,
+// ) -> An<impl AudioNode<Inputs = U1, Outputs = U1>> {
+//     // Proper RMS envelope follower with 10ms window
+//     let rms = envelope(|x| x * x) >> lowpass_hz(100.0, 1.0) >> map(|x| x[0].sqrt());
 
-    // Gate control with smoothing using follow
-    let gate_control = rms
-        >> map(move |level| if level[0] > threshold { 1.0 } else { 0.0 })
-        >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
+//     // Gate control with smoothing using follow
+//     let gate_control = rms
+//         >> map(move |level| if level[0] > threshold { 1.0 } else { 0.0 })
+//         >> afollow(attack_ms / 1000.0, release_ms / 1000.0);
 
-    // Apply gating
-    pass() * gate_control
-}
+//     // Apply gating
+//     pass() * gate_control
+// }
 
 /// Returns a processing function that applies an attack/release envelope-based speech gate (no fundsp), with lazy sample rate initialization and a hard threshold (no knee).
 /// Works well with 0.0025 threshold for the examples.
@@ -217,12 +214,16 @@ pub fn make_speech_gate_processor_soft(
 /// for smoother, more natural speech gating and better echo suppression.
 ///
 /// Claude 3.7
+///
+/// soft_rms: (knee_width / threshold)
+/// 0.01 / 0.0025 (echo example barely audible)
+/// Optimization: Take the buffer of the AudioFrame and return it, input is not used anymore.
 pub fn make_speech_gate_processor_soft_rms(
     threshold: f32, // normalized, 0.0 to 1.0
     attack_ms: f32,
     release_ms: f32,
     knee_width: f32,
-) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame> {
+) -> Box<dyn FnMut(&AudioFrame) -> AudioFrame + Send + Sync> {
     let mut envelope = 0.0f32;
     let mut sample_rate: Option<f32> = None;
     let mut attack_coeff = 0.0f32;
diff --git a/services/azure/src/transcribe.rs b/services/azure/src/transcribe.rs
@@ -10,6 +10,7 @@ use crate::Host;
 use context_switch_core::{
     BillingRecord, Service,
     conversation::{BillingSchedule, Conversation, Input},
+    speech_gate::make_speech_gate_processor_soft_rms,
 };
 
 #[derive(Debug, Deserialize)]
@@ -66,7 +67,9 @@ impl Service for AzureTranscribe {
             .into_header_for_infinite_file();
             stream! {
                 yield wav_header;
-                while let Some(Input::Audio{frame}) = input.recv().await {
+                let mut speech_gate = make_speech_gate_processor_soft_rms(0.0025, 10., 300., 0.01);
+                while let Some(Input::Audio{ frame }) = input.recv().await {
+                    let frame = speech_gate(&frame);
                     yield frame.to_le_bytes();
                     // <https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services/>
                     // Speech to text hours are measured as the hours of audio _sent to the service_, billed in second increments.
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,7 +1,6 @@
 mod audio_tracer;
 mod context_switch;
 mod protocol;
-mod speech_gate;
 
 #[cfg(test)]
 mod tests;