fix(audio): off vocal extraction

NextFire · NextFire · commit 20e09b9b5be2 · 2025-03-16T11:18:59.000+01:00
sometimes the substraction failed because the two tensors didn't have
the same length (???)
diff --git a/yohane-cli/yohane_cli/audio.py b/yohane-cli/yohane_cli/audio.py
@@ -66,8 +66,9 @@ def save_separated_tracks(yohane: Yohane, song_path: Path):
         filename = song_path.with_suffix(".vocals.wav")
         logger.info(f"Saving vocals track to {filename}")
         torchaudio.save(filename.as_posix(), waveform.to("cpu"), sample_rate)
-    if yohane.off_vocal is not None:
-        waveform, sample_rate = yohane.off_vocal
+    off_vocal = yohane.extract_off_vocal()
+    if off_vocal is not None:
+        waveform, sample_rate = off_vocal
         filename = song_path.with_suffix(".off_vocal.wav")
         logger.info(f"Saving off vocal track to {filename}")
         torchaudio.save(filename.as_posix(), waveform.to("cpu"), sample_rate)
diff --git a/yohane/pipeline.py b/yohane/pipeline.py
@@ -24,25 +24,29 @@ def __init__(self, separator: Separator | None):
     def forced_aligned_audio(self):
         return self.vocals if self.vocals is not None else self.song
 
-    @property
-    def off_vocal(self):
-        if self.song is not None and self.vocals is not None:
-            song_waveform, song_sample_rate = self.song
-            vocals_waveform, vocals_sample_rate = self.vocals
-            vocals_waveform_resampled = torchaudio.functional.resample(
-                vocals_waveform, vocals_sample_rate, song_sample_rate
-            )
-            return song_waveform - vocals_waveform_resampled, song_sample_rate
-
     def load_song(self, song_file: Path):
         logger.info("Loading song")
         self.song = torchaudio.load(song_file.as_posix())
 
     def extract_vocals(self):
-        if self.separator is not None:
-            logger.info(f"Extracting vocals with {self.separator=}")
-            assert self.song
-            self.vocals = self.separator(*self.song)
+        if self.separator is None:
+            return
+        logger.info(f"Extracting vocals with {self.separator=}")
+        assert self.song
+        self.vocals = self.separator(*self.song)
+
+    def extract_off_vocal(self):
+        if self.song is None or self.vocals is None:
+            return
+        song_waveform, song_sample_rate = self.song
+        vocals_waveform, vocals_sample_rate = self.vocals
+        vocals_waveform_resampled = torchaudio.functional.resample(
+            vocals_waveform, vocals_sample_rate, song_sample_rate
+        )
+        min_length = min(song_waveform.size(1), vocals_waveform_resampled.size(1))
+        song_waveform = song_waveform[:, :min_length]
+        vocals_waveform_resampled = vocals_waveform_resampled[:, :min_length]
+        return song_waveform - vocals_waveform_resampled, song_sample_rate
 
     def load_lyrics(self, lyrics_str: str):
         logger.info("Loading lyrics")