add simulate_rir_ism method

nateanl · nateanl · commit 50de1e9a6945 · 2022-08-29T12:30:49.000-04:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -58,6 +58,7 @@ endif()
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)
diff --git a/test/torchaudio_unittest/prototype/functional/autograd_test_impl.py b/test/torchaudio_unittest/prototype/functional/autograd_test_impl.py
@@ -31,3 +31,12 @@ def test_add_noise(self):
 
         self.assertTrue(gradcheck(F.add_noise, (waveform, noise, lengths, snr)))
         self.assertTrue(gradgradcheck(F.add_noise, (waveform, noise, lengths, snr)))
+
+    def test_simulate_rir_ism(self):
+        room = torch.tensor([9.0, 7.0, 3.0], dtype=self.dtype, device=self.device, requires_grad=True)
+        mic_array = torch.tensor([0.1, 3.5, 1.5], dtype=self.dtype, device=self.device, requires_grad=True).reshape(1, -1).repeat(6,1)
+        source = torch.tensor([8.8,3.5,1.5],dtype=self.dtype, device=self.device, requires_grad=True)
+        max_order= 3
+        e_absorption= torch.rand(7, 6, dtype=self.dtype, device=self.device, requires_grad=True)
+        self.assertTrue(gradcheck(F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption), eps=1e-2, atol=1e-2))
+        self.assertTrue(gradgradcheck(F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption), eps=1e-2, atol=1e-2))
diff --git a/test/torchaudio_unittest/prototype/functional/functional_test_impl.py b/test/torchaudio_unittest/prototype/functional/functional_test_impl.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pyroomacoustics as pra
 import torch
 import torchaudio.prototype.functional as F
 from parameterized import parameterized
@@ -107,3 +108,37 @@ def test_add_noise_length_check(self):
 
         with self.assertRaisesRegex(ValueError, "Length dimensions"):
             F.add_noise(waveform, noise, lengths, snr)
+
+    def test_simulate_rir_ism(self):
+        room_dim = torch.tensor([9.0, 9.0, 9.0], dtype=self.dtype, device=self.device, requires_grad=True)
+        mic_array = torch.tensor([1, 1, 1], dtype=self.dtype, device=self.device, requires_grad=True).reshape(1, -1).repeat(6,1)
+        source = torch.tensor([7,7,7],dtype=self.dtype, device=self.device, requires_grad=True)
+        max_order= 3
+        e_absorption= torch.rand(7, 6, dtype=self.dtype, device=self.device, requires_grad=True)
+        walls = ["west", "east", "south", "north", "floor", "ceiling"]
+        room2= pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials={
+                walls[i] : pra.Material(
+                    {
+                        "coeffs": e_absorption[:, i].reshape(-1,).detach().numpy(),
+                        "center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
+                    }
+                ) for i in range(len(walls))
+            },
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        mic_locs = np.asarray(
+            [[1.0,1.0,1.0]for _ in range(6)]  # mic 1
+        ).swapaxes(0,1)
+        room2.add_microphone_array(mic_locs)
+        room2.add_source([7.0,7.0,7.0])
+        room2.compute_rir()
+        actual = torch.concat([torch.tensor(room2.rir[0]) for i in range(6)]).to(self.dtype)
+        expected = F.simulate_rir_ism(room_dim, source, mic_array, max_order, e_absorption)
+        self.assertEqual(expected, actual)
+
+
diff --git a/torchaudio/csrc/CMakeLists.txt b/torchaudio/csrc/CMakeLists.txt
@@ -53,6 +53,14 @@ if(BUILD_RNNT)
   endif()
 endif()
 
+if(BUILD_RIR)
+  list(
+    APPEND
+    LIBTORCHAUDIO_SOURCES
+    build_rir.cpp
+    )
+endif()
+
 if(USE_CUDA)
   list(
     APPEND
diff --git a/torchaudio/csrc/build_rir.cpp b/torchaudio/csrc/build_rir.cpp
@@ -0,0 +1,120 @@
+#include <math.h>
+#include <torch/script.h>
+#include <torch/torch.h>
+using namespace torch::indexing;
+
+namespace torchaudio {
+namespace rir {
+
+template <typename scalar_t>
+void build_rir_impl(
+    const torch::Tensor& irs,
+    const torch::Tensor& delay,
+    torch::Tensor& rirs,
+    const int64_t rir_length,
+    const int64_t num_band,
+    const int64_t num_image,
+    const int64_t num_mic,
+    const int64_t ir_length) {
+  const scalar_t* input_data = irs.data_ptr<scalar_t>();
+  const int* delay_data = delay.data_ptr<int>();
+  scalar_t* output_data = rirs.data_ptr<scalar_t>();
+  at::parallel_for(
+      0, num_band * num_image * num_mic, 0, [&](int64_t start, int64_t end) {
+        for (auto i = start; i < end; i++) {
+          int64_t offset_input = i * ir_length;
+          int64_t mic = i % num_mic;
+          int64_t image = ((i - mic) / num_mic) % num_image;
+          int64_t band = (i - mic - image * num_mic) / (num_image * num_mic);
+          int64_t offset_output = (band * num_mic + mic) * rir_length;
+          int64_t offset_delay = image * num_mic + mic;
+          for (auto j = 0; j < ir_length; j++) {
+            output_data[offset_output + j + delay_data[offset_delay]] +=
+                input_data[offset_input + j];
+          }
+        }
+      });
+}
+
+torch::Tensor build_rir(
+    const torch::Tensor irs,
+    const torch::Tensor delay,
+    const int64_t rir_length) {
+  const int64_t num_band = irs.size(0);
+  const int64_t num_image = irs.size(1);
+  const int64_t num_mic = irs.size(2);
+  const int64_t ir_length = irs.size(3);
+  torch::Tensor rirs =
+      torch::zeros({num_band, num_mic, rir_length}, irs.dtype());
+  rirs.requires_grad_(true);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(irs.scalar_type(), "build_rir", [&] {
+    build_rir_impl<scalar_t>(
+        irs, delay, rirs, rir_length, num_band, num_image, num_mic, ir_length);
+  });
+  return rirs;
+}
+
+torch::Tensor make_filter(
+    torch::Tensor centers,
+    double sample_rate,
+    int64_t n_fft) {
+  int64_t n = centers.size(0);
+  torch::Tensor new_bands = torch::zeros({n, 2});
+  new_bands.requires_grad_(true);
+  float* newband_data = new_bands.data_ptr<float>();
+  const float* centers_data = centers.data_ptr<float>();
+  at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+    for (int64_t i = start; i < end; i++) {
+      if (i == 0) {
+        newband_data[i * 2] = centers_data[0] / 2;
+        newband_data[i * 2 + 1] = centers_data[1];
+      } else if (i == n - 1) {
+        newband_data[i * 2] = centers_data[n - 2];
+        newband_data[i * 2 + 1] = sample_rate / 2;
+      } else {
+        newband_data[i * 2] = centers_data[i - 1];
+        newband_data[i * 2 + 1] = centers_data[i + 1];
+      }
+    }
+  });
+  auto n_freq = n_fft / 2 + 1;
+  torch::Tensor freq_resp = torch::zeros({n_freq, n});
+  torch::Tensor freq = torch::arange(n_freq) / n_fft * sample_rate;
+  const float* freq_data = freq.data_ptr<float>();
+  float* freqreq_data = freq_resp.data_ptr<float>();
+
+  at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+    at::parallel_for(0, n_freq, 0, [&](int64_t start2, int64_t end2) {
+      for (auto i = start; i < end; i++) {
+        for (auto j = start2; j < end2; j++) {
+          if (freq_data[j] >= newband_data[i * 2] &&
+              freq_data[j] < centers_data[i]) {
+            freqreq_data[j * n + i] =
+                0.5 * (1 + cos(2 * M_PI * freq_data[j] / centers_data[i]));
+          }
+          if (i != n - 1 && freq_data[j] >= centers_data[i] &&
+              freq_data[j] < newband_data[i * 2 + 1]) {
+            freqreq_data[j * n + i] = 0.5 *
+                (1 - cos(2 * M_PI * freq_data[j] / newband_data[i * 2 + 1]));
+          }
+          if (i == n - 1 && centers_data[i] <= freq_data[j]) {
+            freqreq_data[j * n + i] = 1.0;
+          }
+        }
+      }
+    });
+  });
+  torch::Tensor filters =
+      torch::fft::fftshift(torch::fft::irfft(freq_resp, n_fft, 0), 0);
+  return filters.index({Slice(1)}).transpose(0, 1);
+}
+
+TORCH_LIBRARY(rir, m) {
+  m.def(
+      "rir::build_rir(Tensor irs, Tensor delay_i, int rir_length) -> Tensor",
+      &torchaudio::rir::build_rir);
+  m.def("rir::make_filter", &torchaudio::rir::make_filter);
+}
+
+} // namespace rir
+} // namespace torchaudio
diff --git a/torchaudio/prototype/functional/__init__.py b/torchaudio/prototype/functional/__init__.py
@@ -1,3 +1,4 @@
 from .functional import add_noise, convolve, fftconvolve
+from .rir import simulate_rir_ism
 
-__all__ = ["add_noise", "convolve", "fftconvolve"]
+__all__ = ["add_noise", "convolve", "fftconvolve", "simulate_rir_ism"]
diff --git a/torchaudio/prototype/functional/rir.py b/torchaudio/prototype/functional/rir.py
@@ -0,0 +1,117 @@
+import math
+from random import sample
+from typing import Union
+
+import torch
+import torchaudio
+from torch import Tensor
+
+_CENTER_FREQUENCY = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=torch.float)
+
+
+def _compute_image_sources(room, source, max_order, e_abs, e_scatter=None):
+    if e_scatter is None:
+        e_scatter = torch.zeros_like(e_abs)
+    # reflection coefficients
+    tr = torch.sqrt(1 - e_abs) * torch.sqrt(1 - e_scatter)
+
+    ind = torch.arange(-max_order, max_order + 1, device=source.device)
+    XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
+    XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
+    XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
+
+    # location of image sources
+    d = room[None, :]
+    s = source[None, :]
+    img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
+
+    # attenuation
+    exp_lo = abs(torch.floor(XYZ / 2))
+    exp_hi = abs(torch.floor((XYZ + 1) / 2))
+    t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # num_band, left walls
+    t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # num_band, right walls
+    att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1)  # num_band, num_image_source
+    return img_loc, att
+
+
+def _hann(x, T):
+    """Compute he Hann window."""
+    y = torch.where(
+        torch.abs(x) <= T / 2,
+        0.5 * (1 + torch.cos(2 * math.pi * x / T)),
+        x.new_zeros(1),
+    )
+    return y
+
+
+def _frac_delay(tau, filter_len=41):
+    if filter_len % 2 != 1:
+        raise ValueError("The filter length must be odd")
+
+    pad = filter_len // 2
+    n = torch.arange(-pad, pad + 1, device=tau.device)
+    tau = tau[..., None]
+
+    return torch.special.sinc(n - tau) * _hann(n - tau, 2 * pad)
+
+
+def simulate_rir_ism(
+    room: Tensor,
+    source: Tensor,
+    mic_array: Tensor,
+    max_order: int,
+    e_absorption: Union[float, Tensor],
+    sound_speed: float = 343.0,
+    sample_rate: float = 16000.0,
+) -> Tensor:
+    """Compute Room Impulse Response (RIR) based on image source method.
+
+    Args:
+        room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
+            `(D,)`, where D is 2 if room is a 2D room, or 3 if room is a 3D room.
+        source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
+            `(D)`.
+        mic_array (torch.Tensor): The coordinate of microphone array. Tensor with dimensions
+            `(channel, D)`.
+        max_order (int): The maximum order of relfections of image sources.
+        e_absorption (float or torch.Tensor): The absorption coefficients of wall materials.
+            If the dtype is ``float``, the absorption coefficient is identical to all walls and
+            all frequencies.
+            If ``e_absorption`` is a 1D Tensor, the shape must be `(4)` if the room is a 2D room,
+            or `(6)` if the room is a 3D room, where 4 represents 4 walls, 6 represents 4 walls,
+            ceiling, and floor.
+            If ``e_absorption`` is a 2D Tensor, the shape must be `(4, 7)` if the room is a 2D room,
+            or `(6, 7)` if the room is a 3D room, where 7 represents the number of frequency bands.
+        sound_speed (float): The speed of sound. (Default: ``343.0``)
+        sample_rate (float): The sample rate of the generated room impulse response signal.
+            (Default: ``16000.0``)
+
+    Returns:
+        (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
+            `(channel, rir_length)`.
+    """
+    if isinstance(e_absorption, float):
+        e_absorption = torch.ones(1, 6) * e_absorption
+
+    img_location, att = _compute_image_sources(room, source, max_order, e_absorption)
+    vec = img_location[:, None, :] - mic_array[None, :, :]
+
+    dist = torch.linalg.norm(vec, dim=-1)  # (num_band, n_img, n_mics)
+
+    img_src_att = att[..., None] / dist[None, ...]  # (n_band, n_img_src, n_mics)
+
+    # separate delays in integer / frac part
+    delay = dist / sound_speed * sample_rate  # distance to delay in samples
+    delay_i = torch.round(delay)  # integer part
+    delay_f = delay - delay_i  # frac part, in [-0.5, 0.5)
+
+    # compute the shorts IRs corresponding to each image source
+    irs = img_src_att[..., None] * _frac_delay(delay_f, filter_len=81)[None, ...]
+
+    rir_length = int(delay_i.max() + irs.shape[-1])
+    rir = torch.ops.rir.build_rir(irs, delay_i.type(torch.int32), rir_length)
+    if rir.shape[0] > 1:
+        filters = torch.ops.rir.make_filter(_CENTER_FREQUENCY.to(room.device), sample_rate, 512)
+        rir = torchaudio.prototype.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1))
+        rir = rir[..., (filters.shape[-1]-1) // 2 : -(filters.shape[-1]-1) // 2]
+    return rir.sum(0)