Add smem-based polyphase channelizer kernel (#613)

tbensonatl · web-flow · commit 16e985bec76d · 2024-04-30T08:52:51.000-04:00
Add a new polyphase channelizer kernel that utilizes shared memory
for both the filter coefficients and input samples. This kernel
is automatically selected in cases where the corresponding data
will fit in shared memory.
diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh
@@ -174,6 +174,148 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
     }
 }
 
+// This kernel works in cases where the full filter (with potentially some zero padding) and
+// the inputs required to compute elems_per_channel_per_cta outputs all fit into shared memory.
+template <typename OutType, typename InType, typename FilterType>
+__global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType filter, index_t elems_per_channel_per_cta)
+{
+    using output_t = typename OutType::scalar_type;
+    using input_t = typename InType::scalar_type;
+    using filter_t = typename FilterType::scalar_type;
+
+    extern __shared__ uint8_t __attribute((aligned(16))) smem_dyn_align16[];
+
+    constexpr int InRank = InType::Rank();
+    constexpr int OutRank = OutType::Rank();
+    constexpr int ChannelRank = OutRank-1;
+    constexpr int OutElemRank = OutRank-2;
+
+    const index_t input_len = input.Size(InRank-1);
+    const index_t output_len_per_channel = output.Size(OutElemRank);
+    // If the filter fits into shared memory, then a 32-bit index is sufficient. One
+    // edge case exception would be num_channels > 2^32-1, but with a small filter
+    // implicitly padded with zeros. We assume that the kernel selection logic
+    // considers the size of the zero-padded filter since that is what we actually
+    // store in shared memory.
+    const uint32_t num_channels = static_cast<uint32_t>(output.Size(ChannelRank));
+    const uint32_t filter_full_len = static_cast<uint32_t>(filter.Size(0));
+    const uint32_t filter_phase_len = static_cast<uint32_t>((filter_full_len + num_channels - 1) / num_channels);
+
+    filter_t *smem_h = reinterpret_cast<filter_t *>(smem_dyn_align16);
+    size_t smem_input_offset = sizeof(filter_t) * filter_phase_len * num_channels;
+    if (smem_input_offset % sizeof(input_t)) {
+        smem_input_offset += sizeof(input_t) - smem_input_offset % sizeof(input_t);
+    }
+    input_t *smem_input = reinterpret_cast<input_t *>(smem_dyn_align16 + smem_input_offset);
+
+    const uint32_t tid = threadIdx.y * blockDim.x + threadIdx.x;
+    const uint32_t nthreads = blockDim.x * blockDim.y;
+    const uint32_t chan = threadIdx.x;
+    const uint32_t ty = threadIdx.y;
+    const uint32_t by = blockDim.y;
+
+    for (uint32_t t = tid; t < filter_full_len; t += nthreads) {
+        smem_h[t] = filter.operator()(t);
+    }
+
+    for (uint32_t t = filter_full_len+tid; t < filter_phase_len * num_channels; t += nthreads) {
+        smem_h[t] = static_cast<filter_t>(0);
+    }
+
+    // The input stored in shared memory is logically [smem_input_height, num_channels] where
+    // smem_input_height is the number of samples at the output sample rate stored in smem.
+    const uint32_t smem_input_height = filter_phase_len + by - 1;
+
+    const index_t start_elem = blockIdx.x * elems_per_channel_per_cta;
+    const index_t last_elem = std::min(output_len_per_channel-1, (blockIdx.x+1) * elems_per_channel_per_cta - 1);
+    auto indims = BlockToIdx(input, blockIdx.z, 1);
+    auto outdims = BlockToIdx(output, blockIdx.z, 2);
+    outdims[ChannelRank] = chan;
+
+    for (uint32_t t = ty; t < filter_phase_len-1; t += by) {
+        const index_t out_sample_ind = start_elem - (filter_phase_len-1) + t;
+        const uint32_t smem_ind = t * num_channels + chan;
+        const index_t input_ind = out_sample_ind * num_channels + chan;
+        if (input_ind >= 0 && input_ind < input_len) {
+            indims[InRank-1] = input_ind;
+            detail::mapply([smem_input, smem_ind, &input](auto &&...args) {
+                smem_input[smem_ind] = input.operator()(args...);
+            }, indims);
+        } else {
+            smem_input[smem_ind] = static_cast<filter_t>(0);
+        }
+    }
+
+    index_t next_start_elem = start_elem;
+    const index_t num_elem_iters = (last_elem - start_elem + 1 + by - 1) / by;
+
+    uint32_t cached_input_ind_tail = filter_phase_len - 1 + ty;
+    const filter_t *h_start = smem_h + num_channels * filter_phase_len - (num_channels - chan);
+    for (index_t iter = 0; iter < num_elem_iters; iter++) {
+
+        __syncthreads();
+
+        // Load next elems_per_channel_per_cta elements for each channel
+        const index_t next_last_elem = std::min(next_start_elem + by - 1, last_elem);
+        const uint32_t out_samples_this_iter = static_cast<uint32_t>(next_last_elem - next_start_elem + 1);
+        if (ty < out_samples_this_iter) {
+            indims[InRank-1] = (next_start_elem + ty) * num_channels + chan;
+            const uint32_t smem_ind = cached_input_ind_tail * num_channels + chan;
+            if (indims[InRank-1] < input_len) {
+                detail::mapply([smem_input, smem_ind, &input](auto &&...args) {
+                    smem_input[smem_ind] = input.operator()(args...);
+                }, indims);
+            } else {
+                smem_input[smem_ind] = static_cast<filter_t>(0);
+            }
+        }
+
+        cached_input_ind_tail += by;
+        // The below effectively mods cached_input_ind_tail by smem_input_height. Since
+        // smem_input_height is >= by, adding by means that we will need to subtract
+        // smem_input_height at most once for cached_input_ind_tail to be in the range
+        // [0, smem_input_height-1]. The conditional is cheaper than the mod, unless
+        // smem_input_height is known at compile time.
+        if (cached_input_ind_tail >= smem_input_height) {
+            cached_input_ind_tail -= smem_input_height;
+        }
+
+        __syncthreads();
+
+        outdims[OutElemRank] = next_start_elem + ty;
+        if (outdims[OutElemRank] <= last_elem) {
+            const filter_t *h = h_start;
+            output_t accum { 0 };
+            const int first_end = std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
+            // The footprint of samples involved in the convolution may wrap from the end
+            // to the beginning of smem_input. The prologue below handles the samples from
+            // the current tail to the end of smem_input and the epilogue starts back at the
+            // beginning of smem_input.
+            const int prologue_count = (first_end - cached_input_ind_tail + 1);
+            const int epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
+            const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
+            // Apply the filter h in reverse order below to flip the filter for convolution
+            for (int k = 0; k < prologue_count; k++) {
+                accum += (*h) * (*sample);
+                sample += num_channels;
+                h -= num_channels;
+            }
+            sample = smem_input + (num_channels - 1 - chan);
+            for (int k = 0; k < epilogue_count; k++) {
+                accum += (*h) * (*sample);
+                sample += num_channels;
+                h -= num_channels;
+            }
+
+            detail::mapply([accum, &output](auto &&...args) {
+                output.operator()(args...) = accum;
+            }, outdims);
+        }
+
+        next_start_elem += out_samples_this_iter;
+    }
+}
+
 template <int THREADS, int NUM_CHAN, typename OutType, typename InType, typename FilterType>
 __launch_bounds__(THREADS)
 __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterType filter)
diff --git a/include/matx/transforms/channelize_poly.h b/include/matx/transforms/channelize_poly.h
@@ -52,6 +52,11 @@ namespace detail {
 // channel counts.
 constexpr index_t MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD = 6;
 
+// Number of output samples per channel per iteration for the kernel that stores
+// the input data in shared memory. Ideally, this value would be determined dynamically
+// to balance occupancy and CTA size. For now, we choose a reasonable default.
+constexpr index_t MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER = 4;
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
                                      const FilterType &filter, cudaStream_t stream)
@@ -78,6 +83,70 @@ inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
 #endif
 }
 
+template <typename OutType, typename InType, typename FilterType>
+inline size_t matxChannelizePoly1DInternal_SmemSizeBytes(const OutType &o, const InType &, const FilterType &filter)
+{
+  using input_t = typename InType::scalar_type;
+  using filter_t = typename FilterType::scalar_type;
+
+  index_t filter_len = filter.Size(FilterType::Rank()-1);
+
+  const index_t num_channels = o.Size(OutType::Rank()-1);
+  const index_t nout_per_channel = o.Size(OutType::Rank()-2);
+  const index_t filter_phase_len = (filter_len + num_channels - 1) / num_channels;
+
+  size_t smem_size = sizeof(filter_t)*(num_channels)*(filter_phase_len) +
+    sizeof(input_t)*(num_channels)*(filter_phase_len + MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER - 1);
+  const size_t max_sizeof = std::max(sizeof(filter_t), sizeof(input_t));
+  if (smem_size % max_sizeof) {
+    smem_size += max_sizeof - (smem_size % max_sizeof);
+  }
+  return smem_size;
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline size_t matxChannelizePoly1DInternal_ShouldUseSmemKernel(const OutType &out, const InType &in, const FilterType &filter)
+{
+  // 48 KB is the largest shared memory allocation that does not require
+  // explicit opt-in via cudaFuncSetAttribute()
+  const size_t MAX_SMEM_BYTES = 48 * 1024;
+  // The full shared memory kernel uses blocks of size
+  // (num_channels, detail::MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER), so ensure
+  // that the resulting thread per block count will not exceed MAX_NUM_THREADS_PER_BLOCK
+  const int MAX_NUM_THREADS_PER_BLOCK = 1024;
+  const index_t num_channels = out.Size(OutType::Rank()-1);
+  return (
+      matxChannelizePoly1DInternal_SmemSizeBytes(out, in, filter) <= MAX_SMEM_BYTES &&
+      num_channels <= (MAX_NUM_THREADS_PER_BLOCK/detail::MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER));
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline void matxChannelizePoly1DInternal_Smem(OutType o, const InType &i, const FilterType &filter, cudaStream_t stream)
+{
+#ifdef __CUDACC__
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+
+  using input_t = typename InType::scalar_type;
+  using filter_t = typename FilterType::scalar_type;
+
+  index_t filter_len = filter.Size(FilterType::Rank()-1);
+
+  const index_t num_channels = o.Size(OutType::Rank()-1);
+  const index_t nout_per_channel = o.Size(OutType::Rank()-2);
+  const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
+
+  const int target_num_blocks = 1024;
+  const int elem_per_block = static_cast<int>(
+    (nout_per_channel + target_num_blocks - 1) / target_num_blocks);
+  dim3 block(static_cast<int>(num_channels), MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER);
+  const uint32_t num_blocks = static_cast<uint32_t>((nout_per_channel + elem_per_block - 1) / elem_per_block);
+  dim3 grid(num_blocks, 1, num_batches);
+  const size_t smem_size = matxChannelizePoly1DInternal_SmemSizeBytes(o, i, filter);
+  ChannelizePoly1D_Smem<OutType, InType, FilterType><<<grid, block, smem_size, stream>>>(
+      o, i, filter, elem_per_block);
+#endif
+}
+
 template <typename OutType, typename InType, typename FilterType>
 inline void matxChannelizePoly1DInternal_FusedChan(OutType o, const InType &i,
                                      const FilterType &filter, cudaStream_t stream)
@@ -237,7 +306,11 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
         }
       }();
 
-      matxChannelizePoly1DInternal(fft_in_slice, in, f, stream);
+      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+        matxChannelizePoly1DInternal_Smem(fft_in_slice, in, f, stream);
+      } else {
+        matxChannelizePoly1DInternal(fft_in_slice, in, f, stream);
+      }
       stop_dims[OUT_RANK-1] = (num_channels/2) + 1;
       auto out_packed = slice<OUT_RANK>(out, start_dims, stop_dims);
       (out_packed = fft(fft_in_slice, num_channels)).run(stream);
@@ -247,7 +320,11 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
     if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
       matxChannelizePoly1DInternal_FusedChan(out, in, f, stream);
     } else {
-      matxChannelizePoly1DInternal(out, in, f, stream);
+      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+        matxChannelizePoly1DInternal_Smem(out, in, f, stream);
+      } else {
+        matxChannelizePoly1DInternal(out, in, f, stream);
+      }
       // Specify FORWARD here to prevent any normalization after the ifft. We do not
       // want any extra scaling on the output values.
       (out = ifft(out, num_channels, FFTNorm::FORWARD)).run(stream);
diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu
@@ -145,6 +145,8 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple)
     { 271374, 31*14+4, 14 },
     { 27137, 301*13+3, 13 },
     { 27138, 301*14+4, 14 },
+    { 1000000, 32*16, 32 },
+    { 1000000, 40*16, 40 }
   };
 
   cudaStream_t stream = 0;