Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs_input/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ API Reference
tensorgenerators.rst
stats.rst
random.rst
resample_poly.rst
utilities.rst
type_traits.rst
einsum.rst
Expand Down
6 changes: 6 additions & 0 deletions docs_input/api/resample_poly.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Polyphase Resampling
####################

The polyphase resampler API supports resampling an input signal by ratio ``up``/``down`` using a polyphase filter.

.. doxygenfunction:: matx::resample_poly(OutType &out, const InType &in, const FilterType &f, index_t up, index_t down, cudaStream_t stream = 0)
211 changes: 211 additions & 0 deletions include/matx/kernels/resample_poly.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
////////////////////////////////////////////////////////////////////////////////
// BSD 3-Clause License
//
// Copyright (c) 2021, NVIDIA Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/////////////////////////////////////////////////////////////////////////////////

#pragma once

#include <complex>
#include <cuda.h>
#include <iomanip>
#include <memory>
#include <stdint.h>
#include <stdio.h>
#include <vector>

#include "cuComplex.h"
#include "matx/core/utils.h"
#include "matx/core/type_utils.h"
#include "matx/core/tensor_utils.h"

namespace matx {

#ifdef __CUDACC__
template <int THREADS, typename OutType, typename InType, typename FilterType>
__launch_bounds__(THREADS)
__global__ void ResamplePoly1D(OutType output, InType input, FilterType filter,
index_t up, index_t down)
{
using output_t = typename OutType::scalar_type;
using input_t = typename InType::scalar_type;
using filter_t = typename FilterType::scalar_type;

extern __shared__ __align__(alignof(double4)) uint8_t smem_filter[];
filter_t *s_filter = reinterpret_cast<filter_t *>(smem_filter);

constexpr int Rank = OutType::Rank();
const index_t output_len = output.Size(Rank-1);
index_t filter_len = filter.Size(0);
const index_t input_len = input.Size(Rank-1);

// We assume odd-length filters below. In the case of an even-length filter,
// logically prepend the filter with a single zero to make its length odd.
const bool is_even = filter_len % 2 == 0;
if (filter_len % 2 == 0) {
filter_len++;
}

const int phase_ind = blockIdx.y;
const int tid = threadIdx.x;
const index_t filter_len_half = filter_len/2;

// All but the last dim are batch indices
const int batch_idx = blockIdx.x;
auto bdims = BlockToIdx(output, batch_idx, 1);

// We assume odd-length filters in the below index calculations. If the filter
// is even-length, then it has had a single zero logically prepended to it to
// make it odd-length.
// Consider resampling an input x by up=3, down=2. The upsampled and then
// downsampled sequence -- ignoring filtering for now -- will look as folows.
// | x0 | 0 | 0 | x1 | 0 | 0 | x2 | ... ]
//
// d0 d1 d2 d3 ...
// The dX are samples that will be kept after filtering and downsampling.
// The filter is centered on the output sample. We want to avoid storing the
// filter coefficents that will map to 0 in the upsampled sequence. Thus,
// we see in the above that for filter h with length K and output d0, we will
// need filter elements h[K/2], h[K/2 - 3], h[K/2 - 6], etc., depending upon
// the length of the filter. The filter is flipped for convolution, which is
// why the index is decreasing in the above. We call this set of coefficients
// phase 0 because there is an offset of 0 between the xN points in the
// upsampled sequence and the central tap of the filter.
// Similarly, output d1 will use phase 1 coefficients because x1 is offset by
// 1 from the central tap, and output d1 will use phase 2 coefficients because
// x2 is offset by 2 from the central tap. Thus, there are up total phases and
// therefore either filter_len / up or filter_len / up - 1 coefficients per phase.
//
// This kernel is launched with the phase index provided as a block index
// and it will calculate the output points corresponding to this phase. We first
// need to determine which filter samples we need for this phase and load them
// into shared memory.
// left_filter_ind is the filter index that corresponds to the input sample
// in the upsampled sequence equal to or to the left of the output sample.
// Thus, for phase 0, left_filter_ind is the central filter tap, h[k/2].
// For phase 1, with our up=3, down=2 example, left_filter_ind is
// h[K/2+2]. Once we have a single tap index and the corresponding x index,
// we will stride in both sequences (input and filter) by +/- up to cover
// the full filter phase.
const index_t left_filter_ind = filter_len_half + (phase_ind*down) % up;

// If left_filter_ind >= filter_len, then the filter is not long enough to reach
// a potentially non-zero sample value on the left. In that case, set the
// last_filter_ind to the next non-zero sample to the right of the output
// index.
const index_t last_filter_ind = (left_filter_ind < filter_len) ?
left_filter_ind + up * ((filter_len - 1 - left_filter_ind) / up) :
left_filter_ind - up;

// If last_filter_ind is now < 0, that means that the filter does not have
// any corresponding non-zero samples (i.e. samples from the input signal).
// Thus, all output values for this phase are zero because the filter only
// overlaps with zero-filled samples from the upsampling operation.
if (last_filter_ind < 0) {
for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
bdims[Rank - 1] = out_ind;
output.operator()(bdims) = 0;
}
return;
}

const index_t first_filter_ind = left_filter_ind - up * (left_filter_ind / up);
const index_t this_phase_len = (last_filter_ind - first_filter_ind)/up + 1;

// Scale the filter coefficients by up to match scipy's convention
const filter_t scale = static_cast<filter_t>(up);

// Flip the filter when writing to smem. The filtering is a convolution.
if (is_even) {
for (index_t t = tid; t < this_phase_len; t += THREADS) {
const index_t ind = t * up + first_filter_ind;
const index_t smem_ind = this_phase_len - 1 - t;
s_filter[smem_ind] = (ind > 0) ? scale * filter.operator()(ind-1) : static_cast<filter_t>(0);
}
} else {
for (index_t t = tid; t < this_phase_len; t += THREADS) {
const index_t ind = t * up + first_filter_ind;
const index_t smem_ind = this_phase_len - 1 - t;
s_filter[smem_ind] = scale * filter.operator()(ind);
}
}


__syncthreads();

// left_h_ind is the index in s_filter that contains the filter tap that will be applied to the
// last input signal value not to the right of the output index in the virtual upsampled array.
// If the filter has odd length and a given output value aligns with an input value, then
// left_h_ind would reference the central tap. If the output value corresponds to a zero-padded
// value (i.e. a 0 inserted during upsampling), then left_h_ind is the filter tap applied
// to the nearest input value to the left of this output value.
const index_t left_h_ind = (last_filter_ind - left_filter_ind)/up;

const index_t max_h_epilogue = this_phase_len - left_h_ind - 1;
const index_t max_input_ind = static_cast<int>(input_len) - 1;

for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
// out_ind is the index in the output array and up_ind is the corresponding
// index in the upsampled array
const index_t up_ind = out_ind * down;

// input_ind is the largest index in the input array that is not greater than
// (to the right of, in the previous figure earlier) up_ind.
const index_t input_ind = up_ind / up;

// We want x_ind and h_ind to be the first aligned input and filter samples
// of the convolution and n to be the number of taps. prologue is the number
// of valid samples before input_ind. In the case that the filter is not
// long enough to include input_ind, last_filter_ind is left_filter_ind - up
// and thus left_h_ind and prologue are both -1.
const index_t prologue = std::min(input_ind, left_h_ind);
// epilogue is the number of valid samples after input_ind.
const index_t epilogue = std::min(max_input_ind - input_ind, max_h_epilogue);
// n is the number of valid samples. If input_ind is not valid because it
// precedes the reach of the filter, then prologue = -1 and n is just the
// epilogue.
const index_t n = prologue + 1 + epilogue;

// Finally, convolve the filter and input samples
index_t x_ind = input_ind - prologue;
index_t h_ind = left_h_ind - prologue;
output_t accum {};
for (index_t j = 0; j < n; j++) {
bdims[Rank - 1] = x_ind++;
accum += s_filter[h_ind++] * input.operator()(bdims);
}

bdims[Rank - 1] = out_ind;
output.operator()(bdims) = accum;
}
}

#endif // __CUDACC__

}; // namespace matx
152 changes: 152 additions & 0 deletions include/matx/transforms/resample_poly.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
////////////////////////////////////////////////////////////////////////////////
// BSD 3-Clause License
//
// Copyright (c) 2021, NVIDIA Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/////////////////////////////////////////////////////////////////////////////////

#pragma once

#include <cstdint>
#include <cstdio>
#include <type_traits>

#include "matx/core/error.h"
#include "matx/core/nvtx.h"
#include "matx/core/tensor.h"
#include "matx/operators/clone.h"
#include "matx/kernels/resample_poly.cuh"

namespace matx {
namespace detail {

template <typename OutType, typename InType, typename FilterType>
inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
const FilterType &filter, index_t up, index_t down,
cudaStream_t stream)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)

using input_t = typename InType::scalar_type;
using filter_t = typename FilterType::scalar_type;
using shape_type = typename OutType::shape_type;

shape_type filter_len = filter.Size(FilterType::Rank()-1);

// Even-length filters will be prepended with a single 0 to make them odd-length
const int max_phase_len = (filter_len % 2 == 0) ?
static_cast<int>((filter_len + 1 + up - 1) / up) :
static_cast<int>((filter_len + up - 1) / up);
const size_t filter_shm = sizeof(filter_t) * max_phase_len;

const int num_phases = static_cast<int>(up);
const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
dim3 grid(num_batches, num_phases);

constexpr int THREADS = 128;
ResamplePoly1D<THREADS, OutType, InType, FilterType><<<grid, THREADS, filter_shm, stream>>>(
o, i, filter, up, down);

#endif
}

} // end namespace detail

// Simple gcd implementation using the Euclidean algorithm.
// If large number support is needed, or if this function becomes performance
// sensitive, then this implementation may be insufficient. Typically, up/down
// factors for resampling will be known in a signal processing pipeline and
// thus the user would already supply co-prime up/down factors. In that case,
// b will be 0 below after one iteration and this implementation quickly identifies
// the factors as co-prime.
static index_t gcd(index_t a, index_t b) {
while (b != 0) {
const index_t t = b;
b = a % b;
a = t;
}
return a;
};

/**
* @brief 1D polyphase resampler
*
* @tparam OutType Type of output
* @tparam InType Type of input
* @tparam FilterType Type of filter
* @param out Output tensor
* @param in Input operator
* @param f Filter operator
* @param up Factor by which to upsample
* @param down Factor by which to downsample
* @param stream CUDA stream on which to run the kernel(s)
*/
template <typename OutType, typename InType, typename FilterType>
inline void resample_poly(OutType &out, const InType &in, const FilterType &f,
index_t up, index_t down, cudaStream_t stream = 0) {
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)

constexpr int RANK = InType::Rank();

MATX_STATIC_ASSERT(OutType::Rank() == InType::Rank(), matxInvalidDim);
// Currently only support 1D filters.
MATX_STATIC_ASSERT(FilterType::Rank() == 1, matxInvalidDim);

MATX_ASSERT_STR(up > 0, matxInvalidParameter, "up must be positive");
MATX_ASSERT_STR(down > 0, matxInvalidParameter, "down must be positive");

for(int i = 0 ; i < RANK-1; i++) {
MATX_ASSERT_STR(out.Size(i) == in.Size(i), matxInvalidDim, "resample_poly: input/output must have matched batch sizes");
}

const index_t up_size = in.Size(RANK-1) * up;
const index_t outlen = up_size / down + ((up_size % down) ? 1 : 0);

MATX_ASSERT_STR(out.Size(RANK-1) == outlen, matxInvalidDim, "resample_poly: output size mismatch");

const index_t g = gcd(up, down);
up /= g;
down /= g;

// There are two ways to interpret resampling when up == down == 1. One is
// that it is a no-op and thus we should just return a copy of the input
// tensor. Another is that polyphase resampling is logically equivalent to
// upsampling, convolving with a filter kernel, and then downsampling, in
// which case up == down == 1 is equivalent to convolution. We apply the
// first interpretation and return a copy of the input tensor. This matches
// the behavior of scipy.
if (up == 1 && down == 1) {
(out = in).run(stream);
return;
}

matxResamplePoly1DInternal(out, in, f, up, down, stream);
}

} // end namespace matx
1 change: 1 addition & 0 deletions include/matx/transforms/transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "matx/transforms/permute.h"
#include "matx/transforms/qr.h"
#include "matx/transforms/reduce.h"
#include "matx/transforms/resample_poly.h"
#include "matx/transforms/solver.h"
#include "matx/transforms/svd.h"
#include "matx/transforms/transpose.h"
Loading