Skip to content

Commit 02c43ea

Browse files
tbensonatlcliffburdick
authored andcommitted
Adding initial polyphase resampler transform
1 parent 5df4811 commit 02c43ea

File tree

8 files changed

+927
-0
lines changed

8 files changed

+927
-0
lines changed

docs_input/api/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ API Reference
1212
tensorgenerators.rst
1313
stats.rst
1414
random.rst
15+
resample_poly.rst
1516
utilities.rst
1617
type_traits.rst
1718
einsum.rst

docs_input/api/resample_poly.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Polyphase Resampling
2+
####################
3+
4+
The polyphase resampler API supports resampling an input signal by ratio ``up``/``down`` using a polyphase filter.
5+
6+
.. doxygenfunction:: matx::resample_poly(OutType &out, const InType &in, const FilterType &f, index_t up, index_t down, cudaStream_t stream = 0)
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
////////////////////////////////////////////////////////////////////////////////
2+
// BSD 3-Clause License
3+
//
4+
// Copyright (c) 2021, NVIDIA Corporation
5+
// All rights reserved.
6+
//
7+
// Redistribution and use in source and binary forms, with or without
8+
// modification, are permitted provided that the following conditions are met:
9+
//
10+
// 1. Redistributions of source code must retain the above copyright notice, this
11+
// list of conditions and the following disclaimer.
12+
//
13+
// 2. Redistributions in binary form must reproduce the above copyright notice,
14+
// this list of conditions and the following disclaimer in the documentation
15+
// and/or other materials provided with the distribution.
16+
//
17+
// 3. Neither the name of the copyright holder nor the names of its
18+
// contributors may be used to endorse or promote products derived from
19+
// this software without specific prior written permission.
20+
//
21+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
/////////////////////////////////////////////////////////////////////////////////
32+
33+
#pragma once
34+
35+
#include <complex>
36+
#include <cuda.h>
37+
#include <iomanip>
38+
#include <memory>
39+
#include <stdint.h>
40+
#include <stdio.h>
41+
#include <vector>
42+
43+
#include "cuComplex.h"
44+
#include "matx/core/utils.h"
45+
#include "matx/core/type_utils.h"
46+
#include "matx/core/tensor_utils.h"
47+
48+
namespace matx {
49+
50+
#ifdef __CUDACC__
51+
template <int THREADS, typename OutType, typename InType, typename FilterType>
52+
__launch_bounds__(THREADS)
53+
__global__ void ResamplePoly1D(OutType output, InType input, FilterType filter,
54+
index_t up, index_t down)
55+
{
56+
using output_t = typename OutType::scalar_type;
57+
using input_t = typename InType::scalar_type;
58+
using filter_t = typename FilterType::scalar_type;
59+
60+
extern __shared__ __align__(alignof(double4)) uint8_t smem_filter[];
61+
filter_t *s_filter = reinterpret_cast<filter_t *>(smem_filter);
62+
63+
constexpr int Rank = OutType::Rank();
64+
const index_t output_len = output.Size(Rank-1);
65+
index_t filter_len = filter.Size(0);
66+
const index_t input_len = input.Size(Rank-1);
67+
68+
// We assume odd-length filters below. In the case of an even-length filter,
69+
// logically prepend the filter with a single zero to make its length odd.
70+
const bool is_even = filter_len % 2 == 0;
71+
if (filter_len % 2 == 0) {
72+
filter_len++;
73+
}
74+
75+
const int phase_ind = blockIdx.y;
76+
const int tid = threadIdx.x;
77+
const index_t filter_len_half = filter_len/2;
78+
79+
// All but the last dim are batch indices
80+
const int batch_idx = blockIdx.x;
81+
auto bdims = BlockToIdx(output, batch_idx, 1);
82+
83+
// We assume odd-length filters in the below index calculations. If the filter
84+
// is even-length, then it has had a single zero logically prepended to it to
85+
// make it odd-length.
86+
// Consider resampling an input x by up=3, down=2. The upsampled and then
87+
// downsampled sequence -- ignoring filtering for now -- will look as folows.
88+
// | x0 | 0 | 0 | x1 | 0 | 0 | x2 | ... ]
89+
//
90+
// d0 d1 d2 d3 ...
91+
// The dX are samples that will be kept after filtering and downsampling.
92+
// The filter is centered on the output sample. We want to avoid storing the
93+
// filter coefficents that will map to 0 in the upsampled sequence. Thus,
94+
// we see in the above that for filter h with length K and output d0, we will
95+
// need filter elements h[K/2], h[K/2 - 3], h[K/2 - 6], etc., depending upon
96+
// the length of the filter. The filter is flipped for convolution, which is
97+
// why the index is decreasing in the above. We call this set of coefficients
98+
// phase 0 because there is an offset of 0 between the xN points in the
99+
// upsampled sequence and the central tap of the filter.
100+
// Similarly, output d1 will use phase 1 coefficients because x1 is offset by
101+
// 1 from the central tap, and output d1 will use phase 2 coefficients because
102+
// x2 is offset by 2 from the central tap. Thus, there are up total phases and
103+
// therefore either filter_len / up or filter_len / up - 1 coefficients per phase.
104+
//
105+
// This kernel is launched with the phase index provided as a block index
106+
// and it will calculate the output points corresponding to this phase. We first
107+
// need to determine which filter samples we need for this phase and load them
108+
// into shared memory.
109+
// left_filter_ind is the filter index that corresponds to the input sample
110+
// in the upsampled sequence equal to or to the left of the output sample.
111+
// Thus, for phase 0, left_filter_ind is the central filter tap, h[k/2].
112+
// For phase 1, with our up=3, down=2 example, left_filter_ind is
113+
// h[K/2+2]. Once we have a single tap index and the corresponding x index,
114+
// we will stride in both sequences (input and filter) by +/- up to cover
115+
// the full filter phase.
116+
const index_t left_filter_ind = filter_len_half + (phase_ind*down) % up;
117+
118+
// If left_filter_ind >= filter_len, then the filter is not long enough to reach
119+
// a potentially non-zero sample value on the left. In that case, set the
120+
// last_filter_ind to the next non-zero sample to the right of the output
121+
// index.
122+
const index_t last_filter_ind = (left_filter_ind < filter_len) ?
123+
left_filter_ind + up * ((filter_len - 1 - left_filter_ind) / up) :
124+
left_filter_ind - up;
125+
126+
// If last_filter_ind is now < 0, that means that the filter does not have
127+
// any corresponding non-zero samples (i.e. samples from the input signal).
128+
// Thus, all output values for this phase are zero because the filter only
129+
// overlaps with zero-filled samples from the upsampling operation.
130+
if (last_filter_ind < 0) {
131+
for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
132+
bdims[Rank - 1] = out_ind;
133+
output.operator()(bdims) = 0;
134+
}
135+
return;
136+
}
137+
138+
const index_t first_filter_ind = left_filter_ind - up * (left_filter_ind / up);
139+
const index_t this_phase_len = (last_filter_ind - first_filter_ind)/up + 1;
140+
141+
// Scale the filter coefficients by up to match scipy's convention
142+
const filter_t scale = static_cast<filter_t>(up);
143+
144+
// Flip the filter when writing to smem. The filtering is a convolution.
145+
if (is_even) {
146+
for (index_t t = tid; t < this_phase_len; t += THREADS) {
147+
const index_t ind = t * up + first_filter_ind;
148+
const index_t smem_ind = this_phase_len - 1 - t;
149+
s_filter[smem_ind] = (ind > 0) ? scale * filter.operator()(ind-1) : static_cast<filter_t>(0);
150+
}
151+
} else {
152+
for (index_t t = tid; t < this_phase_len; t += THREADS) {
153+
const index_t ind = t * up + first_filter_ind;
154+
const index_t smem_ind = this_phase_len - 1 - t;
155+
s_filter[smem_ind] = scale * filter.operator()(ind);
156+
}
157+
}
158+
159+
160+
__syncthreads();
161+
162+
// left_h_ind is the index in s_filter that contains the filter tap that will be applied to the
163+
// last input signal value not to the right of the output index in the virtual upsampled array.
164+
// If the filter has odd length and a given output value aligns with an input value, then
165+
// left_h_ind would reference the central tap. If the output value corresponds to a zero-padded
166+
// value (i.e. a 0 inserted during upsampling), then left_h_ind is the filter tap applied
167+
// to the nearest input value to the left of this output value.
168+
const index_t left_h_ind = (last_filter_ind - left_filter_ind)/up;
169+
170+
const index_t max_h_epilogue = this_phase_len - left_h_ind - 1;
171+
const index_t max_input_ind = static_cast<int>(input_len) - 1;
172+
173+
for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
174+
// out_ind is the index in the output array and up_ind is the corresponding
175+
// index in the upsampled array
176+
const index_t up_ind = out_ind * down;
177+
178+
// input_ind is the largest index in the input array that is not greater than
179+
// (to the right of, in the previous figure earlier) up_ind.
180+
const index_t input_ind = up_ind / up;
181+
182+
// We want x_ind and h_ind to be the first aligned input and filter samples
183+
// of the convolution and n to be the number of taps. prologue is the number
184+
// of valid samples before input_ind. In the case that the filter is not
185+
// long enough to include input_ind, last_filter_ind is left_filter_ind - up
186+
// and thus left_h_ind and prologue are both -1.
187+
const index_t prologue = std::min(input_ind, left_h_ind);
188+
// epilogue is the number of valid samples after input_ind.
189+
const index_t epilogue = std::min(max_input_ind - input_ind, max_h_epilogue);
190+
// n is the number of valid samples. If input_ind is not valid because it
191+
// precedes the reach of the filter, then prologue = -1 and n is just the
192+
// epilogue.
193+
const index_t n = prologue + 1 + epilogue;
194+
195+
// Finally, convolve the filter and input samples
196+
index_t x_ind = input_ind - prologue;
197+
index_t h_ind = left_h_ind - prologue;
198+
output_t accum {};
199+
for (index_t j = 0; j < n; j++) {
200+
bdims[Rank - 1] = x_ind++;
201+
accum += s_filter[h_ind++] * input.operator()(bdims);
202+
}
203+
204+
bdims[Rank - 1] = out_ind;
205+
output.operator()(bdims) = accum;
206+
}
207+
}
208+
209+
#endif // __CUDACC__
210+
211+
}; // namespace matx
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
////////////////////////////////////////////////////////////////////////////////
2+
// BSD 3-Clause License
3+
//
4+
// Copyright (c) 2021, NVIDIA Corporation
5+
// All rights reserved.
6+
//
7+
// Redistribution and use in source and binary forms, with or without
8+
// modification, are permitted provided that the following conditions are met:
9+
//
10+
// 1. Redistributions of source code must retain the above copyright notice, this
11+
// list of conditions and the following disclaimer.
12+
//
13+
// 2. Redistributions in binary form must reproduce the above copyright notice,
14+
// this list of conditions and the following disclaimer in the documentation
15+
// and/or other materials provided with the distribution.
16+
//
17+
// 3. Neither the name of the copyright holder nor the names of its
18+
// contributors may be used to endorse or promote products derived from
19+
// this software without specific prior written permission.
20+
//
21+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24+
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25+
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
/////////////////////////////////////////////////////////////////////////////////
32+
33+
#pragma once
34+
35+
#include <cstdint>
36+
#include <cstdio>
37+
#include <type_traits>
38+
39+
#include "matx/core/error.h"
40+
#include "matx/core/nvtx.h"
41+
#include "matx/core/tensor.h"
42+
#include "matx/operators/clone.h"
43+
#include "matx/kernels/resample_poly.cuh"
44+
45+
namespace matx {
46+
namespace detail {
47+
48+
template <typename OutType, typename InType, typename FilterType>
49+
inline void matxResamplePoly1DInternal(OutType &o, const InType &i,
50+
const FilterType &filter, index_t up, index_t down,
51+
cudaStream_t stream)
52+
{
53+
#ifdef __CUDACC__
54+
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
55+
56+
using input_t = typename InType::scalar_type;
57+
using filter_t = typename FilterType::scalar_type;
58+
using shape_type = typename OutType::shape_type;
59+
60+
shape_type filter_len = filter.Size(FilterType::Rank()-1);
61+
62+
// Even-length filters will be prepended with a single 0 to make them odd-length
63+
const int max_phase_len = (filter_len % 2 == 0) ?
64+
static_cast<int>((filter_len + 1 + up - 1) / up) :
65+
static_cast<int>((filter_len + up - 1) / up);
66+
const size_t filter_shm = sizeof(filter_t) * max_phase_len;
67+
68+
const int num_phases = static_cast<int>(up);
69+
const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
70+
dim3 grid(num_batches, num_phases);
71+
72+
constexpr int THREADS = 128;
73+
ResamplePoly1D<THREADS, OutType, InType, FilterType><<<grid, THREADS, filter_shm, stream>>>(
74+
o, i, filter, up, down);
75+
76+
#endif
77+
}
78+
79+
} // end namespace detail
80+
81+
// Simple gcd implementation using the Euclidean algorithm.
82+
// If large number support is needed, or if this function becomes performance
83+
// sensitive, then this implementation may be insufficient. Typically, up/down
84+
// factors for resampling will be known in a signal processing pipeline and
85+
// thus the user would already supply co-prime up/down factors. In that case,
86+
// b will be 0 below after one iteration and this implementation quickly identifies
87+
// the factors as co-prime.
88+
static index_t gcd(index_t a, index_t b) {
89+
while (b != 0) {
90+
const index_t t = b;
91+
b = a % b;
92+
a = t;
93+
}
94+
return a;
95+
};
96+
97+
/**
98+
* @brief 1D polyphase resampler
99+
*
100+
* @tparam OutType Type of output
101+
* @tparam InType Type of input
102+
* @tparam FilterType Type of filter
103+
* @param out Output tensor
104+
* @param in Input operator
105+
* @param f Filter operator
106+
* @param up Factor by which to upsample
107+
* @param down Factor by which to downsample
108+
* @param stream CUDA stream on which to run the kernel(s)
109+
*/
110+
template <typename OutType, typename InType, typename FilterType>
111+
inline void resample_poly(OutType &out, const InType &in, const FilterType &f,
112+
index_t up, index_t down, cudaStream_t stream = 0) {
113+
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
114+
115+
constexpr int RANK = InType::Rank();
116+
117+
MATX_STATIC_ASSERT(OutType::Rank() == InType::Rank(), matxInvalidDim);
118+
// Currently only support 1D filters.
119+
MATX_STATIC_ASSERT(FilterType::Rank() == 1, matxInvalidDim);
120+
121+
MATX_ASSERT_STR(up > 0, matxInvalidParameter, "up must be positive");
122+
MATX_ASSERT_STR(down > 0, matxInvalidParameter, "down must be positive");
123+
124+
for(int i = 0 ; i < RANK-1; i++) {
125+
MATX_ASSERT_STR(out.Size(i) == in.Size(i), matxInvalidDim, "resample_poly: input/output must have matched batch sizes");
126+
}
127+
128+
const index_t up_size = in.Size(RANK-1) * up;
129+
const index_t outlen = up_size / down + ((up_size % down) ? 1 : 0);
130+
131+
MATX_ASSERT_STR(out.Size(RANK-1) == outlen, matxInvalidDim, "resample_poly: output size mismatch");
132+
133+
const index_t g = gcd(up, down);
134+
up /= g;
135+
down /= g;
136+
137+
// There are two ways to interpret resampling when up == down == 1. One is
138+
// that it is a no-op and thus we should just return a copy of the input
139+
// tensor. Another is that polyphase resampling is logically equivalent to
140+
// upsampling, convolving with a filter kernel, and then downsampling, in
141+
// which case up == down == 1 is equivalent to convolution. We apply the
142+
// first interpretation and return a copy of the input tensor. This matches
143+
// the behavior of scipy.
144+
if (up == 1 && down == 1) {
145+
(out = in).run(stream);
146+
return;
147+
}
148+
149+
matxResamplePoly1DInternal(out, in, f, up, down, stream);
150+
}
151+
152+
} // end namespace matx

include/matx/transforms/transforms.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#include "matx/transforms/permute.h"
4848
#include "matx/transforms/qr.h"
4949
#include "matx/transforms/reduce.h"
50+
#include "matx/transforms/resample_poly.h"
5051
#include "matx/transforms/solver.h"
5152
#include "matx/transforms/svd.h"
5253
#include "matx/transforms/transpose.h"

0 commit comments

Comments
 (0)