1+ // //////////////////////////////////////////////////////////////////////////////
2+ // BSD 3-Clause License
3+ //
4+ // Copyright (c) 2021, NVIDIA Corporation
5+ // All rights reserved.
6+ //
7+ // Redistribution and use in source and binary forms, with or without
8+ // modification, are permitted provided that the following conditions are met:
9+ //
10+ // 1. Redistributions of source code must retain the above copyright notice, this
11+ // list of conditions and the following disclaimer.
12+ //
13+ // 2. Redistributions in binary form must reproduce the above copyright notice,
14+ // this list of conditions and the following disclaimer in the documentation
15+ // and/or other materials provided with the distribution.
16+ //
17+ // 3. Neither the name of the copyright holder nor the names of its
18+ // contributors may be used to endorse or promote products derived from
19+ // this software without specific prior written permission.
20+ //
21+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+ // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24+ // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
25+ // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+ // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+ // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+ // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+ // ///////////////////////////////////////////////////////////////////////////////
32+
33+ #pragma once
34+
35+ #include < complex>
36+ #include < cuda.h>
37+ #include < iomanip>
38+ #include < memory>
39+ #include < stdint.h>
40+ #include < stdio.h>
41+ #include < vector>
42+
43+ #include " cuComplex.h"
44+ #include " matx/core/utils.h"
45+ #include " matx/core/type_utils.h"
46+ #include " matx/core/tensor_utils.h"
47+
48+ namespace matx {
49+
50+ #ifdef __CUDACC__
51+ template <int THREADS, typename OutType, typename InType, typename FilterType>
52+ __launch_bounds__ (THREADS)
53+ __global__ void ResamplePoly1D(OutType output, InType input, FilterType filter,
54+ index_t up, index_t down)
55+ {
56+ using output_t = typename OutType::scalar_type;
57+ using input_t = typename InType::scalar_type;
58+ using filter_t = typename FilterType::scalar_type;
59+
60+ extern __shared__ __align__ (alignof (double4 )) uint8_t smem_filter[];
61+ filter_t *s_filter = reinterpret_cast <filter_t *>(smem_filter);
62+
63+ constexpr int Rank = OutType::Rank ();
64+ const index_t output_len = output.Size (Rank-1 );
65+ index_t filter_len = filter.Size (0 );
66+ const index_t input_len = input.Size (Rank-1 );
67+
68+ // We assume odd-length filters below. In the case of an even-length filter,
69+ // logically prepend the filter with a single zero to make its length odd.
70+ const bool is_even = filter_len % 2 == 0 ;
71+ if (filter_len % 2 == 0 ) {
72+ filter_len++;
73+ }
74+
75+ const int phase_ind = blockIdx .y ;
76+ const int tid = threadIdx .x ;
77+ const index_t filter_len_half = filter_len/2 ;
78+
79+ // All but the last dim are batch indices
80+ const int batch_idx = blockIdx .x ;
81+ auto bdims = BlockToIdx (output, batch_idx, 1 );
82+
83+ // We assume odd-length filters in the below index calculations. If the filter
84+ // is even-length, then it has had a single zero logically prepended to it to
85+ // make it odd-length.
86+ // Consider resampling an input x by up=3, down=2. The upsampled and then
87+ // downsampled sequence -- ignoring filtering for now -- will look as folows.
88+ // | x0 | 0 | 0 | x1 | 0 | 0 | x2 | ... ]
89+ //
90+ // d0 d1 d2 d3 ...
91+ // The dX are samples that will be kept after filtering and downsampling.
92+ // The filter is centered on the output sample. We want to avoid storing the
93+ // filter coefficents that will map to 0 in the upsampled sequence. Thus,
94+ // we see in the above that for filter h with length K and output d0, we will
95+ // need filter elements h[K/2], h[K/2 - 3], h[K/2 - 6], etc., depending upon
96+ // the length of the filter. The filter is flipped for convolution, which is
97+ // why the index is decreasing in the above. We call this set of coefficients
98+ // phase 0 because there is an offset of 0 between the xN points in the
99+ // upsampled sequence and the central tap of the filter.
100+ // Similarly, output d1 will use phase 1 coefficients because x1 is offset by
101+ // 1 from the central tap, and output d1 will use phase 2 coefficients because
102+ // x2 is offset by 2 from the central tap. Thus, there are up total phases and
103+ // therefore either filter_len / up or filter_len / up - 1 coefficients per phase.
104+ //
105+ // This kernel is launched with the phase index provided as a block index
106+ // and it will calculate the output points corresponding to this phase. We first
107+ // need to determine which filter samples we need for this phase and load them
108+ // into shared memory.
109+ // left_filter_ind is the filter index that corresponds to the input sample
110+ // in the upsampled sequence equal to or to the left of the output sample.
111+ // Thus, for phase 0, left_filter_ind is the central filter tap, h[k/2].
112+ // For phase 1, with our up=3, down=2 example, left_filter_ind is
113+ // h[K/2+2]. Once we have a single tap index and the corresponding x index,
114+ // we will stride in both sequences (input and filter) by +/- up to cover
115+ // the full filter phase.
116+ const index_t left_filter_ind = filter_len_half + (phase_ind*down) % up;
117+
118+ // If left_filter_ind >= filter_len, then the filter is not long enough to reach
119+ // a potentially non-zero sample value on the left. In that case, set the
120+ // last_filter_ind to the next non-zero sample to the right of the output
121+ // index.
122+ const index_t last_filter_ind = (left_filter_ind < filter_len) ?
123+ left_filter_ind + up * ((filter_len - 1 - left_filter_ind) / up) :
124+ left_filter_ind - up;
125+
126+ // If last_filter_ind is now < 0, that means that the filter does not have
127+ // any corresponding non-zero samples (i.e. samples from the input signal).
128+ // Thus, all output values for this phase are zero because the filter only
129+ // overlaps with zero-filled samples from the upsampling operation.
130+ if (last_filter_ind < 0 ) {
131+ for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
132+ bdims[Rank - 1 ] = out_ind;
133+ output.operator ()(bdims) = 0 ;
134+ }
135+ return ;
136+ }
137+
138+ const index_t first_filter_ind = left_filter_ind - up * (left_filter_ind / up);
139+ const index_t this_phase_len = (last_filter_ind - first_filter_ind)/up + 1 ;
140+
141+ // Scale the filter coefficients by up to match scipy's convention
142+ const filter_t scale = static_cast <filter_t >(up);
143+
144+ // Flip the filter when writing to smem. The filtering is a convolution.
145+ if (is_even) {
146+ for (index_t t = tid; t < this_phase_len; t += THREADS) {
147+ const index_t ind = t * up + first_filter_ind;
148+ const index_t smem_ind = this_phase_len - 1 - t;
149+ s_filter[smem_ind] = (ind > 0 ) ? scale * filter.operator ()(ind-1 ) : static_cast <filter_t >(0 );
150+ }
151+ } else {
152+ for (index_t t = tid; t < this_phase_len; t += THREADS) {
153+ const index_t ind = t * up + first_filter_ind;
154+ const index_t smem_ind = this_phase_len - 1 - t;
155+ s_filter[smem_ind] = scale * filter.operator ()(ind);
156+ }
157+ }
158+
159+
160+ __syncthreads ();
161+
162+ // left_h_ind is the index in s_filter that contains the filter tap that will be applied to the
163+ // last input signal value not to the right of the output index in the virtual upsampled array.
164+ // If the filter has odd length and a given output value aligns with an input value, then
165+ // left_h_ind would reference the central tap. If the output value corresponds to a zero-padded
166+ // value (i.e. a 0 inserted during upsampling), then left_h_ind is the filter tap applied
167+ // to the nearest input value to the left of this output value.
168+ const index_t left_h_ind = (last_filter_ind - left_filter_ind)/up;
169+
170+ const index_t max_h_epilogue = this_phase_len - left_h_ind - 1 ;
171+ const index_t max_input_ind = static_cast <int >(input_len) - 1 ;
172+
173+ for (index_t out_ind = phase_ind + tid * up; out_ind < output_len; out_ind += THREADS * up) {
174+ // out_ind is the index in the output array and up_ind is the corresponding
175+ // index in the upsampled array
176+ const index_t up_ind = out_ind * down;
177+
178+ // input_ind is the largest index in the input array that is not greater than
179+ // (to the right of, in the previous figure earlier) up_ind.
180+ const index_t input_ind = up_ind / up;
181+
182+ // We want x_ind and h_ind to be the first aligned input and filter samples
183+ // of the convolution and n to be the number of taps. prologue is the number
184+ // of valid samples before input_ind. In the case that the filter is not
185+ // long enough to include input_ind, last_filter_ind is left_filter_ind - up
186+ // and thus left_h_ind and prologue are both -1.
187+ const index_t prologue = std::min (input_ind, left_h_ind);
188+ // epilogue is the number of valid samples after input_ind.
189+ const index_t epilogue = std::min (max_input_ind - input_ind, max_h_epilogue);
190+ // n is the number of valid samples. If input_ind is not valid because it
191+ // precedes the reach of the filter, then prologue = -1 and n is just the
192+ // epilogue.
193+ const index_t n = prologue + 1 + epilogue;
194+
195+ // Finally, convolve the filter and input samples
196+ index_t x_ind = input_ind - prologue;
197+ index_t h_ind = left_h_ind - prologue;
198+ output_t accum {};
199+ for (index_t j = 0 ; j < n; j++) {
200+ bdims[Rank - 1 ] = x_ind++;
201+ accum += s_filter[h_ind++] * input.operator ()(bdims);
202+ }
203+
204+ bdims[Rank - 1 ] = out_ind;
205+ output.operator ()(bdims) = accum;
206+ }
207+ }
208+
209+ #endif // __CUDACC__
210+
211+ }; // namespace matx
0 commit comments