Skip to content

Commit e4d427d

Browse files
author
root
committed
Update AlphabetHistogram
1 parent b3f2acd commit e4d427d

File tree

13 files changed

+310
-2
lines changed

13 files changed

+310
-2
lines changed

.clangd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ If:
2121
PathMatch: [.*\.cpp, .*\.cu, .*\.hpp, .*\.cuh]
2222
CompileFlags:
2323
Add:
24+
- -std=c++20
2425
- --no-cuda-version-check
2526
Remove:
2627
- -ccbin

configs/lib-tests.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,9 @@ OpTest:
22
Conv2D:
33
- inputHeight: 32
44
inputWidth: 32
5-
kernelSize: 3
5+
kernelSize: 3
6+
AlphabetHistogram:
7+
- nInputs: 256
8+
divider: 4
9+
- nInputs: 2048
10+
divider: 4

csrc/include/pmpp/utils/address.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,16 @@ namespace pmpp
1414
{
1515
/**
1616
* @brief Compute the offset of a multi-dimensional array.
17+
* A typical use case is that if you have rowIdx, colIdx, nRows and nCols,
18+
* to calculate the linear index of the element at (rowIdx, colIdx), you can
19+
* use this function as follows:
20+
* > offset(rowIdx, colIdx, nRows, nCols)
1721
*
1822
* @param args First half is the indices, second half is the size of each
1923
* dimension.
2024
* @return std::uint32_t The offset of the multi-dimensional array.
2125
*
22-
* @example
26+
* @example
2327
* 1. To calculate the offset of idx (2, 1) in a 2D array of dim (4, 3):
2428
* > offset(2, 1, 4, 3) -> 1*1 + 2*3 = 7
2529
* 2. To calculate the offset of idx (1, 2, 3) in a 3D array of dim

csrc/include/pmpp/utils/common.cuh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,14 @@
4242
#define PMPP_DEBUG_CUDA_ERR_CHECK(err) PMPP_CUDA_ERR_CHECK(err)
4343
#endif
4444
#endif
45+
46+
namespace pmpp::cuda
47+
{
48+
template <typename T>
49+
__host__ __device__ void initMemory(T* ptr, size_t n, const T& val)
50+
{
51+
for (size_t i = 0; i < n; ++i) {
52+
ptr[i] = val;
53+
}
54+
}
55+
} // namespace pmpp::cuda

csrc/include/pmpp/utils/common.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,12 @@ auto arr2str(const ArrT& arr) -> std::string
2121
return str;
2222
}
2323

24+
template <typename T>
25+
void initMemory(T* ptr, size_t n, const T& val)
26+
{
27+
for (size_t i = 0; i < n; ++i) {
28+
ptr[i] = val;
29+
}
30+
}
31+
2432
} // namespace pmpp

csrc/lib/ops/alphabetHistogram/op.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#include "../ops.hpp"
2+
3+
namespace pmpp::ops::cpu
4+
{
5+
template <>
6+
void launchAlphabetHistogram<int32_t>(const int32_t* input, int32_t* histo,
7+
int32_t nInputs, int32_t divider)
8+
{
9+
// O(N)
10+
for (int32_t i = 0; i < nInputs; ++i) {
11+
int32_t pos = input[i] - 'a';
12+
if (pos >= 0 && pos < 26) {
13+
++histo[pos / divider];
14+
}
15+
}
16+
}
17+
} // namespace pmpp::ops::cpu

csrc/lib/ops/alphabetHistogram/op.cu

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include <algorithm>
2+
#include <cuda_runtime.h>
3+
4+
#include "../ops.hpp"
5+
#include "pmpp/utils/common.cuh"
6+
#include "pmpp/utils/math.hpp"
7+
8+
namespace pmpp::ops::cuda
9+
{
10+
__global__ void alphabetHistogramKernel(const int32_t* input, int32_t* histo,
11+
int32_t nInputs, int32_t divider)
12+
{
13+
constexpr auto MAX_N_BINS = 26;
14+
int32_t nBins = ceilDiv(26, divider);
15+
__shared__ int32_t histo_s[MAX_N_BINS];
16+
::pmpp::cuda::initMemory(histo_s, nBins, 0);
17+
__syncthreads();
18+
19+
// Global thread index
20+
int32_t gTid = blockIdx.x * blockDim.x + threadIdx.x;
21+
int32_t accumulator = 0;
22+
int32_t prevBinIdx = -1;
23+
24+
// Map concecutive threads to all elements of the input
25+
for (int32_t i = gTid; i < nInputs; i += blockDim.x * gridDim.x) {
26+
int32_t alphabetPos = input[i] - 'a';
27+
if (alphabetPos >= 0 && alphabetPos < 26) {
28+
int32_t bin = alphabetPos / divider;
29+
if (bin == prevBinIdx) {
30+
++accumulator;
31+
} else {
32+
if (accumulator >= 0) {
33+
atomicAdd(&(histo_s[prevBinIdx]), accumulator);
34+
}
35+
accumulator = 1;
36+
prevBinIdx = bin;
37+
}
38+
}
39+
}
40+
if (accumulator > 0) {
41+
atomicAdd(&(histo_s[prevBinIdx]), accumulator);
42+
}
43+
44+
if (blockIdx.x > 0) {
45+
__syncthreads();
46+
// This loop is for the case when nBins > blockDim.x (nThreads per
47+
// block)
48+
for (int32_t bin = threadIdx.x; bin < nBins; bin += blockDim.x) {
49+
int32_t binVal = histo_s[bin];
50+
if (binVal > 0) {
51+
atomicAdd(&(histo[bin]), binVal);
52+
}
53+
}
54+
}
55+
}
56+
57+
template <>
58+
void launchAlphabetHistogram<int32_t>(const int32_t* d_input, int32_t* d_histo,
59+
int32_t nInputs, int32_t divider)
60+
{
61+
constexpr dim3 blockDim = {1024, 1, 1};
62+
dim3 gridDim = {uint32_t(ceilDiv(nInputs, blockDim.x)), 1, 1};
63+
alphabetHistogramKernel<<<gridDim, blockDim>>>(d_input, d_histo, nInputs,
64+
divider);
65+
}
66+
} // namespace pmpp::ops::cuda
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <torch/torch.h>
2+
3+
#include "../ops.hpp"
4+
#include "../torch_impl.hpp"
5+
6+
namespace pmpp::ops::cpu::torch_impl
7+
{
8+
auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
9+
-> torch::Tensor
10+
{
11+
auto nInputs = input.numel();
12+
auto histo = torch::zeros({26 / divider}, torch::kInt32);
13+
14+
switch (input.scalar_type()) {
15+
case torch::kInt32: {
16+
pmpp::ops::cpu::launchAlphabetHistogram<int32_t>(
17+
input.data_ptr<int32_t>(), histo.data_ptr<int32_t>(), nInputs,
18+
int32_t(divider));
19+
break;
20+
}
21+
default: {
22+
AT_ERROR("Unsupported dtype: ", input.dtype());
23+
}
24+
}
25+
26+
return histo;
27+
}
28+
} // namespace pmpp::ops::cpu::torch_impl
29+
30+
namespace pmpp::ops::cuda::torch_impl
31+
{
32+
auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
33+
-> torch::Tensor
34+
{
35+
auto nInputs = input.numel();
36+
auto histo = torch::zeros({26 / divider}, torch::kInt32);
37+
38+
switch (input.scalar_type()) {
39+
case torch::kInt32: {
40+
pmpp::ops::cuda::launchAlphabetHistogram<int32_t>(
41+
input.data_ptr<int32_t>(), histo.data_ptr<int32_t>(), nInputs,
42+
int32_t(divider));
43+
break;
44+
}
45+
default: {
46+
AT_ERROR("Unsupported dtype: ", input.dtype());
47+
}
48+
}
49+
50+
return histo;
51+
}
52+
} // namespace pmpp::ops::cuda::torch_impl

csrc/lib/ops/ops.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#pragma once
22

3+
#include "pmpp/types/cu_types.cuh"
34
#include "pmpp/types/cxx_types.hpp"
45

6+
#include <array>
7+
58
namespace pmpp::ops::cpu
69
{
710

@@ -16,6 +19,14 @@ template <typename ScalarT>
1619
void launchConv2d(const ScalarT* input, const ScalarT* kernel, ScalarT* output,
1720
int32_t inHeight, int32_t inWidth, int32_t kernelSize);
1821

22+
template <typename ScalarT>
23+
void launchStencil3d(const ScalarT* input, ScalarT* output, dim3 shape,
24+
const std::array<ScalarT, 7>& coeffs);
25+
26+
template <typename ScalarT>
27+
void launchAlphabetHistogram(const ScalarT* input, ScalarT* histo,
28+
int32_t nInputs, int32_t divider);
29+
1930
} // namespace pmpp::ops::cpu
2031

2132
namespace pmpp::ops::cuda
@@ -34,4 +45,12 @@ void launchConv2d(const ScalarT* d_input, const ScalarT* d_kernel,
3445
ScalarT* d_output, int32_t inputHeight, int32_t inputWidth,
3546
int32_t kernelSize);
3647

48+
template <typename ScalarT>
49+
void launchStencil3d(const ScalarT* input, ScalarT* output, dim3 shape,
50+
const std::array<ScalarT, 7>& coeffs);
51+
52+
template <typename ScalarT>
53+
void launchAlphabetHistogram(const ScalarT* d_input, ScalarT* d_histo,
54+
int32_t nInputs, int32_t divider);
55+
3756
} // namespace pmpp::ops::cuda

csrc/lib/ops/stencil/op.cu

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#include <cuda_runtime.h>
2+
3+
#include "../ops.hpp"
4+
#include "pmpp/utils/address.hpp"
5+
#include "pmpp/utils/common.cuh"
6+
#include "pmpp/utils/math.hpp"
7+
8+
namespace pmpp::ops::cuda
9+
{
10+
11+
template <typename ScalarT, dim3 TILE_DIM>
12+
__global__ void stencilKernel(const ScalarT* input, ScalarT* output,
13+
dim3 shape, const std::array<ScalarT, 7>& coeffs)
14+
{
15+
int32_t iStart = blockIdx.z * TILE_DIM.z;
16+
int32_t j = blockIdx.y * TILE_DIM.y + threadIdx.y - 1;
17+
int32_t k = blockIdx.x * TILE_DIM.x + threadIdx.x - 1;
18+
19+
__shared__ ScalarT inPrev_s[TILE_DIM.x][TILE_DIM.y];
20+
__shared__ ScalarT inCurr_s[TILE_DIM.x][TILE_DIM.y];
21+
__shared__ ScalarT inNext_s[TILE_DIM.x][TILE_DIM.y];
22+
23+
if (iStart - 1 >= 0 && iStart - 1 < shape.z && j >= 0 && j < shape.y &&
24+
k >= 0 && k < shape.x) {
25+
inPrev_s[threadIdx.y][threadIdx.x] = input[offset<uint32_t>(
26+
iStart - 1, j, k, shape.z, shape.y, shape.x)];
27+
}
28+
29+
if (iStart >= 0 && iStart < shape.z && j >= 0 && j < shape.y && k >= 0 &&
30+
k < shape.x) {
31+
inCurr_s[threadIdx.y][threadIdx.x] =
32+
input[offset<uint32_t>(iStart, j, k, shape.z, shape.y, shape.x)];
33+
}
34+
35+
for (int32_t i = iStart; i < iStart + TILE_DIM.z; ++i) {
36+
if (i + 1 >= 0 && i + 1 < shape.z && j >= 0 && j < shape.y && k >= 0 &&
37+
k < shape.x) {
38+
inNext_s[threadIdx.y][threadIdx.x] = input[offset<uint32_t>(
39+
i + 1, j, k, shape.z, shape.y, shape.x)];
40+
}
41+
__syncthreads();
42+
if (i >= 1 && i < shape.z - 1 && j >= 1 && j < shape.y - 1 && k >= 1 &&
43+
k < shape.x - 1) {
44+
if (threadIdx.y >= 1 && threadIdx.y < TILE_DIM.y - 1 &&
45+
threadIdx.x >= 1 && threadIdx.x < TILE_DIM.x - 1) {
46+
output[offset<uint32_t>(i, j, k, shape.z, shape.y, shape.x)] =
47+
coeffs[0] * inCurr_s[threadIdx.y][threadIdx.x] +
48+
coeffs[1] * inCurr_s[threadIdx.y][threadIdx.x - 1] +
49+
coeffs[2] * inCurr_s[threadIdx.y][threadIdx.x + 1] +
50+
coeffs[3] * inCurr_s[threadIdx.y - 1][threadIdx.x] +
51+
coeffs[4] * inCurr_s[threadIdx.y + 1][threadIdx.x] +
52+
coeffs[5] * inPrev_s[threadIdx.y][threadIdx.x] +
53+
coeffs[6] * inNext_s[threadIdx.y][threadIdx.x];
54+
}
55+
}
56+
__syncthreads();
57+
inPrev_s[threadIdx.y][threadIdx.x] =
58+
inCurr_s[threadIdx.y][threadIdx.x];
59+
inCurr_s[threadIdx.y][threadIdx.x] =
60+
inNext_s[threadIdx.y][threadIdx.x];
61+
}
62+
}
63+
64+
template <>
65+
void launchStencil3d(const fp32_t* input, fp32_t* output, dim3 shape,
66+
const std::array<fp32_t, 7>& coeffs)
67+
{
68+
constexpr dim3 BLOCK_DIM = {8, 8, 8};
69+
dim3 gridDim = {ceilDiv(shape.x, BLOCK_DIM.x),
70+
ceilDiv(shape.y, BLOCK_DIM.y),
71+
ceilDiv(shape.z, BLOCK_DIM.z)};
72+
73+
74+
75+
stencilKernel<fp32_t, BLOCK_DIM>
76+
<<<gridDim, BLOCK_DIM>>>(input, output, shape, coeffs);
77+
78+
PMPP_DEBUG_CUDA_ERR_CHECK(cudaGetLastError());
79+
}
80+
81+
} // namespace pmpp::ops::cuda

0 commit comments

Comments
 (0)