Update AlphabetHistogram

root · root · commit e4d427dc5b7b · 2025-02-06T08:09:06.000Z
diff --git a/.clangd b/.clangd
@@ -21,6 +21,7 @@ If:
   PathMatch: [.*\.cpp, .*\.cu, .*\.hpp, .*\.cuh]
 CompileFlags:
   Add: 
+    - -std=c++20
     - --no-cuda-version-check
   Remove:
     - -ccbin
diff --git a/configs/lib-tests.yml b/configs/lib-tests.yml
@@ -2,4 +2,9 @@ OpTest:
   Conv2D:
     - inputHeight: 32
       inputWidth: 32
-      kernelSize: 3
+      kernelSize: 3
+  AlphabetHistogram:
+    - nInputs: 256
+      divider: 4
+    - nInputs: 2048
+      divider: 4
diff --git a/csrc/include/pmpp/utils/address.hpp b/csrc/include/pmpp/utils/address.hpp
@@ -14,12 +14,16 @@ namespace pmpp
 {
 /**
  * @brief Compute the offset of a multi-dimensional array.
+ *   A typical use case is that if you have rowIdx, colIdx, nRows and nCols, 
+ *   to calculate the linear index of the element at (rowIdx, colIdx), you can
+ *   use this function as follows:
+ *   > offset(rowIdx, colIdx, nRows, nCols)
  *
  * @param args First half is the indices, second half is the size of each
  *             dimension.
  * @return std::uint32_t The offset of the multi-dimensional array.
  *
- * @example
+ * @example 
  *   1. To calculate the offset of idx (2, 1) in a 2D array of dim (4, 3):
  *      > offset(2, 1, 4, 3) -> 1*1 + 2*3 = 7
  *   2. To calculate the offset of idx (1, 2, 3) in a 3D array of dim
diff --git a/csrc/include/pmpp/utils/common.cuh b/csrc/include/pmpp/utils/common.cuh
@@ -42,3 +42,14 @@
         #define PMPP_DEBUG_CUDA_ERR_CHECK(err) PMPP_CUDA_ERR_CHECK(err)
     #endif
 #endif
+
+namespace pmpp::cuda
+{
+template <typename T>
+__host__ __device__ void initMemory(T* ptr, size_t n, const T& val)
+{
+    for (size_t i = 0; i < n; ++i) {
+        ptr[i] = val;
+    }
+}
+}  // namespace pmpp::cuda
diff --git a/csrc/include/pmpp/utils/common.hpp b/csrc/include/pmpp/utils/common.hpp
@@ -21,4 +21,12 @@ auto arr2str(const ArrT& arr) -> std::string
     return str;
 }
 
+template <typename T>
+void initMemory(T* ptr, size_t n, const T& val)
+{
+    for (size_t i = 0; i < n; ++i) {
+        ptr[i] = val;
+    }
+}
+
 }  // namespace pmpp
diff --git a/csrc/lib/ops/alphabetHistogram/op.cpp b/csrc/lib/ops/alphabetHistogram/op.cpp
@@ -0,0 +1,17 @@
+#include "../ops.hpp"
+
+namespace pmpp::ops::cpu
+{
+template <>
+void launchAlphabetHistogram<int32_t>(const int32_t* input, int32_t* histo,
+                                      int32_t nInputs, int32_t divider)
+{
+    // O(N)
+    for (int32_t i = 0; i < nInputs; ++i) {
+        int32_t pos = input[i] - 'a';
+        if (pos >= 0 && pos < 26) {
+            ++histo[pos / divider];
+        }
+    }
+}
+}  // namespace pmpp::ops::cpu
diff --git a/csrc/lib/ops/alphabetHistogram/op.cu b/csrc/lib/ops/alphabetHistogram/op.cu
@@ -0,0 +1,66 @@
+#include <algorithm>
+#include <cuda_runtime.h>
+
+#include "../ops.hpp"
+#include "pmpp/utils/common.cuh"
+#include "pmpp/utils/math.hpp"
+
+namespace pmpp::ops::cuda
+{
+__global__ void alphabetHistogramKernel(const int32_t* input, int32_t* histo,
+                                        int32_t nInputs, int32_t divider)
+{
+    constexpr auto MAX_N_BINS = 26;
+    int32_t nBins = ceilDiv(26, divider);
+    __shared__ int32_t histo_s[MAX_N_BINS];
+    ::pmpp::cuda::initMemory(histo_s, nBins, 0);
+    __syncthreads();
+
+    // Global thread index
+    int32_t gTid = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t accumulator = 0;
+    int32_t prevBinIdx = -1;
+
+    // Map concecutive threads to all elements of the input
+    for (int32_t i = gTid; i < nInputs; i += blockDim.x * gridDim.x) {
+        int32_t alphabetPos = input[i] - 'a';
+        if (alphabetPos >= 0 && alphabetPos < 26) {
+            int32_t bin = alphabetPos / divider;
+            if (bin == prevBinIdx) {
+                ++accumulator;
+            } else {
+                if (accumulator >= 0) {
+                    atomicAdd(&(histo_s[prevBinIdx]), accumulator);
+                }
+                accumulator = 1;
+                prevBinIdx = bin;
+            }
+        }
+    }
+    if (accumulator > 0) {
+        atomicAdd(&(histo_s[prevBinIdx]), accumulator);
+    }
+
+    if (blockIdx.x > 0) {
+        __syncthreads();
+        // This loop is for the case when nBins > blockDim.x (nThreads per
+        // block)
+        for (int32_t bin = threadIdx.x; bin < nBins; bin += blockDim.x) {
+            int32_t binVal = histo_s[bin];
+            if (binVal > 0) {
+                atomicAdd(&(histo[bin]), binVal);
+            }
+        }
+    }
+}
+
+template <>
+void launchAlphabetHistogram<int32_t>(const int32_t* d_input, int32_t* d_histo,
+                                      int32_t nInputs, int32_t divider)
+{
+    constexpr dim3 blockDim = {1024, 1, 1};
+    dim3 gridDim = {uint32_t(ceilDiv(nInputs, blockDim.x)), 1, 1};
+    alphabetHistogramKernel<<<gridDim, blockDim>>>(d_input, d_histo, nInputs,
+                                                   divider);
+}
+}  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/alphabetHistogram/torch-impl.cpp b/csrc/lib/ops/alphabetHistogram/torch-impl.cpp
@@ -0,0 +1,52 @@
+#include <torch/torch.h>
+
+#include "../ops.hpp"
+#include "../torch_impl.hpp"
+
+namespace pmpp::ops::cpu::torch_impl
+{
+auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
+    -> torch::Tensor
+{
+    auto nInputs = input.numel();
+    auto histo = torch::zeros({26 / divider}, torch::kInt32);
+
+    switch (input.scalar_type()) {
+    case torch::kInt32: {
+        pmpp::ops::cpu::launchAlphabetHistogram<int32_t>(
+            input.data_ptr<int32_t>(), histo.data_ptr<int32_t>(), nInputs,
+            int32_t(divider));
+        break;
+    }
+    default: {
+        AT_ERROR("Unsupported dtype: ", input.dtype());
+    }
+    }
+
+    return histo;
+}
+}  // namespace pmpp::ops::cpu::torch_impl
+
+namespace pmpp::ops::cuda::torch_impl
+{
+auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
+    -> torch::Tensor
+{
+    auto nInputs = input.numel();
+    auto histo = torch::zeros({26 / divider}, torch::kInt32);
+
+    switch (input.scalar_type()) {
+    case torch::kInt32: {
+        pmpp::ops::cuda::launchAlphabetHistogram<int32_t>(
+            input.data_ptr<int32_t>(), histo.data_ptr<int32_t>(), nInputs,
+            int32_t(divider));
+        break;
+    }
+    default: {
+        AT_ERROR("Unsupported dtype: ", input.dtype());
+    }
+    }
+
+    return histo;
+}
+}  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/lib/ops/ops.hpp b/csrc/lib/ops/ops.hpp
@@ -1,7 +1,10 @@
 #pragma once
 
+#include "pmpp/types/cu_types.cuh"
 #include "pmpp/types/cxx_types.hpp"
 
+#include <array>
+
 namespace pmpp::ops::cpu
 {
 
@@ -16,6 +19,14 @@ template <typename ScalarT>
 void launchConv2d(const ScalarT* input, const ScalarT* kernel, ScalarT* output,
                   int32_t inHeight, int32_t inWidth, int32_t kernelSize);
 
+template <typename ScalarT>
+void launchStencil3d(const ScalarT* input, ScalarT* output, dim3 shape,
+                     const std::array<ScalarT, 7>& coeffs);
+
+template <typename ScalarT>
+void launchAlphabetHistogram(const ScalarT* input, ScalarT* histo,
+                             int32_t nInputs, int32_t divider);
+
 }  // namespace pmpp::ops::cpu
 
 namespace pmpp::ops::cuda
@@ -34,4 +45,12 @@ void launchConv2d(const ScalarT* d_input, const ScalarT* d_kernel,
                   ScalarT* d_output, int32_t inputHeight, int32_t inputWidth,
                   int32_t kernelSize);
 
+template <typename ScalarT>
+void launchStencil3d(const ScalarT* input, ScalarT* output, dim3 shape,
+                     const std::array<ScalarT, 7>& coeffs);
+
+template <typename ScalarT>
+void launchAlphabetHistogram(const ScalarT* d_input, ScalarT* d_histo,
+                             int32_t nInputs, int32_t divider);
+
 }  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/stencil/op.cu b/csrc/lib/ops/stencil/op.cu
@@ -0,0 +1,81 @@
+#include <cuda_runtime.h>
+
+#include "../ops.hpp"
+#include "pmpp/utils/address.hpp"
+#include "pmpp/utils/common.cuh"
+#include "pmpp/utils/math.hpp"
+
+namespace pmpp::ops::cuda
+{
+
+template <typename ScalarT, dim3 TILE_DIM>
+__global__ void stencilKernel(const ScalarT* input, ScalarT* output,
+                              dim3 shape, const std::array<ScalarT, 7>& coeffs)
+{
+    int32_t iStart = blockIdx.z * TILE_DIM.z;
+    int32_t j = blockIdx.y * TILE_DIM.y + threadIdx.y - 1;
+    int32_t k = blockIdx.x * TILE_DIM.x + threadIdx.x - 1;
+
+    __shared__ ScalarT inPrev_s[TILE_DIM.x][TILE_DIM.y];
+    __shared__ ScalarT inCurr_s[TILE_DIM.x][TILE_DIM.y];
+    __shared__ ScalarT inNext_s[TILE_DIM.x][TILE_DIM.y];
+
+    if (iStart - 1 >= 0 && iStart - 1 < shape.z && j >= 0 && j < shape.y &&
+        k >= 0 && k < shape.x) {
+        inPrev_s[threadIdx.y][threadIdx.x] = input[offset<uint32_t>(
+            iStart - 1, j, k, shape.z, shape.y, shape.x)];
+    }
+
+    if (iStart >= 0 && iStart < shape.z && j >= 0 && j < shape.y && k >= 0 &&
+        k < shape.x) {
+        inCurr_s[threadIdx.y][threadIdx.x] =
+            input[offset<uint32_t>(iStart, j, k, shape.z, shape.y, shape.x)];
+    }
+
+    for (int32_t i = iStart; i < iStart + TILE_DIM.z; ++i) {
+        if (i + 1 >= 0 && i + 1 < shape.z && j >= 0 && j < shape.y && k >= 0 &&
+            k < shape.x) {
+            inNext_s[threadIdx.y][threadIdx.x] = input[offset<uint32_t>(
+                i + 1, j, k, shape.z, shape.y, shape.x)];
+        }
+        __syncthreads();
+        if (i >= 1 && i < shape.z - 1 && j >= 1 && j < shape.y - 1 && k >= 1 &&
+            k < shape.x - 1) {
+            if (threadIdx.y >= 1 && threadIdx.y < TILE_DIM.y - 1 &&
+                threadIdx.x >= 1 && threadIdx.x < TILE_DIM.x - 1) {
+                output[offset<uint32_t>(i, j, k, shape.z, shape.y, shape.x)] =
+                    coeffs[0] * inCurr_s[threadIdx.y][threadIdx.x] +
+                    coeffs[1] * inCurr_s[threadIdx.y][threadIdx.x - 1] +
+                    coeffs[2] * inCurr_s[threadIdx.y][threadIdx.x + 1] +
+                    coeffs[3] * inCurr_s[threadIdx.y - 1][threadIdx.x] +
+                    coeffs[4] * inCurr_s[threadIdx.y + 1][threadIdx.x] +
+                    coeffs[5] * inPrev_s[threadIdx.y][threadIdx.x] +
+                    coeffs[6] * inNext_s[threadIdx.y][threadIdx.x];
+            }
+        }
+        __syncthreads();
+        inPrev_s[threadIdx.y][threadIdx.x] =
+            inCurr_s[threadIdx.y][threadIdx.x];
+        inCurr_s[threadIdx.y][threadIdx.x] =
+            inNext_s[threadIdx.y][threadIdx.x];
+    }
+}
+
+template <>
+void launchStencil3d(const fp32_t* input, fp32_t* output, dim3 shape,
+                     const std::array<fp32_t, 7>& coeffs)
+{
+    constexpr dim3 BLOCK_DIM = {8, 8, 8};
+    dim3 gridDim = {ceilDiv(shape.x, BLOCK_DIM.x),
+                    ceilDiv(shape.y, BLOCK_DIM.y),
+                    ceilDiv(shape.z, BLOCK_DIM.z)};
+    
+    
+
+    stencilKernel<fp32_t, BLOCK_DIM>
+        <<<gridDim, BLOCK_DIM>>>(input, output, shape, coeffs);
+
+    PMPP_DEBUG_CUDA_ERR_CHECK(cudaGetLastError());
+}
+
+}  // namespace pmpp::ops::cuda
diff --git a/csrc/lib/ops/torch_bind.cpp b/csrc/lib/ops/torch_bind.cpp
@@ -11,6 +11,7 @@ TORCH_LIBRARY(pmpp, m)
     m.def("cvt_rgb_to_gray(Tensor img) -> Tensor");
     m.def("matmul(Tensor A, Tensor B) -> Tensor");
     m.def("conv2d(Tensor input, Tensor kernel) -> Tensor");
+    m.def("alphabet_histogram(Tensor input, int divider) -> Tensor");
 }
 
 // Register the implementations.
@@ -22,6 +23,8 @@ TORCH_LIBRARY_IMPL(pmpp, CPU, m)
     m.impl("cvt_rgb_to_gray", &pmpp::ops::cpu::torch_impl::cvtRGBtoGray);
     m.impl("matmul", &pmpp::ops::cpu::torch_impl::matmul);
     m.impl("conv2d", &pmpp::ops::cpu::torch_impl::conv2d);
+    m.impl("alphabet_histogram",
+           &pmpp::ops::cpu::torch_impl::alphabetHistogram);
 }
 
 TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
@@ -30,4 +33,6 @@ TORCH_LIBRARY_IMPL(pmpp, CUDA, m)
     m.impl("cvt_rgb_to_gray", &pmpp::ops::cuda::torch_impl::cvtRGBtoGray);
     m.impl("matmul", &pmpp::ops::cuda::torch_impl::matmul);
     m.impl("conv2d", &pmpp::ops::cuda::torch_impl::conv2d);
+    m.impl("alphabet_histogram",
+           &pmpp::ops::cuda::torch_impl::alphabetHistogram);
 }
diff --git a/csrc/lib/ops/torch_impl.hpp b/csrc/lib/ops/torch_impl.hpp
@@ -15,6 +15,9 @@ auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor;
 auto conv2d(const torch::Tensor& input, const torch::Tensor& kernel)
     -> torch::Tensor;
 
+auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
+    -> torch::Tensor;
+
 }  // namespace pmpp::ops::cpu::torch_impl
 
 namespace pmpp::ops::cuda::torch_impl
@@ -30,4 +33,7 @@ auto matmul(const torch::Tensor& A, const torch::Tensor& B) -> torch::Tensor;
 auto conv2d(const torch::Tensor& input, const torch::Tensor& kernel)
     -> torch::Tensor;
 
+auto alphabetHistogram(const torch::Tensor& input, int64_t divider)
+    -> torch::Tensor;
+
 }  // namespace pmpp::ops::cuda::torch_impl
diff --git a/csrc/test/OpTest/AlphabetHistogram.cpp b/csrc/test/OpTest/AlphabetHistogram.cpp

Original file line number	Diff line number	Diff line change
`@@ -14,12 +14,16 @@ namespace pmpp`
`14`	`14`	`{`
`15`	`15`	`/**`
`16`	`16`	`* @brief Compute the offset of a multi-dimensional array.`
	`17`	`+ * A typical use case is that if you have rowIdx, colIdx, nRows and nCols,`
	`18`	`+ * to calculate the linear index of the element at (rowIdx, colIdx), you can`
	`19`	`+ * use this function as follows:`
	`20`	`+ * > offset(rowIdx, colIdx, nRows, nCols)`
`17`	`21`	`*`
`18`	`22`	`* @param args First half is the indices, second half is the size of each`
`19`	`23`	`* dimension.`
`20`	`24`	`* @return std::uint32_t The offset of the multi-dimensional array.`
`21`	`25`	`*`
`22`		`- * @example`
	`26`	`+ * @example`
`23`	`27`	`* 1. To calculate the offset of idx (2, 1) in a 2D array of dim (4, 3):`
`24`	`28`	`* > offset(2, 1, 4, 3) -> 11 + 23 = 7`
`25`	`29`	`* 2. To calculate the offset of idx (1, 2, 3) in a 3D array of dim`