Optimizations + big case performance tests

pwilkin · pwilkin · commit df917ccf2487 · 2025-12-02T14:29:56.000+01:00
diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu
@@ -17,53 +17,72 @@ static __global__ void cumsum_kernel(
     const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
     const int64_t nb0,  const int64_t nb1,  const int64_t nb2,  const int64_t nb3) {
 
-    // Shared memory to store warp sums (always use float for accumulation)
-    extern __shared__ float shmem[];
+    const int tid = threadIdx.x;
+    const int lane = tid & (WARP_SIZE - 1);
+    const int warp = tid / WARP_SIZE;
+    const int warps_per_block = blockDim.x / WARP_SIZE;
+
+    extern __shared__ float smem[];
+    float* s_vals = smem;
+    float* s_warp_sums = smem + blockDim.x;
+    float* s_carry = smem + blockDim.x + warps_per_block;
+    float* s_chunk_total = s_carry + 1;
+
+    // Initialize carry
+    if (tid == 0) {
+        *s_carry = 0.0f;
+    }
+    __syncthreads();
 
     const int64_t i3 = blockIdx.z;
     const int64_t i2 = blockIdx.y;
     const int64_t i1 = blockIdx.x;
-
     if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
         return;
     }
 
-    const T * src_row = src + i1 * nb01 + i2*nb02 + i3*nb03;
-    T       * dst_row = dst + i1 * nb1  + i2*nb2  + i3*nb3;
+    const T * src_row = src + i1 * nb01 + i2 * nb02 + i3 * nb03;
+    T       * dst_row = dst + i1 * nb1  + i2 * nb2  + i3 * nb3;
 
-    const int tid = threadIdx.x;
-    const int lane_id = tid % WARP_SIZE;
-
-    if (tid >= ne00) {
-        return;
-    }
+    for (int64_t start = 0; start < ne00; start += blockDim.x) {
+        int64_t idx = start + tid;
+        float val = (idx < ne00) ? static_cast<float>(src_row[idx]) : 0.0f;
 
-    // Phase 1: Each thread processes elements at stride blockDim.x
-    // Compute warp-level prefix sums
-    for (int64_t i0 = tid; i0 < ne00; i0 += blockDim.x) {
-        // Load value and compute prefix sum within warp
-        float val = static_cast<float>(src_row[i0]);
+        // 1. Warp inclusive scan
         val = warp_prefix_inclusive_sum(val);
-        dst_row[i0] = static_cast<T>(val);
+        s_vals[tid] = val;
 
-        // Last thread of warp stores its sum to shared memory at position based on data index
-        if (lane_id == WARP_SIZE - 1 || i0 == ne00 - 1) {
-            const int shmem_idx = i0 / WARP_SIZE;
-            shmem[shmem_idx] = val;
+        // Store warp total
+        if (lane == WARP_SIZE - 1) {
+            s_warp_sums[warp] = val;
         }
-    }
+        __syncthreads();
+
+        // 2. Exclusive scan of warp sums (warp 0 only)
+        if (warp == 0) {
+            float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
+            float inc = warp_prefix_inclusive_sum(w);
+            if (tid < warps_per_block) {
+                s_warp_sums[tid] = inc - w;   // exclusive sum
+            }
+            if (tid == warps_per_block - 1) {
+                *s_chunk_total = inc;          // total sum of this chunk
+            }
+        }
+        __syncthreads();
 
-    // Sync once after all warp prefix sums are computed
-    __syncthreads();
+        float carry = *s_carry;
+        float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
+        if (idx < ne00) {
+            dst_row[idx] = static_cast<T>(final_val);
+        }
+        __syncthreads();
 
-    // Phase 2: Add the sum of all preceding warp groups to each element
-    for (int64_t i0 = tid; i0 < ne00; i0 += blockDim.x) {
-        const int shmem_idx = i0 / WARP_SIZE;
-        float sum = 0.0f;
-        for (int j = 0; j < shmem_idx; ++j) {
-            sum += shmem[j];
+        // Update carry for next chunk
+        if (tid == 0) {
+            *s_carry += *s_chunk_total;
         }
-        dst_row[i0] = static_cast<T>(static_cast<float>(dst_row[i0]) + sum);
+        __syncthreads();
     }
 }
 
@@ -76,15 +95,13 @@ static void cumsum_cuda(
     cudaStream_t stream) {
 
     dim3 grid_dims(ne01, ne02, ne03);
-
-    // Shared memory size: one float per warp
     const int num_warps = (ne00 + WARP_SIZE - 1) / WARP_SIZE;
-    const size_t shmem_size = num_warps * sizeof(float);
-    const size_t type_size = sizeof(T);
-
     int block_size = num_warps * WARP_SIZE;
     block_size = std::min(block_size, CUDA_CUMSUM_BLOCK_SIZE);
     dim3 block_dims(block_size, 1, 1);
+    const int warps_per_block = block_size / WARP_SIZE;
+    const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
+    const size_t type_size = sizeof(T);
 
     cumsum_kernel<<<grid_dims, block_dims, shmem_size, stream>>>(
         src, dst,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -7942,7 +7942,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
 
     test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 1 }));
-    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 1 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 16, 5, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20000, 10, 4, 1 }));
 
     for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
         for (ggml_type type_a : all_types) {