crazydemo
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎cpp/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/kernels/fmha_v2/Makefile‎
Lines changed: 5 additions & 0 deletions b/‎cpp/kernels/fmha_v2/Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 85 additions & 33 deletions b/‎cpp/kernels/fmha_v2/setup.py‎
Lines changed: 85 additions & 33 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h‎
Lines changed: 43 additions & 4 deletions b/‎cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h‎
Lines changed: 43 additions & 4 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h‎
Lines changed: 138 additions & 20 deletions b/‎cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h‎
Lines changed: 138 additions & 20 deletions
@@ -68,6 +68,7 @@ option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
        ON)
 option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
        "Using open sourced Cutlass AR gemm kernel" ON)
+option(SKIP_SOFTMAX_STAT "Enable Statistics of Skip-Softmax" OFF)
 
 message(STATUS "ENABLE_NVSHMEM is ${ENABLE_NVSHMEM}")
 
@@ -360,6 +361,11 @@ else()
                           $<$<COMPILE_LANGUAGE:CUDA>:ENABLE_NVSHMEM=0>)
 endif()
 
+if(SKIP_SOFTMAX_STAT)
+  add_compile_definitions("SKIP_SOFTMAX_STAT")
+  message(STATUS "SKIP_SOFTMAX_STAT is enabled")
+endif()
+
 # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
 # be found in
 # https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-mcmodel_003dmedium-1
 
@@ -69,6 +69,11 @@ PREPROCESSOR_FLAGS += -DUSE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE
 # Do we want to use half accumulation for flash attention
 PREPROCESSOR_FLAGS += -DHALF_ACCUMULATION_FOR_FLASH_ATTENTION
 
+# Print the resulted sparsity given threshold in Skip-Softmax attention
+# Note: You only need to "python scripts/build_wheel.py -D SKIP_SOFTMAX_STAT=ON ..." to use it inside TRTLLM.
+# Turn this on manually only if you want to build&run the unittest (bin/fmha.exe) with SKIP_SOFTMAX_STAT.
+# PREPROCESSOR_FLAGS += -DSKIP_SOFTMAX_STAT
+
 # Add FLAGS when generating cubins.
 ifdef GENERATE_CUBIN
 	PREPROCESSOR_FLAGS += -DGENERATE_CUBIN
 
@@ -256,7 +256,8 @@ struct Compute
         actual_kv_seqlen, alibi_head_scale,                                                                            \
         USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset)                           \
                         : (q_step_idx * STEP_Q + head_info.q_tile_offset),                                             \
-        kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor, kv_step_idx == kv_idx_end - 1);
+        kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor,                                             \
+        &shared->skip_softmax_votes[kv_step_idx & 1][warpgroup_id], kv_step_idx == kv_idx_end - 1);
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -360,6 +361,12 @@ struct Compute
             // Contiguous QKV FMHA assumes q, and kv have the same sequence length.
             int const actual_kv_seqlen = SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen;
 
+            // Update threshold of Skip-Softmax
+            if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
+            {
+                softmax.skip_softmax_threshold = params.skip_softmax_threshold_scale_factor / actual_kv_seqlen;
+            }
+
             // Calculate the alibi head_scaling_factor.
             float alibi_head_scale
                 = APPLY_ALIBI ? get_alibi_head_scaling_factor<AlibiParams>(head_info.bidh, params.alibi_params) : 0.f;
@@ -513,6 +520,13 @@ struct Compute
                 }
             }
         }
+#ifdef SKIP_SOFTMAX_STAT
+        if (tidx == 0)
+        {
+            atomicAdd(params.skip_softmax_total_blocks, softmax.total_blocks);
+            atomicAdd(params.skip_softmax_skipped_blocks, softmax.skipped_blocks);
+        }
+#endif
     }
 
     ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -522,8 +536,15 @@ struct Compute
         Compute_tile_o& ctile_o, float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M],
         int const tidx, int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset,
         int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr, Circular_buffer_kv_reader& cbr_v,
-        OrderedMutexAccessor& mutex, bool complete = false)
+        OrderedMutexAccessor& mutex, uint32_t* skip_softmax_vote, bool complete = false)
     {
+
+        // Skip-softmax vote initialization
+        if (tidx == 0)
+        {
+            // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before voting.
+            *skip_softmax_vote = 1;
+        }
 // load the scales of K/V from global memory
 #define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size)                                                        \
     if constexpr (block_size > 0)                                                                                      \
@@ -557,6 +578,10 @@ struct Compute
         // Ctile_p is only used once by each n step.
         ctile_p.clear();
 
+        // If skip_softmax is enabled, make sure there is no racing between the initialization and writing of
+        // skip_softmax_vote.
+        named_barrier_wait(Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + threadIdx.x / 128, 128);
+
         // BMM1 (Q x K').
         warpgroup_arrive();
 
@@ -626,8 +651,22 @@ struct Compute
         softmax.apply_alibi_and_mask<APPLY_MASK>(
             ctile_p, params.alibi_params, alibi_head_scale, actual_kv_seqlen, row_offset, col_offset);
 
-        // Softmax Exp, max/sum, and update scales.
-        softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum);
+        // Softmax Exp, max/sum, and update scales. If returns false we skip the rest.
+        if (!softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum, skip_softmax_vote))
+        {
+            if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1)
+            {
+                // Notify another warpgroup to execute QGMMA.
+                mutex.named_bar_arrive();
+            }
+            // Need to wait V, otherwise compute-sanitizer synccheck will fail.
+            int ready2 = cbr_v.peek();
+            if (!ready2)
+            {
+                cbr_v.wait();
+            }
+            return;
+        }
 
         // experiments show that here is the best place to load scales of V
         float scales_v[SAGE_BLOCKS_PER_STEP_V];
 
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include "fmha/hopper/arrive_wait.h"
+
 #include <fmha/softmax.h>
 #include <fmha/traits.h>
 #include <fmha/utils.h>
@@ -104,6 +106,12 @@ struct Softmax_base
         CHECK_IF_NEG_INF_EXISTS = SLIDING_OR_CHUNKED_ATTENTION || USE_CUSTOM_MASK
     };
 
+    // There are 2 warpgroups so 0x3 and 0x4 are used
+    enum
+    {
+        SKIP_SOFTMAX_BARRIER = Kernel_traits::SKIP_SOFTMAX_BARRIER_ID
+    };
+
     // Ctor.
     template <typename Params>
     inline __device__ Softmax_base(Params params, int tidx)
@@ -114,6 +122,11 @@ struct Softmax_base
         , log2_chunked_attention_size_(params.log2_chunked_attention_size)
         , packed_mask_ptr_{reinterpret_cast<uint32_t*>(params.packed_mask_ptr)}
         , params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes}
+#ifdef SKIP_SOFTMAX_STAT
+        , total_blocks(0)
+        , skipped_blocks(0)
+#endif
+        , skip_softmax_threshold(0)
     {
 
         int warp = tidx / 32;
@@ -330,31 +343,79 @@ struct Softmax_base
     }
 
     // Calculate max/sum, and update flash-attention scales.
+    // Returns false if skipped due to skip-softmax attention feature.
     template <bool IS_FIRST_COL>
-    inline __device__ void compute_and_update_scale(
-        float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M])
+    inline __device__ bool compute_and_update_scale(
+        float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote)
     {
         float const scale = reinterpret_cast<float const&>(scale_bmm1_);
 
+        // whether this warpgroup skips the softmax
+        constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
+        bool skip = may_skip;
+
 // Row-wise max of current tile.
 #pragma unroll
         for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++)
         {
-            if (IS_FIRST_COL)
-            {
-                local_max_[mi] = elt_[mi][0];
-            }
-            else
-            {
-                local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
-            }
+            local_max_[mi] = elt_[mi][0];
 #pragma unroll
             for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++)
             {
                 local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
             }
             local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
             local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+
+            if constexpr (may_skip)
+            {
+                // AND(&) the CORES_M results, then `skip` means whether to skip
+                // the CORES_M(=2) rows
+                if constexpr (!EXP2F_OPTIMIZATION)
+                {
+                    skip &= expf(local_max_[mi] - global_max[mi]) < skip_softmax_threshold;
+                }
+                else
+                {
+                    skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < skip_softmax_threshold;
+                }
+            }
+
+            if (!IS_FIRST_COL)
+            {
+                local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
+            }
+        }
+
+        if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
+        {
+#ifdef SKIP_SOFTMAX_STAT
+            total_blocks++;
+#endif
+            if constexpr (may_skip)
+            {
+
+                // AND(&) the results together in a warp, then `skip` means whether to skip
+                // all the 16 rows managed by this warp.
+                // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
+                // instead of 0xffffffff. But the perf is the same.
+                skip = __all_sync(0xffffffff, skip);
+                if (threadIdx.x % 32 == 0)
+                {
+                    // The leader of each warp votes.
+                    atomicAnd(skip_softmax_vote, uint32_t(skip));
+                }
+                // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
+                named_barrier_wait(SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
+                skip = *((uint32_t volatile*) skip_softmax_vote);
+                if (skip)
+                {
+#ifdef SKIP_SOFTMAX_STAT
+                    skipped_blocks++;
+#endif
+                    return false;
+                }
+            }
         }
 
 // Softmax Exp.
@@ -436,6 +497,7 @@ struct Softmax_base
                 global_max[mi] = max_new;
             }
         }
+        return true;
     }
 
     // Update flash attention scales and pack elements for BMM2.
@@ -513,6 +575,13 @@ struct Softmax_base
     float correction_[Mma_tile_p::CORES_M];
     // The packed mask.
     uint4 packed_mask_;
+    // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold.
+    float skip_softmax_threshold;
+#ifdef SKIP_SOFTMAX_STAT
+    // Statistics of skip-softmax
+    uint32_t total_blocks;
+    uint32_t skipped_blocks;
+#endif
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -868,35 +937,83 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
     }
 
     // Calculate max/sum, and update flash-attention scales.
+    // Returns false if skipped due to skip-softmax attention feature.
     template <bool IS_FIRST_COL>
-    inline __device__ void compute_and_update_scale(
-        float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M])
+    inline __device__ bool compute_and_update_scale(
+        float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote)
     {
         float const scale = reinterpret_cast<float const&>(this->scale_bmm1_);
         float(&local_max_)[Mma_tile_p::CORES_M] = this->local_max_;
         float(&local_sum_)[Mma_tile_p::CORES_M] = this->local_sum_;
         float(&correction_)[Mma_tile_p::CORES_M] = this->correction_;
         float(&elt_)[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2] = this->elt_;
 
+        // whether this warpgroup skips the softmax
+        constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
+        bool skip = may_skip;
+
 // Row-wise max of current tile.
 #pragma unroll
         for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++)
         {
-            if (IS_FIRST_COL)
-            {
-                local_max_[mi] = elt_[mi][0];
-            }
-            else
-            {
-                local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
-            }
+            local_max_[mi] = elt_[mi][0];
 #pragma unroll
             for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++)
             {
                 local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
             }
             local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
             local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+            // AND(&) the CORES_M results, then `skip` means whether to skip
+            // the CORES_M(=2) rows
+            if constexpr (may_skip)
+            {
+                // AND(&) the CORES_M results, then `skip` means whether to skip
+                // the CORES_M(=2) rows
+                if constexpr (!EXP2F_OPTIMIZATION)
+                {
+                    skip &= expf(local_max_[mi] - global_max[mi]) < this->skip_softmax_threshold;
+                }
+                else
+                {
+                    skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < this->skip_softmax_threshold;
+                }
+            }
+            if (!IS_FIRST_COL)
+            {
+                local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
+            }
+        }
+
+        if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX)
+        {
+#ifdef SKIP_SOFTMAX_STAT
+            this->total_blocks++;
+#endif
+
+            if constexpr (may_skip)
+            {
+                // AND(&) the results together in a warp, then `skip` means whether to skip
+                // all the 16 rows managed by this warp.
+                // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
+                // instead of 0xffffffff. But the perf is the same.
+                skip = __all_sync(0xffffffff, skip);
+                if (threadIdx.x % 32 == 0)
+                {
+                    // The leader of each warp votes.
+                    atomicAnd(skip_softmax_vote, uint32_t(skip));
+                }
+                // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
+                named_barrier_wait(Base::SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
+                skip = *((uint32_t volatile*) skip_softmax_vote);
+                if (skip)
+                {
+#ifdef SKIP_SOFTMAX_STAT
+                    this->skipped_blocks++;
+#endif
+                    return false;
+                }
+            }
         }
 
 // Softmax Exp.
@@ -987,6 +1104,7 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
                 global_max[mi] = max_new;
             }
         }
+        return true;
     }
 
     // Update flash attention scales and pack elements for BMM2.