Revert "[CUDA] Use GEMM with epilogue instead of AddMM (#2569)"

zcbenz · web-flow · commit 5e5fe4ac3b76 · 2025-09-09T20:29:00.000+09:00
This reverts commit dde3682.
diff --git a/mlx/backend/cuda/gemms/cublas_gemm.cpp b/mlx/backend/cuda/gemms/cublas_gemm.cpp
@@ -85,10 +85,10 @@ cublasLtMatrixLayout_t create_matrix_layout(
     int32_t batch_count,
     int64_t batch_stride) {
   cublasLtMatrixLayout_t desc;
-  if (transposed) {
-    std::swap(rows, cols);
-  }
   CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutCreate(&desc, type, rows, cols, ld));
+  cublasLtOrder_t order = transposed ? CUBLASLT_ORDER_COL : CUBLASLT_ORDER_ROW;
+  CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
+      desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order, sizeof(cublasLtOrder_t)));
   if (batch_count > 1) {
     CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutSetAttribute(
         desc,
@@ -138,34 +138,25 @@ CublasGemm::CublasGemm(
       CUBLASLT_MATMUL_DESC_POINTER_MODE,
       &pointer_mode,
       sizeof(int32_t)));
-
-  // In cublasLt matrices use column-major layout, while it is possible to use
-  // the CUBLASLT_ORDER_ROW option to switch to row-major layout, the bias
-  // epilogue does not work with the option. So instead we swap A and B to make
-  // cublasLt return the row-major result, which works because:
-  // - the data of a matrix in row-major layout is identical to its transpose in
-  //   column-major layout
-  // - C^T = (A @ B)^T = B^T @ A^T
-  cublasOperation_t a_op = b_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op = CUBLAS_OP_N;
   CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
       matmul_desc_,
       CUBLASLT_MATMUL_DESC_TRANSA,
-      &a_op,
+      &op,
       sizeof(cublasOperation_t)));
-  cublasOperation_t b_op = a_transposed ? CUBLAS_OP_T : CUBLAS_OP_N;
   CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
       matmul_desc_,
       CUBLASLT_MATMUL_DESC_TRANSB,
-      &b_op,
+      &op,
       sizeof(cublasOperation_t)));
 
   auto type = dtype_to_cublas_type(dtype);
   a_desc_ = create_matrix_layout(
-      type, b_cols, b_rows, b_transposed, ldb, batch_count, b_batch_stride);
+      type, a_rows, a_cols, a_transposed, lda, batch_count, a_batch_stride);
   b_desc_ = create_matrix_layout(
-      type, a_cols, a_rows, a_transposed, lda, batch_count, a_batch_stride);
+      type, b_rows, b_cols, b_transposed, ldb, batch_count, b_batch_stride);
   out_desc_ = create_matrix_layout(
-      type, b_cols, a_rows, false, b_cols, batch_count, a_rows * b_cols);
+      type, a_rows, b_cols, false, b_cols, batch_count, a_rows * b_cols);
 }
 
 CublasGemm::CublasGemm(
@@ -200,7 +191,7 @@ CublasGemm::CublasGemm(
           b_batch_stride) {
   auto type = dtype_to_cublas_type(dtype);
   c_desc_ = create_matrix_layout(
-      type, b_cols, a_rows, false, ldc, batch_count, c_batch_stride);
+      type, a_rows, b_cols, false, ldc, batch_count, c_batch_stride);
 }
 
 CublasGemm::~CublasGemm() {
@@ -222,25 +213,14 @@ void CublasGemm::set_out(
   CHECK_CUBLAS_ERROR(cublasLtMatrixLayoutDestroy(out_desc_));
   out_desc_ = create_matrix_layout(
       dtype_to_cublas_type(dtype),
-      cols,
       rows,
+      cols,
       transposed,
       ld,
       batch_count,
       batch_stride);
 }
 
-void CublasGemm::set_bias(void* bias) {
-  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
-      matmul_desc_,
-      CUBLASLT_MATMUL_DESC_EPILOGUE,
-      &epilogue,
-      sizeof(epilogue)));
-  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
-      matmul_desc_, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
-}
-
 void CublasGemm::run(
     cu::CommandEncoder& encoder,
     array& out,
@@ -350,9 +330,9 @@ void CublasGemm::execute(
       handle_,
       matmul_desc_,
       &alpha,
-      b, // a and b are swapped
-      a_desc_,
       a,
+      a_desc_,
+      b,
       b_desc_,
       &beta,
       c ? c : out,
diff --git a/mlx/backend/cuda/gemms/cublas_gemm.h b/mlx/backend/cuda/gemms/cublas_gemm.h
@@ -55,8 +55,6 @@ class CublasGemm {
       int32_t batch_count,
       int64_t batch_stride);
 
-  void set_bias(void* bias);
-
   void run(
       cu::CommandEncoder& encoder,
       array& out,
diff --git a/mlx/backend/cuda/matmul.cpp b/mlx/backend/cuda/matmul.cpp
@@ -11,7 +11,6 @@
 #include <numeric>
 
 namespace mlx::core {
-
 namespace {
 
 std::tuple<bool, int64_t, array>
@@ -29,20 +28,41 @@ check_transpose(cu::CommandEncoder& enc, const Stream& s, const array& arr) {
   }
 }
 
-void gemm_and_bias(
-    cu::CommandEncoder& encoder,
-    int M,
-    int N,
-    int K,
-    bool a_transposed,
-    int64_t lda,
-    bool b_transposed,
-    int64_t ldb,
-    array& out,
-    const array& a,
-    const array& b,
-    void* bias = nullptr) {
+} // namespace
+
+void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("Matmul::eval_gpu");
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+
+  assert(inputs.size() == 2);
+  auto& a_pre = inputs[0];
+  auto& b_pre = inputs[1];
+  // Return 0s if either input is empty.
+  if (a_pre.size() == 0 || b_pre.size() == 0) {
+    array zero(0, a_pre.dtype());
+    encoder.add_temporary(zero);
+    fill_gpu(zero, out, s);
+    return;
+  }
+
+  out.set_data(allocator::malloc(out.nbytes()));
+
+  /////////////////////////////////////////////////////////////////////////////
+  // Init checks and prep
+
+  int M = a_pre.shape(-2);
+  int N = b_pre.shape(-1);
+  int K = a_pre.shape(-1);
+
+  // Keep a vector with copies to be cleared in the completed buffer to release
+  // the arrays
+  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
+  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
+
+  /////////////////////////////////////////////////////////////////////////////
   // Check and collapse batch dimensions
+
   auto [batch_shape, a_batch_strides, b_batch_strides] = collapse_batches(a, b);
 
   auto batch_count = out.size() / (M * N);
@@ -59,8 +79,7 @@ void gemm_and_bias(
     batch_shape = {1};
   }
 
-  // Use gemmv when possible
-  if (!bias && cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
+  if (cu::can_use_gemv(M, N, K, a_transposed, b_transposed)) {
     cu::gemv(
         a,
         b,
@@ -76,9 +95,10 @@ void gemm_and_bias(
     return;
   }
 
+  /////////////////////////////////////////////////////////////////////////////
   // Invoke cublasLt
   CublasGemm gemm(
-      encoder.device(),
+      cu::device(s.device),
       a.dtype(),
       a_transposed,
       M,
@@ -91,45 +111,9 @@ void gemm_and_bias(
       batch_shape.back(),
       a_batch_strides.back(),
       b_batch_strides.back());
-  if (bias) {
-    gemm.set_bias(bias);
-  }
   gemm.run(encoder, out, a, b, batch_shape, a_batch_strides, b_batch_strides);
 }
 
-} // namespace
-
-void Matmul::eval_gpu(const std::vector<array>& inputs, array& out) {
-  nvtx3::scoped_range r("Matmul::eval_gpu");
-  auto& s = stream();
-  auto& encoder = cu::get_command_encoder(s);
-
-  assert(inputs.size() == 2);
-  auto& a_pre = inputs[0];
-  auto& b_pre = inputs[1];
-  // Return 0s if either input is empty.
-  if (a_pre.size() == 0 || b_pre.size() == 0) {
-    array zero(0, a_pre.dtype());
-    encoder.add_temporary(zero);
-    fill_gpu(zero, out, s);
-    return;
-  }
-
-  out.set_data(allocator::malloc(out.nbytes()));
-
-  int M = a_pre.shape(-2);
-  int N = b_pre.shape(-1);
-  int K = a_pre.shape(-1);
-
-  // Keep a vector with copies to be cleared in the completed buffer to release
-  // the arrays
-  auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
-  auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
-
-  gemm_and_bias(
-      encoder, M, N, K, a_transposed, lda, b_transposed, ldb, out, a, b);
-}
-
 void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
   nvtx3::scoped_range r("AddMM::eval_gpu");
   auto& s = stream();
@@ -152,27 +136,6 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
   auto [a_transposed, lda, a] = check_transpose(encoder, s, a_pre);
   auto [b_transposed, ldb, b] = check_transpose(encoder, s, b_pre);
 
-  /////////////////////////////////////////////////////////////////////////////
-  // Dispatch to GEMM with epilogue or AddMM
-
-  if (beta_ == 1 && c.strides(-1) == 1 && c.data_size() == out.shape(-1)) {
-    out.set_data(allocator::malloc(out.nbytes()));
-    gemm_and_bias(
-        encoder,
-        M,
-        N,
-        K,
-        a_transposed,
-        lda,
-        b_transposed,
-        ldb,
-        out,
-        a,
-        b,
-        c.data<void>());
-    return;
-  }
-
   int64_t ldc;
   {
     auto stx = c.strides()[c.ndim() - 2];
@@ -214,7 +177,7 @@ void AddMM::eval_gpu(const std::vector<array>& inputs, array& out) {
   }
 
   /////////////////////////////////////////////////////////////////////////////
-  // Invoke cublasLt with AddMM settings
+  // Invoke cublasLt
 
   CublasGemm gemm(
       cu::device(s.device),