From f7f87a40817502389f225acdf81a29cc37e7cc92 Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <dhernandez0@gmail.com>
Date: Thu, 27 Feb 2025 09:32:05 +0000
Subject: [PATCH 1/4] [mlir][AMDGPU] Make sure gfx942 and previous archs
 emulate bf16 atomics

---
 .../AMDGPU/Transforms/EmulateAtomics.cpp      |  8 +++
 .../AMDGPU/amdgpu-emulate-atomics.mlir        | 49 +++++++++++++++++++
 2 files changed, 57 insertions(+)
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 7459a6503cddf..04dc871941b98 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -189,6 +189,14 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
     } else {
       target.addIllegalOp<RawBufferAtomicFmaxOp>();
     }
+    // gfx950 has bf16 atomics
+    if (chipset < Chipset(9, 5, 0)) {
+      target.addDynamicallyLegalOp<RawBufferAtomicFaddOp>(
+          [](RawBufferAtomicFaddOp op) -> bool {
+            Type elemType = getElementTypeOrSelf(op.getValue().getType());
+            return !isa<BFloat16Type>(elemType);
+          });
+    }
   }
   patterns.add<
       RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
index 104af58a0d277..7368f853ca293 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
@@ -1,6 +1,9 @@
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1100 %s | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1200 %s | FileCheck %s --check-prefixes=CHECK,GFX12
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx942 %s | FileCheck %s --check-prefixes=CHECK,GFX942
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx950 %s | FileCheck %s --check-prefixes=CHECK,GFX950
 
 // -----
 
@@ -10,6 +13,7 @@ func.func @atomic_fmax(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 // CHECK: gpu.printf "Begin\0A"
 // GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
 // GFX11: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
+// GFX12: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
 // GFX9:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
 // GFX9:  cf.br [[loop:\^.+]]([[ld]] : f32)
 // GFX9:  [[loop]]([[arg:%.+]]: f32):
@@ -20,6 +24,26 @@ func.func @atomic_fmax(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 // GFX9:  [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
 // GFX9:  cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
 // GFX9:  [[post]]:
+// GFX942:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
+// GFX942:  cf.br [[loop:\^.+]]([[ld]] : f32)
+// GFX942:  [[loop]]([[arg:%.+]]: f32):
+// GFX942:  [[operated:%.+]] = arith.maximumf [[val]], [[arg]]
+// GFX942:  [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]]
+// GFX942:  [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32
+// GFX942:  [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32
+// GFX942:  [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
+// GFX942:  cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
+// GFX942:  [[post]]:
+// GFX950:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
+// GFX950:  cf.br [[loop:\^.+]]([[ld]] : f32)
+// GFX950:  [[loop]]([[arg:%.+]]: f32):
+// GFX950:  [[operated:%.+]] = arith.maximumf [[val]], [[arg]]
+// GFX950:  [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]]
+// GFX950:  [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32
+// GFX950:  [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32
+// GFX950:  [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
+// GFX950:  cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
+// GFX950:  [[post]]:
 // CHECK-NEXT: gpu.printf "End\0A"
   gpu.printf "Begin\n"
   amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} %val -> %buffer[%idx] : f32 -> memref<?xf32>, i32
@@ -36,6 +60,9 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref<?xf64>, %idx: i32) {
 // GFX9:  amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // GFX11: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// GFX12: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// GFX942: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// GFX950: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // CHECK-NEXT: gpu.printf "End\0A"
   gpu.printf "Begin\n"
   amdgpu.raw_buffer_atomic_fmax %val -> %buffer[%idx] : f64 -> memref<?xf64>, i32
@@ -51,6 +78,9 @@ func.func @atomic_fadd(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 // GFX10: amdgpu.raw_buffer_load
 // GFX10: amdgpu.raw_buffer_atomic_cmpswap
 // GFX11: amdgpu.raw_buffer_atomic_fadd
+// GFX12: amdgpu.raw_buffer_atomic_fadd
+// GFX942: amdgpu.raw_buffer_atomic_fadd
+// GFX950: amdgpu.raw_buffer_atomic_fadd
   amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : f32 -> memref<?xf32>, i32
   func.return
 }
@@ -69,6 +99,25 @@ func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref<?xf16>, %idx:
 // GFX11: %[[vecCastOld:.+]] = vector.bitcast %[[old]] : vector<2xf16> to vector<1xi32>
 // GFX11: %[[scalarOld:.+]] = vector.extract %[[vecCastOld]][0]
 // GFX11: arith.cmpi eq, %[[scalarOld]], %[[scalarExpected]]
+// GFX942: amdgpu.raw_buffer_atomic_fadd
+// GFX12:  amdgpu.raw_buffer_atomic_fadd
+// GFX950:  amdgpu.raw_buffer_atomic_fadd
   amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : vector<2xf16> -> memref<?xf16>, i32
   func.return
 }
+
+// CHECK: func @atomic_fadd_v2bf16
+func.func @atomic_fadd_v2bf16(%val: vector<2xbf16>, %buffer: memref<?xbf16>, %idx: i32) {
+// GFX9: amdgpu.raw_buffer_load
+// GFX9: amdgpu.raw_buffer_atomic_cmpswap
+// GFX10: amdgpu.raw_buffer_load
+// GFX10: amdgpu.raw_buffer_atomic_cmpswap
+// GFX11: amdgpu.raw_buffer_load
+// GFX11: amdgpu.raw_buffer_atomic_cmpswap
+// GFX942: amdgpu.raw_buffer_load
+// GFX942: amdgpu.raw_buffer_atomic_cmpswap
+// GFX12:  amdgpu.raw_buffer_atomic_fadd
+// GFX950:  amdgpu.raw_buffer_atomic_fadd
+  amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : vector<2xbf16> -> memref<?xbf16>, i32
+  func.return
+}

From ef93b18526aebc48b9ece3cd26a846a4df39ff17 Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <dhernandez0@gmail.com>
Date: Thu, 27 Feb 2025 16:26:41 +0000
Subject: [PATCH 2/4] Addressing PR comments

---
 .../AMDGPU/amdgpu-emulate-atomics.mlir        | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
index 7368f853ca293..37d7faa0bc004 100644
--- a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX90A
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1100 %s | FileCheck %s --check-prefixes=CHECK,GFX11
 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1200 %s | FileCheck %s --check-prefixes=CHECK,GFX12
@@ -14,16 +14,16 @@ func.func @atomic_fmax(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 // GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
 // GFX11: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
 // GFX12: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
-// GFX9:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
-// GFX9:  cf.br [[loop:\^.+]]([[ld]] : f32)
-// GFX9:  [[loop]]([[arg:%.+]]: f32):
-// GFX9:  [[operated:%.+]] = arith.maximumf [[val]], [[arg]]
-// GFX9:  [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]]
-// GFX9:  [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32
-// GFX9:  [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32
-// GFX9:  [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
-// GFX9:  cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
-// GFX9:  [[post]]:
+// GFX90A:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
+// GFX90A:  cf.br [[loop:\^.+]]([[ld]] : f32)
+// GFX90A:  [[loop]]([[arg:%.+]]: f32):
+// GFX90A:  [[operated:%.+]] = arith.maximumf [[val]], [[arg]]
+// GFX90A:  [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]]
+// GFX90A:  [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32
+// GFX90A:  [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32
+// GFX90A:  [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
+// GFX90A:  cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
+// GFX90A:  [[post]]:
 // GFX942:  [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
 // GFX942:  cf.br [[loop:\^.+]]([[ld]] : f32)
 // GFX942:  [[loop]]([[arg:%.+]]: f32):
@@ -57,7 +57,7 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref<?xf64>, %idx: i32) {
 // CHECK: func @atomic_fmax_f64
 // CHECK-SAME: ([[val:%.+]]: f64, [[buffer:%.+]]: memref<?xf64>, [[idx:%.+]]: i32)
 // CHECK: gpu.printf "Begin\0A"
-// GFX9:  amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// GFX90A:  amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // GFX11: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
 // GFX12: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
@@ -74,7 +74,7 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref<?xf64>, %idx: i32) {
 
 func.func @atomic_fadd(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 // CHECK: func @atomic_fadd
-// GFX9:  amdgpu.raw_buffer_atomic_fadd
+// GFX90A:  amdgpu.raw_buffer_atomic_fadd
 // GFX10: amdgpu.raw_buffer_load
 // GFX10: amdgpu.raw_buffer_atomic_cmpswap
 // GFX11: amdgpu.raw_buffer_atomic_fadd
@@ -87,7 +87,7 @@ func.func @atomic_fadd(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
 
 // CHECK: func @atomic_fadd_v2f16
 func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref<?xf16>, %idx: i32) {
-// GFX9:  amdgpu.raw_buffer_atomic_fadd
+// GFX90A:  amdgpu.raw_buffer_atomic_fadd
 // GFX10: amdgpu.raw_buffer_load
 // GFX10: amdgpu.raw_buffer_atomic_cmpswap
 // Note: the atomic operation itself will be done over i32, and then we use bitcasts
@@ -108,8 +108,8 @@ func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref<?xf16>, %idx:
 
 // CHECK: func @atomic_fadd_v2bf16
 func.func @atomic_fadd_v2bf16(%val: vector<2xbf16>, %buffer: memref<?xbf16>, %idx: i32) {
-// GFX9: amdgpu.raw_buffer_load
-// GFX9: amdgpu.raw_buffer_atomic_cmpswap
+// GFX90A: amdgpu.raw_buffer_load
+// GFX90A: amdgpu.raw_buffer_atomic_cmpswap
 // GFX10: amdgpu.raw_buffer_load
 // GFX10: amdgpu.raw_buffer_atomic_cmpswap
 // GFX11: amdgpu.raw_buffer_load

From 739274b7d330d38dd850627e3b5173dbafe16c10 Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <dhernandez0@gmail.com>
Date: Fri, 28 Feb 2025 08:33:21 +0000
Subject: [PATCH 3/4] Addressing PR comments

---
 mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 04dc871941b98..5f24902d07b98 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -189,6 +189,8 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
     } else {
       target.addIllegalOp<RawBufferAtomicFmaxOp>();
     }
+    // TODO: refactor this to avoid hardcoding ISA version:
+    // https://github.com/llvm/llvm-project/issues/129206
     // gfx950 has bf16 atomics
     if (chipset < Chipset(9, 5, 0)) {
       target.addDynamicallyLegalOp<RawBufferAtomicFaddOp>(

From c63496acbdcd88aa7f338ddf04c57642c81fb5dc Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <dhernandez0@gmail.com>
Date: Wed, 12 Mar 2025 12:48:25 +0100
Subject: [PATCH 4/4] Addressing PR comments

Co-authored-by: Jakub Kuderski <kubakuderski@gmail.com>
---
 mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 5f24902d07b98..7dd4be66d2bd6 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -189,9 +189,8 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
     } else {
       target.addIllegalOp<RawBufferAtomicFmaxOp>();
     }
-    // TODO: refactor this to avoid hardcoding ISA version:
-    // https://github.com/llvm/llvm-project/issues/129206
-    // gfx950 has bf16 atomics
+    // TODO(https://github.com/llvm/llvm-project/issues/129206): Refactor
+    // this to avoid hardcoding ISA version: gfx950 has bf16 atomics.
     if (chipset < Chipset(9, 5, 0)) {
       target.addDynamicallyLegalOp<RawBufferAtomicFaddOp>(
           [](RawBufferAtomicFaddOp op) -> bool {