From f7f87a40817502389f225acdf81a29cc37e7cc92 Mon Sep 17 00:00:00 2001 From: Daniel Hernandez-Juarez Date: Thu, 27 Feb 2025 09:32:05 +0000 Subject: [PATCH 1/4] [mlir][AMDGPU] Make sure gfx942 and previous archs emulate bf16 atomics --- .../AMDGPU/Transforms/EmulateAtomics.cpp | 8 +++ .../AMDGPU/amdgpu-emulate-atomics.mlir | 49 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 7459a6503cddf..04dc871941b98 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -189,6 +189,14 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } + // gfx950 has bf16 atomics + if (chipset < Chipset(9, 5, 0)) { + target.addDynamicallyLegalOp( + [](RawBufferAtomicFaddOp op) -> bool { + Type elemType = getElementTypeOrSelf(op.getValue().getType()); + return !isa(elemType); + }); + } } patterns.add< RawBufferAtomicByCasPattern, diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir index 104af58a0d277..7368f853ca293 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir @@ -1,6 +1,9 @@ // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1100 %s | FileCheck %s --check-prefixes=CHECK,GFX11 +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1200 %s | FileCheck %s --check-prefixes=CHECK,GFX12 +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx942 %s | FileCheck %s --check-prefixes=CHECK,GFX942 +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx950 %s | FileCheck %s --check-prefixes=CHECK,GFX950 // ----- @@ -10,6 +13,7 @@ func.func @atomic_fmax(%val: f32, %buffer: memref, %idx: i32) { // CHECK: gpu.printf "Begin\0A" // GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] // GFX11: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] +// GFX12: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] // GFX9: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] // GFX9: cf.br [[loop:\^.+]]([[ld]] : f32) // GFX9: [[loop]]([[arg:%.+]]: f32): @@ -20,6 +24,26 @@ func.func @atomic_fmax(%val: f32, %buffer: memref, %idx: i32) { // GFX9: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] // GFX9: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) // GFX9: [[post]]: +// GFX942: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] +// GFX942: cf.br [[loop:\^.+]]([[ld]] : f32) +// GFX942: [[loop]]([[arg:%.+]]: f32): +// GFX942: [[operated:%.+]] = arith.maximumf [[val]], [[arg]] +// GFX942: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]] +// GFX942: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32 +// GFX942: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32 +// GFX942: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] +// GFX942: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) +// GFX942: [[post]]: +// GFX950: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] +// GFX950: cf.br [[loop:\^.+]]([[ld]] : f32) +// GFX950: [[loop]]([[arg:%.+]]: f32): +// GFX950: [[operated:%.+]] = arith.maximumf [[val]], [[arg]] +// GFX950: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]] +// GFX950: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32 +// GFX950: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32 +// GFX950: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] +// GFX950: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) +// GFX950: [[post]]: // CHECK-NEXT: gpu.printf "End\0A" gpu.printf "Begin\n" amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} %val -> %buffer[%idx] : f32 -> memref, i32 @@ -36,6 +60,9 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref, %idx: i32) { // GFX9: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // GFX11: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// GFX12: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// GFX942: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// GFX950: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // CHECK-NEXT: gpu.printf "End\0A" gpu.printf "Begin\n" amdgpu.raw_buffer_atomic_fmax %val -> %buffer[%idx] : f64 -> memref, i32 @@ -51,6 +78,9 @@ func.func @atomic_fadd(%val: f32, %buffer: memref, %idx: i32) { // GFX10: amdgpu.raw_buffer_load // GFX10: amdgpu.raw_buffer_atomic_cmpswap // GFX11: amdgpu.raw_buffer_atomic_fadd +// GFX12: amdgpu.raw_buffer_atomic_fadd +// GFX942: amdgpu.raw_buffer_atomic_fadd +// GFX950: amdgpu.raw_buffer_atomic_fadd amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : f32 -> memref, i32 func.return } @@ -69,6 +99,25 @@ func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref, %idx: // GFX11: %[[vecCastOld:.+]] = vector.bitcast %[[old]] : vector<2xf16> to vector<1xi32> // GFX11: %[[scalarOld:.+]] = vector.extract %[[vecCastOld]][0] // GFX11: arith.cmpi eq, %[[scalarOld]], %[[scalarExpected]] +// GFX942: amdgpu.raw_buffer_atomic_fadd +// GFX12: amdgpu.raw_buffer_atomic_fadd +// GFX950: amdgpu.raw_buffer_atomic_fadd amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : vector<2xf16> -> memref, i32 func.return } + +// CHECK: func @atomic_fadd_v2bf16 +func.func @atomic_fadd_v2bf16(%val: vector<2xbf16>, %buffer: memref, %idx: i32) { +// GFX9: amdgpu.raw_buffer_load +// GFX9: amdgpu.raw_buffer_atomic_cmpswap +// GFX10: amdgpu.raw_buffer_load +// GFX10: amdgpu.raw_buffer_atomic_cmpswap +// GFX11: amdgpu.raw_buffer_load +// GFX11: amdgpu.raw_buffer_atomic_cmpswap +// GFX942: amdgpu.raw_buffer_load +// GFX942: amdgpu.raw_buffer_atomic_cmpswap +// GFX12: amdgpu.raw_buffer_atomic_fadd +// GFX950: amdgpu.raw_buffer_atomic_fadd + amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : vector<2xbf16> -> memref, i32 + func.return +} From ef93b18526aebc48b9ece3cd26a846a4df39ff17 Mon Sep 17 00:00:00 2001 From: Daniel Hernandez-Juarez Date: Thu, 27 Feb 2025 16:26:41 +0000 Subject: [PATCH 2/4] Addressing PR comments --- .../AMDGPU/amdgpu-emulate-atomics.mlir | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir index 7368f853ca293..37d7faa0bc004 100644 --- a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir +++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9 +// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX90A // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1100 %s | FileCheck %s --check-prefixes=CHECK,GFX11 // RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1200 %s | FileCheck %s --check-prefixes=CHECK,GFX12 @@ -14,16 +14,16 @@ func.func @atomic_fmax(%val: f32, %buffer: memref, %idx: i32) { // GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] // GFX11: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] // GFX12: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]] -// GFX9: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] -// GFX9: cf.br [[loop:\^.+]]([[ld]] : f32) -// GFX9: [[loop]]([[arg:%.+]]: f32): -// GFX9: [[operated:%.+]] = arith.maximumf [[val]], [[arg]] -// GFX9: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]] -// GFX9: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32 -// GFX9: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32 -// GFX9: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] -// GFX9: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) -// GFX9: [[post]]: +// GFX90A: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] +// GFX90A: cf.br [[loop:\^.+]]([[ld]] : f32) +// GFX90A: [[loop]]([[arg:%.+]]: f32): +// GFX90A: [[operated:%.+]] = arith.maximumf [[val]], [[arg]] +// GFX90A: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]] +// GFX90A: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32 +// GFX90A: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32 +// GFX90A: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]] +// GFX90A: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32) +// GFX90A: [[post]]: // GFX942: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]] // GFX942: cf.br [[loop:\^.+]]([[ld]] : f32) // GFX942: [[loop]]([[arg:%.+]]: f32): @@ -57,7 +57,7 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref, %idx: i32) { // CHECK: func @atomic_fmax_f64 // CHECK-SAME: ([[val:%.+]]: f64, [[buffer:%.+]]: memref, [[idx:%.+]]: i32) // CHECK: gpu.printf "Begin\0A" -// GFX9: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] +// GFX90A: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // GFX11: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] // GFX12: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]] @@ -74,7 +74,7 @@ func.func @atomic_fmax_f64(%val: f64, %buffer: memref, %idx: i32) { func.func @atomic_fadd(%val: f32, %buffer: memref, %idx: i32) { // CHECK: func @atomic_fadd -// GFX9: amdgpu.raw_buffer_atomic_fadd +// GFX90A: amdgpu.raw_buffer_atomic_fadd // GFX10: amdgpu.raw_buffer_load // GFX10: amdgpu.raw_buffer_atomic_cmpswap // GFX11: amdgpu.raw_buffer_atomic_fadd @@ -87,7 +87,7 @@ func.func @atomic_fadd(%val: f32, %buffer: memref, %idx: i32) { // CHECK: func @atomic_fadd_v2f16 func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref, %idx: i32) { -// GFX9: amdgpu.raw_buffer_atomic_fadd +// GFX90A: amdgpu.raw_buffer_atomic_fadd // GFX10: amdgpu.raw_buffer_load // GFX10: amdgpu.raw_buffer_atomic_cmpswap // Note: the atomic operation itself will be done over i32, and then we use bitcasts @@ -108,8 +108,8 @@ func.func @atomic_fadd_v2f16(%val: vector<2xf16>, %buffer: memref, %idx: // CHECK: func @atomic_fadd_v2bf16 func.func @atomic_fadd_v2bf16(%val: vector<2xbf16>, %buffer: memref, %idx: i32) { -// GFX9: amdgpu.raw_buffer_load -// GFX9: amdgpu.raw_buffer_atomic_cmpswap +// GFX90A: amdgpu.raw_buffer_load +// GFX90A: amdgpu.raw_buffer_atomic_cmpswap // GFX10: amdgpu.raw_buffer_load // GFX10: amdgpu.raw_buffer_atomic_cmpswap // GFX11: amdgpu.raw_buffer_load From 739274b7d330d38dd850627e3b5173dbafe16c10 Mon Sep 17 00:00:00 2001 From: Daniel Hernandez-Juarez Date: Fri, 28 Feb 2025 08:33:21 +0000 Subject: [PATCH 3/4] Addressing PR comments --- mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 04dc871941b98..5f24902d07b98 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -189,6 +189,8 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } + // TODO: refactor this to avoid hardcoding ISA version: + // https://github.com/llvm/llvm-project/issues/129206 // gfx950 has bf16 atomics if (chipset < Chipset(9, 5, 0)) { target.addDynamicallyLegalOp( From c63496acbdcd88aa7f338ddf04c57642c81fb5dc Mon Sep 17 00:00:00 2001 From: Daniel Hernandez-Juarez Date: Wed, 12 Mar 2025 12:48:25 +0100 Subject: [PATCH 4/4] Addressing PR comments Co-authored-by: Jakub Kuderski --- mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 5f24902d07b98..7dd4be66d2bd6 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -189,9 +189,8 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } - // TODO: refactor this to avoid hardcoding ISA version: - // https://github.com/llvm/llvm-project/issues/129206 - // gfx950 has bf16 atomics + // TODO(https://github.com/llvm/llvm-project/issues/129206): Refactor + // this to avoid hardcoding ISA version: gfx950 has bf16 atomics. if (chipset < Chipset(9, 5, 0)) { target.addDynamicallyLegalOp( [](RawBufferAtomicFaddOp op) -> bool {