diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ba7e11a853347..3c564c76e0f9f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17577,3 +17577,13 @@ SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { AI->eraseFromParent(); return LI; } + +bool SITargetLowering::hasAndNot(SDValue Op) const { + // AND-NOT is only valid on uniform (SGPR) values; divergent values live in + // VGPRs. + if (Op->isDivergent()) + return false; + + EVT VT = Op.getValueType(); + return VT == MVT::i32 || VT == MVT::i64; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c42366a1c04c8..b348702a5bd8d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -611,6 +611,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override; + bool hasAndNot(SDValue Op) const override; }; // Returns true if argument is a boolean value which is not serialized into diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 201b97d479c68..6e9cd8807b379 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -135,9 +135,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s1, s1, s2 -; GFX7-NEXT: s_and_b32 s0, s0, s1 -; GFX7-NEXT: s_xor_b32 s0, s2, s0 +; GFX7-NEXT: s_andn2_b32 s2, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -147,9 +147,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s1 -; GFX8-NEXT: s_xor_b32 s0, s2, s0 +; GFX8-NEXT: s_andn2_b32 s2, s2, s0 +; GFX8-NEXT: s_and_b32 s0, s1, s0 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -163,9 +163,9 @@ define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s0, s0, s1 -; GFX10-NEXT: s_xor_b32 s0, s2, s0 +; GFX10-NEXT: s_andn2_b32 s2, s2, s0 +; GFX10-NEXT: s_and_b32 s0, s1, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -317,19 +317,26 @@ entry: define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { ; GFX7-LABEL: s_s_v_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_nand_b32 s0, s1, s0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_s_v_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v1, s0 -; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0 +; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: s_nand_b32 s0, s1, s0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_s_v_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_nand_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch: @@ -350,30 +357,40 @@ entry: ret float %cast } -define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) { +define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) { ; GFX7-LABEL: s_v_v_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_nand_b32 s0, s1, s0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_v_v_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: s_nand_b32 s0, s1, s0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_v_v_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_nand_b32 s0, s1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0 ; GFX8-GISEL-NEXT: ; return to shader part epilog ; ; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1 +; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0 ; GFX10-GISEL-NEXT: ; return to shader part epilog entry: %xor0 = xor i32 %y, %z @@ -1008,24 +1025,32 @@ define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i6 define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) { ; GFX7-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX7-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX7-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0 +; GFX8-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_s_v_bitselect_i64_pat_1: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0 -; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1 +; GFX10-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_not_b64 s[0:1], s[0:1] +; GFX10-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1: @@ -1495,9 +1520,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1510,9 +1535,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1526,9 +1551,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -1583,9 +1608,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX7-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1598,9 +1623,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1614,9 +1639,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index ae8080cf9f06a..f6deb6c6df422 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs ; GCN-LABEL: commute_sgt_neg1_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[8:9], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4] -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; GCN-NEXT: v_not_b32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll index 69724aa75af4f..56990a7b73310 100644 --- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll @@ -39,11 +39,10 @@ define i32 @s_in32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_in32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -56,11 +55,10 @@ define i64 @s_in64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) { ; GCN-LABEL: s_in64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17] ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i64 %x, %y @@ -75,11 +73,10 @@ define i32 @s_in_commutativity_0_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_0_0_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -92,11 +89,10 @@ define i32 @s_in_commutativity_0_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_0_1_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -109,11 +105,10 @@ define i32 @in_commutativity_0_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: in_commutativity_0_1_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -126,11 +121,10 @@ define i32 @s_in_commutativity_1_0_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_0_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -143,11 +137,10 @@ define i32 @s_in_commutativity_1_0_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_0_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s1, s2, s1 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -160,11 +153,10 @@ define i32 @s_in_commutativity_1_1_0(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_1_0: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -177,11 +169,10 @@ define i32 @s_in_commutativity_1_1_1(i32 inreg %x, i32 inreg %y, i32 inreg %mask ; GCN-LABEL: s_in_commutativity_1_1_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, s1 +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s1, s2, s1 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, %y @@ -197,11 +188,10 @@ define i32 @s_in_complex_y0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s0, s0, s3 +; GCN-NEXT: s_and_not1_b32 s1, s1, s3 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %y = and i32 %y_hi, %y_low @@ -216,11 +206,10 @@ define i32 @s_in_complex_y1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, i32 ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_and_b32 s0, s0, s3 +; GCN-NEXT: s_and_not1_b32 s1, s1, s3 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %y = and i32 %y_hi, %y_low @@ -237,10 +226,10 @@ define i32 @s_in_complex_m0(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -256,10 +245,10 @@ define i32 @s_in_complex_m1(i32 inreg %x, i32 inreg %y, i32 inreg %m_a, i32 inre ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_b32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -278,10 +267,10 @@ define i32 @s_in_complex_y0_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -299,10 +288,10 @@ define i32 @s_in_complex_y1_m0(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -320,10 +309,10 @@ define i32 @s_in_complex_y0_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -341,10 +330,10 @@ define i32 @s_in_complex_y1_m1(i32 inreg %x, i32 inreg %y_hi, i32 inreg %y_low, ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_and_b32 s1, s1, s2 ; GCN-NEXT: s_xor_b32 s2, s3, s16 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -444,11 +433,10 @@ define i32 @in_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: in_constant_varx_42: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, 42, s2 ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 %x, 42 @@ -480,11 +468,10 @@ define i32 @s_in_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_in_constant_varx_42_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_and_b32 s1, s2, 42 ; GCN-NEXT: s_and_not1_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -577,11 +564,10 @@ define i32 @s_in_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_in_constant_42_vary: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_not1_b32 s0, s1, s2 +; GCN-NEXT: s_and_b32 s1, s2, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %n0 = xor i32 42, %y @@ -613,11 +599,10 @@ define i32 @s_in_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_in_constant_42_vary_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_b32 s0, s1, s2 +; GCN-NEXT: s_and_not1_b32 s1, 42, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_not1_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s1, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1