diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 084b788d51828..993b75256fa23 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -749,7 +749,7 @@ class LLVM_ABI TargetLoweringBase { /// Return true if the heuristic to prefer icmp eq zero should be used in code /// gen prepare. - virtual bool preferZeroCompareBranch() const { return false; } + virtual bool preferZeroCompareBranch(BranchInst *) const { return false; } /// Return true if it is cheaper to split the store of a merged int val /// from a pair of smaller values into multiple stores. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index d9d41f1d72e35..90ad9949fb772 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -8630,7 +8630,7 @@ static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, // br %c, bla, blb // Creating the cmp to zero can be better for the backend, especially if the // lshr produces flags that can be used automatically. - if (!TLI.preferZeroCompareBranch() || !Branch->isConditional()) + if (!TLI.preferZeroCompareBranch(Branch) || !Branch->isConditional()) return false; ICmpInst *Cmp = dyn_cast(Branch->getCondition()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ff23f76fadccd..46f982fe21d5a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -28508,6 +28508,16 @@ Register AArch64TargetLowering::getExceptionSelectorRegister( return AArch64::X1; } +bool AArch64TargetLowering::preferZeroCompareBranch(BranchInst *Branch) const { + // If we can use Armv9.6 CB instructions, prefer that over zero compare + // branches. + + // If we have speculative load hardening enabled, we cannot use + // zero compare branches. + return !Subtarget->hasCMPBR() && !Branch->getFunction()->hasFnAttribute( + Attribute::SpeculativeLoadHardening); +} + bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { // Only sink 'and' mask to cmp use block if it is masking a single bit, since diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 7b1de3d3254f2..26fa599655a48 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -366,6 +366,8 @@ class AArch64TargetLowering : public TargetLowering { return true; } + bool preferZeroCompareBranch(BranchInst *) const override; + bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef55b22c9..ff55dd8d1b06d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -605,7 +605,7 @@ class VectorType; Sched::Preference getSchedulingPreference(SDNode *N) const override; - bool preferZeroCompareBranch() const override { return true; } + bool preferZeroCompareBranch(BranchInst *) const override { return true; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a5d735c407e5c..71dd861b17759 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -225,7 +225,7 @@ class RISCVTargetLowering : public TargetLowering { unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override; - bool preferZeroCompareBranch() const override { return true; } + bool preferZeroCompareBranch(BranchInst *) const override { return true; } // Note that one specific case requires fence insertion for an // AtomicCmpXchgInst but is handled via the RISCVZacasABIFix pass rather diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 1866962e17587..b5e497d773bd9 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -471,7 +471,7 @@ class SystemZTargetLowering : public TargetLowering { } bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool isCheapToSpeculateCttz(Type *) const override { return true; } - bool preferZeroCompareBranch() const override { return true; } + bool preferZeroCompareBranch(BranchInst *) const override { return true; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override { ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); return Mask && Mask->getValue().isIntN(16); diff --git a/llvm/test/CodeGen/AArch64/branch-on-zero.ll b/llvm/test/CodeGen/AArch64/branch-on-zero.ll new file mode 100644 index 0000000000000..efd4d2b319c55 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/branch-on-zero.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define i32 @test_lshr(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-SD-LABEL: test_lshr: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr w8, w2, #2 +; CHECK-SD-NEXT: cbz w8, .LBB0_2 +; CHECK-SD-NEXT: .LBB0_1: // %while.body +; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SD-NEXT: ldr w9, [x1], #4 +; CHECK-SD-NEXT: subs w8, w8, #1 +; CHECK-SD-NEXT: lsl w9, w9, #1 +; CHECK-SD-NEXT: str w9, [x0], #4 +; CHECK-SD-NEXT: b.ne .LBB0_1 +; CHECK-SD-NEXT: .LBB0_2: // %while.end +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_lshr: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr w8, w2, #2 +; CHECK-GI-NEXT: cbz w8, .LBB0_2 +; CHECK-GI-NEXT: .LBB0_1: // %while.body +; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-GI-NEXT: ldr w9, [x1], #4 +; CHECK-GI-NEXT: add x10, x0, #4 +; CHECK-GI-NEXT: subs w8, w8, #1 +; CHECK-GI-NEXT: lsl w9, w9, #1 +; CHECK-GI-NEXT: str w9, [x0] +; CHECK-GI-NEXT: mov x0, x10 +; CHECK-GI-NEXT: b.ne .LBB0_1 +; CHECK-GI-NEXT: .LBB0_2: // %while.end +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: ret +entry: + %shr = lshr i32 %n, 2 + %tobool.not4 = icmp eq i32 %shr, 0 + br i1 %tobool.not4, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %entry ] + %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %entry ] + %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %entry ] + %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1 + %0 = load i32, ptr %y.addr.05, align 4 + %mul = shl nsw i32 %0, 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1 + store i32 %mul, ptr %x.addr.06, align 4 + %dec = add nsw i32 %c.07, -1 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret i32 0 +} + +define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { +; CHECK-SD-LABEL: test_lshr2: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr w8, w2, #2 +; CHECK-SD-NEXT: cbz w8, .LBB1_2 +; CHECK-SD-NEXT: .LBB1_1: // %while.body +; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SD-NEXT: ldr w9, [x1], #4 +; CHECK-SD-NEXT: subs w8, w8, #1 +; CHECK-SD-NEXT: lsl w9, w9, #1 +; CHECK-SD-NEXT: str w9, [x0], #4 +; CHECK-SD-NEXT: b.ne .LBB1_1 +; CHECK-SD-NEXT: .LBB1_2: // %while.end +; CHECK-SD-NEXT: mov w0, wzr +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_lshr2: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr w8, w2, #2 +; CHECK-GI-NEXT: cbz w8, .LBB1_2 +; CHECK-GI-NEXT: .LBB1_1: // %while.body +; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-GI-NEXT: ldr w9, [x1], #4 +; CHECK-GI-NEXT: add x10, x0, #4 +; CHECK-GI-NEXT: subs w8, w8, #1 +; CHECK-GI-NEXT: lsl w9, w9, #1 +; CHECK-GI-NEXT: str w9, [x0] +; CHECK-GI-NEXT: mov x0, x10 +; CHECK-GI-NEXT: b.ne .LBB1_1 +; CHECK-GI-NEXT: .LBB1_2: // %while.end +; CHECK-GI-NEXT: mov w0, wzr +; CHECK-GI-NEXT: ret +entry: + %tobool.not4 = icmp ult i32 %n, 4 + br i1 %tobool.not4, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + %shr = lshr i32 %n, 2 + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %c.07 = phi i32 [ %dec, %while.body ], [ %shr, %while.body.preheader ] + %x.addr.06 = phi ptr [ %incdec.ptr1, %while.body ], [ %x, %while.body.preheader ] + %y.addr.05 = phi ptr [ %incdec.ptr, %while.body ], [ %y, %while.body.preheader ] + %incdec.ptr = getelementptr inbounds i32, ptr %y.addr.05, i32 1 + %0 = load i32, ptr %y.addr.05, align 4 + %mul = shl nsw i32 %0, 1 + %incdec.ptr1 = getelementptr inbounds i32, ptr %x.addr.06, i32 1 + store i32 %mul, ptr %x.addr.06, align 4 + %dec = add nsw i32 %c.07, -1 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + ret i32 0 +} + + +define i32 @lshr(i32 %u) { +; CHECK-LABEL: lshr: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: lsr w0, w0, #4 +; CHECK-NEXT: mov w8, w19 +; CHECK-NEXT: cbz w0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: bl use +; CHECK-NEXT: add w8, w19, w19, lsl #1 +; CHECK-NEXT: .LBB2_2: // %if.end +; CHECK-NEXT: sub w9, w19, #7 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %cmp.not = icmp ult i32 %u, 16 + br i1 %cmp.not, label %if.end, label %if.then + +if.then: ; preds = %entry + %shr = lshr i32 %u, 4 + tail call void @use(i32 noundef %shr) + %mul = mul i32 %u, 3 + br label %if.end + +if.end: ; preds = %if.then, %entry + %u.addr.0 = phi i32 [ %mul, %if.then ], [ %u, %entry ] + %sub = add i32 %u, -7 + %cmp1 = icmp ugt i32 %u.addr.0, %sub + %conv = zext i1 %cmp1 to i32 + ret i32 %conv +} + +declare void @use(i32) +