From 8fa8f130e68a6c3bfdc7890e86a4c62653321884 Mon Sep 17 00:00:00 2001 From: Guy David Date: Wed, 28 May 2025 19:55:44 +0300 Subject: [PATCH] [AArch64] Spare N2I roundtrip when splatting float comparison Transform `select_cc t1, t2, -1, 0` for floats into a vector comparison which generates a mask, which is later on combined with potential vectorized DUPs. --- .../Target/AArch64/AArch64ISelLowering.cpp | 225 ++++++++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 +- .../CodeGen/AArch64/arm64-neon-v1i1-setcc.ll | 6 +- .../CodeGen/AArch64/build-vector-dup-simd.ll | 378 ++++++++++++++++++ 4 files changed, 557 insertions(+), 56 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9f51caef6d228..58e77ea0e6dcc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11002,10 +11002,126 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op, Cmp.getValue(1)); } -SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, - SDValue RHS, SDValue TVal, - SDValue FVal, const SDLoc &dl, - SelectionDAG &DAG) const { +/// Emit vector comparison for floating-point values, producing a mask. +static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, + AArch64CC::CondCode CC, bool NoNans, EVT VT, + const SDLoc &DL, SelectionDAG &DAG) { + EVT SrcVT = LHS.getValueType(); + assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && + "function only supposed to emit natural comparisons"); + + switch (CC) { + default: + return SDValue(); + case AArch64CC::NE: { + SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS); + // Use vector semantics for the inversion to potentially save a copy between + // SIMD and regular registers. + if (!LHS.getValueType().isVector()) { + EVT VecVT = + EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits()); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, + DAG.getUNDEF(VecVT), Fcmeq, Zero); + SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero); + } + return DAG.getNOT(DL, Fcmeq, VT); + } + case AArch64CC::EQ: + return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS); + case AArch64CC::GE: + return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS); + case AArch64CC::GT: + return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS); + case AArch64CC::LE: + if (!NoNans) + return SDValue(); + // If we ignore NaNs then we can use to the LS implementation. + [[fallthrough]]; + case AArch64CC::LS: + return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS); + case AArch64CC::LT: + if (!NoNans) + return SDValue(); + // If we ignore NaNs then we can use to the MI implementation. + [[fallthrough]]; + case AArch64CC::MI: + return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS); + } +} + +/// For SELECT_CC, when the true/false values are (-1, 0) and the compared +/// values are scalars, try to emit a mask generating vector instruction. +static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, + SDValue FVal, ISD::CondCode CC, bool NoNaNs, + const SDLoc &DL, SelectionDAG &DAG) { + assert(!LHS.getValueType().isVector()); + assert(!RHS.getValueType().isVector()); + + auto *CTVal = dyn_cast(TVal); + auto *CFVal = dyn_cast(FVal); + if (!CTVal || !CFVal) + return {}; + if (!(CTVal->isAllOnes() && CFVal->isZero()) && + !(CTVal->isZero() && CFVal->isAllOnes())) + return {}; + + if (CTVal->isZero()) + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); + + EVT VT = TVal.getValueType(); + if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits()) + return {}; + + if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) { + bool OneNaN = false; + if (LHS == RHS) { + OneNaN = true; + } else if (DAG.isKnownNeverNaN(RHS)) { + OneNaN = true; + RHS = LHS; + } else if (DAG.isKnownNeverNaN(LHS)) { + OneNaN = true; + LHS = RHS; + } + if (OneNaN) + CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ; + } + + AArch64CC::CondCode CC1; + AArch64CC::CondCode CC2; + bool ShouldInvert = false; + changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); + SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG); + SDValue Cmp2; + if (CC2 != AArch64CC::AL) { + Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG); + if (!Cmp2) + return {}; + } + if (!Cmp2 && !ShouldInvert) + return Cmp; + + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits()); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp, + Zero); + if (Cmp2) { + Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), + Cmp2, Zero); + Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2); + } + if (ShouldInvert) + Cmp = DAG.getNOT(DL, Cmp, VecVT); + Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero); + return Cmp; +} + +SDValue AArch64TargetLowering::LowerSELECT_CC( + ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, + iterator_range Users, bool HasNoNaNs, + const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { @@ -11188,6 +11304,27 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); + + // If the purpose of the comparison is to select between all ones + // or all zeros, try to use a vector comparison because the operands are + // already stored in SIMD registers. + if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) { + switch (U->getOpcode()) { + default: + return false; + case ISD::INSERT_VECTOR_ELT: + case ISD::SCALAR_TO_VECTOR: + case AArch64ISD::DUP: + return true; + } + })) { + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs; + SDValue VectorCmp = + emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, dl, DAG); + if (VectorCmp) + return VectorCmp; + } + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally @@ -11274,8 +11411,10 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); + bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); - return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); + return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, + DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, @@ -11283,6 +11422,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); + bool HasNoNans = Op->getFlags().hasNoNaNs(); SDLoc DL(Op); EVT Ty = Op.getValueType(); @@ -11349,7 +11489,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, DAG.getUNDEF(MVT::f32), FVal); } - SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); + SDValue Res = + LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG); if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) { return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res); @@ -15602,47 +15743,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, llvm_unreachable("unexpected shift opcode"); } -static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, - AArch64CC::CondCode CC, bool NoNans, EVT VT, - const SDLoc &dl, SelectionDAG &DAG) { - EVT SrcVT = LHS.getValueType(); - assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && - "function only supposed to emit natural comparisons"); - - if (SrcVT.getVectorElementType().isFloatingPoint()) { - switch (CC) { - default: - return SDValue(); - case AArch64CC::NE: { - SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); - return DAG.getNOT(dl, Fcmeq, VT); - } - case AArch64CC::EQ: - return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); - case AArch64CC::GE: - return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); - case AArch64CC::GT: - return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); - case AArch64CC::LE: - if (!NoNans) - return SDValue(); - // If we ignore NaNs then we can use to the LS implementation. - [[fallthrough]]; - case AArch64CC::LS: - return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); - case AArch64CC::LT: - if (!NoNans) - return SDValue(); - // If we ignore NaNs then we can use to the MI implementation. - [[fallthrough]]; - case AArch64CC::MI: - return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); - } - } - - return SDValue(); -} - SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isScalableVector()) @@ -15691,15 +15791,14 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); - SDValue Cmp = - EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); + bool NoNaNs = + getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs(); + SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { - SDValue Cmp2 = - EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); + SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); @@ -25456,6 +25555,28 @@ static SDValue performDUPCombine(SDNode *N, } if (N->getOpcode() == AArch64ISD::DUP) { + // If the instruction is known to produce a scalar in SIMD registers, we can + // duplicate it across the vector lanes using DUPLANE instead of moving it + // to a GPR first. For example, this allows us to handle: + // v4i32 = DUP (i32 (FCMGT (f32, f32))) + SDValue Op = N->getOperand(0); + // FIXME: Ideally, we should be able to handle all instructions that + // produce a scalar value in FPRs. + if (Op.getOpcode() == AArch64ISD::FCMEQ || + Op.getOpcode() == AArch64ISD::FCMGE || + Op.getOpcode() == AArch64ISD::FCMGT) { + EVT ElemVT = VT.getVectorElementType(); + EVT ExpandedVT = VT; + // Insert into a 128-bit vector to match DUPLANE's pattern. + if (VT.getSizeInBits() != 128) + ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT, + 128 / ElemVT.getSizeInBits()); + SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64); + SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT, + DCI.DAG.getUNDEF(ExpandedVT), Op, Zero); + return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero); + } + if (DCI.isAfterLegalizeDAG()) { // If scalar dup's operand is extract_vector_elt, try to combine them into // duplane. For example, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index b2174487c2fe8..8048cca203b66 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -643,7 +643,9 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, - SDValue TVal, SDValue FVal, const SDLoc &dl, + SDValue TVal, SDValue FVal, + iterator_range Users, + bool HasNoNans, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll index 6c70d19a977a5..1f2b1f8dd28ff 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll @@ -174,9 +174,9 @@ define <1 x i16> @test_select_f16_i16(half %i105, half %in, <1 x i16> %x, <1 x i ; CHECK-LABEL: test_select_f16_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: fcvt s0, h0 -; CHECK-NEXT: fcmp s0, s0 -; CHECK-NEXT: csetm w8, vs -; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: fcmeq s0, s0, s0 +; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: dup v0.4h, v0.h[0] ; CHECK-NEXT: bsl v0.8b, v2.8b, v3.8b ; CHECK-NEXT: ret %i179 = fcmp uno half %i105, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll b/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll new file mode 100644 index 0000000000000..ac0b8e89519dd --- /dev/null +++ b/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll @@ -0,0 +1,378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-NOFULLFP16 +; RUN: llc < %s -mtriple=aarch64 --enable-no-nans-fp-math | FileCheck %s --check-prefixes=CHECK,CHECK-NONANS +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FULLFP16 + +define <1 x float> @dup_v1i32_oeq(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_oeq: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmeq s0, s0, s1 +; CHECK-NEXT: ret +entry: + %0 = fcmp oeq float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ogt(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_ogt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmgt s0, s0, s1 +; CHECK-NEXT: ret +entry: + %0 = fcmp ogt float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_oge(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_oge: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmge s0, s0, s1 +; CHECK-NEXT: ret +entry: + %0 = fcmp oge float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_olt(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_olt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmgt s0, s1, s0 +; CHECK-NEXT: ret +entry: + %0 = fcmp olt float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ole(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_ole: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmge s0, s1, s0 +; CHECK-NEXT: ret +entry: + %0 = fcmp ole float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_one(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_one: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmgt s2, s0, s1 +; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-NOFULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NOFULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_one: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmeq s0, s0, s1 +; CHECK-NONANS-NEXT: mvn v0.8b, v0.8b +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_one: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmgt s2, s0, s1 +; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-FULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-FULLFP16-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp one float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ord(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_ord: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmge s2, s0, s1 +; CHECK-NEXT: fcmgt s0, s1, s0 +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %0 = fcmp ord float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ueq(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_ueq: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmgt s2, s0, s1 +; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-NOFULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_ueq: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmeq s0, s0, s1 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_ueq: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmgt s2, s0, s1 +; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-FULLFP16-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp ueq float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ugt(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_ugt: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmge s0, s1, s0 +; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_ugt: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmgt s0, s0, s1 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_ugt: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmge s0, s1, s0 +; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp ugt float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_uge(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_uge: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_uge: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmge s0, s0, s1 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_uge: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmgt s0, s1, s0 +; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp uge float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ult(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_ult: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmge s0, s0, s1 +; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_ult: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmgt s0, s1, s0 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_ult: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmge s0, s0, s1 +; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp ult float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_ule(float %a, float %b) { +; CHECK-NOFULLFP16-LABEL: dup_v1i32_ule: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcmgt s0, s0, s1 +; CHECK-NOFULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v1i32_ule: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcmge s0, s1, s0 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v1i32_ule: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmgt s0, s0, s1 +; CHECK-FULLFP16-NEXT: mvn v0.8b, v0.8b +; CHECK-FULLFP16-NEXT: ret +entry: + %0 = fcmp ule float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_une(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_une: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmeq s0, s0, s1 +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: ret +entry: + %0 = fcmp une float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <1 x float> @dup_v1i32_uno(float %a, float %b) { +; CHECK-LABEL: dup_v1i32_uno: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmge s2, s0, s1 +; CHECK-NEXT: fcmgt s0, s1, s0 +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: ret +entry: + %0 = fcmp uno float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <1 x i32> %vecinit.i to <1 x float> + ret <1 x float> %1 +} + +define <4 x float> @dup_v4i32(float %a, float %b) { +; CHECK-LABEL: dup_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmge s0, s0, s1 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret +entry: + %0 = fcmp oge float %a, %b + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <4 x i32> %vecinit.i to <4 x float> + %2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer + ret <4 x float> %2 +} + +define <4 x float> @dup_v4i32_reversed(float %a, float %b) { +; CHECK-LABEL: dup_v4i32_reversed: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmgt s0, s1, s0 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret +entry: + %0 = fcmp ogt float %b, %a + %vcmpd.i = sext i1 %0 to i32 + %vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0 + %1 = bitcast <4 x i32> %vecinit.i to <4 x float> + %2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer + ret <4 x float> %2 +} + +define <2 x double> @dup_v2i64(double %a, double %b) { +; CHECK-LABEL: dup_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmgt d0, d0, d1 +; CHECK-NEXT: dup v0.2d, v0.d[0] +; CHECK-NEXT: ret +entry: + %0 = fcmp ogt double %a, %b + %vcmpd.i = sext i1 %0 to i64 + %vecinit.i = insertelement <2 x i64> poison, i64 %vcmpd.i, i64 0 + %1 = bitcast <2 x i64> %vecinit.i to <2 x double> + %2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer + ret <2 x double> %2 +} + +define <8 x half> @dup_v8i16(half %a, half %b) { +; CHECK-NOFULLFP16-LABEL: dup_v8i16: +; CHECK-NOFULLFP16: // %bb.0: // %entry +; CHECK-NOFULLFP16-NEXT: fcvt s1, h1 +; CHECK-NOFULLFP16-NEXT: fcvt s0, h0 +; CHECK-NOFULLFP16-NEXT: fcmeq s0, s0, s1 +; CHECK-NOFULLFP16-NEXT: ret +; +; CHECK-NONANS-LABEL: dup_v8i16: +; CHECK-NONANS: // %bb.0: // %entry +; CHECK-NONANS-NEXT: fcvt s1, h1 +; CHECK-NONANS-NEXT: fcvt s0, h0 +; CHECK-NONANS-NEXT: fcmeq s0, s0, s1 +; CHECK-NONANS-NEXT: ret +; +; CHECK-FULLFP16-LABEL: dup_v8i16: +; CHECK-FULLFP16: // %bb.0: // %entry +; CHECK-FULLFP16-NEXT: fcmp h0, h1 +; CHECK-FULLFP16-NEXT: csetm w8, eq +; CHECK-FULLFP16-NEXT: fmov s0, w8 +; CHECK-FULLFP16-NEXT: ret +; FIXME: Could be replaced with fcmeq + dup but the type of the former is +; promoted to i32 during selection and then the optimization does not apply. + + entry: + %0 = fcmp oeq half %a, %b + %vcmpd.i = sext i1 %0 to i16 + %vecinit.i = insertelement <8 x i16> poison, i16 %vcmpd.i, i64 0 + %1 = bitcast <8 x i16> %vecinit.i to <8 x half> + ret <8 x half> %1 +} + +; Check that a mask is not generated for non-vectorized users. +define i32 @mask_i32(float %a, float %b) { +; CHECK-LABEL: mask_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: csetm w0, eq +; CHECK-NEXT: ret +entry: + %0 = fcmp oeq float %a, %b + %vcmpd.i = sext i1 %0 to i32 + ret i32 %vcmpd.i +} + +; Verify that a mask is not emitted when (allOnes, allZeros) are not the +; operands for the SELECT_CC. +define i32 @bool_i32(float %a, float %b) { +; CHECK-LABEL: bool_i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +entry: + %0 = fcmp oeq float %a, %b + %vcmpd.i = zext i1 %0 to i32 + ret i32 %vcmpd.i +}