diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index cc775e4b260dc..253933a2438cd 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -599,6 +599,28 @@ static std::optional getElementIndex(const Value *Inst, return Index; } +/// \returns true if all of the values in \p VL use the same opcode. +/// For comparison instructions, also checks if predicates match. +/// PoisonValues are considered matching. +/// Interchangeable instructions are not considered. +static bool allSameOpcode(ArrayRef VL) { + auto *It = find_if(VL, IsaPred); + if (It == VL.end()) + return true; + Instruction *MainOp = cast(*It); + unsigned Opcode = MainOp->getOpcode(); + bool IsCmpOp = isa(MainOp); + CmpInst::Predicate BasePred = IsCmpOp ? cast(MainOp)->getPredicate() + : CmpInst::BAD_ICMP_PREDICATE; + return std::all_of(It, VL.end(), [&](Value *V) { + if (auto *CI = dyn_cast(V)) + return BasePred == CI->getPredicate(); + if (auto *I = dyn_cast(V)) + return I->getOpcode() == Opcode; + return isa(V); + }); +} + namespace { /// Specifies the way the mask should be analyzed for undefs/poisonous elements /// in the shuffle mask. @@ -814,6 +836,272 @@ static std::optional getExtractIndex(const Instruction *E) { } namespace { +/// \returns true if \p Opcode is allowed as part of the main/alternate +/// instruction for SLP vectorization. +/// +/// Example of unsupported opcode is SDIV that can potentially cause UB if the +/// "shuffled out" lane would result in division by zero. +bool isValidForAlternation(unsigned Opcode) { + return !Instruction::isIntDivRem(Opcode); +} + +/// Helper class that determines VL can use the same opcode. +/// Alternate instruction is supported. In addition, it supports interchangeable +/// instruction. An interchangeable instruction is an instruction that can be +/// converted to another instruction with same semantics. For example, x << 1 is +/// equal to x * 2. x * 1 is equal to x | 0. +class BinOpSameOpcodeHelper { + using MaskType = std::uint_fast16_t; + /// Sort SupportedOp because it is used by binary_search. + constexpr static std::initializer_list SupportedOp = { + Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl, + Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor}; + enum : MaskType { + ShlBIT = 0b1, + AShrBIT = 0b10, + MulBIT = 0b100, + AddBIT = 0b1000, + SubBIT = 0b10000, + AndBIT = 0b100000, + OrBIT = 0b1000000, + XorBIT = 0b10000000, + MainOpBIT = 0b100000000, + LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT) + }; + /// Return a non-nullptr if either operand of I is a ConstantInt. + /// The second return value represents the operand position. We check the + /// right-hand side first (1). If the right hand side is not a ConstantInt and + /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand + /// side (0). + static std::pair + isBinOpWithConstantInt(const Instruction *I) { + unsigned Opcode = I->getOpcode(); + assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode."); + (void)SupportedOp; + auto *BinOp = cast(I); + if (auto *CI = dyn_cast(BinOp->getOperand(1))) + return {CI, 1}; + if (Opcode == Instruction::Sub || Opcode == Instruction::Shl || + Opcode == Instruction::AShr) + return {nullptr, 0}; + if (auto *CI = dyn_cast(BinOp->getOperand(0))) + return {CI, 0}; + return {nullptr, 0}; + } + struct InterchangeableInfo { + const Instruction *I = nullptr; + /// The bit it sets represents whether MainOp can be converted to. + MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | + MulBIT | AShrBIT | ShlBIT; + /// We cannot create an interchangeable instruction that does not exist in + /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0], + /// but << does not exist in VL. In the end, we convert VL to [x * 1, y * + /// 1]. SeenBefore is used to know what operations have been seen before. + MaskType SeenBefore = 0; + InterchangeableInfo(const Instruction *I) : I(I) {} + /// Return false allows BinOpSameOpcodeHelper to find an alternate + /// instruction. Directly setting the mask will destroy the mask state, + /// preventing us from determining which instruction it should convert to. + bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) { + if (Mask & InterchangeableMask) { + SeenBefore |= OpcodeInMaskForm; + Mask &= InterchangeableMask; + return true; + } + return false; + } + bool equal(unsigned Opcode) { + if (Opcode == I->getOpcode()) + return trySet(MainOpBIT, MainOpBIT); + return false; + } + unsigned getOpcode() const { + MaskType Candidate = Mask & SeenBefore; + if (Candidate & MainOpBIT) + return I->getOpcode(); + if (Candidate & ShlBIT) + return Instruction::Shl; + if (Candidate & AShrBIT) + return Instruction::AShr; + if (Candidate & MulBIT) + return Instruction::Mul; + if (Candidate & AddBIT) + return Instruction::Add; + if (Candidate & SubBIT) + return Instruction::Sub; + if (Candidate & AndBIT) + return Instruction::And; + if (Candidate & OrBIT) + return Instruction::Or; + if (Candidate & XorBIT) + return Instruction::Xor; + llvm_unreachable("Cannot find interchangeable instruction."); + } + SmallVector getOperand(const Instruction *To) const { + unsigned ToOpcode = To->getOpcode(); + unsigned FromOpcode = I->getOpcode(); + if (FromOpcode == ToOpcode) + return SmallVector(I->operands()); + assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode."); + auto [CI, Pos] = isBinOpWithConstantInt(I); + const APInt &FromCIValue = CI->getValue(); + unsigned FromCIValueBitWidth = FromCIValue.getBitWidth(); + APInt ToCIValue; + switch (FromOpcode) { + case Instruction::Shl: + if (ToOpcode == Instruction::Mul) { + ToCIValue = APInt::getOneBitSet(FromCIValueBitWidth, + FromCIValue.getZExtValue()); + } else { + assert(FromCIValue.isZero() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::And + ? APInt::getAllOnes(FromCIValueBitWidth) + : APInt::getZero(FromCIValueBitWidth); + } + break; + case Instruction::Mul: + assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction."); + if (ToOpcode == Instruction::Shl) { + ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2()); + } else { + assert(FromCIValue.isOne() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::And + ? APInt::getAllOnes(FromCIValueBitWidth) + : APInt::getZero(FromCIValueBitWidth); + } + break; + case Instruction::Add: + case Instruction::Sub: + if (FromCIValue.isZero()) { + ToCIValue = APInt::getZero(FromCIValueBitWidth); + } else { + assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) && + "Cannot convert the instruction."); + ToCIValue = FromCIValue; + ToCIValue.negate(); + } + break; + case Instruction::And: + assert(FromCIValue.isAllOnes() && "Cannot convert the instruction."); + ToCIValue = ToOpcode == Instruction::Mul + ? APInt::getOneBitSet(FromCIValueBitWidth, 0) + : APInt::getZero(FromCIValueBitWidth); + break; + default: + assert(FromCIValue.isZero() && "Cannot convert the instruction."); + ToCIValue = APInt::getZero(FromCIValueBitWidth); + break; + } + Value *LHS = I->getOperand(1 - Pos); + Constant *RHS = + ConstantInt::get(I->getOperand(Pos)->getType(), ToCIValue); + if (Pos == 1) + return SmallVector({LHS, RHS}); + return SmallVector({RHS, LHS}); + } + }; + InterchangeableInfo MainOp; + InterchangeableInfo AltOp; + bool isValidForAlternation(const Instruction *I) const { + return ::isValidForAlternation(MainOp.I->getOpcode()) && + ::isValidForAlternation(I->getOpcode()); + } + bool initializeAltOp(const Instruction *I) { + if (AltOp.I) + return true; + if (!isValidForAlternation(I)) + return false; + AltOp.I = I; + return true; + } + +public: + BinOpSameOpcodeHelper(const Instruction *MainOp, + const Instruction *AltOp = nullptr) + : MainOp(MainOp), AltOp(AltOp) { + assert(is_sorted(SupportedOp) && "SupportedOp is not sorted."); + } + bool add(const Instruction *I) { + assert(isa(I) && + "BinOpSameOpcodeHelper only accepts BinaryOperator."); + unsigned Opcode = I->getOpcode(); + MaskType OpcodeInMaskForm; + // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp. + switch (Opcode) { + case Instruction::Shl: + OpcodeInMaskForm = ShlBIT; + break; + case Instruction::AShr: + OpcodeInMaskForm = AShrBIT; + break; + case Instruction::Mul: + OpcodeInMaskForm = MulBIT; + break; + case Instruction::Add: + OpcodeInMaskForm = AddBIT; + break; + case Instruction::Sub: + OpcodeInMaskForm = SubBIT; + break; + case Instruction::And: + OpcodeInMaskForm = AndBIT; + break; + case Instruction::Or: + OpcodeInMaskForm = OrBIT; + break; + case Instruction::Xor: + OpcodeInMaskForm = XorBIT; + break; + default: + return MainOp.equal(Opcode) || + (initializeAltOp(I) && AltOp.equal(Opcode)); + } + MaskType InterchangeableMask = OpcodeInMaskForm; + ConstantInt *CI = isBinOpWithConstantInt(I).first; + if (CI) { + constexpr MaskType CanBeAll = + XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT; + const APInt &CIValue = CI->getValue(); + switch (Opcode) { + case Instruction::Shl: + if (CIValue.ult(CIValue.getBitWidth())) + InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT; + break; + case Instruction::Mul: + if (CIValue.isOne()) { + InterchangeableMask = CanBeAll; + break; + } + if (CIValue.isPowerOf2()) + InterchangeableMask = MulBIT | ShlBIT; + break; + case Instruction::Add: + case Instruction::Sub: + InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT; + break; + case Instruction::And: + if (CIValue.isAllOnes()) + InterchangeableMask = CanBeAll; + break; + default: + if (CIValue.isZero()) + InterchangeableMask = CanBeAll; + break; + } + } + return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) || + (initializeAltOp(I) && + AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); + } + unsigned getMainOpcode() const { return MainOp.getOpcode(); } + bool hasAltOp() const { return AltOp.I; } + unsigned getAltOpcode() const { + return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); + } + SmallVector getOperand(const Instruction *I) const { + return MainOp.getOperand(I); + } +}; /// Main data required for vectorization of instructions. class InstructionsState { @@ -861,9 +1149,27 @@ class InstructionsState { /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return getMainOp() != getAltOp(); } - bool isOpcodeOrAlt(Instruction *I) const { - unsigned CheckedOpcode = I->getOpcode(); - return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; + /// Checks if the instruction matches either the main or alternate opcode. + /// \returns + /// - MainOp if \param I matches MainOp's opcode directly or can be converted + /// to it + /// - AltOp if \param I matches AltOp's opcode directly or can be converted to + /// it + /// - nullptr if \param I cannot be matched or converted to either opcode + Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { + assert(MainOp && "MainOp cannot be nullptr."); + if (I->getOpcode() == MainOp->getOpcode()) + return MainOp; + // Prefer AltOp instead of interchangeable instruction of MainOp. + assert(AltOp && "AltOp cannot be nullptr."); + if (I->getOpcode() == AltOp->getOpcode()) + return AltOp; + if (!I->isBinaryOp()) + return nullptr; + BinOpSameOpcodeHelper Converter(MainOp); + if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) + return MainOp; + return AltOp; } /// Checks if main/alt instructions are shift operations. @@ -913,23 +1219,41 @@ class InstructionsState { static InstructionsState invalid() { return {nullptr, nullptr}; } }; -} // end anonymous namespace - -/// \returns true if \p Opcode is allowed as part of the main/alternate -/// instruction for SLP vectorization. -/// -/// Example of unsupported opcode is SDIV that can potentially cause UB if the -/// "shuffled out" lane would result in division by zero. -static bool isValidForAlternation(unsigned Opcode) { - if (Instruction::isIntDivRem(Opcode)) - return false; - - return true; +std::pair> +convertTo(Instruction *I, const InstructionsState &S) { + Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I); + assert(SelectedOp && "Cannot convert the instruction."); + if (I->isBinaryOp()) { + BinOpSameOpcodeHelper Converter(I); + return std::make_pair(SelectedOp, Converter.getOperand(SelectedOp)); + } + return std::make_pair(SelectedOp, SmallVector(I->operands())); } +} // end anonymous namespace + static InstructionsState getSameOpcode(ArrayRef VL, const TargetLibraryInfo &TLI); +/// Find an instruction with a specific opcode in VL. +/// \param VL Array of values to search through. Must contain only Instructions +/// and PoisonValues. +/// \param Opcode The instruction opcode to search for +/// \returns +/// - The first instruction found with matching opcode +/// - nullptr if no matching instruction is found +Instruction *findInstructionWithOpcode(ArrayRef VL, unsigned Opcode) { + for (Value *V : VL) { + if (isa(V)) + continue; + assert(isa(V) && "Only accepts PoisonValue and Instruction."); + auto *Inst = cast(V); + if (Inst->getOpcode() == Opcode) + return Inst; + } + return nullptr; +} + /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. /// compatible instructions or constants, or just some other regular values. static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, @@ -993,6 +1317,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, unsigned Opcode = MainOp->getOpcode(); unsigned AltOpcode = Opcode; + BinOpSameOpcodeHelper BinOpHelper(MainOp); bool SwappedPredsCompatible = IsCmpOp && [&]() { SetVector UniquePreds, UniqueNonSwappedPreds; UniquePreds.insert(BasePred); @@ -1039,14 +1364,8 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState::invalid(); unsigned InstOpcode = I->getOpcode(); if (IsBinOp && isa(I)) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (BinOpHelper.add(I)) continue; - if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && - isValidForAlternation(Opcode)) { - AltOpcode = InstOpcode; - AltOp = I; - continue; - } } else if (IsCastOp && isa(I)) { Value *Op0 = MainOp->getOperand(0); Type *Ty0 = Op0->getType(); @@ -1147,7 +1466,22 @@ static InstructionsState getSameOpcode(ArrayRef VL, return InstructionsState::invalid(); } - return InstructionsState(MainOp, AltOp); + if (IsBinOp) { + MainOp = findInstructionWithOpcode(VL, BinOpHelper.getMainOpcode()); + assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper."); + AltOp = findInstructionWithOpcode(VL, BinOpHelper.getAltOpcode()); + assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper."); + } + assert((MainOp == AltOp || !allSameOpcode(VL)) && + "Incorrect implementation of allSameOpcode."); + InstructionsState S(MainOp, AltOp); + assert(all_of(VL, + [&](Value *V) { + return isa(V) || + S.getMatchingMainOpOrAltOp(cast(V)); + }) && + "Invalid InstructionsState."); + return S; } /// \returns true if all of the values in \p VL have the same type or false @@ -2560,11 +2894,11 @@ class BoUpSLP { // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - bool IsInverseOperation = !isCommutative(cast(V)); + auto [SelectedOp, Ops] = convertTo(cast(VL[Lane]), S); + bool IsInverseOperation = !isCommutative(SelectedOp); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; - OpsVec[OpIdx][Lane] = {cast(V)->getOperand(OpIdx), APO, - false}; + OpsVec[OpIdx][Lane] = {Ops[OpIdx], APO, false}; } } } @@ -3542,14 +3876,16 @@ class BoUpSLP { /// Some of the instructions in the list have alternate opcodes. bool isAltShuffle() const { return S.isAltShuffle(); } - bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); } + Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { + return S.getMatchingMainOpOrAltOp(I); + } /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is /// \p OpValue. Value *isOneOf(Value *Op) const { auto *I = dyn_cast(Op); - if (I && isOpcodeOrAlt(I)) + if (I && getMatchingMainOpOrAltOp(I)) return Op; return S.getMainOp(); } @@ -8428,11 +8764,15 @@ static std::pair generateKeySubkey( return std::make_pair(Key, SubKey); } +/// Checks if the specified instruction \p I is an main operation for the given +/// \p MainOp and \p AltOp instructions. +static bool isMainInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI); + /// Checks if the specified instruction \p I is an alternate operation for /// the given \p MainOp and \p AltOp instructions. -static bool isAlternateInstruction(const Instruction *I, - const Instruction *MainOp, - const Instruction *AltOp, +static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI); bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, @@ -9245,7 +9585,8 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, continue; } if ((LocalState.getAltOpcode() != LocalState.getOpcode() && - I->getOpcode() == LocalState.getOpcode()) || + isMainInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(), + *TLI)) || (LocalState.getAltOpcode() == LocalState.getOpcode() && !isAlternateInstruction(I, LocalState.getMainOp(), LocalState.getAltOp(), *TLI))) { @@ -10344,9 +10685,14 @@ void BoUpSLP::TreeEntry::buildAltOpShuffleMask( } } -static bool isAlternateInstruction(const Instruction *I, - const Instruction *MainOp, - const Instruction *AltOp, +static bool isMainInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, + const TargetLibraryInfo &TLI) { + return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp; +} + +static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, + Instruction *AltOp, const TargetLibraryInfo &TLI) { if (auto *MainCI = dyn_cast(MainOp)) { auto *AltCI = cast(AltOp); @@ -10366,7 +10712,7 @@ static bool isAlternateInstruction(const Instruction *I, "their swap."); return MainP != P && MainP != SwappedP; } - return I->getOpcode() == AltOp->getOpcode(); + return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp; } TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef Ops) { @@ -11129,7 +11475,9 @@ void BoUpSLP::transformNodes() { // same opcode and same parent block or all constants. if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) || !(!E.hasState() || E.getOpcode() == Instruction::Load || - E.isAltShuffle() || !allSameBlock(VL)) || + // We use allSameOpcode instead of isAltShuffle because we don't + // want to use interchangeable instruction here. + !allSameOpcode(VL) || !allSameBlock(VL)) || allConstant(VL) || isSplat(VL)) continue; if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load) @@ -11174,7 +11522,7 @@ void BoUpSLP::transformNodes() { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S || S.isAltShuffle() || !allSameBlock(Slice) || + if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && areKnownNonVectorizableLoads(Slice)) || (S.getOpcode() != Instruction::Load && @@ -12974,14 +13322,22 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (isa(UniqueValues[Idx])) return InstructionCost(TTI::TCC_Free); - auto *VI = cast(UniqueValues[Idx]); - unsigned OpIdx = isa(VI) ? 0 : 1; - TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); - TTI::OperandValueInfo Op2Info = - TTI::getOperandInfo(VI->getOperand(OpIdx)); - SmallVector Operands(VI->operand_values()); + // We cannot retrieve the operand from UniqueValues[Idx] because an + // interchangeable instruction may be used. The order and the actual + // operand might differ from what is retrieved from UniqueValues[Idx]. + Value *Op1 = E->getOperand(0)[Idx]; + Value *Op2; + SmallVector Operands(1, Op1); + if (isa(UniqueValues[Idx])) { + Op2 = Op1; + } else { + Op2 = E->getOperand(1)[Idx]; + Operands.push_back(Op2); + } + TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1); + TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2); return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind, - Op1Info, Op2Info, Operands, VI); + Op1Info, Op2Info, Operands); }; auto GetVectorCost = [=](InstructionCost CommonCost) { if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { @@ -13211,7 +13567,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, return InstructionCost(TTI::TCC_Free); auto *VI = cast(UniqueValues[Idx]); - assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(VI) && + "Unexpected main/alternate opcode"); (void)E; return TTI->getInstructionCost(VI, CostKind); }; @@ -13279,7 +13636,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallVector Mask; E->buildAltOpShuffleMask( [&](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(I) && + "Unexpected main/alternate opcode"); return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), *TLI); }, @@ -15441,7 +15799,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { !isa(V)) return true; auto *I = dyn_cast(V); - return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB || + return !I || !E->getMatchingMainOpOrAltOp(I) || + I->getParent() == BB || isVectorLikeInstWithConstOps(I); })) && "Expected gathered loads or GEPs or instructions from same basic " @@ -17585,7 +17944,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateBinOp( static_cast(E->getOpcode()), LHS, RHS); - propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end()); + propagateIRFlags(V, E->Scalars, nullptr, It == MinBWs.end()); if (auto *I = dyn_cast(V)) { V = ::propagateMetadata(I, E->Scalars); // Drop nuw flags for abs(sub(commutative), true). @@ -18005,7 +18364,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector Mask; E->buildAltOpShuffleMask( [E, this](Instruction *I) { - assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + assert(E->getMatchingMainOpOrAltOp(I) && + "Unexpected main/alternate opcode"); return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(), *TLI); }, @@ -21796,7 +22156,7 @@ class HorizontalReduction { // Also check if the instruction was folded to constant/other value. auto *Inst = dyn_cast(RdxVal); if ((Inst && isVectorLikeInstWithConstOps(Inst) && - (!S || !S.isOpcodeOrAlt(Inst))) || + (!S || !S.getMatchingMainOpOrAltOp(Inst))) || (S && !Inst)) continue; Candidates.push_back(RdxVal); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll index feb4ad865f314..d527d38adbee3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll @@ -314,10 +314,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[ARRAYIDX_I1887]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll index fd3d4ab80b29c..ff897180cc9b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -7,13 +7,12 @@ define void @test(ptr %a, i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: -; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = or disjoint <2 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = extractelement <2 x ptr> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> splat (i1 true), <2 x double> poison) ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 7ab5e4d6cb787..481d586e6658a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -324,10 +324,10 @@ define void @store_try_reorder(ptr %dst) { ; ; POW2-ONLY-LABEL: @store_try_reorder( ; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[ADD:%.*]] = add i32 0, 0 -; POW2-ONLY-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; POW2-ONLY-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[ARRAYIDX_I1887]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 ; POW2-ONLY-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll b/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll new file mode 100644 index 0000000000000..6f27555aeb3f1 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/BinOpSameOpcodeHelper.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=slp-vectorizer -S %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 0, 0 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, 1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 0 +; CHECK-NEXT: [[UMIN120:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP0]], i64 [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 0, 0 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 0 +; CHECK-NEXT: [[UMIN122:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN120]], i64 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = add i64 0, 1 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 0 +; CHECK-NEXT: [[UMIN123:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN122]], i64 [[TMP6]]) +; CHECK-NEXT: [[UMIN124:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN123]], i64 0) +; CHECK-NEXT: ret void +; +entry: + %0 = mul i64 0, 0 + %1 = lshr i64 %0, 0 + %2 = sub i64 0, 1 + %3 = lshr i64 %2, 0 + %umin120 = call i64 @llvm.umin.i64(i64 %1, i64 %3) + %4 = sub i64 0, 0 + %5 = lshr i64 %4, 0 + %umin122 = call i64 @llvm.umin.i64(i64 %umin120, i64 %5) + %6 = add i64 0, 1 + %7 = lshr i64 %6, 0 + %umin123 = call i64 @llvm.umin.i64(i64 %umin122, i64 %7) + %umin124 = call i64 @llvm.umin.i64(i64 %umin123, i64 0) + ret void +} + +declare i64 @llvm.umin.i64(i64, i64) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index f46a5d84a86cc..a39e602e2da71 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -10,9 +10,7 @@ define i32 @foo(ptr nocapture %A, i32 %n) { ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll index 889f5a95c81d6..299677ca80b34 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bottom-to-top-reorder.ll @@ -10,15 +10,10 @@ define void @test(ptr %0, ptr %1, ptr %2) { ; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> , [[TMP8]] ; CHECK-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = sub <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> , [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = add <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = sub <4 x i32> [[TMP16]], zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> [[TMP18]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = sub <4 x i32> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP22]], ptr [[TMP2:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll index 43c42c1ea2bfb..03a89e54e4212 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-postpone-for-dependency.ll @@ -8,15 +8,13 @@ define void @test() { ; CHECK: [[BB1:.*]]: ; CHECK-NEXT: br label %[[BB2:.*]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ poison, %[[BB1]] ], [ [[TMP5:%.*]], %[[BB6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ poison, %[[BB1]] ], [ [[TMP4:%.*]], %[[BB6]] ] ; CHECK-NEXT: ret void ; CHECK: [[BB6]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP8:%.*]], %[[BB6]] ] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP4]] = mul <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> ; CHECK-NEXT: [[TMP8]] = mul <2 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: br i1 false, label %[[BB2]], label %[[BB6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll index 766916fe71f35..c4ddc5d63cc04 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll @@ -7,14 +7,12 @@ define i16 @test(i16 %v1, i16 %v2) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> , i16 [[V2]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> , i16 [[V1]], i32 3 -; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i16> [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i16> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP11]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 1c62e57edfc46..514d5f974cb16 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,17 +4,13 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP5:%.*]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 6 -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP3]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP12]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[TMP8:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> , <8 x i32> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) @@ -29,7 +25,7 @@ bb: %tmp4 = xor i32 %tmp3, 0 %tmp6 = sub i32 0, 0 %tmp8 = sub i32 %tmp7, 0 - %tmp9 = sub nsw i32 0, undef + %tmp9 = sub nsw i32 0, poison %tmp10 = add nsw i32 0, %tmp6 %tmp11 = sub nsw i32 0, %tmp8 %tmp12 = add i32 0, %tmp10 @@ -44,10 +40,10 @@ bb: %tmp21 = add i32 %tmp20, %tmp17 %tmp22 = sub i32 0, 0 %tmp23 = add i32 0, 0 - %tmp24 = sub i32 undef, 0 - %tmp25 = add nsw i32 %tmp23, undef + %tmp24 = sub i32 poison, 0 + %tmp25 = add nsw i32 %tmp23, poison %tmp26 = add nsw i32 %tmp24, %tmp22 - %tmp27 = sub nsw i32 undef, %tmp24 + %tmp27 = sub nsw i32 poison, %tmp24 %tmp28 = add i32 0, %tmp25 %tmp29 = xor i32 %tmp28, 0 %tmp30 = add i32 0, %tmp26 @@ -58,7 +54,7 @@ bb: %tmp35 = add i32 %tmp34, %tmp29 %tmp36 = add i32 %tmp35, 0 %tmp37 = add i32 %tmp36, %tmp33 - %tmp38 = sub nsw i32 0, undef + %tmp38 = sub nsw i32 0, poison %tmp39 = add i32 0, %tmp38 %tmp40 = xor i32 %tmp39, 0 %tmp41 = add i32 0, %tmp37 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll index 02c3173adc654..c6f5308cf54aa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll @@ -9,9 +9,7 @@ define i32 @foo(ptr nocapture %A, i32 %n, i32 %m) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], splat (i32 9) ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll index b39480b12496b..5a9ea0d292fa0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll @@ -6,26 +6,26 @@ define i64 @foo() { ; CHECK-LABEL: define i64 @foo() { ; CHECK-NEXT: bb: +; CHECK-NEXT: [[ADD7:%.*]] = add i64 0, 0 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ] -; CHECK-NEXT: [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ] ; CHECK-NEXT: ret i64 0 ; CHECK: bb3: -; CHECK-NEXT: [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0 -; CHECK-NEXT: [[TMP9]] = or i64 [[PHI5]], 0 -; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i64> , i64 [[ADD]], i32 0 +; CHECK-NEXT: [[PHI4:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP3:%.*]], [[BB3]] ] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> , <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> , i64 [[PHI4]], i32 0 +; CHECK-NEXT: [[TMP3]] = add <2 x i64> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP5]] = add <2 x i64> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[ADD7]] +; CHECK-NEXT: [[OR:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[ICMP:%.*]] = icmp ult i64 [[OR]], 0 ; CHECK-NEXT: br i1 false, label [[BB3]], label [[BB1:%.*]] ; ; FORCED-LABEL: define i64 @foo() { ; FORCED-NEXT: bb: -; FORCED-NEXT: [[TMP8:%.*]] = add i64 0, 0 +; FORCED-NEXT: [[ADD7:%.*]] = add i64 0, 0 ; FORCED-NEXT: br label [[BB3:%.*]] ; FORCED: bb1: ; FORCED-NEXT: [[TMP0:%.*]] = phi <2 x i64> [ [[TMP5:%.*]], [[BB3]] ] @@ -36,12 +36,10 @@ define i64 @foo() { ; FORCED-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> , <2 x i32> ; FORCED-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> , i64 [[PHI5]], i32 0 ; FORCED-NEXT: [[TMP7]] = add <2 x i64> [[TMP6]], [[TMP2]] -; FORCED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; FORCED-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]] -; FORCED-NEXT: [[TMP5]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i32> -; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[TMP8]] -; FORCED-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0 +; FORCED-NEXT: [[TMP5]] = add <2 x i64> [[TMP1]], [[TMP2]] +; FORCED-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 [[ADD7]] +; FORCED-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; FORCED-NEXT: [[ICMP:%.*]] = icmp ult i64 [[TMP8]], 0 ; FORCED-NEXT: br i1 false, label [[BB3]], label [[BB1:%.*]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll index 2a5bfa7390770..0198b1c5cb846 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-drop-wrapping-flags.ll @@ -9,9 +9,7 @@ define i32 @test() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> poison, i8 [[A_PROMOTED]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i8> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16> ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i16> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext i16 [[TMP7]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll index e6a166c27ac49..230e165e43edc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-extracts-bv-combined.ll @@ -8,10 +8,8 @@ define i32 @foo() { ; CHECK-NEXT: [[D:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[D]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: store <8 x i32> [[TMP2]], ptr getelementptr inbounds ([64 x i32], ptr null, i64 0, i64 15), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll index 1163c8219dabe..034fe82862950 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll @@ -4,6 +4,24 @@ ; RUN: -slp-skip-early-profitability-check < %s | FileCheck %s --check-prefixes=FORCED define void @foo() { +; CHECK-LABEL: define void @foo() { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 0 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> +; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) +; CHECK-NEXT: br label [[BB4]] +; CHECK: bb4: +; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] +; CHECK: bb5: +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP2]], [[BB4]] ] +; CHECK-NEXT: ret void +; ; FORCED-LABEL: define void @foo() { ; FORCED-NEXT: bb: ; FORCED-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 0 @@ -11,9 +29,7 @@ define void @foo() { ; FORCED: bb1: ; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] ; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]] -; FORCED-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP1]], [[TMP0]] -; FORCED-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP1]], <2 x i32> +; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> ; FORCED-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer ; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 ; FORCED-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) @@ -21,29 +37,9 @@ define void @foo() { ; FORCED: bb4: ; FORCED-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] ; FORCED: bb5: -; FORCED-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP4]], [[BB4]] ] +; FORCED-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ [[TMP2]], [[BB4]] ] ; FORCED-NEXT: ret void ; -; CHECK-LABEL: define void @foo() { -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB1:%.*]] -; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP2]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[SHL]], i32 0 -; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 -; CHECK-NEXT: [[CALL:%.*]] = call i64 null(i32 [[TMP7]]) -; CHECK-NEXT: br label [[BB4]] -; CHECK: bb4: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 false, label [[BB5:%.*]], label [[BB1]] -; CHECK: bb5: -; CHECK-NEXT: [[PHI6:%.*]] = phi i32 [ [[SHL]], [[BB4]] ] -; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ [[TMP8]], [[BB4]] ] -; CHECK-NEXT: ret void -; bb: br label %bb1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll index cb02f4d10923c..ad8e905a8ca02 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll @@ -330,9 +330,7 @@ define void @only_arcp(ptr %x) { define void @addsub_all_nsw(ptr %x) { ; CHECK-LABEL: @addsub_all_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; @@ -361,9 +359,7 @@ define void @addsub_all_nsw(ptr %x) { define void @addsub_some_nsw(ptr %x) { ; CHECK-LABEL: @addsub_some_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; @@ -392,9 +388,7 @@ define void @addsub_some_nsw(ptr %x) { define void @addsub_no_nsw(ptr %x) { ; CHECK-LABEL: @addsub_no_nsw( ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[X]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll index 81f3bf99f3fd8..7fe6941d52da7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll @@ -9,16 +9,16 @@ define i32 @test(i1 %cond) { ; CHECK: [[BB]]: ; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = or i32 1, 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[P1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] ; CHECK-NEXT: [[OR92]] = or i32 1, 0 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> , i32 [[OR92]], i32 0 +; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]] -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP8]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP1]], i32 1 ; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret i32 [[OP_RDX]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll index cda88620ab88a..fff2b72df613e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -14,10 +14,8 @@ define void @test() { ; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i16> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = sub <4 x i16> zeroinitializer, [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP14]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[TMP13]] to <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP2]], align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll index fcc295de62adf..a17ccb4b46ef9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll @@ -6,11 +6,9 @@ define i1 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 2 -; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], ; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll index 6e2a43ac5f9f1..15dd6756cd7db 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-base.ll @@ -242,13 +242,18 @@ exit: } define void @store_try_reorder(ptr %dst) { -; CHECK-LABEL: @store_try_reorder( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ADD:%.*]] = add i32 0, 0 -; CHECK-NEXT: store i32 [[ADD]], ptr [[DST:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_I1887:%.*]] = getelementptr i32, ptr [[DST]], i64 1 -; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr [[ARRAYIDX_I1887]], align 4 -; CHECK-NEXT: ret void +; NON-POW2-LABEL: @store_try_reorder( +; NON-POW2-NEXT: entry: +; NON-POW2-NEXT: store <3 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; NON-POW2-NEXT: ret void +; +; POW2-ONLY-LABEL: @store_try_reorder( +; POW2-ONLY-NEXT: entry: +; POW2-ONLY-NEXT: store <2 x i32> zeroinitializer, ptr [[DST:%.*]], align 4 +; POW2-ONLY-NEXT: [[ADD216:%.*]] = sub i32 0, 0 +; POW2-ONLY-NEXT: [[ARRAYIDX_I1891:%.*]] = getelementptr i32, ptr [[DST]], i64 2 +; POW2-ONLY-NEXT: store i32 [[ADD216]], ptr [[ARRAYIDX_I1891]], align 4 +; POW2-ONLY-NEXT: ret void ; entry: %add = add i32 0, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 869a9d1aee80e..4f3d551e21122 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -192,9 +192,7 @@ define void @addsub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4 ; CHECK-NEXT: ret void ; @@ -225,9 +223,7 @@ define void @addsub1(ptr noalias %dst, ptr noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], splat (i32 -1) -; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <2 x i32> [[TMP0]], splat (i32 -1) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP0]], ; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index c250029519590..9b6511d0d8284 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -1,18 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define <2 x i32> @test(i32 %arg) { -; CHECK-LABEL: define <2 x i32> @test( -; CHECK-SAME: i32 [[ARG:%.*]]) { -; CHECK-NEXT: bb: -; CHECK-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 0, 1 -; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 -; CHECK-NEXT: ret <2 x i32> [[TMP1]] +; X86-LABEL: define <2 x i32> @test( +; X86-SAME: i32 [[ARG:%.*]]) { +; X86-NEXT: bb: +; X86-NEXT: [[OR:%.*]] = or i32 [[ARG]], 0 +; X86-NEXT: [[MUL:%.*]] = mul i32 0, 1 +; X86-NEXT: [[MUL1:%.*]] = mul i32 [[OR]], [[MUL]] +; X86-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; X86-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[OR]], i32 0 +; X86-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[MUL]], i32 1 +; X86-NEXT: ret <2 x i32> [[TMP1]] +; +; AARCH64-LABEL: define <2 x i32> @test( +; AARCH64-SAME: i32 [[ARG:%.*]]) { +; AARCH64-NEXT: bb: +; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 +; AARCH64-NEXT: [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer +; AARCH64-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; AARCH64-NEXT: [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]] +; AARCH64-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; AARCH64-NEXT: ret <2 x i32> [[TMP1]] ; bb: %or = or i32 %arg, 0 diff --git a/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll b/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll new file mode 100644 index 0000000000000..29339c8a8aac8 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/bbi-106161.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s + +define i128 @f_768_3162(i16 %0) { +; CHECK-LABEL: @f_768_3162( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SHL22:%.*]] = shl i16 0, -1 +; CHECK-NEXT: [[MUL23:%.*]] = mul i16 0, [[TMP0:%.*]] +; CHECK-NEXT: [[MUL24:%.*]] = mul i16 [[SHL22]], [[MUL23]] +; CHECK-NEXT: store i16 [[MUL24]], ptr null, align 1 +; CHECK-NEXT: ret i128 0 +; +entry: + %shl22 = shl i16 0, -1 + %mul23 = mul i16 0, %0 + %mul24 = mul i16 %shl22, %mul23 + store i16 %mul24, ptr null, align 1 + ret i128 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll b/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll new file mode 100644 index 0000000000000..414997fe8e6f7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/isOpcodeOrAlt.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -S -slp-max-reg-size=1024 %s | FileCheck %s + +define void @test1(ptr %a, ptr %b) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 0 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[GEP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[GEP4]], align 4 +; CHECK-NEXT: ret void +; +entry: + %gep0 = getelementptr inbounds i32, ptr %a, i64 0 + %gep1 = getelementptr inbounds i32, ptr %a, i64 1 + %gep2 = getelementptr inbounds i32, ptr %a, i64 2 + %gep3 = getelementptr inbounds i32, ptr %a, i64 3 + %0 = load i32, ptr %gep0, align 4 + %1 = load i32, ptr %gep1, align 4 + %2 = load i32, ptr %gep2, align 4 + %3 = load i32, ptr %gep3, align 4 + %op0 = shl i32 %0, 1 + %op1 = add i32 %1, zeroinitializer + %op2 = mul i32 %2, 2 + %op3 = shl i32 %3, zeroinitializer + %gep4 = getelementptr inbounds i32, ptr %b, i64 0 + %gep5 = getelementptr inbounds i32, ptr %b, i64 1 + %gep6 = getelementptr inbounds i32, ptr %b, i64 2 + %gep7 = getelementptr inbounds i32, ptr %b, i64 3 + store i32 %op0, ptr %gep4, align 4 + store i32 %op1, ptr %gep5, align 4 + store i32 %op2, ptr %gep6, align 4 + store i32 %op3, ptr %gep7, align 4 + ret void +} + +define void @test2(i64 %_xstride) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MUL3:%.*]] = mul i64 [[_XSTRIDE:%.*]], 1 +; CHECK-NEXT: [[MUL5:%.*]] = mul i64 0, 0 +; CHECK-NEXT: [[MUL9:%.*]] = sub i64 0, [[_XSTRIDE]] +; CHECK-NEXT: [[MUL12:%.*]] = shl i64 [[_XSTRIDE]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr null, i64 [[MUL3]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr i8, ptr null, i64 [[MUL5]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr null, i64 [[MUL9]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i8, ptr null, i64 [[MUL12]] +; CHECK-NEXT: ret void +; +entry: + %mul3 = mul i64 %_xstride, 1 + %mul5 = mul i64 0, 0 + %mul9 = sub i64 0, %_xstride + %mul12 = shl i64 %_xstride, 1 + %arrayidx = getelementptr i8, ptr null, i64 %mul3 + %arrayidx6 = getelementptr i8, ptr null, i64 %mul5 + %arrayidx10 = getelementptr i8, ptr null, i64 %mul9 + %arrayidx13 = getelementptr i8, ptr null, i64 %mul12 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll index 056b6222cae72..caca410f056c1 100644 --- a/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll +++ b/llvm/test/Transforms/SLPVectorizer/resized-alt-shuffle-after-minbw.ll @@ -6,11 +6,9 @@ define void @func(i32 %0) { ; CHECK-SAME: i32 [[TMP0:%.*]]) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shl i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP9]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP9]] to i32 diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll index 732b50396a460..cf5927bf58327 100644 --- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll @@ -12,9 +12,7 @@ define i32 @test() { ; CHECK-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: [[TMP5]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP5]] = add <2 x i32> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br label [[BB1]] ; CHECK: bb4: ; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ]