diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index 8715403f3839a..8d9cb65940097 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -43,6 +43,7 @@ add_llvm_target(RISCVCodeGen RISCVInstrInfo.cpp RISCVISelDAGToDAG.cpp RISCVISelLowering.cpp + RISCVLoopIdiomRecognize.cpp RISCVMachineFunctionInfo.cpp RISCVMergeBaseOffset.cpp RISCVOptWInstrs.cpp diff --git a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp new file mode 100644 index 0000000000000..424a53ed81663 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.cpp @@ -0,0 +1,732 @@ +//===-------- RISCVLoopIdiomRecognize.cpp - Loop idiom recognition --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RISCVLoopIdiomRecognize.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/TargetParser/RISCVTargetParser.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-loop-idiom" + +static cl::opt + DisableAll("riscv-disable-all-loop-idiom", cl::Hidden, cl::init(true), + cl::desc("Disable RISCV Loop Idiom Recognize Pass.")); + +static cl::opt DisableByteCmp( + "disable-riscv-loop-idiom-bytecmp", cl::Hidden, cl::init(false), + cl::desc("Proceed with RISCV Loop Idiom Recognize Pass, but do " + "not convert byte-compare loop(s).")); + +// CustomLoopIdiomLMUL can be used to customize LMUL for vectorizing loops. +// It uses the exponent value to represent LMUL i.e. 0 -> LMUL 1, 1 -> LMUL 2, 2 +// -> LMUL 4, 3 -> LMUL 8, etc. +static cl::opt + CustomLoopIdiomLMUL("riscv-loop-idiom-lmul", cl::Hidden, cl::init(1), + cl::desc("Customize LMUL for vector loop.")); + +namespace { + +class RISCVLoopIdiomRecognize { + Loop *CurLoop = nullptr; + DominatorTree &DT; + LoopInfo &LI; + TargetLibraryInfo &TLI; + const TargetTransformInfo &TTI; + const DataLayout &DL; + +public: + explicit RISCVLoopIdiomRecognize(DominatorTree &DT, LoopInfo &LI, + TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, + const DataLayout &DL) + : DT(DT), LI(LI), TLI(TLI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl &ExitBlocks); + + bool recognizeAndTransformByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, Instruction *Index, + Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + PHINode *IndPhi, Value *MaxLen, Instruction *Index, + Value *Start, bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB); + + /// @} +}; +} // end anonymous namespace + +static VectorType *getBestVectorTypeForLoopIdiom(LLVMContext &Ctx) { + unsigned LMULExp = std::min(3U, CustomLoopIdiomLMUL.getValue()); + unsigned VF = (RISCV::RVVBitsPerBlock / 8) << LMULExp; + ElementCount EC = ElementCount::getScalable(VF); + return VectorType::get(Type::getInt8Ty(Ctx), EC); +} + +PreservedAnalyses +RISCVLoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) + return PreservedAnalyses::all(); + + Function &F = *L.getHeader()->getParent(); + if (F.hasFnAttribute(Attribute::NoImplicitFloat)) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName() + << " due to its NoImplicitFloat attribute"); + return PreservedAnalyses::all(); + } + + // Only enabled on RV64 for now. + if (L.getHeader()->getModule()->getDataLayout().getPointerSizeInBits() != 64) + return PreservedAnalyses::all(); + + // Only enabled when vector extension is present. + if (!AR.TTI.supportsScalableVectors()) + return PreservedAnalyses::all(); + + const auto DL = L.getHeader()->getModule()->getDataLayout(); + + RISCVLoopIdiomRecognize LIR(AR.DT, AR.LI, AR.TLI, AR.TTI, DL); + if (!LIR.run(&L)) + return PreservedAnalyses::all(); + + auto PA = PreservedAnalyses::none(); + PA.preserve(); + return PA; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of RISCVLoopIdiomRecognize +// +//===----------------------------------------------------------------------===// + +bool RISCVLoopIdiomRecognize::run(Loop *L) { + CurLoop = L; + + if (DisableAll) + return false; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + return recognizeAndTransformByteCompare(); +} + +bool RISCVLoopIdiomRecognize::recognizeAndTransformByteCompare() { + if (DisableByteCmp) + return false; + + BasicBlock *PH = CurLoop->getLoopPreheader(); + + // The preheader should only contain an unconditional branch. + if (!PH || &PH->front() != PH->getTerminator()) + return false; + + using namespace PatternMatch; + + BasicBlock *Header; + if (!match(PH->getTerminator(), m_UnconditionalBr(Header))) + return false; + + if (Header != CurLoop->getHeader()) + return false; + + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) + return false; + + auto *PN = dyn_cast(&Header->front()); + if (!PN || PN->getNumIncomingValues() != 2) + return false; + + auto LoopBlocks = CurLoop->getBlocks(); + // The first block in the loop should contain only 4 instructions, e.g. + // + // while.cond: + // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] + // %inc = add i32 %res.phi, 1 + // %cmp.not = icmp eq i32 %inc, %n + // br i1 %cmp.not, label %while.end, label %while.body + // + auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); + if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) != 4) + return false; + + // The second block should contain 7 instructions, e.g. + // + // while.body: + // %idx = zext i32 %inc to i64 + // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx + // %load.a = load i8, ptr %idx.a + // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx + // %load.b = load i8, ptr %idx.b + // %cmp.not.ld = icmp eq i8 %load.a, %load.b + // br i1 %cmp.not.ld, label %while.cond, label %while.end + // + auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); + if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) != 7) + return false; + + // The incoming value to the PHI node from the loop should be an add of 1. + Instruction *Index = nullptr; + Value *StartIdx = nullptr; + for (BasicBlock *BB : PN->blocks()) { + if (!CurLoop->contains(BB)) { + StartIdx = PN->getIncomingValueForBlock(BB); + continue; + } + Index = dyn_cast(PN->getIncomingValueForBlock(BB)); + // Limit to 32-bit types for now + if (!Index || !Index->getType()->isIntegerTy(32) || + !match(Index, m_c_Add(m_Specific(PN), m_One()))) + return false; + } + + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != PN && &I != Index) + for (User *U : I.users()) { + auto *UI = dyn_cast(U); + if (!CurLoop->contains(UI)) + return false; + } + + // Match the branch instruction for the header + ICmpInst::Predicate Pred; + Value *MaxLen; + BasicBlock *EndBB, *WhileBB; + if (!match(Header->getTerminator(), + m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), + m_BasicBlock(EndBB), m_BasicBlock(WhileBB)))) + return false; + + // Make sure Pred is comparing for equal + if (Pred != ICmpInst::ICMP_EQ) + return false; + + // Make sure EndBB is outside the loop and WhileBB is inside the loop. + if (CurLoop->contains(EndBB) || !CurLoop->contains(WhileBB)) + return false; + + // WhileBB should contain the pattern of load & compare instructions. Match + // the pattern and find the GEP instructions used by the loads. + ICmpInst::Predicate WhilePred; + BasicBlock *FoundBB; + BasicBlock *TrueBB; + Value *A, *B; + if (!match(WhileBB->getTerminator(), + m_Br(m_ICmp(WhilePred, m_Load(m_Value(A)), m_Load(m_Value(B))), + m_BasicBlock(TrueBB), m_BasicBlock(FoundBB)))) + return false; + + // Make sure WhilePred is comparing for equal + if (WhilePred != ICmpInst::ICMP_EQ) + return false; + + // Make sure TrueBB is the loop header and FoundBB is outside the loop. + if (CurLoop->getHeader() != TrueBB || CurLoop->contains(FoundBB)) + return false; + + auto *GEPA = dyn_cast(A); + auto *GEPB = dyn_cast(B); + if (!GEPA || !GEPB) + return false; + + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Check PtrA and PtrB stride at i8. + if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || + !GEPA->getResultElementType()->isIntegerTy(8) || + !GEPB->getResultElementType()->isIntegerTy(8) || PtrA == PtrB) + return false; + + // Check loads from GEPA and GEPB are i8. + auto *LoadA = dyn_cast(GEPA->getNextNode()); + if (!LoadA || !LoadA->getType()->isIntegerTy(8)) + return false; + auto *LoadB = dyn_cast(GEPB->getNextNode()); + if (!LoadB || !LoadB->getType()->isIntegerTy(8)) + return false; + + // Check that the index to the GEPs is the index we found earlier + if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) + return false; + + Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); + Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); + + if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) + return false; + + // We only ever expect the pre-incremented index value to be used inside the + // loop. + if (!PN->hasOneUse()) + return false; + + // Ensure that when the Found and End blocks are identical the PHIs have the + // supported format. We don't currently allow cases like this: + // while.cond: + // ... + // br i1 %cmp.not, label %while.end, label %while.body + // + // while.body: + // ... + // br i1 %cmp.not2, label %while.cond, label %while.end + // + // while.end: + // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] + // + // Where the incoming values for %final_ptr are unique and from each of the + // loop blocks, but not actually defined in the loop. This requires extra + // work setting up the byte.compare block, i.e. by introducing a select to + // choose the correct value. + // TODO: We could add support for this in future. + if (FoundBB == EndBB) { + for (PHINode &EndPN : EndBB->phis()) { + Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header); + Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB); + + // The value of the index when leaving the while.cond block is always the + // same as the end value (MaxLen) so we permit either. Otherwise for any + // other value defined outside the loop we only allow values that are the + // same as the exit value for while.body. + if (WhileCondVal != WhileBodyVal && + ((WhileCondVal != Index && WhileCondVal != MaxLen) || + (WhileBodyVal != Index && WhileBodyVal != MaxLen))) + return false; + } + } + + LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" + << *(EndBB->getParent()) << "\n\n"); + transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, true, FoundBB, + EndBB); + LLVM_DEBUG(dbgs() << "AFTER IDIOM TRANSFORMATION: \n" + << *(EndBB->getParent()) << "\n\n"); + return true; +} + +Value *RISCVLoopIdiomRecognize::expandFindMismatch( + IRBuilder<> &Builder, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + Instruction *Index, Value *Start, Value *MaxLen) { + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Get the arguments and types for the intrinsic. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + auto *PHBranch = cast(Preheader->getTerminator()); + LLVMContext &Ctx = PHBranch->getContext(); + Type *LoadType = Type::getInt8Ty(Ctx); + Type *ResType = Builder.getInt32Ty(); + + // Split block at the original callsite, where the EndBlock continues from + // where the original call ended. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + BasicBlock *EndBlock = + SplitBlock(Preheader, PHBranch, &DT, &LI, nullptr, "mismatch_end"); + + // Safeguard to check if we build the correct DomTree with DTU. + auto CheckDTU = llvm::make_scope_exit([&]() { + assert(DTU.getDomTree().verify() && "Ill-formed DomTree built by DTU"); + }); + + // Create the blocks that we're going to need: + // 1. A block for checking the zero-extended length exceeds 0 + // 2. A block to check that the start and end addresses of a given array + // lie on the same page. + // 3. The RVV loop preheader i.e. vector_loop_preheader + // 4. The first RVV loop block i.e. vector_loop + // 5. The RVV loop increment block i.e. vector_loop_inc + // 6. A block we can jump to from the RVV loop when a mismatch is found i.e. + // vector_loop_exit + // 7. The first block of the scalar loop itself, containing PHIs , loads + // and cmp. + // 8. A scalar loop increment block to increment the PHIs and go back + // around the loop. + + BasicBlock *MinItCheckBlock = BasicBlock::Create( + Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); + + // This DTU update is actually the only one we need to cover all control flow + // changes made in this function. Because the current DTU algorithm + // recaculates the whole sub-tree between a deleted edge. And the edge between + // Preheader and EndBlock happens to enclose all the blocks we inserted + // in this function. + DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, + {DominatorTree::Delete, Preheader, EndBlock}}); + + // Update the terminator added by SplitBlock to branch to the first block + Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); + + BasicBlock *MemCheckBlock = BasicBlock::Create( + Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); + + BasicBlock *RVVLoopPreheaderBlock = BasicBlock::Create( + Ctx, "mismatch_vector_loop_preheader", EndBlock->getParent(), EndBlock); + + BasicBlock *RVVLoopStartBlock = BasicBlock::Create( + Ctx, "mismatch_vector_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *RVVLoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_vector_loop_inc", EndBlock->getParent(), EndBlock); + + BasicBlock *RVVLoopMismatchBlock = BasicBlock::Create( + Ctx, "mismatch_vector_loop_found", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( + Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopStartBlock = + BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); + + // Update LoopInfo with the new RVV & scalar loops. + auto RVVLoop = LI.AllocateLoop(); + auto ScalarLoop = LI.AllocateLoop(); + if (CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->addChildLoop(RVVLoop); + CurLoop->getParentLoop()->addChildLoop(ScalarLoop); + + CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(RVVLoopPreheaderBlock, LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(RVVLoopMismatchBlock, LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, LI); + } else { + LI.addTopLevelLoop(RVVLoop); + LI.addTopLevelLoop(ScalarLoop); + } + + // Add the new basic blocks to their associated loops. + RVVLoop->addBasicBlockToLoop(RVVLoopStartBlock, LI); + RVVLoop->addBasicBlockToLoop(RVVLoopIncBlock, LI); + + ScalarLoop->addBasicBlockToLoop(LoopStartBlock, LI); + ScalarLoop->addBasicBlockToLoop(LoopIncBlock, LI); + + // Set up some types and constants that we intend to reuse. + Type *I64Type = Builder.getInt64Ty(); + Type *I32Type = Builder.getInt32Ty(); + + // Check the zero-extended iteration count > 0 + Builder.SetInsertPoint(MinItCheckBlock); + Value *ExtStart = Builder.CreateZExt(Start, I64Type); + Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); + // This check doesn't really cost us very much. + + Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); + BranchInst *MinItCheckBr = + BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); + MinItCheckBr->setMetadata( + LLVMContext::MD_prof, + MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); + Builder.Insert(MinItCheckBr); + + // For each of the arrays, check the start/end addresses are on the same + // page. + Builder.SetInsertPoint(MemCheckBlock); + + // For each start address calculate the offset into the min architecturally + // allowed page size (4096). Then determine how many bytes there are left on + // the page and see if this is >= MaxLen. + Value *LhsStartPage = Builder.CreateLShr( + Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrA, ExtStart), + I64Type), + 12U); + Value *LhsEndPage = Builder.CreateLShr( + Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrA, ExtEnd), + I64Type), + 12U); + Value *RhsStartPage = Builder.CreateLShr( + Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrB, ExtStart), + I64Type), + 12U); + Value *RhsEndPage = Builder.CreateLShr( + Builder.CreatePtrToInt(Builder.CreateGEP(LoadType, PtrB, ExtEnd), + I64Type), + 12U); + Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); + Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); + + BranchInst *CombinedPageCmpCmpBr = + BranchInst::Create(LoopPreHeaderBlock, RVVLoopPreheaderBlock, + Builder.CreateOr(LhsPageCmp, RhsPageCmp)); + CombinedPageCmpCmpBr->setMetadata( + LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) + .createBranchWeights(10, 90)); + Builder.Insert(CombinedPageCmpCmpBr); + + // Set up the RVV loop preheader, i.e. calculate initial loop predicate, + // zero-extend MaxLen to 64-bits, determine the number of vector elements + // processed in each iteration, etc. + Builder.SetInsertPoint(RVVLoopPreheaderBlock); + + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= 4096 due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + auto *JumpToRVVLoop = BranchInst::Create(RVVLoopStartBlock); + Builder.Insert(JumpToRVVLoop); + + // Set up the first RVV loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(RVVLoopStartBlock); + auto *RVVIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_vector_index"); + RVVIndexPhi->addIncoming(ExtStart, RVVLoopPreheaderBlock); + + // Calculate AVL by subtracting the vector loop index from the trip count + Value *AVL = Builder.CreateSub(ExtEnd, RVVIndexPhi, "avl", /*HasNUW=*/true, + /*HasNSW=*/true); + + VectorType *RVVLoadType = getBestVectorTypeForLoopIdiom(Builder.getContext()); + auto *VF = ConstantInt::get( + I32Type, RVVLoadType->getElementCount().getKnownMinValue()); + auto *IsScalable = ConstantInt::getBool( + Builder.getContext(), RVVLoadType->getElementCount().isScalable()); + + Value *RVL = + Builder.CreateIntrinsic(Intrinsic::experimental_get_vector_length, + {I64Type}, {AVL, VF, IsScalable}); + Value *GepOffset = RVVIndexPhi; + + Value *RVVLhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast(RVVLhsGep)->setIsInBounds(true); + VectorType *TrueMaskTy = + VectorType::get(Builder.getInt1Ty(), RVVLoadType->getElementCount()); + Value *AllTrueMask = Constant::getAllOnesValue(TrueMaskTy); + Value *RVVLhsLoad = Builder.CreateIntrinsic( + Intrinsic::vp_load, {RVVLoadType, RVVLhsGep->getType()}, + {RVVLhsGep, AllTrueMask, RVL}, nullptr, "lhs.load"); + + Value *RVVRhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast(RVVRhsGep)->setIsInBounds(true); + Value *RVVRhsLoad = Builder.CreateIntrinsic( + Intrinsic::vp_load, {RVVLoadType, RVVLhsGep->getType()}, + {RVVRhsGep, AllTrueMask, RVL}, nullptr, "rhs.load"); + + StringRef PredicateStr = CmpInst::getPredicateName(CmpInst::ICMP_NE); + auto *PredicateMDS = MDString::get(RVVLhsLoad->getContext(), PredicateStr); + Value *Pred = MetadataAsValue::get(RVVLhsLoad->getContext(), PredicateMDS); + Value *RVVMatchCmp = + Builder.CreateIntrinsic(Intrinsic::vp_icmp, {RVVLhsLoad->getType()}, + {RVVLhsLoad, RVVRhsLoad, Pred, AllTrueMask, RVL}, + nullptr, "mismatch.cmp"); + Value *CTZ = Builder.CreateIntrinsic( + Intrinsic::vp_cttz_elts, {ResType, RVVMatchCmp->getType()}, + {RVVMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true), AllTrueMask, RVL}); + // RISC-V refines/lowers the poison returned by vp.cttz.elts to -1. + Value *MismatchFound = + Builder.CreateICmpSGE(CTZ, ConstantInt::get(ResType, 0)); + auto *RVVEarlyExit = + BranchInst::Create(RVVLoopMismatchBlock, RVVLoopIncBlock, MismatchFound); + Builder.Insert(RVVEarlyExit); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(RVVLoopIncBlock); + Value *RVL64 = Builder.CreateZExt(RVL, I64Type); + Value *NewRVVIndexPhi = Builder.CreateAdd(RVVIndexPhi, RVL64, "", + /*HasNUW=*/true, /*HasNSW=*/true); + RVVIndexPhi->addIncoming(NewRVVIndexPhi, RVVLoopIncBlock); + Value *ExitCond = Builder.CreateICmpNE(NewRVVIndexPhi, ExtEnd); + auto *RVVLoopBranchBack = + BranchInst::Create(RVVLoopStartBlock, EndBlock, ExitCond); + Builder.Insert(RVVLoopBranchBack); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(RVVLoopMismatchBlock); + + // Add LCSSA phis for CTZ and RVVIndexPhi. + auto *CTZLCSSAPhi = Builder.CreatePHI(CTZ->getType(), 1, "ctz"); + CTZLCSSAPhi->addIncoming(CTZ, RVVLoopStartBlock); + auto *RVVIndexLCSSAPhi = + Builder.CreatePHI(RVVIndexPhi->getType(), 1, "mismatch_vector_index"); + RVVIndexLCSSAPhi->addIncoming(RVVIndexPhi, RVVLoopStartBlock); + + Value *CTZI64 = Builder.CreateZExt(CTZLCSSAPhi, I64Type); + Value *RVVLoopRes64 = Builder.CreateAdd(RVVIndexLCSSAPhi, CTZI64, "", + /*HasNUW=*/true, /*HasNSW=*/true); + Value *RVVLoopRes = Builder.CreateTrunc(RVVLoopRes64, ResType); + + Builder.Insert(BranchInst::Create(EndBlock)); + + // Generate code for scalar loop. + Builder.SetInsertPoint(LoopPreHeaderBlock); + auto *StartIndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_start_index"); + StartIndexPhi->addIncoming(Start, MemCheckBlock); + StartIndexPhi->addIncoming(Start, MinItCheckBlock); + Builder.Insert(BranchInst::Create(LoopStartBlock)); + + Builder.SetInsertPoint(LoopStartBlock); + auto *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); + IndexPhi->addIncoming(StartIndexPhi, LoopPreHeaderBlock); + + // Otherwise compare the values + // Load bytes from each array and compare them. + GepOffset = Builder.CreateZExt(IndexPhi, I64Type); + + Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast(LhsGep)->setIsInBounds(true); + Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); + + Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast(RhsGep)->setIsInBounds(true); + Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); + + Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); + // If we have a mismatch then exit the loop ... + auto *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); + Builder.Insert(MatchCmpBr); + // Have we reached the maximum permitted length for the loop? + Builder.SetInsertPoint(LoopIncBlock); + Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", + /*HasNUW=*/Index->hasNoUnsignedWrap(), + /*HasNSW=*/Index->hasNoSignedWrap()); + IndexPhi->addIncoming(PhiInc, LoopIncBlock); + Value *IVCmp = Builder.CreateICmpEQ(IndexPhi, MaxLen); + auto *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); + Builder.Insert(IVCmpBr); + + // In the end block we need to insert a PHI node to deal with three cases: + // 1. The length of the loop was zero, hence we jumped straight from + // MinItCheckBlock. + // 2. We didn't find a mismatch in the scalar loop, so we should return + // MaxLen. + // 3. We exitted the scalar loop early due to a mismatch and need to return + // the index that we found. + // 4. We didn't find a mismatch in the RVV loop, so we should return + // MaxLen. + // 5. We exitted the RVV loop early due to a mismatch and need to return + // the index that we found. + Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); + auto *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); + ResPhi->addIncoming(MaxLen, LoopIncBlock); + ResPhi->addIncoming(IndexPhi, LoopStartBlock); + ResPhi->addIncoming(MaxLen, RVVLoopIncBlock); + ResPhi->addIncoming(RVVLoopRes, RVVLoopMismatchBlock); + + return Builder.CreateTrunc(ResPhi, ResType); +} + +void RISCVLoopIdiomRecognize::transformByteCompare( + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, + Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, + BasicBlock *FoundBB, BasicBlock *EndBB) { + + // Insert the byte compare intrinsic at the end of the preheader block + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BasicBlock *Header = CurLoop->getHeader(); + auto *PHBranch = cast(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + // Increment the pointer if this was done before the loads in the loop. + if (IncIdx) + Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); + + Value *ByteCmpRes = + expandFindMismatch(Builder, GEPA, GEPB, Index, Start, MaxLen); + + // Replaces uses of index with intrinsic. + assert(IndPhi->hasOneUse() && "Index phi node has more than one use!"); + Index->replaceAllUsesWith(ByteCmpRes); + + // If no mismatch was found, we can jump to the end block. Create a + // new basic block for the compare instruction. + auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", + Preheader->getParent()); + CmpBB->moveBefore(EndBB); + + // Replace the branch in the preheader with an always-true conditional branch. + // This ensures there is still a reference to the original loop. + Value *BrCnd = Builder.CreateICmpEQ(ConstantInt::get(Start->getType(), 1), + ConstantInt::get(Start->getType(), 1)); + Builder.CreateCondBr(BrCnd, CmpBB, Header); + PHBranch->eraseFromParent(); + + // Create the branch to either the end or found block depending on the value + // returned by the intrinsic. + Builder.SetInsertPoint(CmpBB); + Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); + Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); + + auto FixSuccessorPhis = [&](BasicBlock *SuccBB) { + for (PHINode &PN : SuccBB->phis()) { + // At this point we've already replaced all uses of the result from the + // loop with ByteCmp. Look through the incoming values to find ByteCmp, + // meaning this is a Phi collecting the results of the byte compare. + bool ResPhi = + any_of(PN.incoming_values(), [=](Value *Op) { return Op == CmpBB; }); + + // If any of the incoming values were ByteCmp, we need to also add + // it as an incoming value from CmpBB. + if (ResPhi) { + PN.addIncoming(ByteCmpRes, CmpBB); + } else { + // Otherwise, this is a Phi for different values. We should create + // a new incoming value from CmpBB matching the same value as from + // the old loop. + for (BasicBlock *BB : PN.blocks()) + if (CurLoop->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); + break; + } + } + } + }; + + // Ensure all Phis in the successors of CmpBB have an incoming value from it. + FixSuccessorPhis(EndBB); + FixSuccessorPhis(FoundBB); + + // The new CmpBB block isn't part of the loop, but will need to be added to + // the outer loop if there is one. + if (!CurLoop->isOutermost()) + CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, LI); + + // Update the dominator tree with the new block. + DT.addNewBlock(CmpBB, Preheader); +} diff --git a/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h new file mode 100644 index 0000000000000..b31f16817a8b9 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVLoopIdiomRecognize.h @@ -0,0 +1,25 @@ +//===-------- RISCVLoopIdiomRecognize.h -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H +#define LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +struct RISCVLoopIdiomRecognizePass + : public PassInfoMixin { + RISCVLoopIdiomRecognizePass() = default; + + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_RISCV_RISCVLOOPIDIOMRECOGNIZE_H diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 5d598a275a008..3c06e62093bb7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -13,6 +13,7 @@ #include "RISCVTargetMachine.h" #include "MCTargetDesc/RISCVBaseInfo.h" #include "RISCV.h" +#include "RISCVLoopIdiomRecognize.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVTargetObjectFile.h" #include "RISCVTargetTransformInfo.h" @@ -33,6 +34,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/InitializePasses.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" @@ -584,3 +586,21 @@ bool RISCVTargetMachine::parseMachineFunctionInfo( PFS.MF.getInfo()->initializeBaseYamlFields(YamlMFI); return false; } + +void RISCVTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { + PB.registerPipelineParsingCallback( + [](StringRef PassName, LoopPassManager &PM, + ArrayRef) { + if (PassName == "riscv-loop-idiom") { + PM.addPass(RISCVLoopIdiomRecognizePass()); + return true; + } + return false; + }); + + PB.registerLateLoopOptimizationsEPCallback( + [=](LoopPassManager &LPM, OptimizationLevel Level) { + LPM.addPass(RISCVLoopIdiomRecognizePass()); + }); +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.h b/llvm/lib/Target/RISCV/RISCVTargetMachine.h index 68dfb3c81f2fe..1f8ccc76987c7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.h +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.h @@ -59,6 +59,9 @@ class RISCVTargetMachine : public LLVMTargetMachine { PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override; + + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll new file mode 100644 index 0000000000000..047ed61119111 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/RISCV/byte-compare-index.ll @@ -0,0 +1,1771 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -riscv-disable-all-loop-idiom=false -passes=riscv-loop-idiom -mtriple=riscv64-unknown-linux-gnu -mattr=+v -S < %s | FileCheck %s +; RUN: opt -riscv-disable-all-loop-idiom=false -passes=riscv-loop-idiom -mtriple=riscv64-unknown-linux-gnu -riscv-loop-idiom-lmul=3 -mattr=+v -S < %s | FileCheck %s --check-prefix=LMUL8 +; RUN: opt -riscv-disable-all-loop-idiom=false -passes='loop(riscv-loop-idiom),simplifycfg' -mtriple=riscv64-unknown-linux-gnu -mattr=+v -S < %s | FileCheck %s --check-prefix=LOOP-DEL + +define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: mismatch_vector_loop_preheader: +; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; CHECK: mismatch_vector_loop: +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; CHECK: mismatch_vector_loop_inc: +; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vector_loop_found: +; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LMUL8-LABEL: define i32 @compare_bytes_simple( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; LMUL8: mismatch_min_it_check: +; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; LMUL8: mismatch_mem_check: +; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1:![0-9]+]] +; LMUL8: mismatch_vector_loop_preheader: +; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; LMUL8: mismatch_vector_loop: +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true) +; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv64i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LMUL8: mismatch_vector_loop_inc: +; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; LMUL8: mismatch_vector_loop_found: +; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LMUL8-NEXT: br label [[MISMATCH_END]] +; LMUL8: mismatch_loop_pre: +; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LMUL8: mismatch_loop: +; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; LMUL8: mismatch_loop_inc: +; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; LMUL8: mismatch_end: +; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LMUL8: byte.compare: +; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0:![0-9]+]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1:![0-9]+]] +; LOOP-DEL: mismatch_vector_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ] +; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LOOP-DEL: mismatch_vector_loop_inc: +; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_vector_loop_found: +; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_signed_wrap(ptr %a, ptr %b, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_signed_wrap( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vector_loop_preheader: +; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; CHECK: mismatch_vector_loop: +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; CHECK: mismatch_vector_loop_inc: +; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vector_loop_found: +; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LMUL8-LABEL: define i32 @compare_bytes_signed_wrap( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; LMUL8: mismatch_min_it_check: +; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LMUL8: mismatch_mem_check: +; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LMUL8: mismatch_vector_loop_preheader: +; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; LMUL8: mismatch_vector_loop: +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true) +; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv64i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LMUL8: mismatch_vector_loop_inc: +; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; LMUL8: mismatch_vector_loop_found: +; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LMUL8-NEXT: br label [[MISMATCH_END]] +; LMUL8: mismatch_loop_pre: +; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LMUL8: mismatch_loop: +; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; LMUL8: mismatch_loop_inc: +; LMUL8-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1 +; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; LMUL8: mismatch_end: +; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC:%.*]] = add nsw i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LMUL8: byte.compare: +; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_signed_wrap( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vector_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ] +; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LOOP-DEL: mismatch_vector_loop_inc: +; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END:%.*]] +; LOOP-DEL: mismatch_vector_loop_found: +; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP35]] = add nsw i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[MISMATCH_RESULT]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_signed_wrap( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add nsw i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add nsw i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +} + + +define i32 @compare_bytes_simple_end_ne_found(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple_end_ne_found( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vector_loop_preheader: +; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; CHECK: mismatch_vector_loop: +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; CHECK: mismatch_vector_loop_inc: +; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vector_loop_found: +; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]] +; CHECK: while.found: +; CHECK-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]] +; CHECK: while.end: +; CHECK-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ] +; CHECK-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ] +; CHECK-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4 +; CHECK-NEXT: ret i32 [[MISMATCH_INDEX]] +; +; LMUL8-LABEL: define i32 @compare_bytes_simple_end_ne_found( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; LMUL8: mismatch_min_it_check: +; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LMUL8: mismatch_mem_check: +; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LMUL8: mismatch_vector_loop_preheader: +; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; LMUL8: mismatch_vector_loop: +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true) +; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv64i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LMUL8: mismatch_vector_loop_inc: +; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; LMUL8: mismatch_vector_loop_found: +; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LMUL8-NEXT: br label [[MISMATCH_END]] +; LMUL8: mismatch_loop_pre: +; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LMUL8: mismatch_loop: +; LMUL8-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64 +; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; LMUL8: mismatch_loop_inc: +; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1 +; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]] +; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; LMUL8: mismatch_end: +; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]] +; LMUL8: while.found: +; LMUL8-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[C]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: br label [[END:%.*]] +; LMUL8: byte.compare: +; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END]], label [[WHILE_FOUND]] +; LMUL8: while.end: +; LMUL8-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ], [ [[N]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ], [ [[D]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: br label [[END]] +; LMUL8: end: +; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ] +; LMUL8-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ] +; LMUL8-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4 +; LMUL8-NEXT: ret i32 [[MISMATCH_INDEX]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple_end_ne_found( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vector_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ] +; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LOOP-DEL: mismatch_vector_loop_inc: +; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[BYTE_COMPARE:%.*]] +; LOOP-DEL: mismatch_vector_loop_found: +; LOOP-DEL-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST1]] to i64 +; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP26]] +; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LOOP-DEL-NEXT: br label [[BYTE_COMPARE]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX3:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX3]] to i64 +; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[BYTE_COMPARE]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX3]], 1 +; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX3]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[BYTE_COMPARE]], label [[MISMATCH_LOOP]] +; LOOP-DEL: byte.compare: +; LOOP-DEL-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX3]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: [[TMP37:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LOOP-DEL-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP37]], i32 [[N]], i32 [[MISMATCH_RESULT]] +; LOOP-DEL-NEXT: [[SPEC_SELECT4:%.*]] = select i1 [[TMP37]], ptr [[D]], ptr [[C]] +; LOOP-DEL-NEXT: store i32 [[SPEC_SELECT]], ptr [[SPEC_SELECT4]], align 4 +; LOOP-DEL-NEXT: ret i32 [[SPEC_SELECT]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_simple_end_ne_found( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_FOUND:%.*]] +; NO-TRANSFORM: while.found: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX1:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: [[FOUND_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: br label [[END:%.*]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX2:%.*]] = phi i32 [ [[N]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: [[END_PTR:%.*]] = phi ptr [ [[D]], [[WHILE_COND]] ] +; NO-TRANSFORM-NEXT: br label [[END]] +; NO-TRANSFORM: end: +; NO-TRANSFORM-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_INDEX1]], [[WHILE_FOUND]] ], [ [[MISMATCH_INDEX2]], [[WHILE_END]] ] +; NO-TRANSFORM-NEXT: [[STORE_PTR:%.*]] = phi ptr [ [[END_PTR]], [[WHILE_END]] ], [ [[FOUND_PTR]], [[WHILE_FOUND]] ] +; NO-TRANSFORM-NEXT: store i32 [[MISMATCH_INDEX]], ptr [[STORE_PTR]], align 4 +; NO-TRANSFORM-NEXT: ret i32 [[MISMATCH_INDEX]] +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.found + +while.found: + %mismatch_index1 = phi i32 [ %inc, %while.body ] + %found_ptr = phi ptr [ %c, %while.body ] + br label %end + +while.end: + %mismatch_index2 = phi i32 [ %n, %while.cond ] + %end_ptr = phi ptr [ %d, %while.cond ] + br label %end + +end: + %mismatch_index = phi i32 [ %mismatch_index1, %while.found ], [ %mismatch_index2, %while.end ] + %store_ptr = phi ptr [ %end_ptr, %while.end ], [ %found_ptr, %while.found ] + store i32 %mismatch_index, ptr %store_ptr + ret i32 %mismatch_index +} + + + +define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) { +; CHECK-LABEL: define i32 @compare_bytes_extra_cmp( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; CHECK-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; CHECK-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vector_loop_preheader: +; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; CHECK: mismatch_vector_loop: +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; CHECK-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; CHECK-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; CHECK: mismatch_vector_loop_inc: +; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vector_loop_found: +; CHECK-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; CHECK-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; CHECK-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; CHECK-NEXT: br i1 [[TMP39]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_END_LOOPEXIT]] +; CHECK: while.end.loopexit: +; CHECK-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: br label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LMUL8-LABEL: define i32 @compare_bytes_extra_cmp( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; LMUL8-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; LMUL8: ph: +; LMUL8-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; LMUL8: mismatch_min_it_check: +; LMUL8-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LMUL8-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LMUL8-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LMUL8: mismatch_mem_check: +; LMUL8-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LMUL8-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LMUL8-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LMUL8-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LMUL8-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LMUL8-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LMUL8-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LMUL8-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LMUL8-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LMUL8-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LMUL8-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LMUL8-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LMUL8: mismatch_vector_loop_preheader: +; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; LMUL8: mismatch_vector_loop: +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP1]], [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true) +; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv64i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LMUL8-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LMUL8-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LMUL8: mismatch_vector_loop_inc: +; LMUL8-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LMUL8-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LMUL8-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LMUL8-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; LMUL8: mismatch_vector_loop_found: +; LMUL8-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64 +; LMUL8-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]] +; LMUL8-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LMUL8-NEXT: br label [[MISMATCH_END]] +; LMUL8: mismatch_loop_pre: +; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[MISMATCH_MIN_IT_CHECK]] ] +; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LMUL8: mismatch_loop: +; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LMUL8-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LMUL8-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LMUL8-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LMUL8-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; LMUL8: mismatch_loop_inc: +; LMUL8-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; LMUL8-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LMUL8-NEXT: br i1 [[TMP36]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; LMUL8: mismatch_end: +; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP37:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP37]], [[TMP38]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]] +; LMUL8: byte.compare: +; LMUL8-NEXT: [[TMP39:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]] +; LMUL8-NEXT: br i1 [[TMP39]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_END_LOOPEXIT]] +; LMUL8: while.end.loopexit: +; LMUL8-NEXT: [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: br label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ] +; LMUL8-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; LOOP-DEL-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; LOOP-DEL: ph: +; LOOP-DEL-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], 1 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ule i32 [[TMP0]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LOOP-DEL: mismatch_mem_check: +; LOOP-DEL-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; LOOP-DEL-NEXT: [[TMP6:%.*]] = lshr i64 [[TMP5]], 12 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +; LOOP-DEL-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP8]], 12 +; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]] +; LOOP-DEL-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; LOOP-DEL-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP11]], 12 +; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; LOOP-DEL-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; LOOP-DEL-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 12 +; LOOP-DEL-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP6]], [[TMP9]] +; LOOP-DEL-NEXT: [[TMP17:%.*]] = icmp ne i64 [[TMP12]], [[TMP15]] +; LOOP-DEL-NEXT: [[TMP18:%.*]] = or i1 [[TMP16]], [[TMP17]] +; LOOP-DEL-NEXT: br i1 [[TMP18]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP:%.*]], !prof [[PROF1]] +; LOOP-DEL: mismatch_vector_loop: +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ [[TMP24:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ], [ [[TMP1]], [[MISMATCH_MEM_CHECK]] ] +; LOOP-DEL-NEXT: [[AVL:%.*]] = sub nuw nsw i64 [[TMP2]], [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[TMP19:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; LOOP-DEL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[MISMATCH_VECTOR_INDEX]] +; LOOP-DEL-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP21]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP19]]) +; LOOP-DEL-NEXT: [[TMP22:%.*]] = icmp sge i32 [[FIRST]], 0 +; LOOP-DEL-NEXT: br i1 [[TMP22]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LOOP-DEL: mismatch_vector_loop_inc: +; LOOP-DEL-NEXT: [[TMP23:%.*]] = zext i32 [[TMP19]] to i64 +; LOOP-DEL-NEXT: [[TMP24]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP23]] +; LOOP-DEL-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP24]], [[TMP2]] +; LOOP-DEL-NEXT: br i1 [[TMP25]], label [[MISMATCH_VECTOR_LOOP]], label [[WHILE_END]] +; LOOP-DEL: mismatch_vector_loop_found: +; LOOP-DEL-NEXT: [[FIRST2:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[MISMATCH_VECTOR_INDEX3:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP26:%.*]] = zext i32 [[FIRST2]] to i64 +; LOOP-DEL-NEXT: [[TMP27:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX3]], [[TMP26]] +; LOOP-DEL-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 +; LOOP-DEL-NEXT: br label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_pre: +; LOOP-DEL-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ [[TMP0]], [[MISMATCH_MEM_CHECK]] ], [ [[TMP0]], [[PH]] ] +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP35:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LOOP-DEL-NEXT: [[TMP29:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1 +; LOOP-DEL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP29]] +; LOOP-DEL-NEXT: [[TMP33:%.*]] = load i8, ptr [[TMP32]], align 1 +; LOOP-DEL-NEXT: [[TMP34:%.*]] = icmp eq i8 [[TMP31]], [[TMP33]] +; LOOP-DEL-NEXT: br i1 [[TMP34]], label [[MISMATCH_LOOP_INC]], label [[WHILE_END]] +; LOOP-DEL: mismatch_loop_inc: +; LOOP-DEL-NEXT: [[TMP35]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP36:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], [[N]] +; LOOP-DEL-NEXT: br i1 [[TMP36]], label [[WHILE_END]], label [[MISMATCH_LOOP]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[N]], [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ [[N]], [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP28]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +; NO-TRANSFORM-LABEL: define i32 @compare_bytes_extra_cmp( +; NO-TRANSFORM-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: [[CMP_X:%.*]] = icmp ult i32 [[N]], [[X]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_X]], label [[PH:%.*]], label [[WHILE_END:%.*]] +; NO-TRANSFORM: ph: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; NO-TRANSFORM: while.end: +; NO-TRANSFORM-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: ret i32 [[INC_LCSSA]] +entry: + %cmp.x = icmp ult i32 %n, %x + br i1 %cmp.x, label %ph, label %while.end + +ph: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ] + ret i32 %inc.lcssa +} + +define void @compare_bytes_cleanup_block(ptr %src1, ptr %src2) { +; CHECK-LABEL: define void @compare_bytes_cleanup_block( +; CHECK-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; CHECK: mismatch_min_it_check: +; CHECK-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; CHECK: mismatch_mem_check: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 12 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 12 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; CHECK: mismatch_vector_loop_preheader: +; CHECK-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; CHECK: mismatch_vector_loop: +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]] +; CHECK-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; CHECK-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv16i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; CHECK-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv16i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; CHECK-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; CHECK: mismatch_vector_loop_inc: +; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; CHECK: mismatch_vector_loop_found: +; CHECK-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; CHECK-NEXT: br label [[MISMATCH_END]] +; CHECK: mismatch_loop_pre: +; CHECK-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_MEM_CHECK]] ], [ 1, [[MISMATCH_MIN_IT_CHECK]] ] +; CHECK-NEXT: br label [[MISMATCH_LOOP:%.*]] +; CHECK: mismatch_loop: +; CHECK-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]] +; CHECK-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; CHECK: mismatch_loop_inc: +; CHECK-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0 +; CHECK-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; CHECK: mismatch_end: +; CHECK-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; CHECK-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ] +; CHECK-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; CHECK: byte.compare: +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; CHECK-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]] +; CHECK: cleanup.thread: +; CHECK-NEXT: ret void +; CHECK: if.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; CHECK-NEXT: ret void +; +; LMUL8-LABEL: define void @compare_bytes_cleanup_block( +; LMUL8-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: br label [[MISMATCH_MIN_IT_CHECK:%.*]] +; LMUL8: mismatch_min_it_check: +; LMUL8-NEXT: br i1 false, label [[MISMATCH_MEM_CHECK:%.*]], label [[MISMATCH_LOOP_PRE:%.*]], !prof [[PROF0]] +; LMUL8: mismatch_mem_check: +; LMUL8-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[SRC1]], i64 1 +; LMUL8-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; LMUL8-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 12 +; LMUL8-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC1]], i64 0 +; LMUL8-NEXT: [[TMP4:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; LMUL8-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 12 +; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[SRC2]], i64 1 +; LMUL8-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 +; LMUL8-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP7]], 12 +; LMUL8-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[SRC2]], i64 0 +; LMUL8-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; LMUL8-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP10]], 12 +; LMUL8-NEXT: [[TMP12:%.*]] = icmp ne i64 [[TMP2]], [[TMP5]] +; LMUL8-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP8]], [[TMP11]] +; LMUL8-NEXT: [[TMP14:%.*]] = or i1 [[TMP12]], [[TMP13]] +; LMUL8-NEXT: br i1 [[TMP14]], label [[MISMATCH_LOOP_PRE]], label [[MISMATCH_VECTOR_LOOP_PREHEADER:%.*]], !prof [[PROF1]] +; LMUL8: mismatch_vector_loop_preheader: +; LMUL8-NEXT: br label [[MISMATCH_VECTOR_LOOP:%.*]] +; LMUL8: mismatch_vector_loop: +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX:%.*]] = phi i64 [ 1, [[MISMATCH_VECTOR_LOOP_PREHEADER]] ], [ [[TMP20:%.*]], [[MISMATCH_VECTOR_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[AVL:%.*]] = sub nuw nsw i64 0, [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 64, i1 true) +; LMUL8-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[LHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; LMUL8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[MISMATCH_VECTOR_INDEX]] +; LMUL8-NEXT: [[RHS_LOAD:%.*]] = call @llvm.vp.load.nxv64i8.p0(ptr [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; LMUL8-NEXT: [[MISMATCH_CMP:%.*]] = call @llvm.vp.icmp.nxv64i8( [[LHS_LOAD]], [[RHS_LOAD]], metadata !"ne", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; LMUL8-NEXT: [[FIRST:%.*]] = call i32 @llvm.vp.cttz.elts.i32.nxv64i1( [[MISMATCH_CMP]], i1 true, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP15]]) +; LMUL8-NEXT: [[TMP18:%.*]] = icmp sge i32 [[FIRST]], 0 +; LMUL8-NEXT: br i1 [[TMP18]], label [[MISMATCH_VECTOR_LOOP_FOUND:%.*]], label [[MISMATCH_VECTOR_LOOP_INC]] +; LMUL8: mismatch_vector_loop_inc: +; LMUL8-NEXT: [[TMP19:%.*]] = zext i32 [[TMP15]] to i64 +; LMUL8-NEXT: [[TMP20]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX]], [[TMP19]] +; LMUL8-NEXT: [[TMP21:%.*]] = icmp ne i64 [[TMP20]], 0 +; LMUL8-NEXT: br i1 [[TMP21]], label [[MISMATCH_VECTOR_LOOP]], label [[MISMATCH_END:%.*]] +; LMUL8: mismatch_vector_loop_found: +; LMUL8-NEXT: [[FIRST1:%.*]] = phi i32 [ [[FIRST]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[MISMATCH_VECTOR_INDEX2:%.*]] = phi i64 [ [[MISMATCH_VECTOR_INDEX]], [[MISMATCH_VECTOR_LOOP]] ] +; LMUL8-NEXT: [[TMP22:%.*]] = zext i32 [[FIRST1]] to i64 +; LMUL8-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[MISMATCH_VECTOR_INDEX2]], [[TMP22]] +; LMUL8-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; LMUL8-NEXT: br label [[MISMATCH_END]] +; LMUL8: mismatch_loop_pre: +; LMUL8-NEXT: [[MISMATCH_START_INDEX:%.*]] = phi i32 [ 1, [[MISMATCH_MEM_CHECK]] ], [ 1, [[MISMATCH_MIN_IT_CHECK]] ] +; LMUL8-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LMUL8: mismatch_loop: +; LMUL8-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ [[MISMATCH_START_INDEX]], [[MISMATCH_LOOP_PRE]] ], [ [[TMP31:%.*]], [[MISMATCH_LOOP_INC:%.*]] ] +; LMUL8-NEXT: [[TMP25:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LMUL8-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP25]] +; LMUL8-NEXT: [[TMP27:%.*]] = load i8, ptr [[TMP26]], align 1 +; LMUL8-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP25]] +; LMUL8-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; LMUL8-NEXT: [[TMP30:%.*]] = icmp eq i8 [[TMP27]], [[TMP29]] +; LMUL8-NEXT: br i1 [[TMP30]], label [[MISMATCH_LOOP_INC]], label [[MISMATCH_END]] +; LMUL8: mismatch_loop_inc: +; LMUL8-NEXT: [[TMP31]] = add i32 [[MISMATCH_INDEX]], 1 +; LMUL8-NEXT: [[TMP32:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0 +; LMUL8-NEXT: br i1 [[TMP32]], label [[MISMATCH_END]], label [[MISMATCH_LOOP]] +; LMUL8: mismatch_end: +; LMUL8-NEXT: [[MISMATCH_RESULT:%.*]] = phi i32 [ 0, [[MISMATCH_LOOP_INC]] ], [ [[MISMATCH_INDEX]], [[MISMATCH_LOOP]] ], [ 0, [[MISMATCH_VECTOR_LOOP_INC]] ], [ [[TMP24]], [[MISMATCH_VECTOR_LOOP_FOUND]] ] +; LMUL8-NEXT: br i1 true, label [[BYTE_COMPARE:%.*]], label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ], [ 0, [[MISMATCH_END]] ] +; LMUL8-NEXT: [[INC:%.*]] = add i32 [[LEN]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP34:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP33]], [[TMP34]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; LMUL8: byte.compare: +; LMUL8-NEXT: [[TMP35:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], 0 +; LMUL8-NEXT: br i1 [[TMP35]], label [[CLEANUP_THREAD]], label [[IF_END]] +; LMUL8: cleanup.thread: +; LMUL8-NEXT: ret void +; LMUL8: if.end: +; LMUL8-NEXT: [[RES:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ] +; LMUL8-NEXT: ret void +; +; LOOP-DEL-LABEL: define void @compare_bytes_cleanup_block( +; LOOP-DEL-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[MISMATCH_LOOP:%.*]] +; LOOP-DEL: mismatch_loop: +; LOOP-DEL-NEXT: [[MISMATCH_INDEX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[MISMATCH_LOOP]] ] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = zext i32 [[MISMATCH_INDEX]] to i64 +; LOOP-DEL-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; LOOP-DEL-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[TMP0]] +; LOOP-DEL-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 +; LOOP-DEL-NEXT: [[TMP5:%.*]] = icmp ne i8 [[TMP2]], [[TMP4]] +; LOOP-DEL-NEXT: [[TMP6]] = add i32 [[MISMATCH_INDEX]], 1 +; LOOP-DEL-NEXT: [[TMP7:%.*]] = icmp eq i32 [[MISMATCH_INDEX]], 0 +; LOOP-DEL-NEXT: [[OR_COND:%.*]] = or i1 [[TMP5]], [[TMP7]] +; LOOP-DEL-NEXT: br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[MISMATCH_LOOP]] +; LOOP-DEL: common.ret: +; LOOP-DEL-NEXT: ret void +; +; NO-TRANSFORM-LABEL: define void @compare_bytes_cleanup_block( +; NO-TRANSFORM-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) { +; NO-TRANSFORM-NEXT: entry: +; NO-TRANSFORM-NEXT: br label [[WHILE_COND:%.*]] +; NO-TRANSFORM: while.cond: +; NO-TRANSFORM-NEXT: [[LEN:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ] +; NO-TRANSFORM-NEXT: [[INC]] = add i32 [[LEN]], 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], 0 +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT]], label [[CLEANUP_THREAD:%.*]], label [[WHILE_BODY]] +; NO-TRANSFORM: while.body: +; NO-TRANSFORM-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; NO-TRANSFORM-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[SRC1]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; NO-TRANSFORM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[SRC2]], i64 [[IDXPROM]] +; NO-TRANSFORM-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; NO-TRANSFORM-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; NO-TRANSFORM-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[IF_END:%.*]] +; NO-TRANSFORM: cleanup.thread: +; NO-TRANSFORM-NEXT: ret void +; NO-TRANSFORM: if.end: +; NO-TRANSFORM-NEXT: [[RES:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ] +; NO-TRANSFORM-NEXT: ret void +entry: + br label %while.cond + +while.cond: + %len = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %inc = add i32 %len, 1 + %cmp.not = icmp eq i32 %inc, 0 + br i1 %cmp.not, label %cleanup.thread, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr i8, ptr %src1, i64 %idxprom + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr i8, ptr %src2, i64 %idxprom + %1 = load i8, ptr %arrayidx2, align 1 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %if.end + +cleanup.thread: + ret void + +if.end: + %res = phi i32 [ %inc, %while.body ] + ret void +} + +; +; NEGATIVE TESTS +; + +; Similar to @compare_bytes_simple, except in the while.end block we have an extra PHI +; with unique values for each incoming block from the loop. +define i32 @compare_bytes_simple2(ptr %a, ptr %b, ptr %c, ptr %d, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ] +; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LMUL8-LABEL: define i32 @compare_bytes_simple2( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: br label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LMUL8-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ] +; LMUL8-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; LMUL8-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple2( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: [[FINAL_PTR:%.*]] = phi ptr [ [[C]], [[WHILE_BODY]] ], [ [[D]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: store i32 [[INC_LCSSA]], ptr [[FINAL_PTR]], align 4 +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] + store i32 %inc.lcssa, ptr %final_ptr + ret i32 %inc.lcssa +} + +define i32 @compare_bytes_simple3(ptr %a, ptr %b, ptr %c, i32 %d, i32 %len, i32 %n) { +; CHECK-LABEL: define i32 @compare_bytes_simple3( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4 +; CHECK-NEXT: ret i32 [[FINAL_VAL]] +; +; LMUL8-LABEL: define i32 @compare_bytes_simple3( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: br label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LMUL8-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4 +; LMUL8-NEXT: ret i32 [[FINAL_VAL]] +; +; LOOP-DEL-LABEL: define i32 @compare_bytes_simple3( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[D:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[FINAL_VAL:%.*]] = phi i32 [ [[D]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: store i32 [[FINAL_VAL]], ptr [[C]], align 4 +; LOOP-DEL-NEXT: ret i32 [[FINAL_VAL]] +; + entry: + br label %while.cond + + while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + + while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + + while.end: + %final_val = phi i32 [ %d, %while.body ], [ %inc, %while.cond ] + store i32 %final_val, ptr %c + ret i32 %final_val +} + +; Disable the optimization when noimplicitfloat is present. +define i32 @no_implicit_float(ptr %a, ptr %b, i32 %len, i32 %n) noimplicitfloat { +; CHECK-LABEL: define i32 @no_implicit_float( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[WHILE_COND:%.*]] +; CHECK: while.cond: +; CHECK-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; CHECK-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; CHECK: while.body: +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; CHECK-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; CHECK: while.end: +; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; CHECK-NEXT: ret i32 [[INC_LCSSA]] +; +; LMUL8-LABEL: define i32 @no_implicit_float( +; LMUL8-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; LMUL8-NEXT: entry: +; LMUL8-NEXT: br label [[WHILE_COND:%.*]] +; LMUL8: while.cond: +; LMUL8-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LMUL8-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LMUL8-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LMUL8-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LMUL8: while.body: +; LMUL8-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LMUL8-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LMUL8-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LMUL8-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LMUL8: while.end: +; LMUL8-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LMUL8-NEXT: ret i32 [[INC_LCSSA]] +; +; LOOP-DEL-LABEL: define i32 @no_implicit_float( +; LOOP-DEL-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[LEN:%.*]], i32 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; LOOP-DEL-NEXT: entry: +; LOOP-DEL-NEXT: br label [[WHILE_COND:%.*]] +; LOOP-DEL: while.cond: +; LOOP-DEL-NEXT: [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ] +; LOOP-DEL-NEXT: [[INC]] = add i32 [[LEN_ADDR]], 1 +; LOOP-DEL-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END:%.*]], label [[WHILE_BODY]] +; LOOP-DEL: while.body: +; LOOP-DEL-NEXT: [[IDXPROM:%.*]] = zext i32 [[INC]] to i64 +; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; LOOP-DEL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]] +; LOOP-DEL-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; LOOP-DEL-NEXT: [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]] +; LOOP-DEL-NEXT: br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]] +; LOOP-DEL: while.end: +; LOOP-DEL-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ] +; LOOP-DEL-NEXT: ret i32 [[INC_LCSSA]] +; +entry: + br label %while.cond + +while.cond: + %len.addr = phi i32 [ %len, %entry ], [ %inc, %while.body ] + %inc = add i32 %len.addr, 1 + %cmp.not = icmp eq i32 %inc, %n + br i1 %cmp.not, label %while.end, label %while.body + +while.body: + %idxprom = zext i32 %inc to i64 + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %idxprom + %0 = load i8, ptr %arrayidx + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom + %1 = load i8, ptr %arrayidx2 + %cmp.not2 = icmp eq i8 %0, %1 + br i1 %cmp.not2, label %while.cond, label %while.end + +while.end: + %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ] + ret i32 %inc.lcssa +}