Skip to content

Commit f0e9bba

Browse files
[LoopVectorize] Generate wide active lane masks (llvm#147535)
This patch adds a new flag (-enable-wide-lane-mask) which allows LoopVectorize to generate wider-than-VF active lane masks when it is safe to do so (i.e. the mask is used for data and control flow). The transform in extractFromWideActiveLaneMask creates vector extracts from the first active lane mask in the header & loop body, modifying the active lane mask phi operands to use the extracts. An additional operand is passed to the ActiveLaneMask instruction, the value of which is used as a multiplier of VF when generating the mask. By default this is 1, and is updated to UF by extractFromWideActiveLaneMask. The motivation for this change is to improve interleaved loops when SVE2.1 is available, where we can make use of the whilelo instruction which returns a predicate pair. This is based on a PR that was created by @momchil-velikov (llvm#81140) and contains tests which were added there.
1 parent 5f41241 commit f0e9bba

File tree

10 files changed

+648
-22
lines changed

10 files changed

+648
-22
lines changed

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
166166
case Intrinsic::is_fpclass:
167167
case Intrinsic::vp_is_fpclass:
168168
case Intrinsic::powi:
169+
case Intrinsic::vector_extract:
169170
return (ScalarOpdIdx == 1);
170171
case Intrinsic::smul_fix:
171172
case Intrinsic::smul_fix_sat:
@@ -200,6 +201,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
200201
case Intrinsic::vp_llrint:
201202
case Intrinsic::ucmp:
202203
case Intrinsic::scmp:
204+
case Intrinsic::vector_extract:
203205
return OpdIdx == -1 || OpdIdx == 0;
204206
case Intrinsic::modf:
205207
case Intrinsic::sincos:

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4220,9 +4220,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
42204220
}
42214221
}
42224222
}
4223-
[[fallthrough]];
4223+
C += VPI->cost(VF, CostCtx);
4224+
break;
4225+
}
4226+
case VPInstruction::ActiveLaneMask: {
4227+
unsigned Multiplier =
4228+
cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
4229+
->getZExtValue();
4230+
C += VPI->cost(VF * Multiplier, CostCtx);
4231+
break;
42244232
}
4225-
case VPInstruction::ActiveLaneMask:
42264233
case VPInstruction::ExplicitVectorLength:
42274234
C += VPI->cost(VF, CostCtx);
42284235
break;

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -979,6 +979,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
979979
Not,
980980
SLPLoad,
981981
SLPStore,
982+
// Creates a mask where each lane is active (true) whilst the current
983+
// counter (first operand + index) is less than the second operand. i.e.
984+
// mask[i] = icmpt ult (op0 + i), op1
985+
// The size of the mask returned is VF * Multiplier (UF, third op).
982986
ActiveLaneMask,
983987
ExplicitVectorLength,
984988
CalculateTripCountMinusVF,
@@ -2003,6 +2007,9 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe,
20032007
return getOperand(1);
20042008
}
20052009

2010+
/// Update the incoming value from the loop backedge.
2011+
void setBackedgeValue(VPValue *V) { setOperand(1, V); }
2012+
20062013
/// Returns the backedge value as a recipe. The backedge value is guaranteed
20072014
/// to be a recipe.
20082015
virtual VPRecipeBase &getBackedgeRecipe() {

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,10 +326,10 @@ m_ExtractLastElement(const Op0_t &Op0) {
326326
return m_VPInstruction<VPInstruction::ExtractLastElement>(Op0);
327327
}
328328

329-
template <typename Op0_t, typename Op1_t>
330-
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t>
331-
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {
332-
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1);
329+
template <typename Op0_t, typename Op1_t, typename Op2_t>
330+
inline VPInstruction_match<VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t>
331+
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
332+
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
333333
}
334334

335335
template <typename Op0_t, typename Op1_t>

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
471471
case Instruction::ICmp:
472472
case Instruction::FCmp:
473473
case Instruction::Store:
474-
case VPInstruction::ActiveLaneMask:
475474
case VPInstruction::BranchOnCount:
476475
case VPInstruction::ComputeReductionResult:
477476
case VPInstruction::FirstOrderRecurrenceSplice:
@@ -481,6 +480,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
481480
case VPInstruction::WideIVStep:
482481
return 2;
483482
case Instruction::Select:
483+
case VPInstruction::ActiveLaneMask:
484484
case VPInstruction::ComputeAnyOfResult:
485485
case VPInstruction::ReductionStartVector:
486486
return 3;
@@ -620,7 +620,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
620620
Name);
621621

622622
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
623-
auto *PredTy = VectorType::get(Int1Ty, State.VF);
623+
auto PredTy = VectorType::get(
624+
Int1Ty, State.VF * cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
625+
->getZExtValue());
624626
return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
625627
{PredTy, ScalarTC->getType()},
626628
{VIVElem0, ScalarTC}, nullptr, Name);
@@ -1091,7 +1093,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
10911093
}
10921094
case VPInstruction::ActiveLaneMask: {
10931095
Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0));
1094-
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF);
1096+
unsigned Multiplier =
1097+
cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue();
1098+
Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier);
10951099
IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
10961100
{ArgTy, ArgTy});
10971101
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 117 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040
using namespace llvm;
4141
using namespace VPlanPatternMatch;
4242

43+
cl::opt<bool> EnableWideActiveLaneMask(
44+
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
45+
cl::desc("Enable use of wide get active lane mask instructions"));
46+
4347
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
4448
VPlanPtr &Plan,
4549
function_ref<const InductionDescriptor *(PHINode *)>
@@ -1475,6 +1479,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
14751479
return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
14761480
}
14771481

1482+
/// Try to replace multiple active lane masks used for control flow with
1483+
/// a single, wide active lane mask instruction followed by multiple
1484+
/// extract subvector intrinsics. This applies to the active lane mask
1485+
/// instructions both in the loop and in the preheader.
1486+
/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1487+
/// new extracts from the first active lane mask, which has it's last
1488+
/// operand (multiplier) set to UF.
1489+
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
1490+
unsigned UF) {
1491+
if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1492+
return false;
1493+
1494+
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1495+
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1496+
auto *Term = &ExitingVPBB->back();
1497+
1498+
using namespace llvm::VPlanPatternMatch;
1499+
if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
1500+
m_VPValue(), m_VPValue(), m_VPValue())))))
1501+
return false;
1502+
1503+
auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1504+
LLVMContext &Ctx = Plan.getContext();
1505+
1506+
auto ExtractFromALM = [&](VPInstruction *ALM,
1507+
SmallVectorImpl<VPValue *> &Extracts) {
1508+
DebugLoc DL = ALM->getDebugLoc();
1509+
for (unsigned Part = 0; Part < UF; ++Part) {
1510+
SmallVector<VPValue *> Ops;
1511+
Ops.append({ALM, Plan.getOrAddLiveIn(
1512+
ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1513+
VF.getKnownMinValue() * Part))});
1514+
auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1515+
IntegerType::getInt1Ty(Ctx), DL);
1516+
Extracts[Part] = Ext;
1517+
Ext->insertAfter(ALM);
1518+
}
1519+
};
1520+
1521+
// Create a list of each active lane mask phi, ordered by unroll part.
1522+
SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
1523+
for (VPRecipeBase &R : Header->phis()) {
1524+
auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
1525+
if (!Phi)
1526+
continue;
1527+
VPValue *Index = nullptr;
1528+
match(Phi->getBackedgeValue(),
1529+
m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
1530+
assert(Index && "Expected index from ActiveLaneMask instruction");
1531+
1532+
auto *II = dyn_cast<VPInstruction>(Index);
1533+
if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) {
1534+
auto Part = cast<ConstantInt>(II->getOperand(1)->getLiveInIRValue());
1535+
Phis[Part->getZExtValue()] = Phi;
1536+
} else
1537+
// Anything other than a CanonicalIVIncrementForPart is part 0
1538+
Phis[0] = Phi;
1539+
}
1540+
1541+
assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1542+
"Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1543+
1544+
auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1545+
auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1546+
1547+
assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1548+
LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1549+
"Expected incoming values of Phi to be ActiveLaneMasks");
1550+
1551+
// When using wide lane masks, the return type of the get.active.lane.mask
1552+
// intrinsic is VF x UF (last operand).
1553+
VPValue *ALMMultiplier =
1554+
Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
1555+
EntryALM->setOperand(2, ALMMultiplier);
1556+
LoopALM->setOperand(2, ALMMultiplier);
1557+
1558+
// Create UF x extract vectors and insert into preheader.
1559+
SmallVector<VPValue *> EntryExtracts(UF);
1560+
ExtractFromALM(EntryALM, EntryExtracts);
1561+
1562+
// Create UF x extract vectors and insert before the loop compare & branch,
1563+
// updating the compare to use the first extract.
1564+
SmallVector<VPValue *> LoopExtracts(UF);
1565+
ExtractFromALM(LoopALM, LoopExtracts);
1566+
VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1567+
Not->setOperand(0, LoopExtracts[0]);
1568+
1569+
// Update the incoming values of active lane mask phis.
1570+
for (unsigned Part = 0; Part < UF; ++Part) {
1571+
Phis[Part]->setStartValue(EntryExtracts[Part]);
1572+
Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1573+
}
1574+
1575+
return true;
1576+
}
1577+
14781578
/// Try to simplify the branch condition of \p Plan. This may restrict the
14791579
/// resulting plan to \p BestVF and \p BestUF.
14801580
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
@@ -1486,8 +1586,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
14861586
VPValue *Cond;
14871587
ScalarEvolution &SE = *PSE.getSE();
14881588
if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
1489-
match(Term, m_BranchOnCond(
1490-
m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
1589+
match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
1590+
m_VPValue(), m_VPValue(), m_VPValue()))))) {
14911591
// Try to simplify the branch condition if TC <= VF * UF when the latch
14921592
// terminator is BranchOnCount or BranchOnCond where the input is
14931593
// Not(ActiveLaneMask).
@@ -1566,8 +1666,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
15661666
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
15671667
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
15681668

1569-
bool MadeChange =
1570-
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1669+
bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
1670+
MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
15711671
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
15721672

15731673
if (MadeChange) {
@@ -2050,9 +2150,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20502150
"index.part.next");
20512151

20522152
// Create the active lane mask instruction in the VPlan preheader.
2053-
auto *EntryALM =
2054-
Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2055-
DL, "active.lane.mask.entry");
2153+
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2154+
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2155+
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2156+
{EntryIncrement, TC, ALMMultiplier}, DL,
2157+
"active.lane.mask.entry");
20562158

20572159
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
20582160
// preheader ActiveLaneMask instruction.
@@ -2067,8 +2169,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
20672169
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
20682170
{IncrementValue}, {false, false}, DL);
20692171
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2070-
{InLoopIncrement, TripCount}, DL,
2071-
"active.lane.mask.next");
2172+
{InLoopIncrement, TripCount, ALMMultiplier},
2173+
DL, "active.lane.mask.next");
20722174
LaneMaskPhi->addOperand(ALM);
20732175

20742176
// Replace the original terminator with BranchOnCond. We have to invert the
@@ -2144,9 +2246,12 @@ void VPlanTransforms::addActiveLaneMask(
21442246
Plan, DataAndControlFlowWithoutRuntimeCheck);
21452247
} else {
21462248
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2147-
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
2148-
{WideCanonicalIV, Plan.getTripCount()}, nullptr,
2149-
"active.lane.mask");
2249+
VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2250+
ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));
2251+
LaneMask =
2252+
B.createNaryOp(VPInstruction::ActiveLaneMask,
2253+
{WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2254+
nullptr, "active.lane.mask");
21502255
}
21512256

21522257
// Walk users of WideCanonicalIV and replace the header mask of the form

llvm/lib/Transforms/Vectorize/VPlanUtils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) {
6565
VPValue *A, *B;
6666
using namespace VPlanPatternMatch;
6767

68-
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B))))
68+
if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1))))
6969
return B == Plan.getTripCount() &&
7070
(match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()),
7171
m_SpecificInt(1),

0 commit comments

Comments
 (0)