40
40
using namespace llvm ;
41
41
using namespace VPlanPatternMatch ;
42
42
43
+ cl::opt<bool > EnableWideActiveLaneMask (
44
+ " enable-wide-lane-mask" , cl::init(false ), cl::Hidden,
45
+ cl::desc(" Enable use of wide get active lane mask instructions" ));
46
+
43
47
bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes (
44
48
VPlanPtr &Plan,
45
49
function_ref<const InductionDescriptor *(PHINode *)>
@@ -1475,6 +1479,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
1475
1479
return SE.isKnownPredicate (CmpInst::ICMP_EQ, VectorTripCount, C);
1476
1480
}
1477
1481
1482
+ // / Try to replace multiple active lane masks used for control flow with
1483
+ // / a single, wide active lane mask instruction followed by multiple
1484
+ // / extract subvector intrinsics. This applies to the active lane mask
1485
+ // / instructions both in the loop and in the preheader.
1486
+ // / Incoming values of all ActiveLaneMaskPHIs are updated to use the
1487
+ // / new extracts from the first active lane mask, which has it's last
1488
+ // / operand (multiplier) set to UF.
1489
+ static bool tryToReplaceALMWithWideALM (VPlan &Plan, ElementCount VF,
1490
+ unsigned UF) {
1491
+ if (!EnableWideActiveLaneMask || !VF.isVector () || UF == 1 )
1492
+ return false ;
1493
+
1494
+ VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
1495
+ VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock ();
1496
+ auto *Term = &ExitingVPBB->back ();
1497
+
1498
+ using namespace llvm ::VPlanPatternMatch;
1499
+ if (!match (Term, m_BranchOnCond (m_Not (m_ActiveLaneMask (
1500
+ m_VPValue (), m_VPValue (), m_VPValue ())))))
1501
+ return false ;
1502
+
1503
+ auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry ());
1504
+ LLVMContext &Ctx = Plan.getContext ();
1505
+
1506
+ auto ExtractFromALM = [&](VPInstruction *ALM,
1507
+ SmallVectorImpl<VPValue *> &Extracts) {
1508
+ DebugLoc DL = ALM->getDebugLoc ();
1509
+ for (unsigned Part = 0 ; Part < UF; ++Part) {
1510
+ SmallVector<VPValue *> Ops;
1511
+ Ops.append ({ALM, Plan.getOrAddLiveIn (
1512
+ ConstantInt::get (IntegerType::getInt64Ty (Ctx),
1513
+ VF.getKnownMinValue () * Part))});
1514
+ auto *Ext = new VPWidenIntrinsicRecipe (Intrinsic::vector_extract, Ops,
1515
+ IntegerType::getInt1Ty (Ctx), DL);
1516
+ Extracts[Part] = Ext;
1517
+ Ext->insertAfter (ALM);
1518
+ }
1519
+ };
1520
+
1521
+ // Create a list of each active lane mask phi, ordered by unroll part.
1522
+ SmallVector<VPActiveLaneMaskPHIRecipe *> Phis (UF, nullptr );
1523
+ for (VPRecipeBase &R : Header->phis ()) {
1524
+ auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
1525
+ if (!Phi)
1526
+ continue ;
1527
+ VPValue *Index = nullptr ;
1528
+ match (Phi->getBackedgeValue (),
1529
+ m_ActiveLaneMask (m_VPValue (Index), m_VPValue (), m_VPValue ()));
1530
+ assert (Index && " Expected index from ActiveLaneMask instruction" );
1531
+
1532
+ auto *II = dyn_cast<VPInstruction>(Index);
1533
+ if (II && II->getOpcode () == VPInstruction::CanonicalIVIncrementForPart) {
1534
+ auto Part = cast<ConstantInt>(II->getOperand (1 )->getLiveInIRValue ());
1535
+ Phis[Part->getZExtValue ()] = Phi;
1536
+ } else
1537
+ // Anything other than a CanonicalIVIncrementForPart is part 0
1538
+ Phis[0 ] = Phi;
1539
+ }
1540
+
1541
+ assert (all_of (Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1542
+ " Expected one VPActiveLaneMaskPHIRecipe for each unroll part" );
1543
+
1544
+ auto *EntryALM = cast<VPInstruction>(Phis[0 ]->getStartValue ());
1545
+ auto *LoopALM = cast<VPInstruction>(Phis[0 ]->getBackedgeValue ());
1546
+
1547
+ assert ((EntryALM->getOpcode () == VPInstruction::ActiveLaneMask &&
1548
+ LoopALM->getOpcode () == VPInstruction::ActiveLaneMask) &&
1549
+ " Expected incoming values of Phi to be ActiveLaneMasks" );
1550
+
1551
+ // When using wide lane masks, the return type of the get.active.lane.mask
1552
+ // intrinsic is VF x UF (last operand).
1553
+ VPValue *ALMMultiplier =
1554
+ Plan.getOrAddLiveIn (ConstantInt::get (IntegerType::getInt64Ty (Ctx), UF));
1555
+ EntryALM->setOperand (2 , ALMMultiplier);
1556
+ LoopALM->setOperand (2 , ALMMultiplier);
1557
+
1558
+ // Create UF x extract vectors and insert into preheader.
1559
+ SmallVector<VPValue *> EntryExtracts (UF);
1560
+ ExtractFromALM (EntryALM, EntryExtracts);
1561
+
1562
+ // Create UF x extract vectors and insert before the loop compare & branch,
1563
+ // updating the compare to use the first extract.
1564
+ SmallVector<VPValue *> LoopExtracts (UF);
1565
+ ExtractFromALM (LoopALM, LoopExtracts);
1566
+ VPInstruction *Not = cast<VPInstruction>(Term->getOperand (0 ));
1567
+ Not->setOperand (0 , LoopExtracts[0 ]);
1568
+
1569
+ // Update the incoming values of active lane mask phis.
1570
+ for (unsigned Part = 0 ; Part < UF; ++Part) {
1571
+ Phis[Part]->setStartValue (EntryExtracts[Part]);
1572
+ Phis[Part]->setBackedgeValue (LoopExtracts[Part]);
1573
+ }
1574
+
1575
+ return true ;
1576
+ }
1577
+
1478
1578
// / Try to simplify the branch condition of \p Plan. This may restrict the
1479
1579
// / resulting plan to \p BestVF and \p BestUF.
1480
1580
static bool simplifyBranchConditionForVFAndUF (VPlan &Plan, ElementCount BestVF,
@@ -1486,8 +1586,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1486
1586
VPValue *Cond;
1487
1587
ScalarEvolution &SE = *PSE.getSE ();
1488
1588
if (match (Term, m_BranchOnCount (m_VPValue (), m_VPValue ())) ||
1489
- match (Term, m_BranchOnCond (
1490
- m_Not ( m_ActiveLaneMask ( m_VPValue (), m_VPValue ()))))) {
1589
+ match (Term, m_BranchOnCond (m_Not ( m_ActiveLaneMask (
1590
+ m_VPValue (), m_VPValue (), m_VPValue ()))))) {
1491
1591
// Try to simplify the branch condition if TC <= VF * UF when the latch
1492
1592
// terminator is BranchOnCount or BranchOnCond where the input is
1493
1593
// Not(ActiveLaneMask).
@@ -1566,8 +1666,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1566
1666
assert (Plan.hasVF (BestVF) && " BestVF is not available in Plan" );
1567
1667
assert (Plan.hasUF (BestUF) && " BestUF is not available in Plan" );
1568
1668
1569
- bool MadeChange =
1570
- simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
1669
+ bool MadeChange = tryToReplaceALMWithWideALM (Plan, BestVF, BestUF);
1670
+ MadeChange |= simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
1571
1671
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF (Plan, BestVF, BestUF);
1572
1672
1573
1673
if (MadeChange) {
@@ -2050,9 +2150,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
2050
2150
" index.part.next" );
2051
2151
2052
2152
// Create the active lane mask instruction in the VPlan preheader.
2053
- auto *EntryALM =
2054
- Builder.createNaryOp (VPInstruction::ActiveLaneMask, {EntryIncrement, TC},
2055
- DL, " active.lane.mask.entry" );
2153
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2154
+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2155
+ auto *EntryALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2156
+ {EntryIncrement, TC, ALMMultiplier}, DL,
2157
+ " active.lane.mask.entry" );
2056
2158
2057
2159
// Now create the ActiveLaneMaskPhi recipe in the main loop using the
2058
2160
// preheader ActiveLaneMask instruction.
@@ -2067,8 +2169,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
2067
2169
Builder.createOverflowingOp (VPInstruction::CanonicalIVIncrementForPart,
2068
2170
{IncrementValue}, {false , false }, DL);
2069
2171
auto *ALM = Builder.createNaryOp (VPInstruction::ActiveLaneMask,
2070
- {InLoopIncrement, TripCount}, DL ,
2071
- " active.lane.mask.next" );
2172
+ {InLoopIncrement, TripCount, ALMMultiplier} ,
2173
+ DL, " active.lane.mask.next" );
2072
2174
LaneMaskPhi->addOperand (ALM);
2073
2175
2074
2176
// Replace the original terminator with BranchOnCond. We have to invert the
@@ -2144,9 +2246,12 @@ void VPlanTransforms::addActiveLaneMask(
2144
2246
Plan, DataAndControlFlowWithoutRuntimeCheck);
2145
2247
} else {
2146
2248
VPBuilder B = VPBuilder::getToInsertAfter (WideCanonicalIV);
2147
- LaneMask = B.createNaryOp (VPInstruction::ActiveLaneMask,
2148
- {WideCanonicalIV, Plan.getTripCount ()}, nullptr ,
2149
- " active.lane.mask" );
2249
+ VPValue *ALMMultiplier = Plan.getOrAddLiveIn (
2250
+ ConstantInt::get (Plan.getCanonicalIV ()->getScalarType (), 1 ));
2251
+ LaneMask =
2252
+ B.createNaryOp (VPInstruction::ActiveLaneMask,
2253
+ {WideCanonicalIV, Plan.getTripCount (), ALMMultiplier},
2254
+ nullptr , " active.lane.mask" );
2150
2255
}
2151
2256
2152
2257
// Walk users of WideCanonicalIV and replace the header mask of the form
0 commit comments