@@ -264,11 +264,6 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
264
264
" enable-masked-interleaved-mem-accesses" , cl::init(false ), cl::Hidden,
265
265
cl::desc(" Enable vectorization on masked interleaved memory accesses in a loop" ));
266
266
267
- static cl::opt<unsigned > TinyTripCountInterleaveThreshold (
268
- " tiny-trip-count-interleave-threshold" , cl::init(128 ), cl::Hidden,
269
- cl::desc(" We don't interleave loops with a estimated constant trip count "
270
- " below this number" ));
271
-
272
267
static cl::opt<unsigned > ForceTargetNumScalarRegs (
273
268
" force-target-num-scalar-regs" , cl::init(0 ), cl::Hidden,
274
269
cl::desc(" A flag that overrides the target's number of scalar registers." ));
@@ -316,12 +311,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316
311
cl::desc(
317
312
" Enable runtime interleaving until load/store ports are saturated" ));
318
313
319
- // / Interleave small loops with scalar reductions.
320
- static cl::opt<bool > InterleaveSmallLoopScalarReduction (
321
- " interleave-small-loop-scalar-reduction" , cl::init(false ), cl::Hidden,
322
- cl::desc(" Enable interleaving for loops with small iteration counts that "
323
- " contain scalar reductions to expose ILP." ));
324
-
325
314
// / The number of stores in a loop that are allowed to need predication.
326
315
static cl::opt<unsigned > NumberOfStoresToPredicate (
327
316
" vectorize-num-stores-pred" , cl::init(1 ), cl::Hidden,
@@ -5823,14 +5812,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5823
5812
5824
5813
auto BestKnownTC = getSmallBestKnownTC (*PSE.getSE (), TheLoop);
5825
5814
const bool HasReductions = !Legal->getReductionVars ().empty ();
5826
- // Do not interleave loops with a relatively small known or estimated trip
5827
- // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5828
- // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5829
- // because with the above conditions interleaving can expose ILP and break
5830
- // cross iteration dependences for reductions.
5831
- if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5832
- !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar ()))
5833
- return 1 ;
5834
5815
5835
5816
// If we did not calculate the cost for VF (because the user selected the VF)
5836
5817
// then we calculate the cost of VF here.
@@ -5903,21 +5884,58 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5903
5884
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5904
5885
}
5905
5886
5906
- // If trip count is known or estimated compile time constant, limit the
5907
- // interleave count to be less than the trip count divided by VF, provided it
5908
- // is at least 1.
5909
- //
5910
- // For scalable vectors we can't know if interleaving is beneficial. It may
5911
- // not be beneficial for small loops if none of the lanes in the second vector
5912
- // iterations is enabled. However, for larger loops, there is likely to be a
5913
- // similar benefit as for fixed-width vectors. For now, we choose to leave
5914
- // the InterleaveCount as if vscale is '1', although if some information about
5915
- // the vector is known (e.g. min vector size), we can make a better decision.
5916
- if (BestKnownTC) {
5917
- MaxInterleaveCount =
5918
- std::min (*BestKnownTC / VF.getKnownMinValue (), MaxInterleaveCount);
5919
- // Make sure MaxInterleaveCount is greater than 0.
5920
- MaxInterleaveCount = std::max (1u , MaxInterleaveCount);
5887
+ unsigned EstimatedVF = VF.getKnownMinValue ();
5888
+ if (VF.isScalable ()) {
5889
+ if (std::optional<unsigned > VScale = getVScaleForTuning (TheLoop, TTI))
5890
+ EstimatedVF *= *VScale;
5891
+ }
5892
+ assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
5893
+
5894
+ unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5895
+ if (KnownTC > 0 ) {
5896
+ // At least one iteration must be scalar when this constraint holds. So the
5897
+ // maximum available iterations for interleaving is one less.
5898
+ unsigned AvailableTC =
5899
+ requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5900
+
5901
+ // If trip count is known we select between two prospective ICs, where
5902
+ // 1) the aggressive IC is capped by the trip count divided by VF
5903
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5904
+ // The final IC is selected in a way that the epilogue loop trip count is
5905
+ // minimized while maximizing the IC itself, so that we either run the
5906
+ // vector loop at least once if it generates a small epilogue loop, or else
5907
+ // we run the vector loop at least twice.
5908
+
5909
+ unsigned InterleaveCountUB = bit_floor (
5910
+ std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5911
+ unsigned InterleaveCountLB = bit_floor (std::max (
5912
+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5913
+ MaxInterleaveCount = InterleaveCountLB;
5914
+
5915
+ if (InterleaveCountUB != InterleaveCountLB) {
5916
+ unsigned TailTripCountUB =
5917
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
5918
+ unsigned TailTripCountLB =
5919
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
5920
+ // If both produce same scalar tail, maximize the IC to do the same work
5921
+ // in fewer vector loop iterations
5922
+ if (TailTripCountUB == TailTripCountLB)
5923
+ MaxInterleaveCount = InterleaveCountUB;
5924
+ }
5925
+ } else if (BestKnownTC && *BestKnownTC > 0 ) {
5926
+ // At least one iteration must be scalar when this constraint holds. So the
5927
+ // maximum available iterations for interleaving is one less.
5928
+ unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5929
+ ? (*BestKnownTC) - 1
5930
+ : *BestKnownTC;
5931
+
5932
+ // If trip count is an estimated compile time constant, limit the
5933
+ // IC to be capped by the trip count divided by VF * 2, such that the vector
5934
+ // loop runs at least twice to make interleaving seem profitable when there
5935
+ // is an epilogue loop present. Since exact Trip count is not known we
5936
+ // choose to be conservative in our IC estimate.
5937
+ MaxInterleaveCount = bit_floor (std::max (
5938
+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5921
5939
}
5922
5940
5923
5941
assert (MaxInterleaveCount > 0 &&
@@ -6021,8 +6039,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6021
6039
6022
6040
// If there are scalar reductions and TTI has enabled aggressive
6023
6041
// interleaving for reductions, we will interleave to expose ILP.
6024
- if (InterleaveSmallLoopScalarReduction && VF.isScalar () &&
6025
- AggressivelyInterleaveReductions) {
6042
+ if (VF.isScalar () && AggressivelyInterleaveReductions) {
6026
6043
LLVM_DEBUG (dbgs () << " LV: Interleaving to expose ILP.\n " );
6027
6044
// Interleave no less than SmallIC but not as aggressive as the normal IC
6028
6045
// to satisfy the rare situation when resources are too limited.
0 commit comments