Skip to content

Commit 8277598

Browse files
authored
JIT: Tune CCMP for better Perf (#116445)
This enables more paths for CCMP by doing the following - Control when and how many switches are converted to CCMP - A switch conversion can span across blocks but CCMP did not check across blocks and hence converted potential switch candidates to ccmp partially hence reducing the effectiveness of switch. This has been handled in this PR to make sure existing switches do not regress. - Let more candidates for CCMP go through lowering - There were priori conditions for a CCMP to happen. This PR enables CCMP on more nodes while carefully checking for which node to convert to CCMP.
1 parent 923a346 commit 8277598

File tree

5 files changed

+115
-46
lines changed

5 files changed

+115
-46
lines changed

src/coreclr/jit/compiler.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6985,8 +6985,8 @@ class Compiler
69856985
public:
69866986
PhaseStatus optOptimizeBools();
69876987
PhaseStatus optRecognizeAndOptimizeSwitchJumps();
6988-
bool optSwitchConvert(BasicBlock* firstBlock, int testsCount, ssize_t* testValues, weight_t falseLikelihood, GenTree* nodeToTest);
6989-
bool optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingForConversion = false);
6988+
bool optSwitchConvert(BasicBlock* firstBlock, int testsCount, ssize_t* testValues, weight_t falseLikelihood, GenTree* nodeToTest, bool testingForConversion = false, BitVec* ccmpVec = nullptr);
6989+
bool optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingForConversion = false, BitVec* ccmpVec = nullptr);
69906990

69916991
PhaseStatus optInvertLoops(); // Invert loops so they're entered at top and tested at bottom.
69926992
PhaseStatus optOptimizeFlow(); // Simplify flow graph and do tail duplication

src/coreclr/jit/lower.cpp

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11478,6 +11478,22 @@ bool Lowering::TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode)
1147811478
}
1147911479

1148011480
#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
11481+
//------------------------------------------------------------------------
11482+
// CanConvertOpToCCMP : Checks whether operand can be converted to CCMP
11483+
//
11484+
// Arguments:
11485+
// operand - operand to check for CCMP conversion
11486+
// tree - parent of the operand
11487+
//
11488+
// Return Value:
11489+
// true if operand can be converted to CCMP
11490+
//
11491+
bool Lowering::CanConvertOpToCCMP(GenTree* operand, GenTree* tree)
11492+
{
11493+
return operand->OperIsCmpCompare() && varTypeIsIntegralOrI(operand->gtGetOp1()) &&
11494+
IsInvariantInRange(operand, tree);
11495+
}
11496+
1148111497
//------------------------------------------------------------------------
1148211498
// TryLowerAndOrToCCMP : Lower AND/OR of two conditions into test + CCMP + SETCC nodes.
1148311499
//
@@ -11519,16 +11535,22 @@ bool Lowering::TryLowerAndOrToCCMP(GenTreeOp* tree, GenTree** next)
1151911535
// by TryLowerConditionToFlagsNode.
1152011536
//
1152111537
GenCondition cond1;
11522-
if (op2->OperIsCmpCompare() && varTypeIsIntegralOrI(op2->gtGetOp1()) && IsInvariantInRange(op2, tree) &&
11523-
(op2->gtGetOp1()->IsIntegralConst() || !op2->gtGetOp1()->isContained()) &&
11524-
(op2->gtGetOp2() == nullptr || op2->gtGetOp2()->IsIntegralConst() || !op2->gtGetOp2()->isContained()) &&
11538+
bool canConvertOp2ToCCMP = CanConvertOpToCCMP(op2, tree);
11539+
bool canConvertOp1ToCCMP = CanConvertOpToCCMP(op1, tree);
11540+
11541+
if (canConvertOp2ToCCMP &&
11542+
(!canConvertOp1ToCCMP ||
11543+
((op2->gtGetOp1()->IsIntegralConst() || !op2->gtGetOp1()->isContained()) &&
11544+
(op2->gtGetOp2() == nullptr || op2->gtGetOp2()->IsIntegralConst() || !op2->gtGetOp2()->isContained()))) &&
1152511545
TryLowerConditionToFlagsNode(tree, op1, &cond1, false))
1152611546
{
1152711547
// Fall through, converting op2 to the CCMP
1152811548
}
11529-
else if (op1->OperIsCmpCompare() && varTypeIsIntegralOrI(op1->gtGetOp1()) && IsInvariantInRange(op1, tree) &&
11530-
(op1->gtGetOp1()->IsIntegralConst() || !op1->gtGetOp1()->isContained()) &&
11531-
(op1->gtGetOp2() == nullptr || op1->gtGetOp2()->IsIntegralConst() || !op1->gtGetOp2()->isContained()) &&
11549+
else if (canConvertOp1ToCCMP &&
11550+
(!op1->gtGetOp1()->isContained() || op1->gtGetOp1()->IsIntegralConst() ||
11551+
IsContainableMemoryOp(op1->gtGetOp1())) &&
11552+
(op1->gtGetOp2() == nullptr || !op1->gtGetOp2()->isContained() || op1->gtGetOp2()->IsIntegralConst() ||
11553+
IsContainableMemoryOp(op1->gtGetOp2())) &&
1153211554
TryLowerConditionToFlagsNode(tree, op2, &cond1, false))
1153311555
{
1153411556
std::swap(op1, op2);

src/coreclr/jit/lower.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ class Lowering final : public Phase
8484
void ContainCheckLclHeap(GenTreeOp* node);
8585
void ContainCheckRet(GenTreeUnOp* ret);
8686
#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
87+
bool CanConvertOpToCCMP(GenTree* operand, GenTree* tree);
8788
bool TryLowerAndOrToCCMP(GenTreeOp* tree, GenTree** next);
8889
insCflags TruthifyingFlags(GenCondition cond);
8990
void ContainCheckConditionalCompare(GenTreeCCMP* ccmp);

src/coreclr/jit/optimizebools.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,11 +1576,13 @@ PhaseStatus Compiler::optOptimizeBools()
15761576
printf("*************** In optOptimizeBools()\n");
15771577
}
15781578
#endif
1579-
bool change = false;
1580-
bool retry = false;
1581-
unsigned numCond = 0;
1582-
unsigned numPasses = 0;
1583-
unsigned stress = false;
1579+
bool change = false;
1580+
bool retry = false;
1581+
unsigned numCond = 0;
1582+
unsigned numPasses = 0;
1583+
unsigned stress = false;
1584+
BitVecTraits ccmpTraits(fgBBNumMax + 1, this);
1585+
BitVec ccmpVec = BitVecOps::MakeEmpty(&ccmpTraits);
15841586

15851587
do
15861588
{
@@ -1656,7 +1658,7 @@ PhaseStatus Compiler::optOptimizeBools()
16561658
// trigger or not
16571659
// else if ((compOpportunisticallyDependsOn(InstructionSet_APX) || JitConfig.JitEnableApxIfConv()) &&
16581660
// optBoolsDsc.optOptimizeCompareChainCondBlock())
1659-
else if (JitConfig.EnableApxConditionalChaining() && !optSwitchDetectAndConvert(b1, true) &&
1661+
else if (JitConfig.EnableApxConditionalChaining() && !optSwitchDetectAndConvert(b1, true, &ccmpVec) &&
16601662
optBoolsDsc.optOptimizeCompareChainCondBlock())
16611663
{
16621664
// The optimization will have merged b1 and b2. Retry the loop so that

src/coreclr/jit/switchrecognition.cpp

Lines changed: 76 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#define SWITCH_MAX_DISTANCE ((TARGET_POINTER_SIZE * BITS_PER_BYTE) - 1)
1212
#define SWITCH_MIN_TESTS 3
1313

14+
// This is a heuristics based value tuned for optimal performance
15+
#define CONVERT_SWITCH_TO_CCMP_MIN_TEST 5
16+
1417
//-----------------------------------------------------------------------------
1518
// optRecognizeAndOptimizeSwitchJumps: Optimize range check for `x == cns1 || x == cns2 || x == cns3 ...`
1619
// pattern and convert it to a BBJ_SWITCH block (jump table), which then *might* be converted
@@ -160,18 +163,22 @@ bool IsConstantTestCondBlock(const BasicBlock* block,
160163
// testingForConversion - Test if its likely a switch conversion will happen.
161164
// Used to prevent a pessimization when optimizing for conditional chaining.
162165
// Done in this function to prevent maintaining the check in two places.
166+
// ccmpVec - BitVec to use to track all the nodes participating in a single switch
163167
//
164168
// Return Value:
165169
// True if the conversion was successful, false otherwise
166170
//
167-
bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingForConversion)
171+
bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingForConversion, BitVec* ccmpVec)
168172
{
169173
assert(firstBlock->KindIs(BBJ_COND));
170174

171-
GenTree* variableNode = nullptr;
172-
ssize_t cns = 0;
173-
BasicBlock* trueTarget = nullptr;
174-
BasicBlock* falseTarget = nullptr;
175+
GenTree* variableNode = nullptr;
176+
ssize_t cns = 0;
177+
BasicBlock* trueTarget = nullptr;
178+
BasicBlock* falseTarget = nullptr;
179+
int testValueIndex = 0;
180+
ssize_t testValues[SWITCH_MAX_DISTANCE] = {};
181+
weight_t falseLikelihood = firstBlock->GetFalseEdge()->getLikelihood();
175182

176183
// The algorithm is simple - we check that the given block is a constant test block
177184
// and then try to accumulate as many constant test blocks as possible. Once we hit
@@ -186,17 +193,30 @@ bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingFor
186193
// TODO: make it more flexible and support cases like "x != cns1 && x != cns2 && ..."
187194
return false;
188195
}
196+
if (testingForConversion)
197+
{
198+
assert(ccmpVec != nullptr);
199+
BitVecTraits ccmpTraits(fgBBNumMax + 1, this);
200+
if (BitVecOps::IsMember(&ccmpTraits, *ccmpVec, firstBlock->bbNum))
201+
{
202+
BitVecOps::RemoveElemD(&ccmpTraits, *ccmpVec, firstBlock->bbNum);
203+
return true;
204+
}
205+
else
206+
{
207+
BitVecOps::ClearD(&ccmpTraits, *ccmpVec);
208+
}
209+
}
189210

190211
// No more than SWITCH_MAX_TABLE_SIZE blocks are allowed (arbitrary limit in this context)
191-
int testValueIndex = 0;
192-
ssize_t testValues[SWITCH_MAX_DISTANCE] = {};
193-
testValues[testValueIndex] = cns;
212+
testValueIndex = 0;
213+
testValues[testValueIndex] = cns;
194214
testValueIndex++;
195215

196216
// Track likelihood of reaching the false block
197217
//
198-
weight_t falseLikelihood = firstBlock->GetFalseEdge()->getLikelihood();
199-
const BasicBlock* prevBlock = firstBlock;
218+
falseLikelihood = firstBlock->GetFalseEdge()->getLikelihood();
219+
const BasicBlock* prevBlock = firstBlock;
200220

201221
// Now walk the chain of test blocks, and see if they are basically the same type of test
202222
for (BasicBlock *currBb = falseTarget, *currFalseTarget; currBb != nullptr; currBb = currFalseTarget)
@@ -209,8 +229,8 @@ bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingFor
209229
{
210230
// Only the first conditional block can have multiple statements.
211231
// Stop searching and process what we already have.
212-
return !testingForConversion &&
213-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
232+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
233+
testingForConversion, ccmpVec);
214234
}
215235

216236
// Inspect secondary blocks
@@ -220,29 +240,29 @@ bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingFor
220240
if (currTrueTarget != trueTarget)
221241
{
222242
// This blocks jumps to a different target, stop searching and process what we already have.
223-
return !testingForConversion &&
224-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
243+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
244+
testingForConversion, ccmpVec);
225245
}
226246

227247
if (!GenTree::Compare(currVariableNode, variableNode->gtEffectiveVal()))
228248
{
229249
// A different variable node is used, stop searching and process what we already have.
230-
return !testingForConversion &&
231-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
250+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
251+
testingForConversion, ccmpVec);
232252
}
233253

234254
if (currBb->GetUniquePred(this) != prevBlock)
235255
{
236256
// Multiple preds in a secondary block, stop searching and process what we already have.
237-
return !testingForConversion &&
238-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
257+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
258+
testingForConversion, ccmpVec);
239259
}
240260

241261
if (!BasicBlock::sameEHRegion(prevBlock, currBb))
242262
{
243263
// Current block is in a different EH region, stop searching and process what we already have.
244-
return !testingForConversion &&
245-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
264+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
265+
testingForConversion, ccmpVec);
246266
}
247267

248268
// Ok we can work with that, add the test value to the list
@@ -252,27 +272,23 @@ bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingFor
252272
if (testValueIndex == SWITCH_MAX_DISTANCE)
253273
{
254274
// Too many suitable tests found - stop and process what we already have.
255-
return !testingForConversion &&
256-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
275+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
276+
testingForConversion, ccmpVec);
257277
}
258278

259279
if (isReversed)
260280
{
261281
// We only support reversed test (GT_NE) for the last block.
262-
return !testingForConversion &&
263-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
282+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
283+
testingForConversion, ccmpVec);
264284
}
265-
266-
if (testingForConversion)
267-
return true;
268-
269285
prevBlock = currBb;
270286
}
271287
else
272288
{
273289
// Current block is not a suitable test, stop searching and process what we already have.
274-
return !testingForConversion &&
275-
optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode);
290+
return optSwitchConvert(firstBlock, testValueIndex, testValues, falseLikelihood, variableNode,
291+
testingForConversion, ccmpVec);
276292
}
277293
}
278294
}
@@ -292,16 +308,30 @@ bool Compiler::optSwitchDetectAndConvert(BasicBlock* firstBlock, bool testingFor
292308
// testValues - Array of constants that are tested against the variable
293309
// falseLikelihood - Likelihood of control flow reaching the false block
294310
// nodeToTest - Variable node that is tested against the constants
311+
// testingForConversion - Test if its likely a switch conversion will happen.
312+
// Used to prevent a pessimization when optimizing for conditional chaining.
313+
// Done in this function to prevent maintaining the check in two places.
314+
// ccmpVec - BitVec to use to track all the nodes participating in a single switch
295315
//
296316
// Return Value:
297317
// True if the conversion was successful, false otherwise
298318
//
299-
bool Compiler::optSwitchConvert(
300-
BasicBlock* firstBlock, int testsCount, ssize_t* testValues, weight_t falseLikelihood, GenTree* nodeToTest)
319+
bool Compiler::optSwitchConvert(BasicBlock* firstBlock,
320+
int testsCount,
321+
ssize_t* testValues,
322+
weight_t falseLikelihood,
323+
GenTree* nodeToTest,
324+
bool testingForConversion,
325+
BitVec* ccmpVec)
301326
{
302327
assert(firstBlock->KindIs(BBJ_COND));
303328
assert(!varTypeIsSmall(nodeToTest));
304329

330+
if (testingForConversion && (testsCount < CONVERT_SWITCH_TO_CCMP_MIN_TEST))
331+
{
332+
return false;
333+
}
334+
305335
if (testsCount < SWITCH_MIN_TESTS)
306336
{
307337
// Early out - short chains.
@@ -376,6 +406,20 @@ bool Compiler::optSwitchConvert(
376406
FlowEdge* const trueEdge = firstBlock->GetTrueEdge();
377407
FlowEdge* const falseEdge = firstBlock->GetFalseEdge();
378408

409+
if (testingForConversion)
410+
{
411+
assert(ccmpVec != nullptr);
412+
BitVecTraits ccmpTraits(fgBBNumMax + 1, this);
413+
// Return if we are just checking for a possibility of a switch convert and not actually making the conversion
414+
// to switch here.
415+
BasicBlock* iterBlock = firstBlock;
416+
for (int i = 0; i < testsCount; i++)
417+
{
418+
BitVecOps::AddElemD(&ccmpTraits, *ccmpVec, iterBlock->bbNum);
419+
iterBlock = iterBlock->GetFalseTarget();
420+
}
421+
return true;
422+
}
379423
// Convert firstBlock to a switch block
380424
const unsigned jumpCount = static_cast<unsigned>(maxValue - minValue + 1);
381425
assert((jumpCount > 0) && (jumpCount <= SWITCH_MAX_DISTANCE + 1));

0 commit comments

Comments
 (0)