Skip to content

Commit a29fe42

Browse files
committed
[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs
Currently, the custom SGPR spill lowering pass spills SGPRs into physical VGPR lanes and the remaining VGPRs are used by regalloc for vector regclass allocation. This imposes many restrictions that we ended up with unsuccessful SGPR spilling when there won't be enough VGPRs and we are forced to spill the leftover into memory during PEI. The custom spill handling during PEI has many edge cases and often breaks the compiler time to time. This patch implements spilling SGPRs into virtual VGPR lanes. Since we now split the register allocation for SGPRs and VGPRs, the virtual registers introduced for the spill lanes would get allocated automatically in the subsequent regalloc invocation for VGPRs. Spill to virtual registers will always be successful, even in the high-pressure situations, and hence it avoids most of the edge cases during PEI. We are now left with only the custom SGPR spills during PEI for special registers like the frame pointer which is an unproblematic case. By spilling CSRs into virtual VGPR lanes, we might end up with broken CFIs that can potentially corrupt the frame unwinding in the debugger causing either a crash or a terrible debugging experience. This occurs when regalloc tries to spill or split the liverange of these virtual VGPRs. The CFIs should also be inserted at these intermediate points to correctly propagate the CFI entries. It is not currently implemented in the compiler. As a short-term fix, we continue to spill CSR SGPRs into physical VGPR lanes for the debugger to correctly compute the unwind information. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D124196 Change-Id: I1180639b1211b05e439132f9fc978860577c9017
1 parent 9729adc commit a29fe42

File tree

72 files changed

+5144
-4786
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+5144
-4786
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,10 @@ static void getVGPRSpillLaneOrTempRegister(
101101
SGPR, PrologEpilogSGPRSaveRestoreInfo(
102102
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
103103

104-
LLVM_DEBUG(
105-
auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
106-
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
107-
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
104+
LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
105+
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
106+
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
107+
<< '\n';);
108108
} else {
109109
// Remove dead <FI> index
110110
MF.getFrameInfo().RemoveStackObject(FI);
@@ -289,7 +289,7 @@ class PrologEpilogSGPRSpillBuilder {
289289

290290
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
291291
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
292-
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
292+
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
293293
assert(Spill.size() == NumSubRegs);
294294

295295
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -372,7 +372,7 @@ class PrologEpilogSGPRSpillBuilder {
372372
void restoreFromVGPRLane(const int FI) {
373373
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
374374
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
375-
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
375+
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
376376
assert(Spill.size() == NumSubRegs);
377377

378378
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -1544,8 +1544,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
15441544
TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
15451545
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
15461546
TRI->isAGPR(MRI, VReg))) {
1547-
// FIXME: change to enterBasicBlockEnd()
1548-
RS->enterBasicBlock(MBB);
1547+
RS->enterBasicBlockEnd(MBB);
1548+
RS->backward(MI);
15491549
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
15501550
SpillFIs.set(FI);
15511551
continue;

llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp

Lines changed: 131 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
2323
#include "SIMachineFunctionInfo.h"
2424
#include "llvm/CodeGen/LiveIntervals.h"
25+
#include "llvm/CodeGen/MachineDominators.h"
2526
#include "llvm/CodeGen/MachineFrameInfo.h"
2627
#include "llvm/CodeGen/RegisterScavenging.h"
2728
#include "llvm/InitializePasses.h"
@@ -40,6 +41,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
4041
const SIInstrInfo *TII = nullptr;
4142
LiveIntervals *LIS = nullptr;
4243
SlotIndexes *Indexes = nullptr;
44+
MachineDominatorTree *MDT = nullptr;
4345

4446
// Save and Restore blocks of the current function. Typically there is a
4547
// single save block, unless Windows EH funclets are involved.
@@ -52,14 +54,25 @@ class SILowerSGPRSpills : public MachineFunctionPass {
5254
SILowerSGPRSpills() : MachineFunctionPass(ID) {}
5355

5456
void calculateSaveRestoreBlocks(MachineFunction &MF);
55-
bool spillCalleeSavedRegs(MachineFunction &MF);
57+
bool spillCalleeSavedRegs(MachineFunction &MF,
58+
SmallVectorImpl<int> &CalleeSavedFIs);
59+
void updateLaneVGPRDomInstr(
60+
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
61+
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr);
5662

5763
bool runOnMachineFunction(MachineFunction &MF) override;
5864

5965
void getAnalysisUsage(AnalysisUsage &AU) const override {
66+
AU.addRequired<MachineDominatorTree>();
6067
AU.setPreservesAll();
6168
MachineFunctionPass::getAnalysisUsage(AU);
6269
}
70+
71+
MachineFunctionProperties getClearedProperties() const override {
72+
return MachineFunctionProperties()
73+
.set(MachineFunctionProperties::Property::IsSSA)
74+
.set(MachineFunctionProperties::Property::NoVRegs);
75+
}
6376
};
6477

6578
} // end anonymous namespace
@@ -70,6 +83,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
7083
"SI lower SGPR spill instructions", false, false)
7184
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
7285
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
86+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
7387
INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
7488
"SI lower SGPR spill instructions", false, false)
7589

@@ -175,7 +189,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
175189
EntryBB.sortUniqueLiveIns();
176190
}
177191

178-
bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
192+
bool SILowerSGPRSpills::spillCalleeSavedRegs(
193+
MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
179194
MachineRegisterInfo &MRI = MF.getRegInfo();
180195
const Function &F = MF.getFunction();
181196
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -214,6 +229,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
214229
TRI->getSpillAlign(*RC), true);
215230

216231
CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
232+
CalleeSavedFIs.push_back(JunkFI);
217233
}
218234
}
219235

@@ -226,6 +242,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
226242
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
227243
TRI->getSpillAlign(*RC), true);
228244
CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
245+
CalleeSavedFIs.push_back(JunkFI);
229246
}
230247

231248
if (!CSI.empty()) {
@@ -245,20 +262,71 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
245262
return false;
246263
}
247264

265+
void SILowerSGPRSpills::updateLaneVGPRDomInstr(
266+
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
267+
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
268+
// For the Def of a virtual LaneVPGR to dominate all its uses, we should
269+
// insert an IMPLICIT_DEF before the dominating spill. Switching to a
270+
// depth first order doesn't really help since the machine function can be in
271+
// the unstructured control flow post-SSA. For each virtual register, hence
272+
// finding the common dominator to get either the dominating spill or a block
273+
// dominating all spills. Is there a better way to handle it?
274+
SIMachineFunctionInfo *FuncInfo =
275+
MBB->getParent()->getInfo<SIMachineFunctionInfo>();
276+
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
277+
FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
278+
Register PrevLaneVGPR;
279+
for (auto &Spill : VGPRSpills) {
280+
if (PrevLaneVGPR == Spill.VGPR)
281+
continue;
282+
283+
PrevLaneVGPR = Spill.VGPR;
284+
auto I = LaneVGPRDomInstr.find(Spill.VGPR);
285+
if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
286+
// Initially add the spill instruction itself for Insertion point.
287+
LaneVGPRDomInstr[Spill.VGPR] = InsertPt;
288+
} else {
289+
assert(I != LaneVGPRDomInstr.end());
290+
auto PrevInsertPt = I->second;
291+
MachineBasicBlock *DomMBB = PrevInsertPt->getParent();
292+
if (DomMBB == MBB) {
293+
// The insertion point earlier selected in a predecessor block whose
294+
// spills are currently being lowered. The earlier InsertPt would be
295+
// the one just before the block terminator and it should be changed
296+
// if we insert any new spill in it.
297+
if (MDT->dominates(&*InsertPt, &*PrevInsertPt))
298+
I->second = InsertPt;
299+
300+
continue;
301+
}
302+
303+
// Find the common dominator block between PrevInsertPt and the
304+
// current spill.
305+
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
306+
if (DomMBB == MBB)
307+
I->second = InsertPt;
308+
else if (DomMBB != PrevInsertPt->getParent())
309+
I->second = &(*DomMBB->getFirstTerminator());
310+
}
311+
}
312+
}
313+
248314
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
249315
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
250316
TII = ST.getInstrInfo();
251317
TRI = &TII->getRegisterInfo();
252318

253319
LIS = getAnalysisIfAvailable<LiveIntervals>();
254320
Indexes = getAnalysisIfAvailable<SlotIndexes>();
321+
MDT = &getAnalysis<MachineDominatorTree>();
255322

256323
assert(SaveBlocks.empty() && RestoreBlocks.empty());
257324

258325
// First, expose any CSR SGPR spills. This is mostly the same as what PEI
259326
// does, but somewhat simpler.
260327
calculateSaveRestoreBlocks(MF);
261-
bool HasCSRs = spillCalleeSavedRegs(MF);
328+
SmallVector<int> CalleeSavedFIs;
329+
bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
262330

263331
MachineFrameInfo &MFI = MF.getFrameInfo();
264332
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -272,6 +340,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
272340

273341
bool MadeChange = false;
274342
bool NewReservedRegs = false;
343+
bool SpilledToVirtVGPRLanes = false;
275344

276345
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
277346
// handled as SpilledToReg in regular PrologEpilogInserter.
@@ -287,30 +356,69 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
287356
// To track the spill frame indices handled in this pass.
288357
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
289358

359+
// To track the IMPLICIT_DEF insertion point for the lane vgprs.
360+
DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr;
361+
290362
for (MachineBasicBlock &MBB : MF) {
291363
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
292364
if (!TII->isSGPRSpill(MI))
293365
continue;
294366

295367
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
296368
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
297-
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
298-
NewReservedRegs = true;
299-
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
300-
MI, FI, nullptr, Indexes, LIS);
301-
(void)Spilled;
302-
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
303-
SpillFIs.set(FI);
369+
370+
bool IsCalleeSaveSGPRSpill =
371+
std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
372+
CalleeSavedFIs.end();
373+
if (IsCalleeSaveSGPRSpill) {
374+
// Spill callee-saved SGPRs into physical VGPR lanes.
375+
376+
// TODO: This is to ensure the CFIs are static for efficient frame
377+
// unwinding in the debugger. Spilling them into virtual VGPR lanes
378+
// involve regalloc to allocate the physical VGPRs and that might
379+
// cause intermediate spill/split of such liveranges for successful
380+
// allocation. This would result in broken CFI encoding unless the
381+
// regalloc aware CFI generation to insert new CFIs along with the
382+
// intermediate spills is implemented. There is no such support
383+
// currently exist in the LLVM compiler.
384+
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
385+
NewReservedRegs = true;
386+
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
387+
MI, FI, nullptr, Indexes, LIS, true);
388+
(void)Spilled;
389+
assert(Spilled &&
390+
"failed to spill SGPR to physical VGPR lane when allocated");
391+
}
392+
} else {
393+
MachineInstrSpan MIS(&MI, &MBB);
394+
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
395+
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
396+
MI, FI, nullptr, Indexes, LIS);
397+
(void)Spilled;
398+
assert(Spilled &&
399+
"failed to spill SGPR to virtual VGPR lane when allocated");
400+
SpillFIs.set(FI);
401+
updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
402+
SpilledToVirtVGPRLanes = true;
403+
}
304404
}
305405
}
306406
}
307407

308-
// FIXME: Adding to live-ins redundant with reserving registers.
309-
for (MachineBasicBlock &MBB : MF) {
310-
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
311-
MBB.addLiveIn(Reg);
312-
MBB.sortUniqueLiveIns();
408+
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
409+
auto InsertPt = LaneVGPRDomInstr[Reg];
410+
// Insert the IMPLICIT_DEF at the identified points.
411+
auto MIB =
412+
BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(),
413+
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
414+
FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
415+
if (LIS) {
416+
LIS->InsertMachineInstrInMaps(*MIB);
417+
LIS->createAndComputeVirtRegInterval(Reg);
418+
}
419+
}
313420

421+
for (MachineBasicBlock &MBB : MF) {
314422
// FIXME: The dead frame indices are replaced with a null register from
315423
// the debug value instructions. We should instead, update it with the
316424
// correct register value. But not sure the register value alone is
@@ -338,6 +446,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
338446
// lane".
339447
FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ false);
340448

449+
MadeChange = true;
450+
}
451+
452+
if (SpilledToVirtVGPRLanes) {
341453
const TargetRegisterClass *RC =
342454
ST.isWave32() ? &AMDGPU::SGPR_32RegClass : &AMDGPU::SGPR_64RegClass;
343455
// Shift back the reserved SGPR for EXEC copy into the lowest range.
@@ -347,18 +459,17 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
347459
if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
348460
TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
349461
FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
350-
351-
MadeChange = true;
352462
} else {
353-
// No SGPR spills and hence there won't be any WWM spills/copies. Reset the
354-
// SGPR reserved for EXEC copy.
463+
// No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
464+
// spills/copies. Reset the SGPR reserved for EXEC copy.
355465
FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
356466
}
357467

358468
SaveBlocks.clear();
359469
RestoreBlocks.clear();
360470

361-
// Updated the reserved registers with any VGPRs added for SGPR spills.
471+
// Updated the reserved registers with any physical VGPRs added for SGPR
472+
// spills.
362473
if (NewReservedRegs)
363474
MRI.freezeReservedRegs(MF);
364475

0 commit comments

Comments
 (0)