Skip to content

Commit 76e14de

Browse files
authored
[X86][BreakFalseDeps] Using reverse order for undef register selection (#137569)
BreakFalseDeps picks the best register for undef operands if instructions have false dependency. The problem is if the instruction is close to the beginning of the function, ReachingDefAnalysis is over optimism to the unused registers, which results in collision with registers just defined in the caller. This patch changes the selection of undef register in an reverse order, which reduces the probability of register collisions between caller and callee. It brings improvement in some of our internal benchmarks with negligible effect on other benchmarks.
1 parent 6fb2a80 commit 76e14de

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+973
-927
lines changed

llvm/include/llvm/CodeGen/RegisterClassInfo.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class RegisterClassInfo {
5050
// entry is valid when its tag matches.
5151
unsigned Tag = 0;
5252

53+
bool Reverse = false;
54+
5355
const MachineFunction *MF = nullptr;
5456
const TargetRegisterInfo *TRI = nullptr;
5557

@@ -86,9 +88,11 @@ class RegisterClassInfo {
8688
public:
8789
LLVM_ABI RegisterClassInfo();
8890

89-
/// runOnFunction - Prepare to answer questions about MF. This must be called
91+
/// runOnFunction - Prepare to answer questions about MF. Rev indicates to
92+
/// use reversed raw order when compute register order. This must be called
9093
/// before any other methods are used.
91-
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF);
94+
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF,
95+
bool Rev = false);
9296

9397
/// getNumAllocatableRegs - Returns the number of actually allocatable
9498
/// registers in RC in the current function.

llvm/include/llvm/CodeGen/TargetRegisterInfo.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class TargetRegisterClass {
6868
const bool CoveredBySubRegs;
6969
const unsigned *SuperClasses;
7070
const uint16_t SuperClassesSize;
71-
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction&);
71+
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction &, bool Rev);
7272

7373
/// Return the register class ID number.
7474
unsigned getID() const { return MC->getID(); }
@@ -199,8 +199,9 @@ class TargetRegisterClass {
199199
/// other criteria.
200200
///
201201
/// By default, this method returns all registers in the class.
202-
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF) const {
203-
return OrderFunc ? OrderFunc(MF) : getRegisters();
202+
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF,
203+
bool Rev = false) const {
204+
return OrderFunc ? OrderFunc(MF, Rev) : getRegisters();
204205
}
205206

206207
/// Returns the combination of all lane masks of register in this class.

llvm/include/llvm/Target/Target.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -314,7 +314,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
314314
// to use in a given machine function. The code will be inserted in a
315315
// function like this:
316316
//
317-
// static inline unsigned f(const MachineFunction &MF) { ... }
317+
// static inline unsigned f(const MachineFunction &MF, bool Rev) { ... }
318318
//
319319
// The function should return 0 to select the default order defined by
320320
// MemberList, 1 to select the first AltOrders entry and so on.

llvm/lib/CodeGen/BreakFalseDeps.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
285285
TRI = MF->getSubtarget().getRegisterInfo();
286286
RDA = &getAnalysis<ReachingDefAnalysis>();
287287

288-
RegClassInfo.runOnMachineFunction(mf);
288+
RegClassInfo.runOnMachineFunction(mf, /*Rev=*/true);
289289

290290
LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");
291291

llvm/lib/CodeGen/RegisterClassInfo.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,16 @@ StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
3939

4040
RegisterClassInfo::RegisterClassInfo() = default;
4141

42-
void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
42+
void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf,
43+
bool Rev) {
4344
bool Update = false;
4445
MF = &mf;
4546

4647
auto &STI = MF->getSubtarget();
4748

4849
// Allocate new array the first time we see a new target.
49-
if (STI.getRegisterInfo() != TRI) {
50+
if (STI.getRegisterInfo() != TRI || Reverse != Rev) {
51+
Reverse = Rev;
5052
TRI = STI.getRegisterInfo();
5153
RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
5254
Update = true;
@@ -142,7 +144,12 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
142144

143145
// FIXME: Once targets reserve registers instead of removing them from the
144146
// allocation order, we can simply use begin/end here.
145-
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
147+
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF, Reverse);
148+
std::vector<MCPhysReg> ReverseOrder;
149+
if (Reverse) {
150+
llvm::append_range(ReverseOrder, reverse(RawOrder));
151+
RawOrder = ArrayRef<MCPhysReg>(ReverseOrder);
152+
}
146153
for (unsigned PhysReg : RawOrder) {
147154
// Remove reserved registers from the allocation order.
148155
if (Reserved.test(PhysReg))

llvm/lib/Target/X86/X86RegisterInfo.td

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -806,17 +806,37 @@ def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i
806806
512, (sequence "ZMM%u", 0, 15)>;
807807

808808
// Scalar AVX-512 floating point registers.
809-
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
809+
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> {
810+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
811+
let AltOrderSelect = [{
812+
return Rev;
813+
}];
814+
}
810815

811-
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
816+
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> {
817+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
818+
let AltOrderSelect = [{
819+
return Rev;
820+
}];
821+
}
812822

813823
def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
814824

815825
// Extended VR128 and VR256 for AVX-512 instructions
816826
def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128],
817-
128, (add FR32X)>;
827+
128, (add FR32X)> {
828+
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
829+
let AltOrderSelect = [{
830+
return Rev;
831+
}];
832+
}
818833
def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64],
819-
256, (sequence "YMM%u", 0, 31)>;
834+
256, (sequence "YMM%u", 0, 31)> {
835+
let AltOrders = [(add (sequence "YMM%u", 16, 31), (sequence "YMM%u", 0, 15))];
836+
let AltOrderSelect = [{
837+
return Rev;
838+
}];
839+
}
820840

821841
// Mask registers
822842
def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}

llvm/test/CodeGen/X86/avx-cvt.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
108108
define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
109109
; CHECK-LABEL: funcA:
110110
; CHECK: # %bb.0:
111-
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
111+
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0
112112
; CHECK-NEXT: retq
113113
%tmp1 = load i64, ptr %e, align 8
114114
%conv = sitofp i64 %tmp1 to double
@@ -118,7 +118,7 @@ define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
118118
define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
119119
; CHECK-LABEL: funcB:
120120
; CHECK: # %bb.0:
121-
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
121+
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0
122122
; CHECK-NEXT: retq
123123
%tmp1 = load i32, ptr %e, align 4
124124
%conv = sitofp i32 %tmp1 to double
@@ -128,7 +128,7 @@ define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
128128
define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
129129
; CHECK-LABEL: funcC:
130130
; CHECK: # %bb.0:
131-
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
131+
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0
132132
; CHECK-NEXT: retq
133133
%tmp1 = load i32, ptr %e, align 4
134134
%conv = sitofp i32 %tmp1 to float
@@ -138,7 +138,7 @@ define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
138138
define float @funcD(ptr nocapture %e) nounwind uwtable readonly ssp {
139139
; CHECK-LABEL: funcD:
140140
; CHECK: # %bb.0:
141-
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
141+
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0
142142
; CHECK-NEXT: retq
143143
%tmp1 = load i64, ptr %e, align 8
144144
%conv = sitofp i64 %tmp1 to float
@@ -183,7 +183,7 @@ declare float @llvm.floor.f32(float %p)
183183
define float @floor_f32_load(ptr %aptr) optsize {
184184
; CHECK-LABEL: floor_f32_load:
185185
; CHECK: # %bb.0:
186-
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
186+
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
187187
; CHECK-NEXT: retq
188188
%a = load float, ptr %aptr
189189
%res = call float @llvm.floor.f32(float %a)
@@ -193,7 +193,7 @@ define float @floor_f32_load(ptr %aptr) optsize {
193193
define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
194194
; CHECK-LABEL: floor_f32_load_pgso:
195195
; CHECK: # %bb.0:
196-
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
196+
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
197197
; CHECK-NEXT: retq
198198
%a = load float, ptr %aptr
199199
%res = call float @llvm.floor.f32(float %a)
@@ -203,7 +203,7 @@ define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
203203
define double @nearbyint_f64_load(ptr %aptr) optsize {
204204
; CHECK-LABEL: nearbyint_f64_load:
205205
; CHECK: # %bb.0:
206-
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
206+
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
207207
; CHECK-NEXT: retq
208208
%a = load double, ptr %aptr
209209
%res = call double @llvm.nearbyint.f64(double %a)
@@ -213,7 +213,7 @@ define double @nearbyint_f64_load(ptr %aptr) optsize {
213213
define double @nearbyint_f64_load_pgso(ptr %aptr) !prof !14 {
214214
; CHECK-LABEL: nearbyint_f64_load_pgso:
215215
; CHECK: # %bb.0:
216-
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
216+
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
217217
; CHECK-NEXT: retq
218218
%a = load double, ptr %aptr
219219
%res = call double @llvm.nearbyint.f64(double %a)

0 commit comments

Comments
 (0)