From 38ec566ee899b593cd18a8303260864d74854c9e Mon Sep 17 00:00:00 2001 From: Michael Constant Date: Wed, 22 Jan 2025 03:13:37 -0800 Subject: [PATCH 1/2] Fix invalid instructions with large stack frames `X86FrameLowering::emitSPUpdate()` assumes that 64-bit targets use a 64-bit stack pointer, but that's not true on x32. When checking the stack pointer size, we need to look at `Uses64BitFramePtr` rather than `Is64Bit`. This avoids generating invalid instructions like `add esp, rcx`. For impossibly-large stack frames (4 GiB or larger with a 32-bit stack pointer), we were also generating invalid instructions like `mov eax, 5000000000`. The inline stack probe code already had a check for that situation; I've moved the check into `emitSPUpdate()`, so any attempt to allocate a 4 GiB stack frame with a 32-bit stack pointer will now trap rather than adjusting ESP by the wrong amount. This also fixes the "can't have 32-bit 16GB stack frame" assertion, which used to be triggerable by user code but is now correct. To help catch situations like this in the future, I've added `-verify-machineinstrs` to the stack clash tests that generate large stack frames. This fixes the expensive-checks buildbot failure caused by #113219. --- llvm/lib/Target/X86/X86FrameLowering.cpp | 30 ++++++++++------- llvm/test/CodeGen/X86/huge-stack-offset.ll | 8 ++--- .../CodeGen/X86/stack-clash-extra-huge.ll | 32 +++---------------- llvm/test/CodeGen/X86/stack-clash-huge.ll | 8 ++--- 4 files changed, 31 insertions(+), 47 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 18de38b2d0159..a7b60afb7f547 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -234,6 +234,14 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineInstr::MIFlag Flag = isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; + if (!Uses64BitFramePtr && !isUInt<32>(Offset)) { + // We're being asked to adjust a 32-bit stack pointer by 4 GiB or more. + // This might be unreachable code, so don't complain now; just trap if + // it's reached at runtime. + BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); + return; + } + uint64_t Chunk = (1LL << 31) - 1; MachineFunction &MF = *MBB.getParent(); @@ -253,17 +261,19 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; - unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX); + unsigned Rax = (unsigned)(Uses64BitFramePtr ? X86::RAX : X86::EAX); if (isSub && !isEAXLiveIn(MBB)) Reg = Rax; else - Reg = TRI->findDeadCallerSavedReg(MBB, MBBI); + Reg = getX86SubSuperRegister(TRI->findDeadCallerSavedReg(MBB, MBBI), + Uses64BitFramePtr ? 64 : 32); - unsigned AddSubRROpc = - isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit); + unsigned AddSubRROpc = isSub ? getSUBrrOpcode(Uses64BitFramePtr) + : getADDrrOpcode(Uses64BitFramePtr); if (Reg) { - BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)), + Reg) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr) @@ -279,7 +289,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // addq %rsp, %rax // xchg %rax, (%rsp) // movq (%rsp), %rsp - assert(Is64Bit && "can't have 32-bit 16GB stack frame"); + assert(Uses64BitFramePtr && "can't have 32-bit 16GB stack frame"); BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r)) .addReg(Rax, RegState::Kill) .setMIFlag(Flag); @@ -289,7 +299,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, Offset = -(Offset - SlotSize); else Offset = Offset + SlotSize; - BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax) + BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Uses64BitFramePtr, Offset)), + Rax) .addImm(Offset) .setMIFlag(Flag); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax) @@ -826,10 +837,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } else { - // We're being asked to probe a stack frame that's 4 GiB or larger, - // but our stack pointer is only 32 bits. This might be unreachable - // code, so don't complain now; just trap if it's reached at runtime. - BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); + llvm_unreachable("Offset too large for 32-bit stack pointer"); } // while in the loop, use loop-invariant reg for CFI, diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll index e825328ccd89a..6629811a59b23 100644 --- a/llvm/test/CodeGen/X86/huge-stack-offset.ll +++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll @@ -13,11 +13,9 @@ define void @foo() nounwind { ; CHECK-64-NEXT: addq [[RAX]], %rsp ; CHECK-32-LABEL: foo: -; CHECK-32: movl $50000000{{..}}, %eax -; CHECK-32-NEXT: subl %eax, %esp +; CHECK-32: ud2 ; CHECK-32-NOT: subl $2147483647, %esp -; CHECK-32: movl $50000000{{..}}, [[EAX:%e..]] -; CHECK-32-NEXT: addl [[EAX]], %esp +; CHECK-32: ud2 %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) ret void @@ -46,7 +44,7 @@ define i32 @foo3(i32 inreg %x) nounwind { ; CHECK-64-NEXT: subq %rax, %rsp ; CHECK-32-LABEL: foo3: -; CHECK-32: subl $2147483647, %esp +; CHECK-32: ud2 ; CHECK-32-NOT: movl ${{.*}}, %eax %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll index 59cbcd0689fbf..d9b20f50e9a88 100644 --- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s -; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s +; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s +; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s define i32 @foo() local_unnamed_addr #0 { ; CHECK-X64-LABEL: foo: @@ -30,44 +30,22 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X86-LABEL: foo: ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: ud2 -; CHECK-X86-NEXT: .cfi_def_cfa_register %eax -; CHECK-X86-NEXT: .cfi_adjust_cfa_offset 4800000000 -; CHECK-X86-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-NEXT: movl $0, (%esp) -; CHECK-X86-NEXT: cmpl %eax, %esp -; CHECK-X86-NEXT: jne .LBB0_1 -; CHECK-X86-NEXT: # %bb.2: -; CHECK-X86-NEXT: subl $12, %esp -; CHECK-X86-NEXT: .cfi_def_cfa_register %esp ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4800000016 ; CHECK-X86-NEXT: movl $1, 392(%esp) ; CHECK-X86-NEXT: movl $1, 28792(%esp) ; CHECK-X86-NEXT: movl (%esp), %eax -; CHECK-X86-NEXT: movl $4800000012, %ecx # imm = 0x11E1A300C -; CHECK-X86-NEXT: addl %ecx, %esp +; CHECK-X86-NEXT: ud2 ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4 ; CHECK-X86-NEXT: retl ; ; CHECK-X32-LABEL: foo: ; CHECK-X32: # %bb.0: ; CHECK-X32-NEXT: ud2 -; CHECK-X32-NEXT: .cfi_def_cfa_register %r11 -; CHECK-X32-NEXT: .cfi_adjust_cfa_offset 4799995904 -; CHECK-X32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X32-NEXT: movq $0, (%esp) -; CHECK-X32-NEXT: cmpl %r11d, %esp -; CHECK-X32-NEXT: jne .LBB0_1 -; CHECK-X32-NEXT: # %bb.2: -; CHECK-X32-NEXT: subl $3976, %esp # imm = 0xF88 -; CHECK-X32-NEXT: .cfi_def_cfa_register %rsp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 4799999888 ; CHECK-X32-NEXT: movl $1, 264(%esp) ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax -; CHECK-X32-NEXT: movabsq $4799999880, %rcx # imm = 0x11E1A2F88 -; CHECK-X32-NEXT: addq %rcx, %esp +; CHECK-X32-NEXT: ud2 ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 1200000000, align 16 diff --git a/llvm/test/CodeGen/X86/stack-clash-huge.ll b/llvm/test/CodeGen/X86/stack-clash-huge.ll index 03f028dfc2506..c9990773201f0 100644 --- a/llvm/test/CodeGen/X86/stack-clash-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-huge.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86 %s -; RUN: llc -mtriple=x86_64-linux-gnux32 < %s | FileCheck -check-prefix=CHECK-X32 %s +; RUN: llc -mtriple=x86_64-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X64 %s +; RUN: llc -mtriple=i686-linux-android -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X86 %s +; RUN: llc -mtriple=x86_64-linux-gnux32 -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-X32 %s define i32 @foo() local_unnamed_addr #0 { ; CHECK-X64-LABEL: foo: @@ -69,7 +69,7 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax ; CHECK-X32-NEXT: movl $2399999880, %ecx # imm = 0x8F0D1788 -; CHECK-X32-NEXT: addq %rcx, %esp +; CHECK-X32-NEXT: addl %ecx, %esp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 600000000, align 16 From a6f2049b4fc44445ea8c4e829ab1b7b9bdcaeecf Mon Sep 17 00:00:00 2001 From: Michael Constant Date: Wed, 22 Jan 2025 19:12:38 -0800 Subject: [PATCH 2/2] don't worry about impossibly-large stack frames for now --- llvm/lib/Target/X86/X86FrameLowering.cpp | 13 +++------- llvm/test/CodeGen/X86/huge-stack-offset.ll | 8 +++--- .../CodeGen/X86/stack-clash-extra-huge.ll | 26 +++++++++++++++++-- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a7b60afb7f547..47cc6a18ef843 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -234,14 +234,6 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MachineInstr::MIFlag Flag = isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy; - if (!Uses64BitFramePtr && !isUInt<32>(Offset)) { - // We're being asked to adjust a 32-bit stack pointer by 4 GiB or more. - // This might be unreachable code, so don't complain now; just trap if - // it's reached at runtime. - BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); - return; - } - uint64_t Chunk = (1LL << 31) - 1; MachineFunction &MF = *MBB.getParent(); @@ -837,7 +829,10 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop( .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } else { - llvm_unreachable("Offset too large for 32-bit stack pointer"); + // We're being asked to probe a stack frame that's 4 GiB or larger, + // but our stack pointer is only 32 bits. This might be unreachable + // code, so don't complain now; just trap if it's reached at runtime. + BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP)); } // while in the loop, use loop-invariant reg for CFI, diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll index 6629811a59b23..e825328ccd89a 100644 --- a/llvm/test/CodeGen/X86/huge-stack-offset.ll +++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll @@ -13,9 +13,11 @@ define void @foo() nounwind { ; CHECK-64-NEXT: addq [[RAX]], %rsp ; CHECK-32-LABEL: foo: -; CHECK-32: ud2 +; CHECK-32: movl $50000000{{..}}, %eax +; CHECK-32-NEXT: subl %eax, %esp ; CHECK-32-NOT: subl $2147483647, %esp -; CHECK-32: ud2 +; CHECK-32: movl $50000000{{..}}, [[EAX:%e..]] +; CHECK-32-NEXT: addl [[EAX]], %esp %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) ret void @@ -44,7 +46,7 @@ define i32 @foo3(i32 inreg %x) nounwind { ; CHECK-64-NEXT: subq %rax, %rsp ; CHECK-32-LABEL: foo3: -; CHECK-32: ud2 +; CHECK-32: subl $2147483647, %esp ; CHECK-32-NOT: movl ${{.*}}, %eax %1 = alloca [5000000000 x i8], align 16 call void @bar(ptr %1) diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll index d9b20f50e9a88..b8031056fd6b0 100644 --- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll +++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll @@ -30,22 +30,44 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-X86-LABEL: foo: ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: ud2 +; CHECK-X86-NEXT: .cfi_def_cfa_register %eax +; CHECK-X86-NEXT: .cfi_adjust_cfa_offset 4800000000 +; CHECK-X86-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-NEXT: movl $0, (%esp) +; CHECK-X86-NEXT: cmpl %eax, %esp +; CHECK-X86-NEXT: jne .LBB0_1 +; CHECK-X86-NEXT: # %bb.2: +; CHECK-X86-NEXT: subl $12, %esp +; CHECK-X86-NEXT: .cfi_def_cfa_register %esp ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4800000016 ; CHECK-X86-NEXT: movl $1, 392(%esp) ; CHECK-X86-NEXT: movl $1, 28792(%esp) ; CHECK-X86-NEXT: movl (%esp), %eax -; CHECK-X86-NEXT: ud2 +; CHECK-X86-NEXT: movl $4800000012, %ecx # imm = 0x11E1A300C +; CHECK-X86-NEXT: addl %ecx, %esp ; CHECK-X86-NEXT: .cfi_def_cfa_offset 4 ; CHECK-X86-NEXT: retl ; ; CHECK-X32-LABEL: foo: ; CHECK-X32: # %bb.0: ; CHECK-X32-NEXT: ud2 +; CHECK-X32-NEXT: .cfi_def_cfa_register %r11 +; CHECK-X32-NEXT: .cfi_adjust_cfa_offset 4799995904 +; CHECK-X32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-X32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X32-NEXT: movq $0, (%esp) +; CHECK-X32-NEXT: cmpl %r11d, %esp +; CHECK-X32-NEXT: jne .LBB0_1 +; CHECK-X32-NEXT: # %bb.2: +; CHECK-X32-NEXT: subl $3976, %esp # imm = 0xF88 +; CHECK-X32-NEXT: .cfi_def_cfa_register %rsp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 4799999888 ; CHECK-X32-NEXT: movl $1, 264(%esp) ; CHECK-X32-NEXT: movl $1, 28664(%esp) ; CHECK-X32-NEXT: movl -128(%esp), %eax -; CHECK-X32-NEXT: ud2 +; CHECK-X32-NEXT: movl $4799999880, %ecx # imm = 0x11E1A2F88 +; CHECK-X32-NEXT: addl %ecx, %esp ; CHECK-X32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X32-NEXT: retq %a = alloca i32, i64 1200000000, align 16