Skip to content

Commit a79985f

Browse files
author
tnfchris
committed
Add support for SVE stack clash probing.
This patch adds basic support for SVE stack clash protection. It is a first implementation and will use a loop to do the probing and stack adjustments. An example sequence is: .cfi_startproc mov x15, sp cntb x16, all, mul gcc-mirror#11 add x16, x16, 304 .cfi_def_cfa_register 15 .SVLPSPL0: cmp x16, 61440 b.lt .SVLPEND0 sub sp, sp, 61440 str xzr, [sp, 0] sub x16, x16, 61440 b .SVLPSPL0 .SVLPEND0: sub sp, sp, x16 .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22 for a 64KB guard size, and for a 4KB guard size .cfi_startproc mov x15, sp cntb x16, all, mul gcc-mirror#11 add x16, x16, 304 .cfi_def_cfa_register 15 .SVLPSPL0: cmp x16, 3072 b.lt .SVLPEND0 sub sp, sp, 3072 str xzr, [sp, 0] sub x16, x16, 3072 b .SVLPSPL0 .SVLPEND0: sub sp, sp, x16 .cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,0x8,0x58,0x1e,0x23,0xb0,0x2,0x22 This has about the same semantics as alloca, except we prioritize the common case where no probe is required. We also change the amount we adjust the stack and the probing interval to be the nearest value to `guard size - abi buffer` that fits in the 12-bit shifted immediate used by cmp. While this would mean we probe a bit more often than we require, in practice the amount of SVE vectors you'd need to spill is significant. Even more so to enter the loop more than once. gcc/ PR target/86486 * config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New. * config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash, aarch64_clamp_to_uimm12_shift): New. (aarch64_allocate_and_probe_stack_space): Add SVE specific section. * config/aarch64/aarch64.md (probe_sve_stack_clash): New. gcc/testsuite/ PR target/86486 * gcc.target/aarch64/stack-check-prologue-16.c: New test * gcc.target/aarch64/stack-check-cfa-3.c: New test. * gcc.target/aarch64/sve/struct_vect_24.c: New test. * gcc.target/aarch64/sve/struct_vect_24_run.c: New test. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@264749 138bc75d-0d04-0410-961f-82ee72b054a4
1 parent b9315fa commit a79985f

File tree

9 files changed

+317
-5
lines changed

9 files changed

+317
-5
lines changed

gcc/ChangeLog

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
2018-10-01 Tamar Christina <[email protected]>
2+
3+
PR target/86486
4+
* config/aarch64/aarch64-protos.h (aarch64_output_probe_sve_stack_clash): New.
5+
* config/aarch64/aarch64.c (aarch64_output_probe_sve_stack_clash,
6+
aarch64_clamp_to_uimm12_shift): New.
7+
(aarch64_allocate_and_probe_stack_space): Add SVE specific section.
8+
* config/aarch64/aarch64.md (probe_sve_stack_clash): New.
9+
110
2018-10-01 Tamar Christina <[email protected]>
211

312
PR target/86486

gcc/config/aarch64/aarch64-protos.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ void aarch64_asm_output_labelref (FILE *, const char *);
497497
void aarch64_cpu_cpp_builtins (cpp_reader *);
498498
const char * aarch64_gen_far_branch (rtx *, int, const char *, const char *);
499499
const char * aarch64_output_probe_stack_range (rtx, rtx);
500+
const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
500501
void aarch64_err_no_fpadvsimd (machine_mode);
501502
void aarch64_expand_epilogue (bool);
502503
void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);

gcc/config/aarch64/aarch64.c

Lines changed: 150 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
166166
static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
167167
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
168168
aarch64_addr_query_type);
169+
static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
169170

170171
/* Major revision number of the ARM Architecture implemented by the target. */
171172
unsigned aarch64_architecture_version;
@@ -4020,6 +4021,84 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
40204021
return "";
40214022
}
40224023

4024+
/* Emit the probe loop for doing stack clash probes and stack adjustments for
4025+
SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4026+
of GUARD_SIZE. When a probe is emitted it is done at most
4027+
MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4028+
at most MIN_PROBE_THRESHOLD. By the end of this function
4029+
BASE = BASE - ADJUSTMENT. */
4030+
4031+
const char *
4032+
aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4033+
rtx min_probe_threshold, rtx guard_size)
4034+
{
4035+
/* This function is not allowed to use any instruction generation function
4036+
like gen_ and friends. If you do you'll likely ICE during CFG validation,
4037+
so instead emit the code you want using output_asm_insn. */
4038+
gcc_assert (flag_stack_clash_protection);
4039+
gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4040+
gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4041+
4042+
/* The minimum required allocation before the residual requires probing. */
4043+
HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4044+
4045+
/* Clamp the value down to the nearest value that can be used with a cmp. */
4046+
residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4047+
rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4048+
4049+
gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4050+
gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4051+
4052+
static int labelno = 0;
4053+
char loop_start_lab[32];
4054+
char loop_end_lab[32];
4055+
rtx xops[2];
4056+
4057+
ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4058+
ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4059+
4060+
/* Emit loop start label. */
4061+
ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4062+
4063+
/* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4064+
xops[0] = adjustment;
4065+
xops[1] = probe_offset_value_rtx;
4066+
output_asm_insn ("cmp\t%0, %1", xops);
4067+
4068+
/* Branch to end if not enough adjustment to probe. */
4069+
fputs ("\tb.lt\t", asm_out_file);
4070+
assemble_name_raw (asm_out_file, loop_end_lab);
4071+
fputc ('\n', asm_out_file);
4072+
4073+
/* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4074+
xops[0] = base;
4075+
xops[1] = probe_offset_value_rtx;
4076+
output_asm_insn ("sub\t%0, %0, %1", xops);
4077+
4078+
/* Probe at BASE. */
4079+
xops[1] = const0_rtx;
4080+
output_asm_insn ("str\txzr, [%0, %1]", xops);
4081+
4082+
/* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4083+
xops[0] = adjustment;
4084+
xops[1] = probe_offset_value_rtx;
4085+
output_asm_insn ("sub\t%0, %0, %1", xops);
4086+
4087+
/* Branch to start if still more bytes to allocate. */
4088+
fputs ("\tb\t", asm_out_file);
4089+
assemble_name_raw (asm_out_file, loop_start_lab);
4090+
fputc ('\n', asm_out_file);
4091+
4092+
/* No probe leave. */
4093+
ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4094+
4095+
/* BASE = BASE - ADJUSTMENT. */
4096+
xops[0] = base;
4097+
xops[1] = adjustment;
4098+
output_asm_insn ("sub\t%0, %0, %1", xops);
4099+
return "";
4100+
}
4101+
40234102
/* Determine whether a frame chain needs to be generated. */
40244103
static bool
40254104
aarch64_needs_frame_chain (void)
@@ -4877,21 +4956,73 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
48774956
}
48784957
}
48794958

4880-
HOST_WIDE_INT size;
48814959
/* If SIZE is not large enough to require probing, just adjust the stack and
48824960
exit. */
4883-
if (!poly_size.is_constant (&size)
4884-
|| known_lt (poly_size, min_probe_threshold)
4961+
if (known_lt (poly_size, min_probe_threshold)
48854962
|| !flag_stack_clash_protection)
48864963
{
48874964
aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
48884965
return;
48894966
}
48904967

4968+
HOST_WIDE_INT size;
4969+
/* Handle the SVE non-constant case first. */
4970+
if (!poly_size.is_constant (&size))
4971+
{
4972+
if (dump_file)
4973+
{
4974+
fprintf (dump_file, "Stack clash SVE prologue: ");
4975+
print_dec (poly_size, dump_file);
4976+
fprintf (dump_file, " bytes, dynamic probing will be required.\n");
4977+
}
4978+
4979+
/* First calculate the amount of bytes we're actually spilling. */
4980+
aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
4981+
poly_size, temp1, temp2, false, true);
4982+
4983+
rtx_insn *insn = get_last_insn ();
4984+
4985+
if (frame_related_p)
4986+
{
4987+
/* This is done to provide unwinding information for the stack
4988+
adjustments we're about to do, however to prevent the optimizers
4989+
from removing the R15 move and leaving the CFA note (which would be
4990+
very wrong) we tie the old and new stack pointer together.
4991+
The tie will expand to nothing but the optimizers will not touch
4992+
the instruction. */
4993+
rtx stack_ptr_copy = gen_rtx_REG (Pmode, R15_REGNUM);
4994+
emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
4995+
emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
4996+
4997+
/* We want the CFA independent of the stack pointer for the
4998+
duration of the loop. */
4999+
add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5000+
RTX_FRAME_RELATED_P (insn) = 1;
5001+
}
5002+
5003+
rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5004+
rtx guard_const = gen_int_mode (guard_size, Pmode);
5005+
5006+
insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5007+
stack_pointer_rtx, temp1,
5008+
probe_const, guard_const));
5009+
5010+
/* Now reset the CFA register if needed. */
5011+
if (frame_related_p)
5012+
{
5013+
add_reg_note (insn, REG_CFA_DEF_CFA,
5014+
gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5015+
gen_int_mode (poly_size, Pmode)));
5016+
RTX_FRAME_RELATED_P (insn) = 1;
5017+
}
5018+
5019+
return;
5020+
}
5021+
48915022
if (dump_file)
48925023
fprintf (dump_file,
4893-
"Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC " bytes"
4894-
", probing will be required.\n", size);
5024+
"Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5025+
" bytes, probing will be required.\n", size);
48955026

48965027
/* Round size to the nearest multiple of guard_size, and calculate the
48975028
residual as the difference between the original size and the rounded
@@ -5494,6 +5625,20 @@ aarch64_uimm12_shift (HOST_WIDE_INT val)
54945625
);
54955626
}
54965627

5628+
/* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5629+
that can be created with a left shift of 0 or 12. */
5630+
static HOST_WIDE_INT
5631+
aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
5632+
{
5633+
/* Check to see if the value fits in 24 bits, as that is the maximum we can
5634+
handle correctly. */
5635+
gcc_assert ((val & 0xffffff) == val);
5636+
5637+
if (((val & 0xfff) << 0) == val)
5638+
return val;
5639+
5640+
return val & (0xfff << 12);
5641+
}
54975642

54985643
/* Return true if val is an immediate that can be loaded into a
54995644
register by a MOVZ instruction. */

gcc/config/aarch64/aarch64.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6514,6 +6514,25 @@
65146514
[(set_attr "length" "32")]
65156515
)
65166516

6517+
;; This instruction is used to generate the stack clash stack adjustment and
6518+
;; probing loop. We can't change the control flow during prologue and epilogue
6519+
;; code generation. So we must emit a volatile unspec and expand it later on.
6520+
6521+
(define_insn "@probe_sve_stack_clash_<mode>"
6522+
[(set (match_operand:P 0 "register_operand" "=rk")
6523+
(unspec_volatile:P [(match_operand:P 1 "register_operand" "0")
6524+
(match_operand:P 2 "register_operand" "r")
6525+
(match_operand:P 3 "const_int_operand" "n")
6526+
(match_operand:P 4 "aarch64_plus_immediate" "L")]
6527+
UNSPECV_PROBE_STACK_RANGE))]
6528+
"TARGET_SVE"
6529+
{
6530+
return aarch64_output_probe_sve_stack_clash (operands[0], operands[2],
6531+
operands[3], operands[4]);
6532+
}
6533+
[(set_attr "length" "28")]
6534+
)
6535+
65176536
;; Named pattern for expanding thread pointer reference.
65186537
(define_expand "get_thread_pointerdi"
65196538
[(match_operand:DI 0 "register_operand" "=r")]

gcc/testsuite/ChangeLog

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
2018-10-01 Tamar Christina <[email protected]>
2+
3+
PR target/86486
4+
* gcc.target/aarch64/stack-check-prologue-16.c: New test
5+
* gcc.target/aarch64/stack-check-cfa-3.c: New test.
6+
* gcc.target/aarch64/sve/struct_vect_24.c: New test.
7+
* gcc.target/aarch64/sve/struct_vect_24_run.c: New test.
8+
19
2018-10-01 Jeff Law <[email protected]>
210
Richard Sandiford <[email protected]>
311
Tamar Christina <[email protected]>
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16 -funwind-tables" } */
3+
/* { dg-require-effective-target supports_stack_clash_protection } */
4+
5+
#include "stack-check-prologue-16.c"
6+
7+
/* Checks that the CFA notes are correct for every sp adjustment, but we also
8+
need to make sure we can unwind correctly before the frame is set up. So
9+
check that we're emitting r15 with a copy of sp an setting the CFA there. */
10+
11+
/* { dg-final { scan-assembler-times {mov\tx15, sp} 1 } } */
12+
/* { dg-final { scan-assembler-times {\.cfi_def_cfa_register 15} 1 } } */
13+
/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xc,0x8f,0,0x92,0x2e,0,.*} 1 } } */
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/* { dg-do compile } */
2+
/* { dg-require-effective-target supports_stack_clash_protection } */
3+
/* { dg-options "-O3 -fopenmp-simd -march=armv8-a+sve -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
4+
5+
/* Invoke X (P##n) for n in [0, 7]. */
6+
#define REPEAT8(X, P) \
7+
X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7)
8+
9+
/* Invoke X (n) for all octal n in [0, 39]. */
10+
#define REPEAT40(X) \
11+
REPEAT8 (X, 0) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
12+
13+
/* Expect vector work to be done, with spilling of vector registers. */
14+
void
15+
f2 (int x[40][100], int *y)
16+
{
17+
/* Try to force some spilling. */
18+
#define DECLARE(N) int y##N = y[N];
19+
REPEAT40 (DECLARE);
20+
#pragma omp simd
21+
for (int i = 0; i < 100; ++i)
22+
{
23+
#define INC(N) x[N][i] += y##N;
24+
REPEAT40 (INC);
25+
}
26+
}
27+
28+
/* SVE spill, requires probing as vector size is unknown at compile time. */
29+
30+
/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 1 } } */
31+
/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 1 } } */
32+
/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 1 } } */
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/* { dg-do compile } */
2+
/* { dg-require-effective-target supports_stack_clash_protection } */
3+
/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
4+
5+
#include <stdint.h>
6+
7+
#define N 50
8+
#define S 2 * 64 * 1024
9+
10+
/* Invoke X (P##n) for n in [0, 9]. */
11+
#define REPEAT8(X, P) \
12+
X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \
13+
X (P##8) X (P##9)
14+
15+
/* Invoke X (n) for all n in [0, 49]. */
16+
#define REPEAT50(X) \
17+
REPEAT8 (X, ) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4)
18+
19+
/* Try to force some spilling. */
20+
#define DECLARE(N) int src##N = src[N * 4];
21+
#define INC(N) dest[i] += src##N;
22+
23+
#define TEST_LOOP(NAME, TYPE) \
24+
void __attribute__ ((noinline, noclone, simd)) \
25+
NAME (TYPE *restrict dest, TYPE *restrict src) \
26+
{ \
27+
REPEAT50 (DECLARE); \
28+
volatile char foo[S]; \
29+
foo[S-1]=1; \
30+
for (int i = 0; i < N; i++) \
31+
{ \
32+
REPEAT50 (INC); \
33+
} \
34+
}
35+
36+
#define TEST(NAME) \
37+
TEST_LOOP (NAME##_i32, int32_t) \
38+
TEST_LOOP (NAME##_i64, int64_t) \
39+
TEST_LOOP (NAME##_f32, float) \
40+
TEST_LOOP (NAME##_f64, double)
41+
42+
TEST (test)
43+
44+
/* Check the vectorized loop for stack clash probing. */
45+
46+
/* { dg-final { scan-assembler-times {str\s+xzr, \[sp, 0\]} 4 } } */
47+
/* { dg-final { scan-assembler-times {cmp\s+x[0-9]+, 61440} 4 } } */
48+
/* { dg-final { scan-assembler-times {sub\s+x[0-9]+, x[0-9]+, 61440} 4 } } */
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/* { dg-do run { target aarch64_sve_hw } } */
2+
/* { dg-require-effective-target supports_stack_clash_protection } */
3+
/* { dg-options "-O3 -fopenmp-simd -fstack-clash-protection --param stack-clash-protection-guard-size=16" } */
4+
5+
#include "struct_vect_24.c"
6+
7+
#undef TEST_LOOP
8+
#define TEST_LOOP(NAME, TYPE) \
9+
{ \
10+
TYPE out[N]; \
11+
TYPE in[N * 4]; \
12+
for (int i = 0; i < N; ++i) \
13+
{ \
14+
out[i] = i * 7 / 2; \
15+
asm volatile ("" ::: "memory"); \
16+
} \
17+
for (int i = 0; i < N * 4; ++i) \
18+
{ \
19+
in[i] = i * 9 / 2; \
20+
asm volatile ("" ::: "memory"); \
21+
} \
22+
NAME (out, in); \
23+
for (int i = 0; i < N; ++i) \
24+
{ \
25+
TYPE expected = i * 7 / 2; \
26+
if (out[i] != out[0] + expected) \
27+
__builtin_abort (); \
28+
asm volatile ("" ::: "memory"); \
29+
} \
30+
}
31+
32+
int __attribute__ ((optimize (0)))
33+
main (void)
34+
{
35+
TEST (test);
36+
return 0;
37+
}

0 commit comments

Comments
 (0)