Skip to content

Commit 70035b6

Browse files
author
Jennifer Schmitz
committed
AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and use_new_vector_costs entry in aarch64-tuning-flags.def and makes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the default. To that end, the function aarch64_use_new_vector_costs_p and its uses were removed. To prevent costing vec_to_scalar operations with 0, as described in https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html, we adjusted vectorizable_store such that the variable n_adjacent_stores also covers vec_to_scalar operations. This way vec_to_scalar operations are not costed individually, but as a group. As suggested by Richard Sandiford, the "known_ne" in the multilane-check was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector rather than a scalar. Two tests were adjusted due to changes in codegen. In both cases, the old code performed loop unrolling once, but the new code does not: Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none): f_int64_t_32: cbz w3, .L92 mov x4, 0 uxtw x3, w3 + cntd x5 + whilelo p7.d, xzr, x3 + mov z29.s, w5 mov z31.s, w2 - whilelo p6.d, xzr, x3 - mov x2, x3 - index z30.s, #0, #1 - uqdecd x2 - ptrue p5.b, all - whilelo p7.d, xzr, x2 + index z30.d, #0, #1 + ptrue p6.b, all .p2align 3,,7 .L94: - ld1d z27.d, p7/z, [x0, #1, mul vl] - ld1d z28.d, p6/z, [x0] - movprfx z29, z31 - mul z29.s, p5/m, z29.s, z30.s - incw x4 - uunpklo z0.d, z29.s - uunpkhi z29.d, z29.s - ld1d z25.d, p6/z, [x1, z0.d, lsl 3] - ld1d z26.d, p7/z, [x1, z29.d, lsl 3] - add z25.d, z28.d, z25.d + ld1d z27.d, p7/z, [x0, x4, lsl 3] + movprfx z28, z31 + mul z28.s, p6/m, z28.s, z30.s + ld1d z26.d, p7/z, [x1, z28.d, uxtw 3] add z26.d, z27.d, z26.d - st1d z26.d, p7, [x0, #1, mul vl] - whilelo p7.d, x4, x2 - st1d z25.d, p6, [x0] - incw z30.s - incb x0, all, mul #2 - whilelo p6.d, x4, x3 + st1d z26.d, p7, [x0, x4, lsl 3] + add z30.s, z30.s, z29.s + incd x4 + whilelo p7.d, x4, x3 b.any .L94 .L92: ret Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with -O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none): f_int64_t_32: cbz w3, .L84 - addvl x5, x1, #1 mov x4, 0 uxtw x3, w3 - mov z31.s, w2 + cntd x5 whilelo p7.d, xzr, x3 - mov x2, x3 - index z30.s, #0, #1 - uqdecd x2 - ptrue p5.b, all - whilelo p6.d, xzr, x2 + mov z29.s, w5 + mov z31.s, w2 + index z30.d, #0, #1 + ptrue p6.b, all .p2align 3,,7 .L86: - ld1d z28.d, p7/z, [x1, x4, lsl 3] - ld1d z27.d, p6/z, [x5, x4, lsl 3] - movprfx z29, z30 - mul z29.s, p5/m, z29.s, z31.s - add z28.d, z28.d, #1 - uunpklo z26.d, z29.s - st1d z28.d, p7, [x0, z26.d, lsl 3] - incw x4 - uunpkhi z29.d, z29.s + ld1d z27.d, p7/z, [x1, x4, lsl 3] + movprfx z28, z30 + mul z28.s, p6/m, z28.s, z31.s add z27.d, z27.d, #1 - whilelo p6.d, x4, x2 - st1d z27.d, p7, [x0, z29.d, lsl 3] - incw z30.s + st1d z27.d, p7, [x0, z28.d, uxtw 3] + incd x4 + add z30.s, z30.s, z29.s whilelo p7.d, x4, x3 b.any .L86 .L84: ret The patch was bootstrapped and tested on aarch64-linux-gnu, no regression. OK for mainline? Signed-off-by: Jennifer Schmitz <[email protected]> gcc/ * tree-vect-stmts.cc (vectorizable_store): Extend the use of n_adjacent_stores to also cover vec_to_scalar operations. * config/aarch64/aarch64-tuning-flags.def: Remove use_new_vector_costs as tuning option. * config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p): Remove. (aarch64_vector_costs::add_stmt_cost): Remove use of aarch64_use_new_vector_costs_p. (aarch64_vector_costs::finish_cost): Remove use of aarch64_use_new_vector_costs_p. * config/aarch64/tuning_models/cortexx925.h: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS. * config/aarch64/tuning_models/fujitsu_monaka.h: Likewise. * config/aarch64/tuning_models/generic_armv8_a.h: Likewise. * config/aarch64/tuning_models/generic_armv9_a.h: Likewise. * config/aarch64/tuning_models/neoverse512tvb.h: Likewise. * config/aarch64/tuning_models/neoversen2.h: Likewise. * config/aarch64/tuning_models/neoversen3.h: Likewise. * config/aarch64/tuning_models/neoversev1.h: Likewise. * config/aarch64/tuning_models/neoversev2.h: Likewise. * config/aarch64/tuning_models/neoversev3.h: Likewise. * config/aarch64/tuning_models/neoversev3ae.h: Likewise. gcc/testsuite/ * gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome. * gcc.target/aarch64/sve/strided_store_2.c: Likewise.
1 parent e53277d commit 70035b6

16 files changed

+27
-50
lines changed

gcc/config/aarch64/aarch64-tuning-flags.def

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
3838

3939
AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
4040

41-
AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
42-
4341
AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)
4442

4543
AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)

gcc/config/aarch64/aarch64.cc

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
1662716627
return new aarch64_vector_costs (vinfo, costing_for_scalar);
1662816628
}
1662916629

16630-
/* Return true if the current CPU should use the new costs defined
16631-
in GCC 11. This should be removed for GCC 12 and above, with the
16632-
costs applying to all CPUs instead. */
16633-
static bool
16634-
aarch64_use_new_vector_costs_p ()
16635-
{
16636-
return (aarch64_tune_params.extra_tuning_flags
16637-
& AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16638-
}
16639-
1664016630
/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
1664116631
static const simd_vec_cost *
1664216632
aarch64_simd_vec_costs (tree vectype)
@@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
1755517545

1755617546
/* Do one-time initialization based on the vinfo. */
1755717547
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17558-
if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17548+
if (!m_analyzed_vinfo)
1755917549
{
1756017550
if (loop_vinfo)
1756117551
analyze_loop_vinfo (loop_vinfo);
@@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
1757317563

1757417564
/* Try to get a more accurate cost by looking at STMT_INFO instead
1757517565
of just looking at KIND. */
17576-
if (stmt_info && aarch64_use_new_vector_costs_p ())
17566+
if (stmt_info)
1757717567
{
1757817568
/* If we scalarize a strided store, the vectorizer costs one
1757917569
vec_to_scalar for each element. However, we can store the first
@@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
1763817628
else
1763917629
m_num_last_promote_demote = 0;
1764017630

17641-
if (stmt_info && aarch64_use_new_vector_costs_p ())
17631+
if (stmt_info)
1764217632
{
1764317633
/* Account for any extra "embedded" costs that apply additively
1764417634
to the base cost calculated above. */
@@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
1799917989

1800017990
auto *scalar_costs
1800117991
= static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
18002-
if (loop_vinfo
18003-
&& m_vec_flags
18004-
&& aarch64_use_new_vector_costs_p ())
17992+
if (loop_vinfo && m_vec_flags)
1800517993
{
1800617994
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
1800717995
m_costs[vect_body]);

gcc/config/aarch64/tuning_models/cortexx925.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings =
221221
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
222222
(AARCH64_EXTRA_TUNE_BASE
223223
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
224-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
225224
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
226225
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
227226
&generic_armv9a_prefetch_tune,

gcc/config/aarch64/tuning_models/fujitsu_monaka.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings =
5555
0, /* max_case_values. */
5656
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
5757
(AARCH64_EXTRA_TUNE_BASE
58-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
5958
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
6059
&generic_prefetch_tune,
6160
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */

gcc/config/aarch64/tuning_models/generic_armv8_a.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings =
183183
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
184184
(AARCH64_EXTRA_TUNE_BASE
185185
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
186-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
187186
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
188187
&generic_prefetch_tune,
189188
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */

gcc/config/aarch64/tuning_models/generic_armv9_a.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings =
251251
0, /* max_case_values. */
252252
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
253253
(AARCH64_EXTRA_TUNE_BASE
254-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
255254
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
256255
&generic_armv9a_prefetch_tune,
257256
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */

gcc/config/aarch64/tuning_models/neoverse512tvb.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings =
156156
0, /* max_case_values. */
157157
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
158158
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
159-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
160159
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
161160
&generic_armv9a_prefetch_tune,
162161
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */

gcc/config/aarch64/tuning_models/neoversen2.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings =
219219
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
220220
(AARCH64_EXTRA_TUNE_BASE
221221
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
222-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
223222
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
224223
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
225224
&generic_armv9a_prefetch_tune,

gcc/config/aarch64/tuning_models/neoversen3.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings =
219219
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
220220
(AARCH64_EXTRA_TUNE_BASE
221221
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
222-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
223222
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
224223
&generic_armv9a_prefetch_tune,
225224
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */

gcc/config/aarch64/tuning_models/neoversev1.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings =
228228
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
229229
(AARCH64_EXTRA_TUNE_BASE
230230
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
231-
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
232231
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
233232
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
234233
&generic_armv9a_prefetch_tune,

0 commit comments

Comments
 (0)