Open
Description
LLVM generates inefficient code for strided array access in loops. Address calculations within the loop use vector operations on offset vectors instead of the scalar base register, leading to performance degradation.
For example:
void func(double* a, int n)
{
for (int i = 0; i < n; i++) {
a[i*5] = 1;
}
}
SVE
.LBB0_4:
add z3.d, z2.d, z0.d
mul z2.d, z2.d, #40
subs x11, x11, x9
st1d { z1.d }, p0, [x0, z2.d]
mov z2.d, z3.d
b.ne .LBB0_4
AVX-512
.LBB0_4:
vpmullq ymm4, ymm0, ymm1
kxnorw k1, k0, k0
vscatterqpd qword ptr [rdi + ymm4] {k1}, ymm2
vpaddq ymm0, ymm0, ymm3
add rdx, -4
jne .LBB0_4