Closed
Description
LLVM cannot auto-vectorize the following code:
pub fn slice_sum(s: &[u64], l: usize) -> u64 {
s.iter().take(l).sum()
}
ASM
example::slice_sum:
test rdx, rdx
je .LBB0_1
shl rsi, 3
xor ecx, ecx
xor eax, eax
.LBB0_3:
cmp rsi, rcx
je .LBB0_5
add rax, qword ptr [rdi + rcx]
add rcx, 8
dec rdx
jne .LBB0_3
.LBB0_5:
ret
.LBB0_1:
xor eax, eax
ret
But can auto-vectorize when using loop
instead of fold
or sum
:
pub fn slice_sum_loop(s: &[u64], l: usize) -> u64 {
let mut acc = 0;
for i in s.iter().take(l) {
acc += *i;
}
acc
}
ASM
example::slice_sum_loop:
xor eax, eax
test rdx, rdx
je .LBB1_10
test rsi, rsi
je .LBB1_10
lea rax, [rsi - 1]
mov cl, 61
bzhi rax, rax, rcx
lea r8, [rdx - 1]
cmp rax, r8
cmovb r8, rax
cmp r8, 15
jae .LBB1_4
xor eax, eax
mov rcx, rdi
jmp .LBB1_7
.LBB1_4:
inc r8
mov r9, r8
and r9, -16
lea rcx, [rdi + 8*r9]
sub rdx, r9
vpxor xmm0, xmm0, xmm0
xor eax, eax
vpxor xmm1, xmm1, xmm1
vpxor xmm2, xmm2, xmm2
vpxor xmm3, xmm3, xmm3
.LBB1_5:
vpaddq ymm0, ymm0, ymmword ptr [rdi + 8*rax]
vpaddq ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32]
vpaddq ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64]
vpaddq ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96]
add rax, 16
cmp r9, rax
jne .LBB1_5
vpaddq ymm0, ymm1, ymm0
vpaddq ymm1, ymm3, ymm2
vpaddq ymm0, ymm1, ymm0
vextracti128 xmm1, ymm0, 1
vpaddq xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpaddq xmm0, xmm0, xmm1
vmovq rax, xmm0
cmp r8, r9
je .LBB1_10
.LBB1_7:
lea rsi, [rdi + 8*rsi]
add rcx, 8
dec rdx
.LBB1_8:
add rax, qword ptr [rcx - 8]
sub rdx, 1
jb .LBB1_10
lea rdi, [rcx + 8]
cmp rcx, rsi
mov rcx, rdi
jne .LBB1_8
.LBB1_10:
vzeroupper
ret
$ rustc -C opt-level=3 -C target-cpu=x86-64-v3 ...
$ rustc --version --verbose
rustc 1.71.0 (8ede3aae2 2023-07-12)
binary: rustc
commit-hash: 8ede3aae28fe6e4d52b38157d7bfe0d3bceef225
commit-date: 2023-07-12
host: x86_64-unknown-linux-gnu
release: 1.71.0
LLVM version: 16.0.5
Compiler returned: 0