Skip to content

Missing auto-vectorization for take+sum #115160

Closed
@numas13

Description

@numas13

LLVM cannot auto-vectorize the following code:

pub fn slice_sum(s: &[u64], l: usize) -> u64 {
    s.iter().take(l).sum()
}
ASM
example::slice_sum:
        test    rdx, rdx
        je      .LBB0_1
        shl     rsi, 3
        xor     ecx, ecx
        xor     eax, eax
.LBB0_3:
        cmp     rsi, rcx
        je      .LBB0_5
        add     rax, qword ptr [rdi + rcx]
        add     rcx, 8
        dec     rdx
        jne     .LBB0_3
.LBB0_5:
        ret
.LBB0_1:
        xor     eax, eax
        ret

But can auto-vectorize when using loop instead of fold or sum:

pub fn slice_sum_loop(s: &[u64], l: usize) -> u64 {
    let mut acc = 0;
    for i in s.iter().take(l) {
        acc += *i;
    }
    acc
}
ASM
example::slice_sum_loop:
        xor     eax, eax
        test    rdx, rdx
        je      .LBB1_10
        test    rsi, rsi
        je      .LBB1_10
        lea     rax, [rsi - 1]
        mov     cl, 61
        bzhi    rax, rax, rcx
        lea     r8, [rdx - 1]
        cmp     rax, r8
        cmovb   r8, rax
        cmp     r8, 15
        jae     .LBB1_4
        xor     eax, eax
        mov     rcx, rdi
        jmp     .LBB1_7
.LBB1_4:
        inc     r8
        mov     r9, r8
        and     r9, -16
        lea     rcx, [rdi + 8*r9]
        sub     rdx, r9
        vpxor   xmm0, xmm0, xmm0
        xor     eax, eax
        vpxor   xmm1, xmm1, xmm1
        vpxor   xmm2, xmm2, xmm2
        vpxor   xmm3, xmm3, xmm3
.LBB1_5:
        vpaddq  ymm0, ymm0, ymmword ptr [rdi + 8*rax]
        vpaddq  ymm1, ymm1, ymmword ptr [rdi + 8*rax + 32]
        vpaddq  ymm2, ymm2, ymmword ptr [rdi + 8*rax + 64]
        vpaddq  ymm3, ymm3, ymmword ptr [rdi + 8*rax + 96]
        add     rax, 16
        cmp     r9, rax
        jne     .LBB1_5
        vpaddq  ymm0, ymm1, ymm0
        vpaddq  ymm1, ymm3, ymm2
        vpaddq  ymm0, ymm1, ymm0
        vextracti128    xmm1, ymm0, 1
        vpaddq  xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vpaddq  xmm0, xmm0, xmm1
        vmovq   rax, xmm0
        cmp     r8, r9
        je      .LBB1_10
.LBB1_7:
        lea     rsi, [rdi + 8*rsi]
        add     rcx, 8
        dec     rdx
.LBB1_8:
        add     rax, qword ptr [rcx - 8]
        sub     rdx, 1
        jb      .LBB1_10
        lea     rdi, [rcx + 8]
        cmp     rcx, rsi
        mov     rcx, rdi
        jne     .LBB1_8
.LBB1_10:
        vzeroupper
        ret
$ rustc -C opt-level=3 -C target-cpu=x86-64-v3 ...
$ rustc --version --verbose
rustc 1.71.0 (8ede3aae2 2023-07-12)
binary: rustc
commit-hash: 8ede3aae28fe6e4d52b38157d7bfe0d3bceef225
commit-date: 2023-07-12
host: x86_64-unknown-linux-gnu
release: 1.71.0
LLVM version: 16.0.5
Compiler returned: 0

https://rust.godbolt.org/z/o1hcvczTW

Metadata

Metadata

Assignees

Labels

A-iteratorsArea: IteratorsI-slowIssue: Problems and improvements with respect to performance of generated code.T-libsRelevant to the library team, which will review and decide on the PR/issue.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions