From b3078d1bfc6aa1c92c0c85d9352c6fa2896b1adf Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:10:42 -0400 Subject: [PATCH 01/14] Uploaded test without optimization --- llvm/test/CodeGen/AArch64/avg-i128.ll | 136 +++++++++++++++++++ llvm/test/CodeGen/X86/avg-i128.ll | 184 ++++++++++++++++++++++++++ 2 files changed, 320 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/avg-i128.ll create mode 100644 llvm/test/CodeGen/X86/avg-i128.ll diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll new file mode 100644 index 0000000000000..d1d2cd7dd0c89 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/avg-i128.ll @@ -0,0 +1,136 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +define i128 @avgflooru_i128(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: eor x8, x3, x1 +; CHECK-NEXT: eor x9, x2, x0 +; CHECK-NEXT: and x10, x2, x0 +; CHECK-NEXT: extr x9, x8, x9, #1 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: and x11, x3, x1 +; CHECK-NEXT: adds x0, x10, x9 +; CHECK-NEXT: adc x1, x11, x8 +; CHECK-NEXT: ret +start: + %xor = xor i128 %y, %x + %lshr = lshr i128 %xor, 1 + %and = and i128 %y, %x + %add = add i128 %lshr, %and + ret i128 %add +} + +declare void @use(i8) + +define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128_multi_use: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill +; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w24, -48 +; CHECK-NEXT: .cfi_offset w30, -64 +; CHECK-NEXT: eor x23, x3, x1 +; CHECK-NEXT: eor x24, x2, x0 +; CHECK-NEXT: mov x21, x1 +; CHECK-NEXT: mov x22, x0 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: mov x19, x3 +; CHECK-NEXT: mov x20, x2 +; CHECK-NEXT: bl use +; CHECK-NEXT: extr x24, x23, x24, #1 +; CHECK-NEXT: lsr x23, x23, #1 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: bl use +; CHECK-NEXT: and x8, x20, x22 +; CHECK-NEXT: and x9, x19, x21 +; CHECK-NEXT: adds x0, x8, x24 +; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: adc x1, x9, x23 +; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload +; CHECK-NEXT: ret +start: + %xor = xor i128 %y, %x + call void @use(i128 %xor) + %lshr = lshr i128 %xor, 1 + call void @use(i128 %lshr) + %and = and i128 %y, %x + %add = add i128 %lshr, %and + ret i128 %add +} + +define i128 @avgflooru_i128_negative(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128_negative: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: mvn x8, x0 +; CHECK-NEXT: and x9, x2, x0 +; CHECK-NEXT: mvn x10, x1 +; CHECK-NEXT: and x11, x3, x1 +; CHECK-NEXT: adds x0, x8, x9 +; CHECK-NEXT: adc x1, x10, x11 +; CHECK-NEXT: ret +start: + %xor = xor i128 %x, -1 + %and = and i128 %y, %x + %add = add i128 %xor, %and + ret i128 %add +} + +define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { +; CHECK-LABEL: avgflooru_i128_negative2: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: add x8, x8, w0, uxtw +; CHECK-NEXT: lsr x0, x8, #1 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret +start: + %xor = xor i32 %y, %x + %lshr = lshr i32 %xor, 1 + %and = and i32 %y, %x + %add = add i32 %lshr, %and + ret i32 %add +} + +define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { +; CHECK-LABEL: avgflooru_i128_vec: +; CHECK: // %bb.0: // %start +; CHECK-NEXT: eor x8, x4, x0 +; CHECK-NEXT: eor x9, x5, x1 +; CHECK-NEXT: eor x11, x6, x2 +; CHECK-NEXT: extr x8, x9, x8, #1 +; CHECK-NEXT: eor x12, x7, x3 +; CHECK-NEXT: and x13, x4, x0 +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: extr x11, x12, x11, #1 +; CHECK-NEXT: and x10, x5, x1 +; CHECK-NEXT: adds x0, x13, x8 +; CHECK-NEXT: lsr x8, x12, #1 +; CHECK-NEXT: and x12, x6, x2 +; CHECK-NEXT: adc x1, x10, x9 +; CHECK-NEXT: adds x10, x12, x11 +; CHECK-NEXT: and x9, x7, x3 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: adc x3, x9, x8 +; CHECK-NEXT: mov v0.d[1], x3 +; CHECK-NEXT: fmov x2, d0 +; CHECK-NEXT: ret +start: + %xor = xor <2 x i128> %y, %x + %lshr = lshr <2 x i128> %xor, + %and = and <2 x i128> %y, %x + %add = add <2 x i128> %lshr, %and + ret <2 x i128> %add +} diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll new file mode 100644 index 0000000000000..82827720865dc --- /dev/null +++ b/llvm/test/CodeGen/X86/avg-i128.ll @@ -0,0 +1,184 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64 < %s | FileCheck %s + +define i128 @avgflooru_i128(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: xorq %rsi, %r8 +; CHECK-NEXT: shrdq $1, %r8, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shrq %r8 +; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: adcq %rcx, %r8 +; CHECK-NEXT: movq %r8, %rdx +; CHECK-NEXT: retq +start: + %xor = xor i128 %y, %x + %lshr = lshr i128 %xor, 1 + %and = and i128 %y, %x + %add = add i128 %lshr, %and + ret i128 %add +} + +declare void @use(i8) + +define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128_multi_use: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rsi, %r15 +; CHECK-NEXT: movq %rdi, %r12 +; CHECK-NEXT: movq %rdx, %r13 +; CHECK-NEXT: xorq %rdi, %r13 +; CHECK-NEXT: movq %rcx, %rbp +; CHECK-NEXT: xorq %rsi, %rbp +; CHECK-NEXT: movq %r13, %rdi +; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: callq use@PLT +; CHECK-NEXT: shrdq $1, %rbp, %r13 +; CHECK-NEXT: shrq %rbp +; CHECK-NEXT: movq %r13, %rdi +; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: callq use@PLT +; CHECK-NEXT: andq %r15, %rbx +; CHECK-NEXT: andq %r12, %r14 +; CHECK-NEXT: addq %r13, %r14 +; CHECK-NEXT: adcq %rbp, %rbx +; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +start: + %xor = xor i128 %y, %x + call void @use(i128 %xor) + %lshr = lshr i128 %xor, 1 + call void @use(i128 %lshr) + %and = and i128 %y, %x + %add = add i128 %lshr, %and + ret i128 %add +} + +define i128 @avgflooru_i128_negative(i128 %x, i128 %y) { +; CHECK-LABEL: avgflooru_i128_negative: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: notq %rsi +; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: notq %rax +; CHECK-NEXT: addq %rdx, %rax +; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: movq %rsi, %rdx +; CHECK-NEXT: retq +start: + %xor = xor i128 %x, -1 + %and = and i128 %y, %x + %add = add i128 %xor, %and + ret i128 %add +} + +define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { +; CHECK-LABEL: avgflooru_i128_negative2: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addq %rcx, %rax +; CHECK-NEXT: shrq %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq +start: + %xor = xor i32 %y, %x + %lshr = lshr i32 %xor, 1 + %and = and i32 %y, %x + %add = add i32 %lshr, %and + ret i32 %add +} + +define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { +; CHECK-LABEL: avgflooru_i128_vec: +; CHECK: # %bb.0: # %start +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: xorq %rsi, %r14 +; CHECK-NEXT: movq %rbx, %r11 +; CHECK-NEXT: xorq %rdx, %r11 +; CHECK-NEXT: shrdq $1, %r11, %r14 +; CHECK-NEXT: andq %rdx, %rbx +; CHECK-NEXT: shrq %r11 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: adcq %rbx, %r11 +; CHECK-NEXT: movq %r9, %rdx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: movq %rdi, %rsi +; CHECK-NEXT: xorq %r8, %rsi +; CHECK-NEXT: shrdq $1, %rsi, %rdx +; CHECK-NEXT: andq %r8, %rdi +; CHECK-NEXT: shrq %rsi +; CHECK-NEXT: andq %rcx, %r9 +; CHECK-NEXT: addq %rdx, %r9 +; CHECK-NEXT: adcq %rdi, %rsi +; CHECK-NEXT: movq %r9, 16(%rax) +; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %rsi, 24(%rax) +; CHECK-NEXT: movq %r11, 8(%rax) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +start: + %xor = xor <2 x i128> %y, %x + %lshr = lshr <2 x i128> %xor, + %and = and <2 x i128> %y, %x + %add = add <2 x i128> %lshr, %and + ret <2 x i128> %add +} From d3a909eb469b3ee12d83b7a76ed6929615d3a70b Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:16:50 -0400 Subject: [PATCH 02/14] Uploaded test with optimization --- llvm/test/CodeGen/AArch64/avg-i128.ll | 57 ++++++++---------- llvm/test/CodeGen/X86/avg-i128.ll | 83 ++++++++++----------------- 2 files changed, 54 insertions(+), 86 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll index d1d2cd7dd0c89..75ee52decbb70 100644 --- a/llvm/test/CodeGen/AArch64/avg-i128.ll +++ b/llvm/test/CodeGen/AArch64/avg-i128.ll @@ -4,14 +4,12 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x3, x1 -; CHECK-NEXT: eor x9, x2, x0 -; CHECK-NEXT: and x10, x2, x0 -; CHECK-NEXT: extr x9, x8, x9, #1 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: and x11, x3, x1 -; CHECK-NEXT: adds x0, x10, x9 -; CHECK-NEXT: adc x1, x11, x8 +; CHECK-NEXT: adds x9, x0, x2 +; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NEXT: adcs x10, x1, x3 +; CHECK-NEXT: csel x1, x8, xzr, hs +; CHECK-NEXT: extr x0, x10, x9, #1 +; CHECK-NEXT: bfxil x1, x10, #1, #63 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -47,18 +45,18 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { ; CHECK-NEXT: mov x19, x3 ; CHECK-NEXT: mov x20, x2 ; CHECK-NEXT: bl use -; CHECK-NEXT: extr x24, x23, x24, #1 -; CHECK-NEXT: lsr x23, x23, #1 -; CHECK-NEXT: mov x0, x24 -; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: extr x0, x23, x24, #1 +; CHECK-NEXT: lsr x1, x23, #1 ; CHECK-NEXT: bl use -; CHECK-NEXT: and x8, x20, x22 -; CHECK-NEXT: and x9, x19, x21 -; CHECK-NEXT: adds x0, x8, x24 +; CHECK-NEXT: adds x8, x22, x20 +; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NEXT: adcs x9, x21, x19 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: adc x1, x9, x23 ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: csel x1, x10, xzr, hs ; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: bfxil x1, x9, #1, #63 ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -107,23 +105,18 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x4, x0 -; CHECK-NEXT: eor x9, x5, x1 -; CHECK-NEXT: eor x11, x6, x2 -; CHECK-NEXT: extr x8, x9, x8, #1 -; CHECK-NEXT: eor x12, x7, x3 -; CHECK-NEXT: and x13, x4, x0 -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: extr x11, x12, x11, #1 -; CHECK-NEXT: and x10, x5, x1 -; CHECK-NEXT: adds x0, x13, x8 -; CHECK-NEXT: lsr x8, x12, #1 -; CHECK-NEXT: and x12, x6, x2 -; CHECK-NEXT: adc x1, x10, x9 -; CHECK-NEXT: adds x10, x12, x11 -; CHECK-NEXT: and x9, x7, x3 +; CHECK-NEXT: adds x8, x0, x4 +; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 +; CHECK-NEXT: adcs x9, x1, x5 +; CHECK-NEXT: csel x1, x10, xzr, hs +; CHECK-NEXT: adds x11, x2, x6 +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: adcs x12, x3, x7 +; CHECK-NEXT: bfxil x1, x9, #1, #63 +; CHECK-NEXT: csel x3, x10, xzr, hs +; CHECK-NEXT: extr x10, x12, x11, #1 +; CHECK-NEXT: bfxil x3, x12, #1, #63 ; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: adc x3, x9, x8 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll index 82827720865dc..e0e3283c308d7 100644 --- a/llvm/test/CodeGen/X86/avg-i128.ll +++ b/llvm/test/CodeGen/X86/avg-i128.ll @@ -4,17 +4,13 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: xorq %rdi, %rax -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: xorq %rsi, %r8 -; CHECK-NEXT: shrdq $1, %r8, %rax -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: shrq %r8 -; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %r8 -; CHECK-NEXT: movq %r8, %rdx +; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: shrdq $1, %rsi, %rax +; CHECK-NEXT: movzbl %cl, %edx +; CHECK-NEXT: shldq $63, %rsi, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -65,12 +61,13 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: andq %r15, %rbx -; CHECK-NEXT: andq %r12, %r14 -; CHECK-NEXT: addq %r13, %r14 -; CHECK-NEXT: adcq %rbp, %rbx -; CHECK-NEXT: movq %r14, %rax -; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: addq %r14, %r12 +; CHECK-NEXT: adcq %rbx, %r15 +; CHECK-NEXT: setb %al +; CHECK-NEXT: shrdq $1, %r15, %r12 +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: shldq $63, %r15, %rdx +; CHECK-NEXT: movq %r12, %rax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 56 ; CHECK-NEXT: popq %rbx @@ -135,45 +132,23 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: xorq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %r11 -; CHECK-NEXT: xorq %rdx, %r11 -; CHECK-NEXT: shrdq $1, %r11, %r14 -; CHECK-NEXT: andq %rdx, %rbx -; CHECK-NEXT: shrq %r11 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: addq %r14, %r10 -; CHECK-NEXT: adcq %rbx, %r11 -; CHECK-NEXT: movq %r9, %rdx -; CHECK-NEXT: xorq %rcx, %rdx -; CHECK-NEXT: movq %rdi, %rsi -; CHECK-NEXT: xorq %r8, %rsi -; CHECK-NEXT: shrdq $1, %rsi, %rdx -; CHECK-NEXT: andq %r8, %rdi -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: andq %rcx, %r9 -; CHECK-NEXT: addq %rdx, %r9 -; CHECK-NEXT: adcq %rdi, %rsi -; CHECK-NEXT: movq %r9, 16(%rax) -; CHECK-NEXT: movq %r10, (%rax) -; CHECK-NEXT: movq %rsi, 24(%rax) -; CHECK-NEXT: movq %r11, 8(%rax) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: setb %dil +; CHECK-NEXT: movzbl %dil, %edi +; CHECK-NEXT: shldq $63, %rdx, %rdi +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: movzbl %r9b, %r9d +; CHECK-NEXT: shldq $63, %r8, %r9 +; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: shldq $63, %rcx, %r8 +; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x From 8881fca9573772d6b89fc6b03451a5e55062b0ef Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:18:58 -0400 Subject: [PATCH 03/14] the legalization of i128 --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c3a20b5044c5f..92795bd37a562 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9318,7 +9318,8 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; assert((Opc == ISD::AVGFLOORS || Opc == ISD::AVGCEILS || Opc == ISD::AVGFLOORU || Opc == ISD::AVGCEILU) && - "Unknown AVG node"); + "Unknown AVG node"); + EVT SVT = VT.getScalarType(); // If the operands are already extended, we can add+shift. bool IsExt = @@ -9352,6 +9353,27 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { } } + if (Opc == ISD::AVGFLOORU && SVT == MVT::i128) { + SDValue UAddWithOverflow = DAG.getNode(ISD::UADDO, dl, + DAG.getVTList(VT, MVT::i1), { RHS, LHS }); + + SDValue Sum = UAddWithOverflow.getValue(0); + SDValue Overflow = UAddWithOverflow.getValue(1); + + // Right shift the sum by 1 + SDValue One = DAG.getConstant(1, dl, VT); + SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One); + + // Creating the select instruction + APInt SignMin = APInt::getSignedMinValue(VT.getSizeInBits()); + SDValue SignMinVal = DAG.getConstant(SignMin, dl, VT); + SDValue ZeroOut = DAG.getConstant(0, dl, VT); + + SDValue SelectVal = DAG.getSelect(dl, VT, Overflow, SignMinVal, ZeroOut); + + return DAG.getNode(ISD::OR, dl, VT, LShrVal, SelectVal); + } + // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1)) // avgceilu(lhs, rhs) -> sub(or(lhs,rhs),lshr(xor(lhs,rhs),1)) // avgfloors(lhs, rhs) -> add(and(lhs,rhs),ashr(xor(lhs,rhs),1)) From 2bb628df46af2a9b43a0cf388d3caaee7dc10814 Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:06:56 -0400 Subject: [PATCH 04/14] Reuploaded the test without optimization --- llvm/test/CodeGen/AArch64/avg-i128.ll | 71 +++++++++-------- llvm/test/CodeGen/X86/avg-i128.ll | 105 ++++++++++++++------------ 2 files changed, 90 insertions(+), 86 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll index 75ee52decbb70..73e3fef8d7e85 100644 --- a/llvm/test/CodeGen/AArch64/avg-i128.ll +++ b/llvm/test/CodeGen/AArch64/avg-i128.ll @@ -4,12 +4,14 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: adds x9, x0, x2 -; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: adcs x10, x1, x3 -; CHECK-NEXT: csel x1, x8, xzr, hs -; CHECK-NEXT: extr x0, x10, x9, #1 -; CHECK-NEXT: bfxil x1, x10, #1, #63 +; CHECK-NEXT: eor x8, x3, x1 +; CHECK-NEXT: eor x9, x2, x0 +; CHECK-NEXT: and x10, x2, x0 +; CHECK-NEXT: extr x9, x8, x9, #1 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: and x11, x3, x1 +; CHECK-NEXT: adds x0, x10, x9 +; CHECK-NEXT: adc x1, x11, x8 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -21,42 +23,34 @@ start: declare void @use(i8) -define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { +define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-LABEL: avgflooru_i128_multi_use: ; CHECK: // %bb.0: // %start ; CHECK-NEXT: str x30, [sp, #-64]! // 8-byte Folded Spill ; CHECK-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -40 -; CHECK-NEXT: .cfi_offset w24, -48 -; CHECK-NEXT: .cfi_offset w30, -64 ; CHECK-NEXT: eor x23, x3, x1 ; CHECK-NEXT: eor x24, x2, x0 +; CHECK-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: mov x21, x1 ; CHECK-NEXT: mov x22, x0 ; CHECK-NEXT: mov x0, x24 ; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: mov x19, x3 ; CHECK-NEXT: mov x20, x2 ; CHECK-NEXT: bl use -; CHECK-NEXT: extr x0, x23, x24, #1 -; CHECK-NEXT: lsr x1, x23, #1 +; CHECK-NEXT: extr x24, x23, x24, #1 +; CHECK-NEXT: lsr x23, x23, #1 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 ; CHECK-NEXT: bl use -; CHECK-NEXT: adds x8, x22, x20 -; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: adcs x9, x21, x19 +; CHECK-NEXT: and x8, x20, x22 +; CHECK-NEXT: and x9, x19, x21 +; CHECK-NEXT: adds x0, x8, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: adc x1, x9, x23 ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: csel x1, x10, xzr, hs ; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: bfxil x1, x9, #1, #63 ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -105,18 +99,23 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: csel x1, x10, xzr, hs -; CHECK-NEXT: adds x11, x2, x6 -; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: adcs x12, x3, x7 -; CHECK-NEXT: bfxil x1, x9, #1, #63 -; CHECK-NEXT: csel x3, x10, xzr, hs -; CHECK-NEXT: extr x10, x12, x11, #1 -; CHECK-NEXT: bfxil x3, x12, #1, #63 +; CHECK-NEXT: eor x8, x4, x0 +; CHECK-NEXT: eor x9, x5, x1 +; CHECK-NEXT: eor x11, x6, x2 +; CHECK-NEXT: extr x8, x9, x8, #1 +; CHECK-NEXT: eor x12, x7, x3 +; CHECK-NEXT: and x13, x4, x0 +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: extr x11, x12, x11, #1 +; CHECK-NEXT: and x10, x5, x1 +; CHECK-NEXT: adds x0, x13, x8 +; CHECK-NEXT: lsr x8, x12, #1 +; CHECK-NEXT: and x12, x6, x2 +; CHECK-NEXT: adc x1, x10, x9 +; CHECK-NEXT: adds x10, x12, x11 +; CHECK-NEXT: and x9, x7, x3 ; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: adc x3, x9, x8 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll index e0e3283c308d7..f0f9a5802a26d 100644 --- a/llvm/test/CodeGen/X86/avg-i128.ll +++ b/llvm/test/CodeGen/X86/avg-i128.ll @@ -4,13 +4,17 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: xorq %rsi, %r8 +; CHECK-NEXT: shrdq $1, %r8, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shrq %r8 +; CHECK-NEXT: andq %rdi, %rdx ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: shrdq $1, %rsi, %rax -; CHECK-NEXT: movzbl %cl, %edx -; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: adcq %rcx, %r8 +; CHECK-NEXT: movq %r8, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -22,29 +26,16 @@ start: declare void @use(i8) -define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { +define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-LABEL: avgflooru_i128_multi_use: ; CHECK: # %bb.0: # %start ; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 ; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 ; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rcx, %rbx ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rsi, %r15 @@ -61,27 +52,19 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) { ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: adcq %rbx, %r15 -; CHECK-NEXT: setb %al -; CHECK-NEXT: shrdq $1, %r15, %r12 -; CHECK-NEXT: movzbl %al, %edx -; CHECK-NEXT: shldq $63, %r15, %rdx -; CHECK-NEXT: movq %r12, %rax +; CHECK-NEXT: andq %r15, %rbx +; CHECK-NEXT: andq %r12, %r14 +; CHECK-NEXT: addq %r13, %r14 +; CHECK-NEXT: adcq %rbp, %rbx +; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: movq %rbx, %rdx ; CHECK-NEXT: addq $8, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 56 ; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 ; CHECK-NEXT: popq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -132,23 +115,45 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: setb %dil -; CHECK-NEXT: movzbl %dil, %edi -; CHECK-NEXT: shldq $63, %rdx, %rdi -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: shldq $63, %r8, %r9 -; CHECK-NEXT: shldq $63, %rsi, %rdx -; CHECK-NEXT: shldq $63, %rcx, %r8 -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %r9, 24(%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: xorq %rsi, %r14 +; CHECK-NEXT: movq %rbx, %r11 +; CHECK-NEXT: xorq %rdx, %r11 +; CHECK-NEXT: shrdq $1, %r11, %r14 +; CHECK-NEXT: andq %rdx, %rbx +; CHECK-NEXT: shrq %r11 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: adcq %rbx, %r11 +; CHECK-NEXT: movq %r9, %rdx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: movq %rdi, %rsi +; CHECK-NEXT: xorq %r8, %rsi +; CHECK-NEXT: shrdq $1, %rsi, %rdx +; CHECK-NEXT: andq %r8, %rdi +; CHECK-NEXT: shrq %rsi +; CHECK-NEXT: andq %rcx, %r9 +; CHECK-NEXT: addq %rdx, %r9 +; CHECK-NEXT: adcq %rdi, %rsi +; CHECK-NEXT: movq %r9, 16(%rax) +; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %rsi, 24(%rax) +; CHECK-NEXT: movq %r11, 8(%rax) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x From 74b4c96bcf501be23b8fde34772e38900f3c9168 Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:17:00 -0400 Subject: [PATCH 05/14] Reuploaded the test with optimization --- llvm/test/CodeGen/AArch64/avg-i128.ll | 64 ++++++------ llvm/test/CodeGen/X86/avg-i128.ll | 135 +++++++++++++------------- 2 files changed, 96 insertions(+), 103 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avg-i128.ll index 73e3fef8d7e85..e3d9f65327738 100644 --- a/llvm/test/CodeGen/AArch64/avg-i128.ll +++ b/llvm/test/CodeGen/AArch64/avg-i128.ll @@ -4,14 +4,13 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x3, x1 -; CHECK-NEXT: eor x9, x2, x0 -; CHECK-NEXT: and x10, x2, x0 -; CHECK-NEXT: extr x9, x8, x9, #1 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: and x11, x3, x1 -; CHECK-NEXT: adds x0, x10, x9 -; CHECK-NEXT: adc x1, x11, x8 +; CHECK-NEXT: adds x8, x0, x2 +; CHECK-NEXT: adcs x9, x1, x3 +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: lsl x10, x10, #63 +; CHECK-NEXT: csel x1, x10, xzr, hs +; CHECK-NEXT: bfxil x1, x9, #1, #63 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -39,18 +38,19 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: mov x19, x3 ; CHECK-NEXT: mov x20, x2 ; CHECK-NEXT: bl use -; CHECK-NEXT: extr x24, x23, x24, #1 -; CHECK-NEXT: lsr x23, x23, #1 -; CHECK-NEXT: mov x0, x24 -; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: extr x0, x23, x24, #1 +; CHECK-NEXT: lsr x1, x23, #1 ; CHECK-NEXT: bl use -; CHECK-NEXT: and x8, x20, x22 -; CHECK-NEXT: and x9, x19, x21 -; CHECK-NEXT: adds x0, x8, x24 +; CHECK-NEXT: adds x8, x22, x20 +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: adcs x9, x21, x19 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: adc x1, x9, x23 +; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: lsl x10, x10, #63 +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: csel x1, x10, xzr, hs +; CHECK-NEXT: bfxil x1, x9, #1, #63 ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -99,23 +99,21 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x4, x0 -; CHECK-NEXT: eor x9, x5, x1 -; CHECK-NEXT: eor x11, x6, x2 -; CHECK-NEXT: extr x8, x9, x8, #1 -; CHECK-NEXT: eor x12, x7, x3 -; CHECK-NEXT: and x13, x4, x0 -; CHECK-NEXT: lsr x9, x9, #1 -; CHECK-NEXT: extr x11, x12, x11, #1 -; CHECK-NEXT: and x10, x5, x1 -; CHECK-NEXT: adds x0, x13, x8 -; CHECK-NEXT: lsr x8, x12, #1 -; CHECK-NEXT: and x12, x6, x2 -; CHECK-NEXT: adc x1, x10, x9 -; CHECK-NEXT: adds x10, x12, x11 -; CHECK-NEXT: and x9, x7, x3 +; CHECK-NEXT: adds x8, x0, x4 +; CHECK-NEXT: adcs x9, x1, x5 +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: lsl x10, x10, #63 +; CHECK-NEXT: csel x1, x10, xzr, hs +; CHECK-NEXT: adds x10, x2, x6 +; CHECK-NEXT: adcs x11, x3, x7 +; CHECK-NEXT: bfxil x1, x9, #1, #63 +; CHECK-NEXT: cset w12, hs +; CHECK-NEXT: extr x10, x11, x10, #1 +; CHECK-NEXT: lsl x12, x12, #63 ; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: adc x3, x9, x8 +; CHECK-NEXT: csel x3, x12, xzr, hs +; CHECK-NEXT: bfxil x3, x11, #1, #63 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avg-i128.ll index f0f9a5802a26d..7b530be63c52f 100644 --- a/llvm/test/CodeGen/X86/avg-i128.ll +++ b/llvm/test/CodeGen/X86/avg-i128.ll @@ -4,17 +4,19 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: xorq %rdi, %rax -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: xorq %rsi, %r8 -; CHECK-NEXT: shrdq $1, %r8, %rax -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: shrq %r8 -; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %r8 -; CHECK-NEXT: movq %r8, %rdx +; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: setb %dl +; CHECK-NEXT: movzbl %dl, %edi +; CHECK-NEXT: shlq $63, %rdi +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: testb %cl, %cl +; CHECK-NEXT: cmovneq %rdi, %rdx +; CHECK-NEXT: shrdq $1, %rsi, %rax +; CHECK-NEXT: shrq %rsi +; CHECK-NEXT: orq %rsi, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -29,42 +31,44 @@ declare void @use(i8) define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-LABEL: avgflooru_i128_multi_use: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movq %rcx, %rbx -; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rsi, %r15 ; CHECK-NEXT: movq %rdi, %r12 -; CHECK-NEXT: movq %rdx, %r13 -; CHECK-NEXT: xorq %rdi, %r13 -; CHECK-NEXT: movq %rcx, %rbp -; CHECK-NEXT: xorq %rsi, %rbp -; CHECK-NEXT: movq %r13, %rdi -; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: addq %rdx, %rbx +; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: adcq %rcx, %r14 +; CHECK-NEXT: setb %al +; CHECK-NEXT: setb %sil +; CHECK-NEXT: movzbl %sil, %esi +; CHECK-NEXT: shlq $63, %rsi +; CHECK-NEXT: xorl %r13d, %r13d +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: cmovneq %rsi, %r13 +; CHECK-NEXT: xorq %rcx, %r15 +; CHECK-NEXT: xorq %rdx, %r12 +; CHECK-NEXT: movq %r12, %rdi +; CHECK-NEXT: movq %r15, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: shrdq $1, %rbp, %r13 -; CHECK-NEXT: shrq %rbp -; CHECK-NEXT: movq %r13, %rdi -; CHECK-NEXT: movq %rbp, %rsi +; CHECK-NEXT: shrdq $1, %r15, %r12 +; CHECK-NEXT: shrq %r15 +; CHECK-NEXT: movq %r12, %rdi +; CHECK-NEXT: movq %r15, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: andq %r15, %rbx -; CHECK-NEXT: andq %r12, %r14 -; CHECK-NEXT: addq %r13, %r14 -; CHECK-NEXT: adcq %rbp, %rbx -; CHECK-NEXT: movq %r14, %rax -; CHECK-NEXT: movq %rbx, %rdx -; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: shrdq $1, %r14, %rbx +; CHECK-NEXT: shrq %r14 +; CHECK-NEXT: orq %r13, %r14 +; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: movq %r14, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -115,45 +119,36 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: xorq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %r11 -; CHECK-NEXT: xorq %rdx, %r11 -; CHECK-NEXT: shrdq $1, %r11, %r14 -; CHECK-NEXT: andq %rdx, %rbx -; CHECK-NEXT: shrq %r11 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: addq %r14, %r10 -; CHECK-NEXT: adcq %rbx, %r11 -; CHECK-NEXT: movq %r9, %rdx -; CHECK-NEXT: xorq %rcx, %rdx -; CHECK-NEXT: movq %rdi, %rsi -; CHECK-NEXT: xorq %r8, %rsi -; CHECK-NEXT: shrdq $1, %rsi, %rdx -; CHECK-NEXT: andq %r8, %rdi -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: andq %rcx, %r9 -; CHECK-NEXT: addq %rdx, %r9 -; CHECK-NEXT: adcq %rdi, %rsi -; CHECK-NEXT: movq %r9, 16(%rax) -; CHECK-NEXT: movq %r10, (%rax) -; CHECK-NEXT: movq %rsi, 24(%rax) -; CHECK-NEXT: movq %r11, 8(%rax) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: setb %dil +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: movzbl %r9b, %r9d +; CHECK-NEXT: shlq $63, %r9 +; CHECK-NEXT: xorl %r10d, %r10d +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: cmoveq %r10, %r9 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: setb %dil +; CHECK-NEXT: setb %r11b +; CHECK-NEXT: movzbl %r11b, %r11d +; CHECK-NEXT: shlq $63, %r11 +; CHECK-NEXT: testb %dil, %dil +; CHECK-NEXT: cmoveq %r10, %r11 +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: shrq %rdi +; CHECK-NEXT: orq %r9, %rdi +; CHECK-NEXT: movq %r8, %r9 +; CHECK-NEXT: shrq %r9 +; CHECK-NEXT: orq %r11, %r9 +; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: shldq $63, %rcx, %r8 +; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x From 7bbee40f884aadfe2dbcacfedf108081075456bc Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:18:49 -0400 Subject: [PATCH 06/14] Not limited to i128 anymore and other fixes --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 92795bd37a562..b1d49a7102e32 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9319,7 +9319,6 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { assert((Opc == ISD::AVGFLOORS || Opc == ISD::AVGCEILS || Opc == ISD::AVGFLOORU || Opc == ISD::AVGCEILU) && "Unknown AVG node"); - EVT SVT = VT.getScalarType(); // If the operands are already extended, we can add+shift. bool IsExt = @@ -9353,7 +9352,7 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { } } - if (Opc == ISD::AVGFLOORU && SVT == MVT::i128) { + if (VT.isScalarInteger() && !isTypeLegal(VT)) { SDValue UAddWithOverflow = DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), { RHS, LHS }); @@ -9361,15 +9360,15 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { SDValue Overflow = UAddWithOverflow.getValue(1); // Right shift the sum by 1 - SDValue One = DAG.getConstant(1, dl, VT); + SDValue One = DAG.getShiftAmountConstant(1, VT, dl); SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One); // Creating the select instruction - APInt SignMin = APInt::getSignedMinValue(VT.getSizeInBits()); - SDValue SignMinVal = DAG.getConstant(SignMin, dl, VT); - SDValue ZeroOut = DAG.getConstant(0, dl, VT); - - SDValue SelectVal = DAG.getSelect(dl, VT, Overflow, SignMinVal, ZeroOut); + SDValue ZeroOut = DAG.getConstant(0, dl, VT); + SDValue ZeroExtOverflow = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Overflow); + SDValue OverflowShl = DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow, + DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT)); + SDValue SelectVal = DAG.getSelect(dl, VT, Overflow, OverflowShl, ZeroOut); return DAG.getNode(ISD::OR, dl, VT, LShrVal, SelectVal); } From 9e3cb4c3ad3ad7e8f38849d5da12f1aebba99c3e Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:01:34 -0400 Subject: [PATCH 07/14] Renamed test files and added reasoning behind negative test cases --- .../test/CodeGen/AArch64/{avg-i128.ll => avgflooru-i128.ll} | 6 ++++++ llvm/test/CodeGen/X86/{avg-i128.ll => avgflooru-i128.ll} | 6 ++++++ 2 files changed, 12 insertions(+) rename llvm/test/CodeGen/AArch64/{avg-i128.ll => avgflooru-i128.ll} (94%) rename llvm/test/CodeGen/X86/{avg-i128.ll => avgflooru-i128.ll} (95%) diff --git a/llvm/test/CodeGen/AArch64/avg-i128.ll b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll similarity index 94% rename from llvm/test/CodeGen/AArch64/avg-i128.ll rename to llvm/test/CodeGen/AArch64/avgflooru-i128.ll index e3d9f65327738..79a19dc45e76b 100644 --- a/llvm/test/CodeGen/AArch64/avg-i128.ll +++ b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll @@ -63,6 +63,9 @@ start: ret i128 %add } +; the 'avgflooru_i128_negative` shouldn't combine because it's not +; an avgflooru operation, which is what we're targeting + define i128 @avgflooru_i128_negative(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128_negative: ; CHECK: // %bb.0: // %start @@ -80,6 +83,9 @@ start: ret i128 %add } +; This negative test case shouldn't work, i32 is already properly +; handled in terms of legalization, compared to the i128 + define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { ; CHECK-LABEL: avgflooru_i128_negative2: ; CHECK: // %bb.0: // %start diff --git a/llvm/test/CodeGen/X86/avg-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll similarity index 95% rename from llvm/test/CodeGen/X86/avg-i128.ll rename to llvm/test/CodeGen/X86/avgflooru-i128.ll index 7b530be63c52f..096630ad3c1f7 100644 --- a/llvm/test/CodeGen/X86/avg-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -80,6 +80,9 @@ start: ret i128 %add } +; This test case shouldn't combine because it's not +; an avgflooru operation + define i128 @avgflooru_i128_negative(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128_negative: ; CHECK: # %bb.0: # %start @@ -99,6 +102,9 @@ start: ret i128 %add } +; This negative test case shouldn't combine, i32 is already properly +; handled in terms of legalization, compared to the i128 + define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { ; CHECK-LABEL: avgflooru_i128_negative2: ; CHECK: # %bb.0: # %start From 1b4573c29c505256612470050421a8bd40b7949a Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:02:32 -0400 Subject: [PATCH 08/14] proper format with other simple changes --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b1d49a7102e32..aab44238cf819 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9318,7 +9318,7 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; assert((Opc == ISD::AVGFLOORS || Opc == ISD::AVGCEILS || Opc == ISD::AVGFLOORU || Opc == ISD::AVGCEILU) && - "Unknown AVG node"); + "Unknown AVG node"); // If the operands are already extended, we can add+shift. bool IsExt = @@ -9352,9 +9352,9 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { } } - if (VT.isScalarInteger() && !isTypeLegal(VT)) { - SDValue UAddWithOverflow = DAG.getNode(ISD::UADDO, dl, - DAG.getVTList(VT, MVT::i1), { RHS, LHS }); + if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) { + SDValue UAddWithOverflow = + DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS}); SDValue Sum = UAddWithOverflow.getValue(0); SDValue Overflow = UAddWithOverflow.getValue(1); @@ -9362,15 +9362,15 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { // Right shift the sum by 1 SDValue One = DAG.getShiftAmountConstant(1, VT, dl); SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One); - + // Creating the select instruction - SDValue ZeroOut = DAG.getConstant(0, dl, VT); - SDValue ZeroExtOverflow = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Overflow); - SDValue OverflowShl = DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow, - DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT)); - SDValue SelectVal = DAG.getSelect(dl, VT, Overflow, OverflowShl, ZeroOut); + SDValue ZeroOut = DAG.getConstant(0, dl, VT); + SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow); + SDValue OverflowShl = + DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow, + DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT)); - return DAG.getNode(ISD::OR, dl, VT, LShrVal, SelectVal); + return DAG.getNode(ISD::OR, dl, VT, LShrVal, OverflowShl); } // avgceils(lhs, rhs) -> sub(or(lhs,rhs),ashr(xor(lhs,rhs),1)) From de26c67891bfec68293950853af5f63332082a62 Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:28:39 -0400 Subject: [PATCH 09/14] remove unused variable --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index aab44238cf819..70d7de7b878b0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9364,7 +9364,6 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One); // Creating the select instruction - SDValue ZeroOut = DAG.getConstant(0, dl, VT); SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow); SDValue OverflowShl = DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow, From 3407bd48a338d86f60a3184c16b077734a59304c Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:29:53 -0400 Subject: [PATCH 10/14] regenerated the test files after removing the Select instruction --- llvm/test/CodeGen/AArch64/avgflooru-i128.ll | 32 +++----- llvm/test/CodeGen/X86/avgflooru-i128.ll | 82 ++++++++------------- 2 files changed, 43 insertions(+), 71 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll index 79a19dc45e76b..d336c38f20799 100644 --- a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll +++ b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll @@ -8,9 +8,7 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-NEXT: adcs x9, x1, x3 ; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: lsl x10, x10, #63 -; CHECK-NEXT: csel x1, x10, xzr, hs -; CHECK-NEXT: bfxil x1, x9, #1, #63 +; CHECK-NEXT: extr x1, x10, x9, #1 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -45,12 +43,10 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: adcs x9, x21, x19 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: lsl x10, x10, #63 +; CHECK-NEXT: cset w10, hs ; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: csel x1, x10, xzr, hs -; CHECK-NEXT: bfxil x1, x9, #1, #63 +; CHECK-NEXT: extr x1, x10, x9, #1 ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -64,7 +60,7 @@ start: } ; the 'avgflooru_i128_negative` shouldn't combine because it's not -; an avgflooru operation, which is what we're targeting +; an avgflooru operation, which is what we're targeting define i128 @avgflooru_i128_negative(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128_negative: @@ -83,7 +79,7 @@ start: ret i128 %add } -; This negative test case shouldn't work, i32 is already properly +; This negative test case shouldn't work, i32 is already properly ; handled in terms of legalization, compared to the i128 define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { @@ -108,18 +104,14 @@ define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-NEXT: adds x8, x0, x4 ; CHECK-NEXT: adcs x9, x1, x5 ; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: adds x11, x2, x6 ; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: lsl x10, x10, #63 -; CHECK-NEXT: csel x1, x10, xzr, hs -; CHECK-NEXT: adds x10, x2, x6 -; CHECK-NEXT: adcs x11, x3, x7 -; CHECK-NEXT: bfxil x1, x9, #1, #63 -; CHECK-NEXT: cset w12, hs -; CHECK-NEXT: extr x10, x11, x10, #1 -; CHECK-NEXT: lsl x12, x12, #63 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: csel x3, x12, xzr, hs -; CHECK-NEXT: bfxil x3, x11, #1, #63 +; CHECK-NEXT: adcs x12, x3, x7 +; CHECK-NEXT: extr x1, x10, x9, #1 +; CHECK-NEXT: extr x11, x12, x11, #1 +; CHECK-NEXT: cset w13, hs +; CHECK-NEXT: extr x3, x13, x12, #1 +; CHECK-NEXT: fmov d0, x11 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll index 096630ad3c1f7..da16a7da48ca6 100644 --- a/llvm/test/CodeGen/X86/avgflooru-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -8,15 +8,9 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: adcq %rcx, %rsi ; CHECK-NEXT: setb %cl -; CHECK-NEXT: setb %dl -; CHECK-NEXT: movzbl %dl, %edi -; CHECK-NEXT: shlq $63, %rdi -; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: testb %cl, %cl -; CHECK-NEXT: cmovneq %rdi, %rdx ; CHECK-NEXT: shrdq $1, %rsi, %rax -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: orq %rsi, %rdx +; CHECK-NEXT: movzbl %cl, %edx +; CHECK-NEXT: shldq $63, %rsi, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -31,44 +25,43 @@ declare void @use(i8) define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-LABEL: avgflooru_i128_multi_use: ; CHECK: # %bb.0: # %start +; CHECK-NEXT: pushq %rbp ; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: pushq %r13 ; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movq %rcx, %rbx +; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rsi, %r15 ; CHECK-NEXT: movq %rdi, %r12 -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: adcq %rcx, %r14 -; CHECK-NEXT: setb %al -; CHECK-NEXT: setb %sil -; CHECK-NEXT: movzbl %sil, %esi -; CHECK-NEXT: shlq $63, %rsi -; CHECK-NEXT: xorl %r13d, %r13d -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: cmovneq %rsi, %r13 -; CHECK-NEXT: xorq %rcx, %r15 -; CHECK-NEXT: xorq %rdx, %r12 -; CHECK-NEXT: movq %r12, %rdi -; CHECK-NEXT: movq %r15, %rsi +; CHECK-NEXT: movq %rdx, %r13 +; CHECK-NEXT: xorq %rdi, %r13 +; CHECK-NEXT: movq %rcx, %rbp +; CHECK-NEXT: xorq %rsi, %rbp +; CHECK-NEXT: movq %r13, %rdi +; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: shrdq $1, %r15, %r12 -; CHECK-NEXT: shrq %r15 -; CHECK-NEXT: movq %r12, %rdi -; CHECK-NEXT: movq %r15, %rsi +; CHECK-NEXT: shrdq $1, %rbp, %r13 +; CHECK-NEXT: shrq %rbp +; CHECK-NEXT: movq %r13, %rdi +; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: shrdq $1, %r14, %rbx -; CHECK-NEXT: shrq %r14 -; CHECK-NEXT: orq %r13, %r14 -; CHECK-NEXT: movq %rbx, %rax -; CHECK-NEXT: movq %r14, %rdx +; CHECK-NEXT: addq %r14, %r12 +; CHECK-NEXT: adcq %rbx, %r15 +; CHECK-NEXT: setb %al +; CHECK-NEXT: shrdq $1, %r15, %r12 +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: shldq $63, %r15, %rdx +; CHECK-NEXT: movq %r12, %rax +; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -102,7 +95,7 @@ start: ret i128 %add } -; This negative test case shouldn't combine, i32 is already properly +; This negative test case shouldn't combine, i32 is already properly ; handled in terms of legalization, compared to the i128 define i32 @avgflooru_i128_negative2(i32 %x, i32 %y) { @@ -129,26 +122,13 @@ define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx ; CHECK-NEXT: setb %dil -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: shlq $63, %r9 -; CHECK-NEXT: xorl %r10d, %r10d -; CHECK-NEXT: testb %dil, %dil -; CHECK-NEXT: cmoveq %r10, %r9 +; CHECK-NEXT: movzbl %dil, %edi +; CHECK-NEXT: shldq $63, %rdx, %rdi ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; CHECK-NEXT: setb %dil -; CHECK-NEXT: setb %r11b -; CHECK-NEXT: movzbl %r11b, %r11d -; CHECK-NEXT: shlq $63, %r11 -; CHECK-NEXT: testb %dil, %dil -; CHECK-NEXT: cmoveq %r10, %r11 -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: shrq %rdi -; CHECK-NEXT: orq %r9, %rdi -; CHECK-NEXT: movq %r8, %r9 -; CHECK-NEXT: shrq %r9 -; CHECK-NEXT: orq %r11, %r9 +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: movzbl %r9b, %r9d +; CHECK-NEXT: shldq $63, %r8, %r9 ; CHECK-NEXT: shldq $63, %rsi, %rdx ; CHECK-NEXT: shldq $63, %rcx, %r8 ; CHECK-NEXT: movq %r8, 16(%rax) From 1fefa256689c4f9f72109391022474273f909c60 Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:41:25 -0400 Subject: [PATCH 11/14] test files without opt --- llvm/test/CodeGen/AArch64/avgflooru-i128.ll | 56 ++++++++------ llvm/test/CodeGen/X86/avgflooru-i128.ll | 83 ++++++++++++++------- 2 files changed, 87 insertions(+), 52 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll index d336c38f20799..aeac2a81863c1 100644 --- a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll +++ b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll @@ -4,11 +4,14 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: adds x8, x0, x2 -; CHECK-NEXT: adcs x9, x1, x3 -; CHECK-NEXT: cset w10, hs -; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: extr x1, x10, x9, #1 +; CHECK-NEXT: eor x8, x3, x1 +; CHECK-NEXT: eor x9, x2, x0 +; CHECK-NEXT: and x10, x2, x0 +; CHECK-NEXT: extr x9, x8, x9, #1 +; CHECK-NEXT: lsr x8, x8, #1 +; CHECK-NEXT: and x11, x3, x1 +; CHECK-NEXT: adds x0, x10, x9 +; CHECK-NEXT: adc x1, x11, x8 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -36,17 +39,18 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: mov x19, x3 ; CHECK-NEXT: mov x20, x2 ; CHECK-NEXT: bl use -; CHECK-NEXT: extr x0, x23, x24, #1 -; CHECK-NEXT: lsr x1, x23, #1 +; CHECK-NEXT: extr x24, x23, x24, #1 +; CHECK-NEXT: lsr x23, x23, #1 +; CHECK-NEXT: mov x0, x24 +; CHECK-NEXT: mov x1, x23 ; CHECK-NEXT: bl use -; CHECK-NEXT: adds x8, x22, x20 -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: adcs x9, x21, x19 +; CHECK-NEXT: and x8, x20, x22 +; CHECK-NEXT: and x9, x19, x21 +; CHECK-NEXT: adds x0, x8, x24 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: adc x1, x9, x23 ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: cset w10, hs -; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: extr x1, x10, x9, #1 +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -101,17 +105,23 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: adds x8, x0, x4 -; CHECK-NEXT: adcs x9, x1, x5 -; CHECK-NEXT: cset w10, hs -; CHECK-NEXT: adds x11, x2, x6 -; CHECK-NEXT: extr x0, x9, x8, #1 -; CHECK-NEXT: adcs x12, x3, x7 -; CHECK-NEXT: extr x1, x10, x9, #1 +; CHECK-NEXT: eor x8, x4, x0 +; CHECK-NEXT: eor x9, x5, x1 +; CHECK-NEXT: eor x11, x6, x2 +; CHECK-NEXT: extr x8, x9, x8, #1 +; CHECK-NEXT: eor x12, x7, x3 +; CHECK-NEXT: and x13, x4, x0 +; CHECK-NEXT: lsr x9, x9, #1 ; CHECK-NEXT: extr x11, x12, x11, #1 -; CHECK-NEXT: cset w13, hs -; CHECK-NEXT: extr x3, x13, x12, #1 -; CHECK-NEXT: fmov d0, x11 +; CHECK-NEXT: and x10, x5, x1 +; CHECK-NEXT: adds x0, x13, x8 +; CHECK-NEXT: lsr x8, x12, #1 +; CHECK-NEXT: and x12, x6, x2 +; CHECK-NEXT: adc x1, x10, x9 +; CHECK-NEXT: adds x10, x12, x11 +; CHECK-NEXT: and x9, x7, x3 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: adc x3, x9, x8 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll index da16a7da48ca6..a4c10f93a3193 100644 --- a/llvm/test/CodeGen/X86/avgflooru-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -4,13 +4,17 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: xorq %rsi, %r8 +; CHECK-NEXT: shrdq $1, %r8, %rax +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: shrq %r8 +; CHECK-NEXT: andq %rdi, %rdx ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %rsi -; CHECK-NEXT: setb %cl -; CHECK-NEXT: shrdq $1, %rsi, %rax -; CHECK-NEXT: movzbl %cl, %edx -; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: adcq %rcx, %r8 +; CHECK-NEXT: movq %r8, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -48,13 +52,12 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: addq %r14, %r12 -; CHECK-NEXT: adcq %rbx, %r15 -; CHECK-NEXT: setb %al -; CHECK-NEXT: shrdq $1, %r15, %r12 -; CHECK-NEXT: movzbl %al, %edx -; CHECK-NEXT: shldq $63, %r15, %rdx -; CHECK-NEXT: movq %r12, %rax +; CHECK-NEXT: andq %r15, %rbx +; CHECK-NEXT: andq %r12, %r14 +; CHECK-NEXT: addq %r13, %r14 +; CHECK-NEXT: adcq %rbp, %rbx +; CHECK-NEXT: movq %r14, %rax +; CHECK-NEXT: movq %rbx, %rdx ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -118,23 +121,45 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: setb %dil -; CHECK-NEXT: movzbl %dil, %edi -; CHECK-NEXT: shldq $63, %rdx, %rdi -; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx -; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 -; CHECK-NEXT: setb %r9b -; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: shldq $63, %r8, %r9 -; CHECK-NEXT: shldq $63, %rsi, %rdx -; CHECK-NEXT: shldq $63, %rcx, %r8 -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rdx, (%rax) -; CHECK-NEXT: movq %r9, 24(%rax) -; CHECK-NEXT: movq %rdi, 8(%rax) +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; CHECK-NEXT: movq %r10, %r14 +; CHECK-NEXT: xorq %rsi, %r14 +; CHECK-NEXT: movq %rbx, %r11 +; CHECK-NEXT: xorq %rdx, %r11 +; CHECK-NEXT: shrdq $1, %r11, %r14 +; CHECK-NEXT: andq %rdx, %rbx +; CHECK-NEXT: shrq %r11 +; CHECK-NEXT: andq %rsi, %r10 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: adcq %rbx, %r11 +; CHECK-NEXT: movq %r9, %rdx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: movq %rdi, %rsi +; CHECK-NEXT: xorq %r8, %rsi +; CHECK-NEXT: shrdq $1, %rsi, %rdx +; CHECK-NEXT: andq %r8, %rdi +; CHECK-NEXT: shrq %rsi +; CHECK-NEXT: andq %rcx, %r9 +; CHECK-NEXT: addq %rdx, %r9 +; CHECK-NEXT: adcq %rdi, %rsi +; CHECK-NEXT: movq %r9, 16(%rax) +; CHECK-NEXT: movq %r10, (%rax) +; CHECK-NEXT: movq %rsi, 24(%rax) +; CHECK-NEXT: movq %r11, 8(%rax) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x From 09ab20215b110696c5753116c4925106955e5fd5 Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:44:56 -0400 Subject: [PATCH 12/14] test files generated with opt --- llvm/test/CodeGen/AArch64/avgflooru-i128.ll | 56 ++++++-------- llvm/test/CodeGen/X86/avgflooru-i128.ll | 83 +++++++-------------- 2 files changed, 52 insertions(+), 87 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll index aeac2a81863c1..d336c38f20799 100644 --- a/llvm/test/CodeGen/AArch64/avgflooru-i128.ll +++ b/llvm/test/CodeGen/AArch64/avgflooru-i128.ll @@ -4,14 +4,11 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x3, x1 -; CHECK-NEXT: eor x9, x2, x0 -; CHECK-NEXT: and x10, x2, x0 -; CHECK-NEXT: extr x9, x8, x9, #1 -; CHECK-NEXT: lsr x8, x8, #1 -; CHECK-NEXT: and x11, x3, x1 -; CHECK-NEXT: adds x0, x10, x9 -; CHECK-NEXT: adc x1, x11, x8 +; CHECK-NEXT: adds x8, x0, x2 +; CHECK-NEXT: adcs x9, x1, x3 +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: extr x1, x10, x9, #1 ; CHECK-NEXT: ret start: %xor = xor i128 %y, %x @@ -39,18 +36,17 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: mov x19, x3 ; CHECK-NEXT: mov x20, x2 ; CHECK-NEXT: bl use -; CHECK-NEXT: extr x24, x23, x24, #1 -; CHECK-NEXT: lsr x23, x23, #1 -; CHECK-NEXT: mov x0, x24 -; CHECK-NEXT: mov x1, x23 +; CHECK-NEXT: extr x0, x23, x24, #1 +; CHECK-NEXT: lsr x1, x23, #1 ; CHECK-NEXT: bl use -; CHECK-NEXT: and x8, x20, x22 -; CHECK-NEXT: and x9, x19, x21 -; CHECK-NEXT: adds x0, x8, x24 +; CHECK-NEXT: adds x8, x22, x20 +; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: adcs x9, x21, x19 ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: adc x1, x9, x23 ; CHECK-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: extr x1, x10, x9, #1 ; CHECK-NEXT: ldr x30, [sp], #64 // 8-byte Folded Reload ; CHECK-NEXT: ret start: @@ -105,23 +101,17 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: // %bb.0: // %start -; CHECK-NEXT: eor x8, x4, x0 -; CHECK-NEXT: eor x9, x5, x1 -; CHECK-NEXT: eor x11, x6, x2 -; CHECK-NEXT: extr x8, x9, x8, #1 -; CHECK-NEXT: eor x12, x7, x3 -; CHECK-NEXT: and x13, x4, x0 -; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: adds x8, x0, x4 +; CHECK-NEXT: adcs x9, x1, x5 +; CHECK-NEXT: cset w10, hs +; CHECK-NEXT: adds x11, x2, x6 +; CHECK-NEXT: extr x0, x9, x8, #1 +; CHECK-NEXT: adcs x12, x3, x7 +; CHECK-NEXT: extr x1, x10, x9, #1 ; CHECK-NEXT: extr x11, x12, x11, #1 -; CHECK-NEXT: and x10, x5, x1 -; CHECK-NEXT: adds x0, x13, x8 -; CHECK-NEXT: lsr x8, x12, #1 -; CHECK-NEXT: and x12, x6, x2 -; CHECK-NEXT: adc x1, x10, x9 -; CHECK-NEXT: adds x10, x12, x11 -; CHECK-NEXT: and x9, x7, x3 -; CHECK-NEXT: fmov d0, x10 -; CHECK-NEXT: adc x3, x9, x8 +; CHECK-NEXT: cset w13, hs +; CHECK-NEXT: extr x3, x13, x12, #1 +; CHECK-NEXT: fmov d0, x11 ; CHECK-NEXT: mov v0.d[1], x3 ; CHECK-NEXT: fmov x2, d0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll index a4c10f93a3193..da16a7da48ca6 100644 --- a/llvm/test/CodeGen/X86/avgflooru-i128.ll +++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll @@ -4,17 +4,13 @@ define i128 @avgflooru_i128(i128 %x, i128 %y) { ; CHECK-LABEL: avgflooru_i128: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: xorq %rdi, %rax -; CHECK-NEXT: movq %rcx, %r8 -; CHECK-NEXT: xorq %rsi, %r8 -; CHECK-NEXT: shrdq $1, %r8, %rax -; CHECK-NEXT: andq %rsi, %rcx -; CHECK-NEXT: shrq %r8 -; CHECK-NEXT: andq %rdi, %rdx +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: adcq %rcx, %r8 -; CHECK-NEXT: movq %r8, %rdx +; CHECK-NEXT: adcq %rcx, %rsi +; CHECK-NEXT: setb %cl +; CHECK-NEXT: shrdq $1, %rsi, %rax +; CHECK-NEXT: movzbl %cl, %edx +; CHECK-NEXT: shldq $63, %rsi, %rdx ; CHECK-NEXT: retq start: %xor = xor i128 %y, %x @@ -52,12 +48,13 @@ define i128 @avgflooru_i128_multi_use(i128 %x, i128 %y) nounwind { ; CHECK-NEXT: movq %r13, %rdi ; CHECK-NEXT: movq %rbp, %rsi ; CHECK-NEXT: callq use@PLT -; CHECK-NEXT: andq %r15, %rbx -; CHECK-NEXT: andq %r12, %r14 -; CHECK-NEXT: addq %r13, %r14 -; CHECK-NEXT: adcq %rbp, %rbx -; CHECK-NEXT: movq %r14, %rax -; CHECK-NEXT: movq %rbx, %rdx +; CHECK-NEXT: addq %r14, %r12 +; CHECK-NEXT: adcq %rbx, %r15 +; CHECK-NEXT: setb %al +; CHECK-NEXT: shrdq $1, %r15, %r12 +; CHECK-NEXT: movzbl %al, %edx +; CHECK-NEXT: shldq $63, %r15, %rdx +; CHECK-NEXT: movq %r12, %rax ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 @@ -121,45 +118,23 @@ start: define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) { ; CHECK-LABEL: avgflooru_i128_vec: ; CHECK: # %bb.0: # %start -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; CHECK-NEXT: movq %r10, %r14 -; CHECK-NEXT: xorq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %r11 -; CHECK-NEXT: xorq %rdx, %r11 -; CHECK-NEXT: shrdq $1, %r11, %r14 -; CHECK-NEXT: andq %rdx, %rbx -; CHECK-NEXT: shrq %r11 -; CHECK-NEXT: andq %rsi, %r10 -; CHECK-NEXT: addq %r14, %r10 -; CHECK-NEXT: adcq %rbx, %r11 -; CHECK-NEXT: movq %r9, %rdx -; CHECK-NEXT: xorq %rcx, %rdx -; CHECK-NEXT: movq %rdi, %rsi -; CHECK-NEXT: xorq %r8, %rsi -; CHECK-NEXT: shrdq $1, %rsi, %rdx -; CHECK-NEXT: andq %r8, %rdi -; CHECK-NEXT: shrq %rsi -; CHECK-NEXT: andq %rcx, %r9 -; CHECK-NEXT: addq %rdx, %r9 -; CHECK-NEXT: adcq %rdi, %rsi -; CHECK-NEXT: movq %r9, 16(%rax) -; CHECK-NEXT: movq %r10, (%rax) -; CHECK-NEXT: movq %rsi, 24(%rax) -; CHECK-NEXT: movq %r11, 8(%rax) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rsi +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %rdx +; CHECK-NEXT: setb %dil +; CHECK-NEXT: movzbl %dil, %edi +; CHECK-NEXT: shldq $63, %rdx, %rdi +; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: adcq {{[0-9]+}}(%rsp), %r8 +; CHECK-NEXT: setb %r9b +; CHECK-NEXT: movzbl %r9b, %r9d +; CHECK-NEXT: shldq $63, %r8, %r9 +; CHECK-NEXT: shldq $63, %rsi, %rdx +; CHECK-NEXT: shldq $63, %rcx, %r8 +; CHECK-NEXT: movq %r8, 16(%rax) +; CHECK-NEXT: movq %rdx, (%rax) +; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: movq %rdi, 8(%rax) ; CHECK-NEXT: retq start: %xor = xor <2 x i128> %y, %x From 58e3b63deef5cca3f7e8124d477919077de15fdf Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Tue, 23 Jul 2024 17:39:16 -0400 Subject: [PATCH 13/14] regenerated for other tests --- llvm/test/CodeGen/RISCV/avgflooru.ll | 48 +++++++++++---------- llvm/test/CodeGen/X86/avgflooru-scalar.ll | 52 ++++++----------------- 2 files changed, 40 insertions(+), 60 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/avgflooru.ll b/llvm/test/CodeGen/RISCV/avgflooru.ll index b58aaab6aaf4a..fa88c3760e455 100644 --- a/llvm/test/CodeGen/RISCV/avgflooru.ll +++ b/llvm/test/CodeGen/RISCV/avgflooru.ll @@ -164,18 +164,20 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-LABEL: test_fixed_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srli a3, a1, 1 -; RV32I-NEXT: add a3, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: xor a4, a0, a2 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: add a0, a2, a1 +; RV32I-NEXT: add a4, a3, a1 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: sltu a1, a0, a2 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a2, a4, a1 +; RV32I-NEXT: beq a2, a3, .LBB6_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: .LBB6_2: +; RV32I-NEXT: slli a1, a1, 31 +; RV32I-NEXT: srli a3, a2, 1 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: slli a2, a2, 31 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_fixed_i64: @@ -195,18 +197,20 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; RV32I-LABEL: test_ext_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: and a4, a1, a3 -; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: srli a3, a1, 1 -; RV32I-NEXT: add a3, a4, a3 -; RV32I-NEXT: slli a1, a1, 31 -; RV32I-NEXT: xor a4, a0, a2 -; RV32I-NEXT: srli a4, a4, 1 -; RV32I-NEXT: or a1, a4, a1 -; RV32I-NEXT: and a2, a0, a2 -; RV32I-NEXT: add a0, a2, a1 +; RV32I-NEXT: add a4, a3, a1 +; RV32I-NEXT: add a0, a2, a0 ; RV32I-NEXT: sltu a1, a0, a2 -; RV32I-NEXT: add a1, a3, a1 +; RV32I-NEXT: add a2, a4, a1 +; RV32I-NEXT: beq a2, a3, .LBB7_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a1, a2, a3 +; RV32I-NEXT: .LBB7_2: +; RV32I-NEXT: slli a1, a1, 31 +; RV32I-NEXT: srli a3, a2, 1 +; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: slli a2, a2, 31 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: test_ext_i64: diff --git a/llvm/test/CodeGen/X86/avgflooru-scalar.ll b/llvm/test/CodeGen/X86/avgflooru-scalar.ll index d21c9d65ea9c8..0c91a9da5720a 100644 --- a/llvm/test/CodeGen/X86/avgflooru-scalar.ll +++ b/llvm/test/CodeGen/X86/avgflooru-scalar.ll @@ -168,26 +168,14 @@ define i32 @test_ext_i32(i32 %a0, i32 %a1) nounwind { define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { ; X86-LABEL: test_fixed_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: shrdl $1, %edx, %ebx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: shrl %edx -; X86-NEXT: andl %esi, %eax -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax +; X86-NEXT: setb %dl +; X86-NEXT: movzbl %dl, %edx +; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: shldl $31, %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_fixed_i64: @@ -208,26 +196,14 @@ define i64 @test_fixed_i64(i64 %a0, i64 %a1) nounwind { define i64 @test_ext_i64(i64 %a0, i64 %a1) nounwind { ; X86-LABEL: test_ext_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: shrdl $1, %edx, %ebx -; X86-NEXT: andl %edi, %ecx -; X86-NEXT: shrl %edx -; X86-NEXT: andl %esi, %eax -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl {{[0-9]+}}(%esp), %eax +; X86-NEXT: setb %dl +; X86-NEXT: movzbl %dl, %edx +; X86-NEXT: shldl $31, %eax, %edx +; X86-NEXT: shldl $31, %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_ext_i64: From a9d80774de05eff556ff10f48106e08bb5ce3aed Mon Sep 17 00:00:00 2001 From: medievalghoul <61852278+medievalghoul@users.noreply.github.com> Date: Wed, 24 Jul 2024 10:17:22 -0400 Subject: [PATCH 14/14] removed comment and added expansion in comment --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 70d7de7b878b0..e0269867436f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -9352,6 +9352,7 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { } } + // avgflooru(lhs, rhs) -> or(lshr(add(lhs, rhs),1),shl(overflow, typesize-1)) if (Opc == ISD::AVGFLOORU && VT.isScalarInteger() && !isTypeLegal(VT)) { SDValue UAddWithOverflow = DAG.getNode(ISD::UADDO, dl, DAG.getVTList(VT, MVT::i1), {RHS, LHS}); @@ -9363,7 +9364,6 @@ SDValue TargetLowering::expandAVG(SDNode *N, SelectionDAG &DAG) const { SDValue One = DAG.getShiftAmountConstant(1, VT, dl); SDValue LShrVal = DAG.getNode(ISD::SRL, dl, VT, Sum, One); - // Creating the select instruction SDValue ZeroExtOverflow = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Overflow); SDValue OverflowShl = DAG.getNode(ISD::SHL, dl, VT, ZeroExtOverflow,