-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[X86] combineINSERT_SUBVECTOR - directly fold to X86ISD::SUBV_BROADCAST_LOAD to prevent vector split infinite loop #145077
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ST_LOAD to prevent vector split infinite loop This reverts llvm#140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis reverts #140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD Patch is 79.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145077.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35d7b80844056..33083c0eba695 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59465,8 +59465,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
}
// If we're splatting the lower half subvector of a full vector load into the
- // upper half, just splat the subvector directly, potentially creating a
- // subvector broadcast.
+ // upper half, attempt to create a subvector broadcast.
if ((int)IdxVal == (VecNumElts / 2) &&
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
@@ -59474,7 +59473,12 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
if (VecLd && SubLd &&
DAG.areNonVolatileConsecutiveLoads(
SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
- return concatSubVectors(SubVec, SubVec, DAG, dl);
+ SDValue BcastLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT,
+ SubVecVT, SubLd, 0, DAG);
+ SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
+ BcastLd, DAG.getVectorIdxConstant(0, dl));
+ DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
+ return BcastLd;
}
}
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 6b9a86343ea10..4b0f75df83a76 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
store <64 x i32> %i7, ptr %output, align 16
ret void
}
+
+; Ensure concatenation of repeated subvector loads before vector can be split apart.
+define void @split_v2i64_subvector_broadcast(ptr readonly align 8 captures(none) dereferenceable(64) %arg) {
+; SSE-LABEL: split_v2i64_subvector_broadcast:
+; SSE: # %bb.0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: movups 40(%rdi), %xmm1
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: movups %xmm0, (%rax)
+; SSE-NEXT: movups %xmm2, (%rax)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: split_v2i64_subvector_broadcast:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX1-NEXT: vmovupd %ymm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: split_v2i64_subvector_broadcast:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovups 40(%rdi), %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: vmovups %ymm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; XOP-LABEL: split_v2i64_subvector_broadcast:
+; XOP: # %bb.0:
+; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; XOP-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; XOP-NEXT: vmovupd %ymm0, (%rax)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
+ %gep = getelementptr inbounds nuw i8, ptr %arg, i64 8
+ %load = load <6 x i64>, ptr %gep, align 8
+ %shuffle = shufflevector <6 x i64> %load, <6 x i64> poison, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ store <4 x i64> %shuffle, ptr poison, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 25e489eef9d11..61bfee133d84e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -10105,14 +10105,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm3
-; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
@@ -10122,105 +10122,106 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm6
-; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm5
+; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
; AVX512BW-FCP-NEXT: kmovq %r10, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm3
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm6
; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm7
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm8
-; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm8, %ymm8
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm10
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm10, %ymm3
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm10
+; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm11
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9
; AVX512BW-FCP-NEXT: vmovdqa 32(%rax), %ymm10
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm4, %ymm11
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
+; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm11
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2}
; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm11
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm3
-; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm18
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm18[8],xmm3[8],xmm18[9],xmm3[9],xmm18[10],xmm3[10],xmm18[11],xmm3[11],xmm18[12],xmm3[12],xmm18[13],xmm3[13],xmm18[14],xmm3[14],xmm18[15],xmm3[15]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm16
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm14
-; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm14, %ymm2
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm13
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2
; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1
; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2}
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm18, %xmm0
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm23, %xmm0
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm3, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm1
; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm18[0],xmm3[1],xmm18[1],xmm3[2],xmm18[2],xmm3[3],xmm18[3],xmm3[4],xmm18[4],xmm3[5],xmm18[5],xmm3[6],xmm18[6],xmm3[7],xmm18[7]
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7]
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm13, %xmm0
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm0
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm12, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm11, %xmm1
; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
@@ -10228,48 +10229,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2}
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm14, %xmm0
+; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm0
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm15, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm14, %xmm1
; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm15...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This fixes the infinite loop in x86 lowering in all code we've found so far. Thanks!
…ST_LOAD to prevent vector split infinite loop (llvm#145077) This reverts llvm#140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD
This reverts #140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD