Skip to content

[X86] combineINSERT_SUBVECTOR - directly fold to X86ISD::SUBV_BROADCAST_LOAD to prevent vector split infinite loop #145077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 20, 2025

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Jun 20, 2025

This reverts #140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD

…ST_LOAD to prevent vector split infinite loop

This reverts llvm#140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD
@RKSimon RKSimon requested a review from alexfh June 20, 2025 17:22
@llvmbot
Copy link
Member

llvmbot commented Jun 20, 2025

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

This reverts #140919 / f1d03de - which could result in another fold trying to split the concatenation apart again before it was folded to a SUBV_BROADCAST_LOAD


Patch is 79.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145077.diff

3 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+7-3)
  • (modified) llvm/test/CodeGen/X86/oddshuffles.ll (+47)
  • (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+278-278)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35d7b80844056..33083c0eba695 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59465,8 +59465,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   }
 
   // If we're splatting the lower half subvector of a full vector load into the
-  // upper half, just splat the subvector directly, potentially creating a
-  // subvector broadcast.
+  // upper half, attempt to create a subvector broadcast.
   if ((int)IdxVal == (VecNumElts / 2) &&
       Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
     auto *VecLd = dyn_cast<LoadSDNode>(Vec);
@@ -59474,7 +59473,12 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
     if (VecLd && SubLd &&
         DAG.areNonVolatileConsecutiveLoads(
             SubLd, VecLd, SubVec.getValueSizeInBits() / 8, 0)) {
-      return concatSubVectors(SubVec, SubVec, DAG, dl);
+      SDValue BcastLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT,
+                                          SubVecVT, SubLd, 0, DAG);
+      SDValue NewSubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT,
+                                      BcastLd, DAG.getVectorIdxConstant(0, dl));
+      DCI.CombineTo(SubLd, NewSubVec, BcastLd.getValue(1));
+      return BcastLd;
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 6b9a86343ea10..4b0f75df83a76 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2508,3 +2508,50 @@ define void @D107009(ptr %input, ptr %output) {
   store <64 x i32> %i7, ptr %output, align 16
   ret void
 }
+
+; Ensure concatenation of repeated subvector loads before vector can be split apart.
+define void @split_v2i64_subvector_broadcast(ptr readonly align 8 captures(none) dereferenceable(64) %arg) {
+; SSE-LABEL: split_v2i64_subvector_broadcast:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups 8(%rdi), %xmm0
+; SSE-NEXT:    movups 40(%rdi), %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    movups %xmm0, (%rax)
+; SSE-NEXT:    movups %xmm2, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: split_v2i64_subvector_broadcast:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; AVX1-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; AVX1-NEXT:    vmovupd %ymm0, (%rax)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: split_v2i64_subvector_broadcast:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovups 40(%rdi), %xmm0
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = mem[0,1,1,3]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    vmovups %ymm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; XOP-LABEL: split_v2i64_subvector_broadcast:
+; XOP:       # %bb.0:
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; XOP-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; XOP-NEXT:    vmovupd %ymm0, (%rax)
+; XOP-NEXT:    vzeroupper
+; XOP-NEXT:    retq
+  %gep = getelementptr inbounds nuw i8, ptr %arg, i64 8
+  %load = load <6 x i64>, ptr %gep, align 8
+  %shuffle = shufflevector <6 x i64> %load, <6 x i64> poison, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  store <4 x i64> %shuffle, ptr poison, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 25e489eef9d11..61bfee133d84e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -10105,14 +10105,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rax), %zmm15
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm0, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm2, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm2, %ymm4
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
 ; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
@@ -10122,105 +10122,106 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm5, %ymm6
-; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm1, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm5
+; AVX512BW-FCP-NEXT:    vpor %ymm4, %ymm5, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
 ; AVX512BW-FCP-NEXT:    kmovq %r10, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm3, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm9, %ymm6
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm7
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm8
-; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm7, %ymm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm3, %ymm10, %ymm3
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm7, %ymm10
+; AVX512BW-FCP-NEXT:    vpor %ymm6, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm7, %ymm11
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpor %ymm11, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm10, %zmm9
 ; AVX512BW-FCP-NEXT:    vmovdqa 32(%rax), %ymm10
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm4 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm4, %ymm11
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10]
+; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm11
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
 ; AVX512BW-FCP-NEXT:    movabsq $145249953336295682, %rax # imm = 0x204081020408102
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm9 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm3, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm11, %ymm3
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm18
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm18[8],xmm3[8],xmm18[9],xmm3[9],xmm18[10],xmm3[10],xmm18[11],xmm3[11],xmm18[12],xmm3[12],xmm18[13],xmm3[13],xmm18[14],xmm3[14],xmm18[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm9, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm10, %ymm8
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm7
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm23
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm16
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT:    vpshufb %xmm17, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm12, %ymm11, %ymm2
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm14, %ymm13, %ymm14
-; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm14, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm13, %ymm12, %ymm13
+; AVX512BW-FCP-NEXT:    vpor %ymm2, %ymm13, %ymm2
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT:    vpshufb %xmm19, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm13, %zmm2
 ; AVX512BW-FCP-NEXT:    movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm2, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
 ; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm15, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm14, %ymm1
 ; AVX512BW-FCP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k2}
 ; AVX512BW-FCP-NEXT:    movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm8 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm18, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm23, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm3, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm22, %xmm18, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm18[0],xmm3[1],xmm18[1],xmm3[2],xmm18[2],xmm3[3],xmm18[3],xmm3[4],xmm18[4],xmm3[5],xmm18[5],xmm3[6],xmm18[6],xmm3[7],xmm18[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm13, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm26, %xmm12, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm12, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm11, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm1, %xmm1
 ; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
@@ -10228,48 +10229,47 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm18 {%k2}
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm14, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb %xmm24, %xmm13, %xmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm15, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb %xmm25, %xmm14, %xmm1
 ; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm15...
[truncated]

Copy link
Contributor

@alexfh alexfh left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixes the infinite loop in x86 lowering in all code we've found so far. Thanks!

@alexfh alexfh merged commit 1753aba into llvm:main Jun 20, 2025
9 checks passed
@RKSimon RKSimon deleted the x86-concat-subv-broadcast branch June 21, 2025 15:09
Jaddyen pushed a commit to Jaddyen/llvm-project that referenced this pull request Jun 23, 2025
…ST_LOAD to prevent vector split infinite loop (llvm#145077)

This reverts llvm#140919 / f1d03de - which
could result in another fold trying to split the concatenation apart
again before it was folded to a SUBV_BROADCAST_LOAD
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants