diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e929dab429de5..48ecd87cc7720 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58493,14 +58493,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1); const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1); // concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128. - // Only concat of subvector high halves which vperm2x128 is best at. + // Only concat of subvector high halves which vperm2x128 is best at or if + // it should fold into a subvector broadcast. if (VT.is256BitVector() && SrcVT0.is256BitVector() && - SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) && - SrcIdx1 == (NumSrcElts1 / 2)) { - return DAG.getNode(X86ISD::VPERM2X128, DL, VT, - DAG.getBitcast(VT, Src0.getOperand(0)), - DAG.getBitcast(VT, Src1.getOperand(0)), - DAG.getTargetConstant(0x31, DL, MVT::i8)); + SrcVT1.is256BitVector()) { + assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) && + (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) && + "Bad subvector index"); + if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) || + (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) { + unsigned Index = 0; + Index |= SrcIdx0 == 0 ? 0x00 : 0x01; + Index |= SrcIdx1 == 0 ? 0x20 : 0x30; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, + DAG.getBitcast(VT, Src0.getOperand(0)), + DAG.getBitcast(VT, Src1.getOperand(0)), + DAG.getTargetConstant(Index, DL, MVT::i8)); + } } // Widen extract_subvector // concat(extract_subvector(x,lo), extract_subvector(x,hi)) @@ -58662,7 +58671,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, break; } case X86ISD::SHUFP: { - // TODO: Add SHUFPD support if/when necessary. if (!IsSplat && (VT == MVT::v8f32 || (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) && @@ -58731,18 +58739,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1)); return DAG.getBitcast(VT, Res); } - if (!IsSplat && (VT == MVT::v4f64 || VT == MVT::v8f64)) { - unsigned NumSubElts = Op0.getValueType().getVectorNumElements(); - uint64_t Mask = (1ULL << NumSubElts) - 1; - uint64_t Idx = 0; - for (unsigned I = 0; I != NumOps; ++I) { - uint64_t SubIdx = Ops[I].getConstantOperandVal(1); - Idx |= (SubIdx & Mask) << (I * NumSubElts); - } - return DAG.getNode(X86ISD::VPERMILPI, DL, VT, - ConcatSubOperand(VT, Ops, 0), - DAG.getTargetConstant(Idx, DL, MVT::i8)); - } break; case X86ISD::VPERMILPV: if (!IsSplat && (VT.is256BitVector() || @@ -59313,6 +59309,59 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, return DAG.getBitcast(VT, Res); } + // We can always convert per-lane vXf64 shuffles into VSHUFPD. + if (!IsSplat && + (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) && + all_of(Ops, [](SDValue Op) { + return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP || + Op.getOpcode() == X86ISD::SHUFP || + Op.getOpcode() == X86ISD::VPERMILPI || + Op.getOpcode() == X86ISD::BLENDI || + Op.getOpcode() == X86ISD::UNPCKL || + Op.getOpcode() == X86ISD::UNPCKH); + })) { + // Collect the individual per-lane v2f64/v4f64 shuffles. + MVT OpVT = Ops[0].getSimpleValueType(); + unsigned NumOpElts = OpVT.getVectorNumElements(); + SmallVector, 4> SrcOps(NumOps); + SmallVector, 4> SrcMasks(NumOps); + if (all_of(seq(NumOps), [&](int I) { + return getTargetShuffleMask(Ops[I], /*AllowSentinelZero=*/false, + SrcOps[I], SrcMasks[I]) && + !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) && + SrcMasks[I].size() == NumOpElts && + all_of(SrcOps[I], [&OpVT](SDValue V) { + return V.getValueType() == OpVT; + }); + })) { + // Concatenate the shuffle masks into SHUFPD mask and collect subops. + bool Unary = true; + unsigned SHUFPDMask = 0; + SmallVector LHS(NumOps), RHS(NumOps); + for (unsigned I = 0; I != NumOps; ++I) { + LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts]; + RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts]; + Unary &= LHS[I] == RHS[I]; + for (unsigned J = 0; J != NumOpElts; ++J) + SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J); + } + // Concat SHUFPD LHS/RHS operands - if they match then it will become a + // PERMILPD mask and we can always profitably concatenate them. + SDValue Concat0 = + combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1); + SDValue Concat1 = + combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1); + if (Unary || Concat0 || Concat1) { + Concat0 = + Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS); + Concat1 = + Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS); + return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1, + DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); + } + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll index 8d68f88249a9e..3e9fed78b56b4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll @@ -163,16 +163,14 @@ define void @store_i64_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; ; AVX-LABEL: store_i64_stride2_vf4: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3] ; AVX-NEXT: vmovapd %ymm1, 32(%rdx) -; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovapd %ymm0, (%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -343,16 +341,12 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; ; AVX-LABEL: store_i64_stride2_vf8: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX-NEXT: vmovaps (%rdi), %xmm2 -; AVX-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm1[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3] @@ -360,9 +354,9 @@ define void @store_i64_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.ve ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3] ; AVX-NEXT: vmovapd %ymm3, 96(%rdx) -; AVX-NEXT: vmovapd %ymm2, 32(%rdx) -; AVX-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX-NEXT: vmovaps %ymm0, (%rdx) +; AVX-NEXT: vmovapd %ymm2, 64(%rdx) +; AVX-NEXT: vmovapd %ymm1, (%rdx) +; AVX-NEXT: vmovapd %ymm0, 32(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -617,26 +611,18 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX-LABEL: store_i64_stride2_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX-NEXT: vmovaps (%rdi), %xmm4 -; AVX-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm3[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm2[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm4[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm1[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3] @@ -651,12 +637,12 @@ define void @store_i64_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3] ; AVX-NEXT: vmovapd %ymm7, 32(%rdx) ; AVX-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX-NEXT: vmovapd %ymm5, 160(%rdx) -; AVX-NEXT: vmovapd %ymm4, 224(%rdx) -; AVX-NEXT: vmovaps %ymm1, 64(%rdx) -; AVX-NEXT: vmovaps %ymm0, (%rdx) -; AVX-NEXT: vmovaps %ymm2, 128(%rdx) -; AVX-NEXT: vmovaps %ymm3, 192(%rdx) +; AVX-NEXT: vmovapd %ymm5, 64(%rdx) +; AVX-NEXT: vmovapd %ymm4, (%rdx) +; AVX-NEXT: vmovapd %ymm3, 160(%rdx) +; AVX-NEXT: vmovapd %ymm2, 128(%rdx) +; AVX-NEXT: vmovapd %ymm1, 192(%rdx) +; AVX-NEXT: vmovapd %ymm0, 224(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1117,47 +1103,31 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX-LABEL: store_i64_stride2_vf32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rsi), %xmm1 -; AVX-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vmovaps (%rsi), %xmm2 -; AVX-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX-NEXT: vmovaps 64(%rsi), %xmm4 -; AVX-NEXT: vmovaps 96(%rsi), %xmm5 -; AVX-NEXT: vmovaps (%rdi), %xmm6 -; AVX-NEXT: vmovaps 32(%rdi), %xmm7 -; AVX-NEXT: vmovaps 64(%rdi), %xmm8 -; AVX-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm6[1],xmm2[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm2[0] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm3[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm3 = xmm7[0],xmm3[0] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm8[0],xmm4[0] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm9[1],xmm5[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm5 = xmm9[0],xmm5[0] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX-NEXT: vmovaps 160(%rsi), %xmm6 -; AVX-NEXT: vmovaps 160(%rdi), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm7[1],xmm6[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm6 = xmm7[0],xmm6[0] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX-NEXT: vmovaps 192(%rsi), %xmm7 -; AVX-NEXT: vmovaps 192(%rdi), %xmm8 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm7[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm8[0],xmm7[0] -; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[3],ymm1[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[3],ymm3[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[3],ymm4[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[3],ymm5[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm6 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[3],ymm6[3] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[3],ymm7[3] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm8 = mem[0,1,0,1] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm9 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[3],ymm8[3] @@ -1188,17 +1158,17 @@ define void @store_i64_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX-NEXT: vmovapd %ymm12, 32(%rdx) ; AVX-NEXT: vmovapd %ymm11, 96(%rdx) ; AVX-NEXT: vmovapd %ymm10, 160(%rdx) -; AVX-NEXT: vmovapd %ymm9, 288(%rdx) -; AVX-NEXT: vmovapd %ymm8, 480(%rdx) -; AVX-NEXT: vmovaps %ymm7, 384(%rdx) -; AVX-NEXT: vmovaps %ymm6, 320(%rdx) -; AVX-NEXT: vmovaps %ymm5, 192(%rdx) -; AVX-NEXT: vmovaps %ymm4, 128(%rdx) -; AVX-NEXT: vmovaps %ymm3, 64(%rdx) -; AVX-NEXT: vmovaps %ymm2, (%rdx) -; AVX-NEXT: vmovaps %ymm1, 256(%rdx) +; AVX-NEXT: vmovapd %ymm9, 384(%rdx) +; AVX-NEXT: vmovapd %ymm8, 320(%rdx) +; AVX-NEXT: vmovapd %ymm7, 192(%rdx) +; AVX-NEXT: vmovapd %ymm6, 128(%rdx) +; AVX-NEXT: vmovapd %ymm5, 64(%rdx) +; AVX-NEXT: vmovapd %ymm4, (%rdx) +; AVX-NEXT: vmovapd %ymm3, 288(%rdx) +; AVX-NEXT: vmovapd %ymm2, 256(%rdx) +; AVX-NEXT: vmovapd %ymm1, 448(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 448(%rdx) +; AVX-NEXT: vmovaps %ymm0, 480(%rdx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -2080,102 +2050,70 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX-LABEL: store_i64_stride2_vf64: ; AVX: # %bb.0: ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX-NEXT: vmovaps (%rsi), %xmm0 -; AVX-NEXT: vmovaps 32(%rsi), %xmm1 -; AVX-NEXT: vmovaps 64(%rsi), %xmm2 -; AVX-NEXT: vmovaps 96(%rsi), %xmm3 -; AVX-NEXT: vmovaps (%rdi), %xmm4 -; AVX-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX-NEXT: vmovaps 64(%rdi), %xmm6 -; AVX-NEXT: vmovaps 96(%rdi), %xmm7 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm4[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm4[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm5[1],xmm1[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm6[1],xmm2[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm2[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm7[1],xmm3[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm3[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 128(%rsi), %xmm0 -; AVX-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 160(%rsi), %xmm0 -; AVX-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rsi), %xmm0 -; AVX-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rsi), %xmm0 -; AVX-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 256(%rsi), %xmm0 -; AVX-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 288(%rsi), %xmm0 -; AVX-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 320(%rsi), %xmm0 -; AVX-NEXT: vmovaps 320(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 352(%rsi), %xmm0 -; AVX-NEXT: vmovaps 352(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 384(%rsi), %xmm0 -; AVX-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX-NEXT: vmovaps 416(%rsi), %xmm0 -; AVX-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 448(%rsi), %xmm0 -; AVX-NEXT: vmovaps 448(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 480(%rsi), %xmm0 -; AVX-NEXT: vmovaps 480(%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] +; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3] @@ -2226,53 +2164,53 @@ define void @store_i64_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm7 = mem[0,1,0,1] ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm15[0],ymm7[3],ymm15[3] ; AVX-NEXT: vmovapd %ymm7, 992(%rdx) -; AVX-NEXT: vmovapd %ymm14, 928(%rdx) -; AVX-NEXT: vmovapd %ymm13, 864(%rdx) -; AVX-NEXT: vmovapd %ymm12, 800(%rdx) -; AVX-NEXT: vmovapd %ymm11, 736(%rdx) -; AVX-NEXT: vmovapd %ymm10, 672(%rdx) -; AVX-NEXT: vmovapd %ymm9, 608(%rdx) -; AVX-NEXT: vmovapd %ymm8, 544(%rdx) -; AVX-NEXT: vmovapd %ymm0, 480(%rdx) -; AVX-NEXT: vmovapd %ymm1, 416(%rdx) -; AVX-NEXT: vmovapd %ymm2, 352(%rdx) -; AVX-NEXT: vmovapd %ymm3, 288(%rdx) -; AVX-NEXT: vmovapd %ymm4, 224(%rdx) -; AVX-NEXT: vmovapd %ymm5, 160(%rdx) -; AVX-NEXT: vmovapd %ymm6, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 960(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 896(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 832(%rdx) -; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 768(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 704(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 640(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 576(%rdx) +; AVX-NEXT: vmovapd %ymm14, 960(%rdx) +; AVX-NEXT: vmovapd %ymm13, 928(%rdx) +; AVX-NEXT: vmovapd %ymm12, 896(%rdx) +; AVX-NEXT: vmovapd %ymm11, 864(%rdx) +; AVX-NEXT: vmovapd %ymm10, 832(%rdx) +; AVX-NEXT: vmovapd %ymm9, 800(%rdx) +; AVX-NEXT: vmovapd %ymm8, 768(%rdx) +; AVX-NEXT: vmovapd %ymm0, 736(%rdx) +; AVX-NEXT: vmovapd %ymm1, 704(%rdx) +; AVX-NEXT: vmovapd %ymm2, 672(%rdx) +; AVX-NEXT: vmovapd %ymm3, 640(%rdx) +; AVX-NEXT: vmovapd %ymm4, 608(%rdx) +; AVX-NEXT: vmovapd %ymm5, 576(%rdx) +; AVX-NEXT: vmovapd %ymm6, 544(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 512(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 480(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 448(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 416(%rdx) +; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 384(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 352(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 320(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 288(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 256(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 224(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 192(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 160(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 128(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, 64(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: addq $424, %rsp # imm = 0x1A8 ; AVX-NEXT: vzeroupper