@@ -58503,14 +58503,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
58503
58503
const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
58504
58504
const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
58505
58505
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58506
- // Only concat of subvector high halves which vperm2x128 is best at.
58506
+ // Only concat of subvector high halves which vperm2x128 is best at or if
58507
+ // it should fold into a subvector broadcast.
58507
58508
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58508
- SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58509
- SrcIdx1 == (NumSrcElts1 / 2)) {
58510
- return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58511
- DAG.getBitcast(VT, Src0.getOperand(0)),
58512
- DAG.getBitcast(VT, Src1.getOperand(0)),
58513
- DAG.getTargetConstant(0x31, DL, MVT::i8));
58509
+ SrcVT1.is256BitVector()) {
58510
+ assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58511
+ (SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58512
+ "Bad subvector index");
58513
+ if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58514
+ (IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58515
+ unsigned Index = 0;
58516
+ Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58517
+ Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58518
+ return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58519
+ DAG.getBitcast(VT, Src0.getOperand(0)),
58520
+ DAG.getBitcast(VT, Src1.getOperand(0)),
58521
+ DAG.getTargetConstant(Index, DL, MVT::i8));
58522
+ }
58514
58523
}
58515
58524
// Widen extract_subvector
58516
58525
// concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -58672,7 +58681,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
58672
58681
break;
58673
58682
}
58674
58683
case X86ISD::SHUFP: {
58675
- // TODO: Add SHUFPD support if/when necessary.
58676
58684
if (!IsSplat &&
58677
58685
(VT == MVT::v8f32 ||
58678
58686
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
@@ -58741,18 +58749,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
58741
58749
DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
58742
58750
return DAG.getBitcast(VT, Res);
58743
58751
}
58744
- if (!IsSplat && (VT == MVT::v4f64 || VT == MVT::v8f64)) {
58745
- unsigned NumSubElts = Op0.getValueType().getVectorNumElements();
58746
- uint64_t Mask = (1ULL << NumSubElts) - 1;
58747
- uint64_t Idx = 0;
58748
- for (unsigned I = 0; I != NumOps; ++I) {
58749
- uint64_t SubIdx = Ops[I].getConstantOperandVal(1);
58750
- Idx |= (SubIdx & Mask) << (I * NumSubElts);
58751
- }
58752
- return DAG.getNode(X86ISD::VPERMILPI, DL, VT,
58753
- ConcatSubOperand(VT, Ops, 0),
58754
- DAG.getTargetConstant(Idx, DL, MVT::i8));
58755
- }
58756
58752
break;
58757
58753
case X86ISD::VPERMILPV:
58758
58754
if (!IsSplat && (VT.is256BitVector() ||
@@ -59323,6 +59319,59 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
59323
59319
return DAG.getBitcast(VT, Res);
59324
59320
}
59325
59321
59322
+ // We can always convert per-lane vXf64 shuffles into VSHUFPD.
59323
+ if (!IsSplat &&
59324
+ (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59325
+ all_of(Ops, [](SDValue Op) {
59326
+ return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59327
+ Op.getOpcode() == X86ISD::SHUFP ||
59328
+ Op.getOpcode() == X86ISD::VPERMILPI ||
59329
+ Op.getOpcode() == X86ISD::BLENDI ||
59330
+ Op.getOpcode() == X86ISD::UNPCKL ||
59331
+ Op.getOpcode() == X86ISD::UNPCKH);
59332
+ })) {
59333
+ // Collect the individual per-lane v2f64/v4f64 shuffles.
59334
+ MVT OpVT = Ops[0].getSimpleValueType();
59335
+ unsigned NumOpElts = OpVT.getVectorNumElements();
59336
+ SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
59337
+ SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
59338
+ if (all_of(seq<int>(NumOps), [&](int I) {
59339
+ return getTargetShuffleMask(Ops[I], /*AllowSentinelZero=*/false,
59340
+ SrcOps[I], SrcMasks[I]) &&
59341
+ !is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59342
+ SrcMasks[I].size() == NumOpElts &&
59343
+ all_of(SrcOps[I], [&OpVT](SDValue V) {
59344
+ return V.getValueType() == OpVT;
59345
+ });
59346
+ })) {
59347
+ // Concatenate the shuffle masks into SHUFPD mask and collect subops.
59348
+ bool Unary = true;
59349
+ unsigned SHUFPDMask = 0;
59350
+ SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
59351
+ for (unsigned I = 0; I != NumOps; ++I) {
59352
+ LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59353
+ RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59354
+ Unary &= LHS[I] == RHS[I];
59355
+ for (unsigned J = 0; J != NumOpElts; ++J)
59356
+ SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59357
+ }
59358
+ // Concat SHUFPD LHS/RHS operands - if they match then it will become a
59359
+ // PERMILPD mask and we can always profitably concatenate them.
59360
+ SDValue Concat0 =
59361
+ combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59362
+ SDValue Concat1 =
59363
+ combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59364
+ if (Unary || Concat0 || Concat1) {
59365
+ Concat0 =
59366
+ Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59367
+ Concat1 =
59368
+ Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59369
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59370
+ DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59371
+ }
59372
+ }
59373
+ }
59374
+
59326
59375
return SDValue();
59327
59376
}
59328
59377
0 commit comments