Skip to content

Commit 399865c

Browse files
authored
[X86] combineConcatVectorOps - concat per-lane v2f64/v4f64 shuffles into vXf64 vshufpd (#143017)
We can always concatenate v2f64/v4f64 per-lane shuffles into a single vshufpd instruction, assuming we can profitably concatenate at least one of its operands (or its an unary shuffle). I was really hoping to get this into combineX86ShufflesRecursively but it still can't handle concatenation/length changing as well as combineConcatVectorOps.
1 parent a029ece commit 399865c

File tree

2 files changed

+229
-242
lines changed

2 files changed

+229
-242
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -58503,14 +58503,23 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5850358503
const APInt &SrcIdx0 = Src0.getConstantOperandAPInt(1);
5850458504
const APInt &SrcIdx1 = Src1.getConstantOperandAPInt(1);
5850558505
// concat(extract_subvector(v0), extract_subvector(v1)) -> vperm2x128.
58506-
// Only concat of subvector high halves which vperm2x128 is best at.
58506+
// Only concat of subvector high halves which vperm2x128 is best at or if
58507+
// it should fold into a subvector broadcast.
5850758508
if (VT.is256BitVector() && SrcVT0.is256BitVector() &&
58508-
SrcVT1.is256BitVector() && SrcIdx0 == (NumSrcElts0 / 2) &&
58509-
SrcIdx1 == (NumSrcElts1 / 2)) {
58510-
return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58511-
DAG.getBitcast(VT, Src0.getOperand(0)),
58512-
DAG.getBitcast(VT, Src1.getOperand(0)),
58513-
DAG.getTargetConstant(0x31, DL, MVT::i8));
58509+
SrcVT1.is256BitVector()) {
58510+
assert((SrcIdx0 == 0 || SrcIdx0 == (NumSrcElts0 / 2)) &&
58511+
(SrcIdx1 == 0 || SrcIdx1 == (NumSrcElts1 / 2)) &&
58512+
"Bad subvector index");
58513+
if ((SrcIdx0 == (NumSrcElts0 / 2) && SrcIdx1 == (NumSrcElts1 / 2)) ||
58514+
(IsSplat && ISD::isNormalLoad(Src0.getOperand(0).getNode()))) {
58515+
unsigned Index = 0;
58516+
Index |= SrcIdx0 == 0 ? 0x00 : 0x01;
58517+
Index |= SrcIdx1 == 0 ? 0x20 : 0x30;
58518+
return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
58519+
DAG.getBitcast(VT, Src0.getOperand(0)),
58520+
DAG.getBitcast(VT, Src1.getOperand(0)),
58521+
DAG.getTargetConstant(Index, DL, MVT::i8));
58522+
}
5851458523
}
5851558524
// Widen extract_subvector
5851658525
// concat(extract_subvector(x,lo), extract_subvector(x,hi))
@@ -58672,7 +58681,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5867258681
break;
5867358682
}
5867458683
case X86ISD::SHUFP: {
58675-
// TODO: Add SHUFPD support if/when necessary.
5867658684
if (!IsSplat &&
5867758685
(VT == MVT::v8f32 ||
5867858686
(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) &&
@@ -58741,18 +58749,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5874158749
DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
5874258750
return DAG.getBitcast(VT, Res);
5874358751
}
58744-
if (!IsSplat && (VT == MVT::v4f64 || VT == MVT::v8f64)) {
58745-
unsigned NumSubElts = Op0.getValueType().getVectorNumElements();
58746-
uint64_t Mask = (1ULL << NumSubElts) - 1;
58747-
uint64_t Idx = 0;
58748-
for (unsigned I = 0; I != NumOps; ++I) {
58749-
uint64_t SubIdx = Ops[I].getConstantOperandVal(1);
58750-
Idx |= (SubIdx & Mask) << (I * NumSubElts);
58751-
}
58752-
return DAG.getNode(X86ISD::VPERMILPI, DL, VT,
58753-
ConcatSubOperand(VT, Ops, 0),
58754-
DAG.getTargetConstant(Idx, DL, MVT::i8));
58755-
}
5875658752
break;
5875758753
case X86ISD::VPERMILPV:
5875858754
if (!IsSplat && (VT.is256BitVector() ||
@@ -59323,6 +59319,59 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5932359319
return DAG.getBitcast(VT, Res);
5932459320
}
5932559321

59322+
// We can always convert per-lane vXf64 shuffles into VSHUFPD.
59323+
if (!IsSplat &&
59324+
(VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
59325+
all_of(Ops, [](SDValue Op) {
59326+
return Op.hasOneUse() && (Op.getOpcode() == X86ISD::MOVDDUP ||
59327+
Op.getOpcode() == X86ISD::SHUFP ||
59328+
Op.getOpcode() == X86ISD::VPERMILPI ||
59329+
Op.getOpcode() == X86ISD::BLENDI ||
59330+
Op.getOpcode() == X86ISD::UNPCKL ||
59331+
Op.getOpcode() == X86ISD::UNPCKH);
59332+
})) {
59333+
// Collect the individual per-lane v2f64/v4f64 shuffles.
59334+
MVT OpVT = Ops[0].getSimpleValueType();
59335+
unsigned NumOpElts = OpVT.getVectorNumElements();
59336+
SmallVector<SmallVector<SDValue, 2>, 4> SrcOps(NumOps);
59337+
SmallVector<SmallVector<int, 8>, 4> SrcMasks(NumOps);
59338+
if (all_of(seq<int>(NumOps), [&](int I) {
59339+
return getTargetShuffleMask(Ops[I], /*AllowSentinelZero=*/false,
59340+
SrcOps[I], SrcMasks[I]) &&
59341+
!is128BitLaneCrossingShuffleMask(OpVT, SrcMasks[I]) &&
59342+
SrcMasks[I].size() == NumOpElts &&
59343+
all_of(SrcOps[I], [&OpVT](SDValue V) {
59344+
return V.getValueType() == OpVT;
59345+
});
59346+
})) {
59347+
// Concatenate the shuffle masks into SHUFPD mask and collect subops.
59348+
bool Unary = true;
59349+
unsigned SHUFPDMask = 0;
59350+
SmallVector<SDValue, 4> LHS(NumOps), RHS(NumOps);
59351+
for (unsigned I = 0; I != NumOps; ++I) {
59352+
LHS[I] = SrcOps[I][SrcMasks[I][0] / NumOpElts];
59353+
RHS[I] = SrcOps[I][SrcMasks[I][1] / NumOpElts];
59354+
Unary &= LHS[I] == RHS[I];
59355+
for (unsigned J = 0; J != NumOpElts; ++J)
59356+
SHUFPDMask |= (SrcMasks[I][J] & 1) << ((I * NumOpElts) + J);
59357+
}
59358+
// Concat SHUFPD LHS/RHS operands - if they match then it will become a
59359+
// PERMILPD mask and we can always profitably concatenate them.
59360+
SDValue Concat0 =
59361+
combineConcatVectorOps(DL, VT, LHS, DAG, Subtarget, Depth + 1);
59362+
SDValue Concat1 =
59363+
combineConcatVectorOps(DL, VT, RHS, DAG, Subtarget, Depth + 1);
59364+
if (Unary || Concat0 || Concat1) {
59365+
Concat0 =
59366+
Concat0 ? Concat0 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS);
59367+
Concat1 =
59368+
Concat1 ? Concat1 : DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS);
59369+
return DAG.getNode(X86ISD::SHUFP, DL, VT, Concat0, Concat1,
59370+
DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
59371+
}
59372+
}
59373+
}
59374+
5932659375
return SDValue();
5932759376
}
5932859377

0 commit comments

Comments
 (0)