Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15137,7 +15137,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
return foldedExt;
} else if (ISD::isNON_EXTLoad(N0.getNode()) &&
ISD::isUNINDEXEDLoad(N0.getNode()) &&
TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
bool DoXform = true;
SmallVector<SDNode *, 4> SetCCs;
if (!N0.hasOneUse())
Expand Down
174 changes: 74 additions & 100 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -702,57 +702,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// intrinsics.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

// Turn FP extload into load/fpextend
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
// Turn FP truncstore into trunc + store.
// FIXME: vector types should also be expanded
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
// FP extload/truncstore is not legal in PTX. We need to expand all these.
for (auto FloatVTs :
{MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
for (MVT ValVT : FloatVTs) {
for (MVT MemVT : FloatVTs) {
setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
setTruncStoreAction(ValVT, MemVT, Expand);
}
}
}

// PTX does not support load / store predicate registers
setOperationAction(ISD::LOAD, MVT::i1, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
// To improve CodeGen we'll legalize any-extend loads to zext loads. This is
// how they'll be lowered in ISel anyway, and by doing this a little earlier
// we allow for more DAG combine opportunities.
for (auto IntVTs :
{MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
for (MVT ValVT : IntVTs)
for (MVT MemVT : IntVTs)
if (isTypeLegal(ValVT))
setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);

// PTX does not support load / store predicate registers
setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
Promote);
setTruncStoreAction(VT, MVT::i1, Expand);
}

// Disable generations of extload/truncstore for v2i16/v2i8. The generic
// expansion for these nodes when they are unaligned is incorrect if the
// type is a vector.
//
// TODO: Fix the generic expansion for these nodes found in
// TargetLowering::expandUnalignedLoad/Store.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
MVT::v2i8, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);

// Register custom handling for illegal type loads/stores. We'll try to custom
// lower almost all illegal types and logic in the lowering will discard cases
// we can't handle.
setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
for (MVT VT : MVT::fixedlen_vector_valuetypes())
if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);

// Custom legalization for LDU intrinsics.
// TODO: The logic to lower these is not very robust and we should rewrite it.
// Perhaps LDU should not be represented as an intrinsic at all.
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
for (MVT VT : MVT::fixedlen_vector_valuetypes())
if (IsPTXVectorType(VT))
setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);

setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
ISD::SETGE, ISD::SETLE},
MVT::i1, Expand);

// expand extload of vector of integers.
setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
MVT::v2i8, Expand);
setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);

// This is legal in NVPTX
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
Expand All @@ -767,24 +776,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// DEBUGTRAP can be lowered to PTX brkpt
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

// Register custom handling for vector loads/stores
for (MVT VT : MVT::fixedlen_vector_valuetypes())
if (IsPTXVectorType(VT))
setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
Custom);

setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
{MVT::i128, MVT::f128}, Custom);

// Support varargs.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Expand);
setOperationAction(ISD::VAEND, MVT::Other, Expand);

// Custom handling for i8 intrinsics
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
{MVT::i16, MVT::i32, MVT::i64}, Legal);

Expand Down Expand Up @@ -3092,39 +3089,14 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &Results,
const NVPTXSubtarget &STI);

SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::i1)
return LowerLOADi1(Op, DAG);

EVT VT = Op.getValueType();

if (NVPTX::isPackedVectorTy(VT)) {
// v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
// handle unaligned loads and have to handle it here.
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
MemVT, *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, SDLoc(Op));
}
}

return SDValue();
}

// v = ld i1* addr
// =>
// v1 = ld i8* addr (-> i16)
// v = trunc i16 to i1
SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
LoadSDNode *LD = cast<LoadSDNode>(Node);
SDLoc dl(Node);
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
SDLoc dl(LD);
assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
assert(Node->getValueType(0) == MVT::i1 &&
"Custom lowering for i1 load only");
assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(),
MVT::i8, LD->getAlign(),
Expand All @@ -3133,8 +3105,27 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
// The legalizer (the caller) is expecting two values from the legalized
// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
// in LegalizeDAG.cpp which also uses MergeValues.
SDValue Ops[] = { result, LD->getChain() };
return DAG.getMergeValues(Ops, dl);
return DAG.getMergeValues({result, LD->getChain()}, dl);
}

SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
LoadSDNode *LD = cast<LoadSDNode>(Op);

if (Op.getValueType() == MVT::i1)
return lowerLOADi1(LD, DAG);

// To improve CodeGen we'll legalize any-extend loads to zext loads. This is
// how they'll be lowered in ISel anyway, and by doing this a little earlier
// we allow for more DAG combine opportunities.
if (LD->getExtensionType() == ISD::EXTLOAD) {
assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
"Unexpected fpext-load");
return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
LD->getMemOperand());
}

llvm_unreachable("Unexpected custom lowering for load");
}

SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
Expand All @@ -3144,17 +3135,6 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::i1)
return LowerSTOREi1(Op, DAG);

// v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
// handle unaligned stores and have to handle it here.
if (NVPTX::isPackedVectorTy(VT) &&
!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
VT, *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);

// v2f16/v2bf16/v2i16 don't need special handling.
if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
return SDValue();

// Lower store of any other vector type, including v2f32 as we want to break
// it apart since this is not a widely-supported type.
return LowerSTOREVector(Op, DAG);
Expand Down Expand Up @@ -4010,14 +3990,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
auto &DL = I.getDataLayout();
Info.opc = ISD::INTRINSIC_W_CHAIN;
if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
Info.memVT = getValueType(DL, I.getType());
else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
Info.memVT = getPointerTy(DL);
else
Info.memVT = getValueType(DL, I.getType());
Info.memVT = getValueType(I.getDataLayout(), I.getType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,6 @@ class NVPTXTargetLowering : public TargetLowering {
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
Expand Down
16 changes: 4 additions & 12 deletions llvm/test/CodeGen/Mips/implicit-sret.ll
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ define internal void @test() unnamed_addr nounwind {
; CHECK-NEXT: ld $6, 24($sp)
; CHECK-NEXT: ld $5, 16($sp)
; CHECK-NEXT: ld $7, 32($sp)
; CHECK-NEXT: lw $1, 0($sp)
; CHECK-NEXT: # implicit-def: $a0_64
; CHECK-NEXT: move $4, $1
; CHECK-NEXT: lw $4, 0($sp)
; CHECK-NEXT: jal use_sret
; CHECK-NEXT: nop
; CHECK-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload
Expand Down Expand Up @@ -64,15 +62,9 @@ define internal void @test2() unnamed_addr nounwind {
; CHECK-NEXT: daddiu $4, $sp, 0
; CHECK-NEXT: jal implicit_sret_decl2
; CHECK-NEXT: nop
; CHECK-NEXT: lw $1, 20($sp)
; CHECK-NEXT: lw $2, 12($sp)
; CHECK-NEXT: lw $3, 4($sp)
; CHECK-NEXT: # implicit-def: $a0_64
; CHECK-NEXT: move $4, $3
; CHECK-NEXT: # implicit-def: $a1_64
; CHECK-NEXT: move $5, $2
; CHECK-NEXT: # implicit-def: $a2_64
; CHECK-NEXT: move $6, $1
; CHECK-NEXT: lw $6, 20($sp)
; CHECK-NEXT: lw $5, 12($sp)
; CHECK-NEXT: lw $4, 4($sp)
; CHECK-NEXT: jal use_sret2
; CHECK-NEXT: nop
; CHECK-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/Mips/msa/basic_operations.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1904,7 +1904,7 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lw $2, 0($2)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 15
; N64-NEXT: ld $1, %got_disp(v16i8)($1)
; N64-NEXT: daddu $1, $1, $2
Expand Down Expand Up @@ -1953,7 +1953,7 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lw $2, 0($2)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 7
; N64-NEXT: ld $1, %got_disp(v8i16)($1)
; N64-NEXT: dlsa $1, $2, $1, 1
Expand Down Expand Up @@ -2002,7 +2002,7 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lw $2, 0($2)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 3
; N64-NEXT: ld $1, %got_disp(v4i32)($1)
; N64-NEXT: dlsa $1, $2, $1, 2
Expand Down Expand Up @@ -2053,7 +2053,7 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
; N64-NEXT: daddu $1, $1, $25
; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64_vidx)))
; N64-NEXT: ld $2, %got_disp(i32)($1)
; N64-NEXT: lw $2, 0($2)
; N64-NEXT: lwu $2, 0($2)
; N64-NEXT: andi $2, $2, 1
; N64-NEXT: ld $1, %got_disp(v2i64)($1)
; N64-NEXT: dlsa $1, $2, $1, 3
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1];
; CHECK-NEXT: and.b32 %r3, %r2, -2147450880;
; CHECK-NEXT: and.b32 %r4, %r1, 2147450879;
; CHECK-NEXT: or.b32 %r5, %r4, %r3;
; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_1];
; CHECK-NEXT: and.b32 %r2, %r1, -2147450880;
; CHECK-NEXT: ld.param.b32 %r3, [test_copysign_param_0];
; CHECK-NEXT: and.b32 %r4, %r3, 2147450879;
; CHECK-NEXT: or.b32 %r5, %r4, %r2;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
Expand Down
Loading