Skip to content

Conversation

AlexMaclean
Copy link
Member

The original version of this change inadvertently dropped b6e19b3. This version retains that fix as well as adding tests for it and an explanation for why it is needed.

@llvmbot
Copy link
Member

llvmbot commented Aug 23, 2025

@llvm/pr-subscribers-backend-mips

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

The original version of this change inadvertently dropped b6e19b3. This version retains that fix as well as adding tests for it and an explanation for why it is needed.


Patch is 141.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155063.diff

13 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1-1)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+74-100)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/CodeGen/Mips/implicit-sret.ll (+4-12)
  • (modified) llvm/test/CodeGen/Mips/msa/basic_operations.ll (+4-4)
  • (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+5-5)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+100-110)
  • (modified) llvm/test/CodeGen/NVPTX/i8x2-instructions.ll (+115)
  • (modified) llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll (+30-34)
  • (modified) llvm/test/CodeGen/NVPTX/mulwide.ll (+4-6)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 27b5a0d37b679..e733f680dc345 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15137,7 +15137,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       return foldedExt;
   } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
              ISD::isUNINDEXEDLoad(N0.getNode()) &&
-             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+             TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode *, 4> SetCCs;
     if (!N0.hasOneUse())
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78b..997c33f1f6a76 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -702,57 +702,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  // Turn FP extload into load/fpextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
-  // Turn FP truncstore into trunc + store.
-  // FIXME: vector types should also be expanded
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+  // FP extload/truncstore is not legal in PTX. We need to expand all these.
+  for (auto FloatVTs :
+       {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+    for (MVT ValVT : FloatVTs) {
+      for (MVT MemVT : FloatVTs) {
+        setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+        setTruncStoreAction(ValVT, MemVT, Expand);
+      }
+    }
+  }
 
-  // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  for (auto IntVTs :
+       {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+    for (MVT ValVT : IntVTs)
+      for (MVT MemVT : IntVTs)
+        if (isTypeLegal(ValVT))
+          setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
 
+  // PTX does not support load / store predicate registers
+  setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
   for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+                     Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
+  // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+  // expansion for these nodes when they are unaligned is incorrect if the
+  // type is a vector.
+  //
+  // TODO: Fix the generic expansion for these nodes found in
+  //       TargetLowering::expandUnalignedLoad/Store.
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+                   MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+  // Register custom handling for illegal type loads/stores. We'll try to custom
+  // lower almost all illegal types and logic in the lowering will discard cases
+  // we can't handle.
+  setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+      setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+  // Custom legalization for LDU intrinsics.
+  // TODO: The logic to lower these is not very robust and we should rewrite it.
+  //       Perhaps LDU should not be represented as an intrinsic at all.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (IsPTXVectorType(VT))
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
   setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
                      ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
                      ISD::SETGE, ISD::SETLE},
                     MVT::i1, Expand);
 
-  // expand extload of vector of integers.
-  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
-                   MVT::v2i8, Expand);
-  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
   // This is legal in NVPTX
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +776,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // DEBUGTRAP can be lowered to PTX brkpt
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
-  // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::fixedlen_vector_valuetypes())
-    if (IsPTXVectorType(VT))
-      setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
-                         Custom);
-
-  setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
-                     {MVT::i128, MVT::f128}, Custom);
-
   // Support varargs.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  // Custom handling for i8 intrinsics
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
   setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
                      {MVT::i16, MVT::i32, MVT::i64}, Legal);
 
@@ -3092,39 +3089,14 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results,
                               const NVPTXSubtarget &STI);
 
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1)
-    return LowerLOADi1(Op, DAG);
-
-  EVT VT = Op.getValueType();
-
-  if (NVPTX::isPackedVectorTy(VT)) {
-    // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-    // handle unaligned loads and have to handle it here.
-    LoadSDNode *Load = cast<LoadSDNode>(Op);
-    EVT MemVT = Load->getMemoryVT();
-    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                        MemVT, *Load->getMemOperand())) {
-      SDValue Ops[2];
-      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
-      return DAG.getMergeValues(Ops, SDLoc(Op));
-    }
-  }
-
-  return SDValue();
-}
-
 // v = ld i1* addr
 //   =>
 // v1 = ld i8* addr (-> i16)
 // v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  LoadSDNode *LD = cast<LoadSDNode>(Node);
-  SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDLoc dl(LD);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
-  assert(Node->getValueType(0) == MVT::i1 &&
-         "Custom lowering for i1 load only");
+  assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
   SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
                                  LD->getBasePtr(), LD->getPointerInfo(),
                                  MVT::i8, LD->getAlign(),
@@ -3133,8 +3105,27 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   // The legalizer (the caller) is expecting two values from the legalized
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   // in LegalizeDAG.cpp which also uses MergeValues.
-  SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, dl);
+  return DAG.getMergeValues({result, LD->getChain()}, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+  if (Op.getValueType() == MVT::i1)
+    return lowerLOADi1(LD, DAG);
+
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  if (LD->getExtensionType() == ISD::EXTLOAD) {
+    assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+           "Unexpected fpext-load");
+    return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+                          LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+                          LD->getMemOperand());
+  }
+
+  llvm_unreachable("Unexpected custom lowering for load");
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -3144,17 +3135,6 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::i1)
     return LowerSTOREi1(Op, DAG);
 
-  // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-  // handle unaligned stores and have to handle it here.
-  if (NVPTX::isPackedVectorTy(VT) &&
-      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      VT, *Store->getMemOperand()))
-    return expandUnalignedStore(Store, DAG);
-
-  // v2f16/v2bf16/v2i16 don't need special handling.
-  if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
-    return SDValue();
-
   // Lower store of any other vector type, including v2f32 as we want to break
   // it apart since this is not a widely-supported type.
   return LowerSTOREVector(Op, DAG);
@@ -4010,14 +3990,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-    auto &DL = I.getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(DL, I.getType());
-    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy(DL);
-    else
-      Info.memVT = getValueType(DL, I.getType());
+    Info.memVT = getValueType(I.getDataLayout(), I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 27f099e220976..e7f1a4b4c98c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -309,8 +309,6 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
-
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll
index c8400abacaf8c..88e5119c5ed5b 100644
--- a/llvm/test/CodeGen/Mips/implicit-sret.ll
+++ b/llvm/test/CodeGen/Mips/implicit-sret.ll
@@ -19,9 +19,7 @@ define internal void @test() unnamed_addr nounwind {
 ; CHECK-NEXT:    ld $6, 24($sp)
 ; CHECK-NEXT:    ld $5, 16($sp)
 ; CHECK-NEXT:    ld $7, 32($sp)
-; CHECK-NEXT:    lw $1, 0($sp)
-; CHECK-NEXT:    # implicit-def: $a0_64
-; CHECK-NEXT:    move $4, $1
+; CHECK-NEXT:    lw $4, 0($sp)
 ; CHECK-NEXT:    jal use_sret
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    ld $ra, 56($sp) # 8-byte Folded Reload
@@ -64,15 +62,9 @@ define internal void @test2() unnamed_addr nounwind {
 ; CHECK-NEXT:    daddiu $4, $sp, 0
 ; CHECK-NEXT:    jal implicit_sret_decl2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lw $1, 20($sp)
-; CHECK-NEXT:    lw $2, 12($sp)
-; CHECK-NEXT:    lw $3, 4($sp)
-; CHECK-NEXT:    # implicit-def: $a0_64
-; CHECK-NEXT:    move $4, $3
-; CHECK-NEXT:    # implicit-def: $a1_64
-; CHECK-NEXT:    move $5, $2
-; CHECK-NEXT:    # implicit-def: $a2_64
-; CHECK-NEXT:    move $6, $1
+; CHECK-NEXT:    lw $6, 20($sp)
+; CHECK-NEXT:    lw $5, 12($sp)
+; CHECK-NEXT:    lw $4, 4($sp)
 ; CHECK-NEXT:    jal use_sret2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 4fc3f57aa002d..c3889372b322e 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1904,7 +1904,7 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 15
 ; N64-NEXT:    ld $1, %got_disp(v16i8)($1)
 ; N64-NEXT:    daddu $1, $1, $2
@@ -1953,7 +1953,7 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 7
 ; N64-NEXT:    ld $1, %got_disp(v8i16)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 1
@@ -2002,7 +2002,7 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 3
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 2
@@ -2053,7 +2053,7 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 1
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 3
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index bd4c7775354ae..6c4ae1937e158 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
-; CHECK-NEXT:    and.b32 %r3, %r2, -2147450880;
-; CHECK-NEXT:    and.b32 %r4, %r1, 2147450879;
-; CHECK-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_1];
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    ld.param.b32 %r3, [test_copysign_param_0];
+; CHECK-NEXT:    and.b32 %r4, %r3, 2147450879;
+; CHECK-NEXT:    or.b32 %r5, %r4, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 6e480996e7e6a..9717efb960f18 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
@@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r11;
 ; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT:    and.b32 %r13, %r12, 255;
-; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
-; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r14, %r2;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r15, %r17, %r3;
-; SM60-NEXT:    or.b32 %r16, %r17, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB0_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB0_1;
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
@@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
@@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r11;
 ; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT:    and.b32 %r13, %r12, 255;
-; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
-; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r14, %r2;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r15, %r17, %r3;
-; SM60-NEXT:    or.b32 %r16, %r17, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Aug 23, 2025

@llvm/pr-subscribers-llvm-selectiondag

Author: Alex MacLean (AlexMaclean)

Changes

The original version of this change inadvertently dropped b6e19b3. This version retains that fix as well as adding tests for it and an explanation for why it is needed.


Patch is 141.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155063.diff

13 Files Affected:

  • (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+1-1)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+74-100)
  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (-2)
  • (modified) llvm/test/CodeGen/Mips/implicit-sret.ll (+4-12)
  • (modified) llvm/test/CodeGen/Mips/msa/basic_operations.ll (+4-4)
  • (modified) llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll (+5-5)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll (+180-198)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+100-110)
  • (modified) llvm/test/CodeGen/NVPTX/i8x2-instructions.ll (+115)
  • (modified) llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll (+30-34)
  • (modified) llvm/test/CodeGen/NVPTX/mulwide.ll (+4-6)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 27b5a0d37b679..e733f680dc345 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15137,7 +15137,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       return foldedExt;
   } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
              ISD::isUNINDEXEDLoad(N0.getNode()) &&
-             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+             TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode *, 4> SetCCs;
     if (!N0.hasOneUse())
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bb4bb1195f78b..997c33f1f6a76 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -702,57 +702,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
-  // Turn FP extload into load/fpextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
-  // Turn FP truncstore into trunc + store.
-  // FIXME: vector types should also be expanded
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
-  setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
+  // FP extload/truncstore is not legal in PTX. We need to expand all these.
+  for (auto FloatVTs :
+       {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) {
+    for (MVT ValVT : FloatVTs) {
+      for (MVT MemVT : FloatVTs) {
+        setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
+        setTruncStoreAction(ValVT, MemVT, Expand);
+      }
+    }
+  }
 
-  // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  for (auto IntVTs :
+       {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()})
+    for (MVT ValVT : IntVTs)
+      for (MVT MemVT : IntVTs)
+        if (isTypeLegal(ValVT))
+          setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
 
+  // PTX does not support load / store predicate registers
+  setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom);
   for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1,
+                     Promote);
     setTruncStoreAction(VT, MVT::i1, Expand);
   }
 
+  // Disable generations of extload/truncstore for v2i16/v2i8. The generic
+  // expansion for these nodes when they are unaligned is incorrect if the
+  // type is a vector.
+  //
+  // TODO: Fix the generic expansion for these nodes found in
+  //       TargetLowering::expandUnalignedLoad/Store.
+  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
+                   MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+
+  // Register custom handling for illegal type loads/stores. We'll try to custom
+  // lower almost all illegal types and logic in the lowering will discard cases
+  // we can't handle.
+  setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
+      setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom);
+
+  // Custom legalization for LDU intrinsics.
+  // TODO: The logic to lower these is not very robust and we should rewrite it.
+  //       Perhaps LDU should not be represented as an intrinsic at all.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+  for (MVT VT : MVT::fixedlen_vector_valuetypes())
+    if (IsPTXVectorType(VT))
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+
   setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE,
                      ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT,
                      ISD::SETGE, ISD::SETLE},
                     MVT::i1, Expand);
 
-  // expand extload of vector of integers.
-  setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
-                   MVT::v2i8, Expand);
-  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
-
   // This is legal in NVPTX
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
@@ -767,24 +776,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // DEBUGTRAP can be lowered to PTX brkpt
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
-  // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::fixedlen_vector_valuetypes())
-    if (IsPTXVectorType(VT))
-      setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT,
-                         Custom);
-
-  setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN},
-                     {MVT::i128, MVT::f128}, Custom);
-
   // Support varargs.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  // Custom handling for i8 intrinsics
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
   setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX},
                      {MVT::i16, MVT::i32, MVT::i64}, Legal);
 
@@ -3092,39 +3089,14 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results,
                               const NVPTXSubtarget &STI);
 
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1)
-    return LowerLOADi1(Op, DAG);
-
-  EVT VT = Op.getValueType();
-
-  if (NVPTX::isPackedVectorTy(VT)) {
-    // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-    // handle unaligned loads and have to handle it here.
-    LoadSDNode *Load = cast<LoadSDNode>(Op);
-    EVT MemVT = Load->getMemoryVT();
-    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                        MemVT, *Load->getMemOperand())) {
-      SDValue Ops[2];
-      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
-      return DAG.getMergeValues(Ops, SDLoc(Op));
-    }
-  }
-
-  return SDValue();
-}
-
 // v = ld i1* addr
 //   =>
 // v1 = ld i8* addr (-> i16)
 // v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  LoadSDNode *LD = cast<LoadSDNode>(Node);
-  SDLoc dl(Node);
+static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDLoc dl(LD);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
-  assert(Node->getValueType(0) == MVT::i1 &&
-         "Custom lowering for i1 load only");
+  assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
   SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
                                  LD->getBasePtr(), LD->getPointerInfo(),
                                  MVT::i8, LD->getAlign(),
@@ -3133,8 +3105,27 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   // The legalizer (the caller) is expecting two values from the legalized
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   // in LegalizeDAG.cpp which also uses MergeValues.
-  SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, dl);
+  return DAG.getMergeValues({result, LD->getChain()}, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+
+  if (Op.getValueType() == MVT::i1)
+    return lowerLOADi1(LD, DAG);
+
+  // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
+  // how they'll be lowered in ISel anyway, and by doing this a little earlier
+  // we allow for more DAG combine opportunities.
+  if (LD->getExtensionType() == ISD::EXTLOAD) {
+    assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
+           "Unexpected fpext-load");
+    return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
+                          LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
+                          LD->getMemOperand());
+  }
+
+  llvm_unreachable("Unexpected custom lowering for load");
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -3144,17 +3135,6 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::i1)
     return LowerSTOREi1(Op, DAG);
 
-  // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to
-  // handle unaligned stores and have to handle it here.
-  if (NVPTX::isPackedVectorTy(VT) &&
-      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      VT, *Store->getMemOperand()))
-    return expandUnalignedStore(Store, DAG);
-
-  // v2f16/v2bf16/v2i16 don't need special handling.
-  if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector())
-    return SDValue();
-
   // Lower store of any other vector type, including v2f32 as we want to break
   // it apart since this is not a widely-supported type.
   return LowerSTOREVector(Op, DAG);
@@ -4010,14 +3990,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_p: {
-    auto &DL = I.getDataLayout();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(DL, I.getType());
-    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy(DL);
-    else
-      Info.memVT = getValueType(DL, I.getType());
+    Info.memVT = getValueType(I.getDataLayout(), I.getType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 27f099e220976..e7f1a4b4c98c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -309,8 +309,6 @@ class NVPTXTargetLowering : public TargetLowering {
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
-
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll
index c8400abacaf8c..88e5119c5ed5b 100644
--- a/llvm/test/CodeGen/Mips/implicit-sret.ll
+++ b/llvm/test/CodeGen/Mips/implicit-sret.ll
@@ -19,9 +19,7 @@ define internal void @test() unnamed_addr nounwind {
 ; CHECK-NEXT:    ld $6, 24($sp)
 ; CHECK-NEXT:    ld $5, 16($sp)
 ; CHECK-NEXT:    ld $7, 32($sp)
-; CHECK-NEXT:    lw $1, 0($sp)
-; CHECK-NEXT:    # implicit-def: $a0_64
-; CHECK-NEXT:    move $4, $1
+; CHECK-NEXT:    lw $4, 0($sp)
 ; CHECK-NEXT:    jal use_sret
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    ld $ra, 56($sp) # 8-byte Folded Reload
@@ -64,15 +62,9 @@ define internal void @test2() unnamed_addr nounwind {
 ; CHECK-NEXT:    daddiu $4, $sp, 0
 ; CHECK-NEXT:    jal implicit_sret_decl2
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lw $1, 20($sp)
-; CHECK-NEXT:    lw $2, 12($sp)
-; CHECK-NEXT:    lw $3, 4($sp)
-; CHECK-NEXT:    # implicit-def: $a0_64
-; CHECK-NEXT:    move $4, $3
-; CHECK-NEXT:    # implicit-def: $a1_64
-; CHECK-NEXT:    move $5, $2
-; CHECK-NEXT:    # implicit-def: $a2_64
-; CHECK-NEXT:    move $6, $1
+; CHECK-NEXT:    lw $6, 20($sp)
+; CHECK-NEXT:    lw $5, 12($sp)
+; CHECK-NEXT:    lw $4, 4($sp)
 ; CHECK-NEXT:    jal use_sret2
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
index 4fc3f57aa002d..c3889372b322e 100644
--- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1904,7 +1904,7 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 15
 ; N64-NEXT:    ld $1, %got_disp(v16i8)($1)
 ; N64-NEXT:    daddu $1, $1, $2
@@ -1953,7 +1953,7 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 7
 ; N64-NEXT:    ld $1, %got_disp(v8i16)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 1
@@ -2002,7 +2002,7 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 3
 ; N64-NEXT:    ld $1, %got_disp(v4i32)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 2
@@ -2053,7 +2053,7 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind {
 ; N64-NEXT:    daddu $1, $1, $25
 ; N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64_vidx)))
 ; N64-NEXT:    ld $2, %got_disp(i32)($1)
-; N64-NEXT:    lw $2, 0($2)
+; N64-NEXT:    lwu $2, 0($2)
 ; N64-NEXT:    andi $2, $2, 1
 ; N64-NEXT:    ld $1, %got_disp(v2i64)($1)
 ; N64-NEXT:    dlsa $1, $2, $1, 3
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index bd4c7775354ae..6c4ae1937e158 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
-; CHECK-NEXT:    and.b32 %r3, %r2, -2147450880;
-; CHECK-NEXT:    and.b32 %r4, %r1, 2147450879;
-; CHECK-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_copysign_param_1];
+; CHECK-NEXT:    and.b32 %r2, %r1, -2147450880;
+; CHECK-NEXT:    ld.param.b32 %r3, [test_copysign_param_0];
+; CHECK-NEXT:    and.b32 %r4, %r3, 2147450879;
+; CHECK-NEXT:    or.b32 %r5, %r4, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index 6e480996e7e6a..9717efb960f18 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
@@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8
 ; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r11;
 ; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT:    and.b32 %r13, %r12, 255;
-; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
-; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r14, %r2;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r15, %r17, %r3;
-; SM60-NEXT:    or.b32 %r16, %r17, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14;
+; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r15;
 ; SM60-NEXT:    @%p1 bra $L__BB0_3;
 ; SM60-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM60-NEXT:    // in Loop: Header=BB0_1 Depth=1
 ; SM60-NEXT:    and.b32 %r6, %r5, %r2;
-; SM60-NEXT:    setp.ne.b32 %p2, %r17, %r6;
-; SM60-NEXT:    mov.b32 %r17, %r6;
+; SM60-NEXT:    setp.ne.b32 %p2, %r16, %r6;
+; SM60-NEXT:    mov.b32 %r16, %r6;
 ; SM60-NEXT:    @%p2 bra $L__BB0_1;
 ; SM60-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
 ; SM60-NEXT:    st.param.b32 [func_retval0], %r12;
@@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60:       {
 ; SM60-NEXT:    .reg .pred %p<3>;
 ; SM60-NEXT:    .reg .b16 %rs<2>;
-; SM60-NEXT:    .reg .b32 %r<18>;
+; SM60-NEXT:    .reg .b32 %r<17>;
 ; SM60-NEXT:    .reg .b64 %rd<3>;
 ; SM60-EMPTY:
 ; SM60-NEXT:  // %bb.0:
@@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %
 ; SM60-NEXT:    shl.b32 %r11, %r10, %r1;
 ; SM60-NEXT:    not.b32 %r2, %r11;
 ; SM60-NEXT:    cvt.u32.u16 %r12, %rs1;
-; SM60-NEXT:    and.b32 %r13, %r12, 255;
-; SM60-NEXT:    shl.b32 %r3, %r13, %r1;
+; SM60-NEXT:    shl.b32 %r3, %r12, %r1;
 ; SM60-NEXT:    shl.b32 %r4, %r7, %r1;
-; SM60-NEXT:    ld.global.b32 %r14, [%rd1];
-; SM60-NEXT:    and.b32 %r17, %r14, %r2;
+; SM60-NEXT:    ld.global.b32 %r13, [%rd1];
+; SM60-NEXT:    and.b32 %r16, %r13, %r2;
 ; SM60-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
 ; SM60-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM60-NEXT:    or.b32 %r15, %r17, %r3;
-; SM60-NEXT:    or.b32 %r16, %r17, %r4;
-; SM60-NEXT:    atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15;
-; SM60-NEXT:    setp.eq.b32 %p1, %r5, %r16;
+; SM60-NEXT:    or.b32 %r14, %r16, %r3;
+; SM60-NEXT:    or.b32 %r15, %r16, %r4;
+; SM60-NEXT:    atom.cta.global...
[truncated]

Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems fine, were you able to test the libc stuff locally?

@rupprecht
Copy link
Collaborator

The JAX tests from before are still passing w/ this change patched, all good w/ me too. Thanks!

@AlexMaclean
Copy link
Member Author

Seems fine, were you able to test the libc stuff locally?

Yep! I was able to reproduce the regression and confirm that there are no errors with this version of the change.

@AlexMaclean AlexMaclean merged commit 8ab917a into llvm:main Aug 25, 2025
13 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants