diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bb4bb1195f78b..3ee196dac9ca5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -838,10 +838,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); // We have some custom DAG combine patterns for these nodes - setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, - ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, - ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD, - ISD::STORE, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); + setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::BITCAST, + ISD::EXTRACT_VECTOR_ELT, ISD::FADD, ISD::MUL, ISD::SHL, + ISD::SREM, ISD::UREM, ISD::VSELECT, ISD::BUILD_VECTOR, + ISD::ADDRSPACECAST, ISD::LOAD, ISD::STORE, + ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -5201,6 +5202,24 @@ static SDValue PerformFADDCombine(SDNode *N, return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel); } +static SDValue combineBitcast(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + const SDValue &Input = N->getOperand(0); + const EVT FromVT = Input.getValueType(); + const EVT ToVT = N->getValueType(0); + + if (Input.getOpcode() == ISD::BUILD_VECTOR && ToVT == MVT::v2f32 && + FromVT == MVT::v2i32) { + // Pull in v2i32 build_vector through v2f32 bitcast to avoid legalizing the + // build_vector as bitwise ops. + return DCI.DAG.getBuildVector( + MVT::v2f32, SDLoc(N), + {DCI.DAG.getBitcast(MVT::f32, Input.getOperand(0)), + DCI.DAG.getBitcast(MVT::f32, Input.getOperand(1))}); + } + + return SDValue(); +} + static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel) { @@ -5872,6 +5891,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformADDCombine(N, DCI, OptLevel); case ISD::ADDRSPACECAST: return combineADDRSPACECAST(N, DCI); + case ISD::BITCAST: + return combineBitcast(N, DCI); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return combineMulWide(N, DCI, OptLevel); diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll new file mode 100644 index 0000000000000..2bb1cade466bd --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \ +; RUN: %} +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} + +; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64. + +target triple = "nvptx64-nvidia-cuda" + +declare <2 x i32> @return_i32x2(i32 %0) + +; Test with v2i32. +define ptx_kernel void @store_i32x2(i32 %0, ptr %p) { +; CHECK-SM90A-LABEL: store_i32x2( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<6>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM90A-NEXT: { // callseq 0, 0 +; CHECK-SM90A-NEXT: .param .b32 param0; +; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0]; +; CHECK-SM90A-NEXT: } // callseq 0 +; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3; +; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: store_i32x2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<4>; +; CHECK-SM100-NEXT: .reg .b64 %rd<4>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM100-NEXT: { // callseq 0, 0 +; CHECK-SM100-NEXT: .param .b32 param0; +; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM100-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM100-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0]; +; CHECK-SM100-NEXT: } // callseq 0 +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r2, %r3}; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3; +; CHECK-SM100-NEXT: ret; + %v = call <2 x i32> @return_i32x2(i32 %0) + %v.f32x2 = bitcast <2 x i32> %v to <2 x float> + %res = fadd <2 x float> %v.f32x2, %v.f32x2 + store <2 x float> %res, ptr %p, align 8 + ret void +} + +; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to +; v2i32. +define ptx_kernel void @inlineasm(ptr %p) { +; CHECK-SM90A-LABEL: inlineasm( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<7>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r3, 0; +; CHECK-SM90A-NEXT: mov.b32 %r4, %r3; +; CHECK-SM90A-NEXT: mov.b32 %r2, %r4; +; CHECK-SM90A-NEXT: mov.b32 %r1, %r3; +; CHECK-SM90A-NEXT: // begin inline asm +; CHECK-SM90A-NEXT: // nop +; CHECK-SM90A-NEXT: // end inline asm +; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000; +; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: inlineasm( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<6>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r3, 0; +; CHECK-SM100-NEXT: mov.b32 %r4, %r3; +; CHECK-SM100-NEXT: mov.b32 %r2, %r4; +; CHECK-SM100-NEXT: mov.b32 %r1, %r3; +; CHECK-SM100-NEXT: // begin inline asm +; CHECK-SM100-NEXT: // nop +; CHECK-SM100-NEXT: // end inline asm +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000; +; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4; +; CHECK-SM100-NEXT: ret; + %r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer) + %i0 = extractvalue { <1 x float>, <1 x float> } %r, 0 + %i1 = extractvalue { <1 x float>, <1 x float> } %r, 1 + %i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> + %mul = fmul < 2 x float> %i4, zeroinitializer + store <2 x float> %mul, ptr %p, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}}