From 29a103da18c5fcf66480165b0969700bb4df60f8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 20 Jan 2025 17:30:34 +0700 Subject: [PATCH] DAG: Fix vector_shuffle -> splat fold defining undef lanes For shuffle vector splats with undef lanes in the mask, this was introducing real values. Filter out build_vector results based on the undef elements in the mask. This avoids AMDGPU test regressions in a future change. test/CodeGen/X86/urem-seteq-illegal-types.ll looks worse but I didn't investigate. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 +- .../AMDGPU/shufflevector.v4i64.v3i64.ll | 787 +++++++++++------- .../CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll | 787 +++++++++++------- .../CodeGen/PowerPC/vector-reduce-fadd.ll | 20 +- llvm/test/CodeGen/WebAssembly/simd.ll | 30 - .../CodeGen/X86/urem-seteq-illegal-types.ll | 6 +- llvm/test/CodeGen/X86/vec_smulo.ll | 24 +- llvm/test/CodeGen/X86/vec_umulo.ll | 10 +- llvm/test/CodeGen/X86/widen_shuffle-1.ll | 5 +- 9 files changed, 1038 insertions(+), 641 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 33f0c8b5555e8..21d5e0a1b2953 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -26386,9 +26386,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (AllSame) return N0; - // Canonicalize any other splat as a build_vector. + // Canonicalize any other splat as a build_vector, but avoid defining any + // undefined elements in the mask. SDValue Splatted = V->getOperand(SplatIndex); SmallVector Ops(NumElts, Splatted); + EVT EltVT = Splatted.getValueType(); + + for (unsigned i = 0; i != NumElts; ++i) { + if (SVN->getMaskElt(i) < 0) + Ops[i] = DAG.getUNDEF(EltVT); + } + SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); // We may have jumped through bitcasts, so the type of the diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll index b87c969c5bbdf..1851a34d0e560 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v3i64.ll @@ -61,13 +61,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -75,13 +72,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -89,13 +83,10 @@ define void @v_shuffle_v4i64_v3i64__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_u_u_u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -113,10 +104,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -127,10 +117,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -141,10 +130,9 @@ define void @v_shuffle_v4i64_v3i64__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -168,13 +156,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -182,13 +167,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -196,13 +178,10 @@ define void @v_shuffle_v4i64_v3i64__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4i64_v3i64__4_u_u_u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -221,10 +200,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -235,10 +213,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -249,10 +226,9 @@ define void @v_shuffle_v4i64_v3i64__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -544,7 +520,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -558,7 +533,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -572,7 +546,6 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -875,9 +848,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -889,9 +865,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -903,9 +882,12 @@ define void @v_shuffle_v4i64_v3i64__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2449,7 +2431,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,7 +2445,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2477,7 +2459,7 @@ define void @v_shuffle_v4i64_v3i64__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -2650,7 +2632,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2664,7 +2646,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,7 +2660,7 @@ define void @v_shuffle_v4i64_v3i64__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -4691,13 +4673,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -4705,13 +4684,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4719,13 +4695,10 @@ define void @v_shuffle_v4i64_v3i64__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4i64_v3i64__1_3_3_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4743,10 +4716,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4757,10 +4729,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4771,10 +4742,9 @@ define void @v_shuffle_v4i64_v3i64__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -5609,7 +5579,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,7 +5593,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5637,7 +5607,7 @@ define void @v_shuffle_v4i64_v3i64__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7046,8 +7016,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7060,8 +7032,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7074,8 +7048,10 @@ define void @v_shuffle_v4i64_v3i64__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=v"() @@ -7408,13 +7384,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -7422,13 +7399,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7436,13 +7414,14 @@ define void @v_shuffle_v4i64_v3i64__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4i64_v3i64__5_5_u_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7807,22 +7786,44 @@ define void @s_shuffle_v4i64_v3i64__0_u_u_u() { } define void @s_shuffle_v4i64_v3i64__1_u_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -7830,59 +7831,99 @@ define void @s_shuffle_v4i64_v3i64__1_u_u_u() { } define void @s_shuffle_v4i64_v3i64__2_u_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__3_u_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x i64> asm "; def $0", "=s"() - %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) - ret void -} - -define void @s_shuffle_v4i64_v3i64__4_u_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__3_u_u_u() { +; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_u_u_u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x i64> asm "; def $0", "=s"() + %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) + ret void +} + +define void @s_shuffle_v4i64_v3i64__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -7891,22 +7932,40 @@ define void @s_shuffle_v4i64_v3i64__4_u_u_u() { } define void @s_shuffle_v4i64_v3i64__5_u_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8150,22 +8209,50 @@ define void @s_shuffle_v4i64_v3i64__5_4_u_u() { } define void @s_shuffle_v4i64_v3i64__5_5_u_u() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> @@ -8491,8 +8578,6 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_u() { ; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8854,22 +8939,56 @@ define void @s_shuffle_v4i64_v3i64__5_5_5_5() { } define void @s_shuffle_v4i64_v3i64__u_0_0_0() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -9022,22 +9141,56 @@ define void @s_shuffle_v4i64_v3i64__2_0_0_0() { } define void @s_shuffle_v4i64_v3i64__3_0_0_0() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -9964,8 +10117,6 @@ define void @s_shuffle_v4i64_v3i64__u_1_1_1() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -10054,8 +10205,6 @@ define void @s_shuffle_v4i64_v3i64__3_1_1_1() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -10968,8 +11117,6 @@ define void @s_shuffle_v4i64_v3i64__u_2_2_2() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -11058,8 +11205,6 @@ define void @s_shuffle_v4i64_v3i64__3_2_2_2() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -11992,22 +12137,44 @@ define void @s_shuffle_v4i64_v3i64__0_3_3_3() { } define void @s_shuffle_v4i64_v3i64__1_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -12015,22 +12182,40 @@ define void @s_shuffle_v4i64_v3i64__1_3_3_3() { } define void @s_shuffle_v4i64_v3i64__2_3_3_3() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x i64> %shuf) @@ -12898,8 +13083,6 @@ define void @s_shuffle_v4i64_v3i64__u_4_4_4() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -13871,8 +14054,6 @@ define void @s_shuffle_v4i64_v3i64__u_5_5_5() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -14144,8 +14325,6 @@ define void @s_shuffle_v4i64_v3i64__5_u_5_5() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s12 ; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 ; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART @@ -14447,22 +14626,56 @@ define void @s_shuffle_v4i64_v3i64__5_4_5_5() { } define void @s_shuffle_v4i64_v3i64__5_5_u_5() { -; GFX9-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4i64_v3i64__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i64> asm "; def $0", "=s"() %vec1 = call <3 x i64> asm "; def $0", "=s"() %shuf = shufflevector <3 x i64> %vec0, <3 x i64> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll index 2b46616c87f0d..7a509ffb8c159 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v3p0.ll @@ -61,13 +61,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -75,13 +72,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -89,13 +83,10 @@ define void @v_shuffle_v4p0_v3p0__1_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_u_u_u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -113,10 +104,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -127,10 +117,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -141,10 +130,9 @@ define void @v_shuffle_v4p0_v3p0__2_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -168,13 +156,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -182,13 +167,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -196,13 +178,10 @@ define void @v_shuffle_v4p0_v3p0__4_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4p0_v3p0__4_u_u_u: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -221,10 +200,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -235,10 +213,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -249,10 +226,9 @@ define void @v_shuffle_v4p0_v3p0__5_u_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -544,7 +520,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -558,7 +533,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -572,7 +546,6 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -875,9 +848,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -889,9 +865,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -903,9 +882,12 @@ define void @v_shuffle_v4p0_v3p0__5_5_5_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2449,7 +2431,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,7 +2445,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2477,7 +2459,7 @@ define void @v_shuffle_v4p0_v3p0__u_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -2650,7 +2632,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -2664,7 +2646,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2678,7 +2660,7 @@ define void @v_shuffle_v4p0_v3p0__3_1_1_1(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -4691,13 +4673,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v4, v2 -; GFX900-NEXT: v_mov_b32_e32 v5, v3 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -4705,13 +4684,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, v2 -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4719,13 +4695,10 @@ define void @v_shuffle_v4p0_v3p0__1_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4p0_v3p0__1_3_3_3: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v4, v2 -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4743,10 +4716,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: v_mov_b32_e32 v6, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, v4 -; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -4757,10 +4729,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v4 -; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -4771,10 +4742,9 @@ define void @v_shuffle_v4p0_v3p0__2_3_3_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v6, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v4 -; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -5609,7 +5579,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v4, v2 ; GFX900-NEXT: v_mov_b32_e32 v5, v3 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -5623,7 +5593,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v4, v2 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -5637,7 +5607,7 @@ define void @v_shuffle_v4p0_v3p0__u_4_4_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v4, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, v3 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7046,8 +7016,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 +; GFX900-NEXT: v_mov_b32_e32 v0, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, v5 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX900-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; @@ -7060,8 +7032,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 +; GFX90A-NEXT: v_mov_b32_e32 v0, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7074,8 +7048,10 @@ define void @v_shuffle_v4p0_v3p0__5_u_5_5(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 +; GFX940-NEXT: v_mov_b32_e32 v0, v4 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=v"() @@ -7408,13 +7384,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX900-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v6, 0 ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def v[0:5] ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_mov_b32_e32 v6, 0 +; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v4 ; GFX900-NEXT: v_mov_b32_e32 v3, v5 -; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX900-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -7422,13 +7399,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 +; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v5 -; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v6, v[2:5], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -7436,13 +7414,14 @@ define void @v_shuffle_v4p0_v3p0__5_5_u_5(ptr addrspace(1) inreg %ptr) { ; GFX940-LABEL: v_shuffle_v4p0_v3p0__5_5_u_5: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v6, 0 +; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 +; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_mov_b32_e32 v2, v4 ; GFX940-NEXT: v_mov_b32_e32 v3, v5 -; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] offset:16 sc0 sc1 ; GFX940-NEXT: global_store_dwordx4 v6, v[2:5], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7807,22 +7786,44 @@ define void @s_shuffle_v4p0_v3p0__0_u_u_u() { } define void @s_shuffle_v4p0_v3p0__1_u_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -7830,59 +7831,99 @@ define void @s_shuffle_v4p0_v3p0__1_u_u_u() { } define void @s_shuffle_v4p0_v3p0__2_u_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__3_u_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] - %vec0 = call <3 x ptr> asm "; def $0", "=s"() - %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> - call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) - ret void -} - -define void @s_shuffle_v4p0_v3p0__4_u_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__3_u_u_u() { +; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_u_u_u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec0 = call <3 x ptr> asm "; def $0", "=s"() + %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> + call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) + ret void +} + +define void @s_shuffle_v4p0_v3p0__4_u_u_u() { +; GFX900-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__4_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -7891,22 +7932,40 @@ define void @s_shuffle_v4p0_v3p0__4_u_u_u() { } define void @s_shuffle_v4p0_v3p0__5_u_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_u_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8150,22 +8209,50 @@ define void @s_shuffle_v4p0_v3p0__5_4_u_u() { } define void @s_shuffle_v4p0_v3p0__5_5_u_u() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_u: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> @@ -8491,8 +8578,6 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_u() { ; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use s[8:15] ; GFX9-NEXT: ;;#ASMEND @@ -8854,22 +8939,56 @@ define void @s_shuffle_v4p0_v3p0__5_5_5_5() { } define void @s_shuffle_v4p0_v3p0__u_0_0_0() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__u_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -9022,22 +9141,56 @@ define void @s_shuffle_v4p0_v3p0__2_0_0_0() { } define void @s_shuffle_v4p0_v3p0__3_0_0_0() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s9 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s9 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s9 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s10, s4 +; GFX900-NEXT: s_mov_b32 s11, s5 +; GFX900-NEXT: s_mov_b32 s12, s4 +; GFX900-NEXT: s_mov_b32 s13, s5 +; GFX900-NEXT: s_mov_b32 s14, s4 +; GFX900-NEXT: s_mov_b32 s15, s5 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s10, s4 +; GFX90A-NEXT: s_mov_b32 s11, s5 +; GFX90A-NEXT: s_mov_b32 s12, s4 +; GFX90A-NEXT: s_mov_b32 s13, s5 +; GFX90A-NEXT: s_mov_b32 s14, s4 +; GFX90A-NEXT: s_mov_b32 s15, s5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__3_0_0_0: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s10, s0 +; GFX940-NEXT: s_mov_b32 s11, s1 +; GFX940-NEXT: s_mov_b32 s12, s0 +; GFX940-NEXT: s_mov_b32 s13, s1 +; GFX940-NEXT: s_mov_b32 s14, s0 +; GFX940-NEXT: s_mov_b32 s15, s1 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -9964,8 +10117,6 @@ define void @s_shuffle_v4p0_v3p0__u_1_1_1() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -10054,8 +10205,6 @@ define void @s_shuffle_v4p0_v3p0__3_1_1_1() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -10968,8 +11117,6 @@ define void @s_shuffle_v4p0_v3p0__u_2_2_2() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -11058,8 +11205,6 @@ define void @s_shuffle_v4p0_v3p0__3_2_2_2() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -11992,22 +12137,44 @@ define void @s_shuffle_v4p0_v3p0__0_3_3_3() { } define void @s_shuffle_v4p0_v3p0__1_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 -; GFX9-NEXT: s_mov_b32 s12, s10 -; GFX9-NEXT: s_mov_b32 s13, s11 -; GFX9-NEXT: s_mov_b32 s14, s10 -; GFX9-NEXT: s_mov_b32 s15, s11 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s6 +; GFX900-NEXT: s_mov_b32 s9, s7 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s6 +; GFX90A-NEXT: s_mov_b32 s9, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__1_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s2 +; GFX940-NEXT: s_mov_b32 s9, s3 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -12015,22 +12182,40 @@ define void @s_shuffle_v4p0_v3p0__1_3_3_3() { } define void @s_shuffle_v4p0_v3p0__2_3_3_3() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[4:9] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[4:9] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__2_3_3_3: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> poison, <4 x i32> call void asm sideeffect "; use $0", "{s[8:15]}"(<4 x ptr> %shuf) @@ -12898,8 +13083,6 @@ define void @s_shuffle_v4p0_v3p0__u_4_4_4() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s10 -; GFX9-NEXT: s_mov_b32 s9, s11 ; GFX9-NEXT: s_mov_b32 s12, s10 ; GFX9-NEXT: s_mov_b32 s13, s11 ; GFX9-NEXT: s_mov_b32 s14, s10 @@ -13871,8 +14054,6 @@ define void @s_shuffle_v4p0_v3p0__u_5_5_5() { ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s[8:13] ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 @@ -14144,8 +14325,6 @@ define void @s_shuffle_v4p0_v3p0__5_u_5_5() { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_mov_b32 s8, s12 ; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: s_mov_b32 s14, s12 ; GFX9-NEXT: s_mov_b32 s15, s13 ; GFX9-NEXT: ;;#ASMSTART @@ -14447,22 +14626,56 @@ define void @s_shuffle_v4p0_v3p0__5_4_5_5() { } define void @s_shuffle_v4p0_v3p0__5_5_u_5() { -; GFX9-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; def s[8:13] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_mov_b32 s8, s12 -; GFX9-NEXT: s_mov_b32 s9, s13 -; GFX9-NEXT: s_mov_b32 s10, s12 -; GFX9-NEXT: s_mov_b32 s11, s13 -; GFX9-NEXT: s_mov_b32 s14, s12 -; GFX9-NEXT: s_mov_b32 s15, s13 -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[8:15] -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; def s[8:13] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s8, s12 +; GFX900-NEXT: s_mov_b32 s9, s13 +; GFX900-NEXT: s_mov_b32 s10, s12 +; GFX900-NEXT: s_mov_b32 s11, s13 +; GFX900-NEXT: s_mov_b32 s14, s12 +; GFX900-NEXT: s_mov_b32 s15, s13 +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use s[8:15] +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def s[8:13] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b32 s8, s12 +; GFX90A-NEXT: s_mov_b32 s9, s13 +; GFX90A-NEXT: s_mov_b32 s10, s12 +; GFX90A-NEXT: s_mov_b32 s11, s13 +; GFX90A-NEXT: s_mov_b32 s14, s12 +; GFX90A-NEXT: s_mov_b32 s15, s13 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use s[8:15] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: s_shuffle_v4p0_v3p0__5_5_u_5: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; def s[0:5] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_mov_b32 s8, s4 +; GFX940-NEXT: s_mov_b32 s9, s5 +; GFX940-NEXT: s_mov_b32 s10, s4 +; GFX940-NEXT: s_mov_b32 s11, s5 +; GFX940-NEXT: s_mov_b32 s14, s4 +; GFX940-NEXT: s_mov_b32 s15, s5 +; GFX940-NEXT: ;;#ASMSTART +; GFX940-NEXT: ; use s[8:15] +; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr> asm "; def $0", "=s"() %vec1 = call <3 x ptr> asm "; def $0", "=s"() %shuf = shufflevector <3 x ptr> %vec0, <3 x ptr> %vec1, <4 x i32> diff --git a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll index 4a036a7868c1a..95ff0d9a3a9c6 100644 --- a/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/PowerPC/vector-reduce-fadd.ll @@ -3628,15 +3628,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add ; PWR9LE-LABEL: v2ppcf128_fast: ; PWR9LE: # %bb.0: # %entry ; PWR9LE-NEXT: mflr r0 -; PWR9LE-NEXT: stdu r1, -64(r1) -; PWR9LE-NEXT: std r0, 80(r1) +; PWR9LE-NEXT: stdu r1, -48(r1) +; PWR9LE-NEXT: std r0, 64(r1) ; PWR9LE-NEXT: bl __gcc_qadd ; PWR9LE-NEXT: nop ; PWR9LE-NEXT: stfd f2, 40(r1) ; PWR9LE-NEXT: stfd f1, 32(r1) ; PWR9LE-NEXT: lxv vs1, 32(r1) ; PWR9LE-NEXT: xxswapd vs2, vs1 -; PWR9LE-NEXT: addi r1, r1, 64 +; PWR9LE-NEXT: addi r1, r1, 48 ; PWR9LE-NEXT: ld r0, 16(r1) ; PWR9LE-NEXT: mtlr r0 ; PWR9LE-NEXT: blr @@ -3644,15 +3644,15 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add ; PWR9BE-LABEL: v2ppcf128_fast: ; PWR9BE: # %bb.0: # %entry ; PWR9BE-NEXT: mflr r0 -; PWR9BE-NEXT: stdu r1, -144(r1) -; PWR9BE-NEXT: std r0, 160(r1) +; PWR9BE-NEXT: stdu r1, -128(r1) +; PWR9BE-NEXT: std r0, 144(r1) ; PWR9BE-NEXT: bl __gcc_qadd ; PWR9BE-NEXT: nop ; PWR9BE-NEXT: stfd f2, 120(r1) ; PWR9BE-NEXT: stfd f1, 112(r1) ; PWR9BE-NEXT: lxv vs1, 112(r1) ; PWR9BE-NEXT: xxswapd vs2, vs1 -; PWR9BE-NEXT: addi r1, r1, 144 +; PWR9BE-NEXT: addi r1, r1, 128 ; PWR9BE-NEXT: ld r0, 16(r1) ; PWR9BE-NEXT: mtlr r0 ; PWR9BE-NEXT: blr @@ -3661,13 +3661,13 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add ; PWR10LE: # %bb.0: # %entry ; PWR10LE-NEXT: mflr r0 ; PWR10LE-NEXT: std r0, 16(r1) -; PWR10LE-NEXT: stdu r1, -64(r1) +; PWR10LE-NEXT: stdu r1, -48(r1) ; PWR10LE-NEXT: bl __gcc_qadd@notoc ; PWR10LE-NEXT: stfd f2, 40(r1) ; PWR10LE-NEXT: stfd f1, 32(r1) ; PWR10LE-NEXT: lxv vs1, 32(r1) ; PWR10LE-NEXT: xxswapd vs2, vs1 -; PWR10LE-NEXT: addi r1, r1, 64 +; PWR10LE-NEXT: addi r1, r1, 48 ; PWR10LE-NEXT: ld r0, 16(r1) ; PWR10LE-NEXT: mtlr r0 ; PWR10LE-NEXT: blr @@ -3676,14 +3676,14 @@ define dso_local ppc_fp128 @v2ppcf128_fast(<2 x ppc_fp128> %a) local_unnamed_add ; PWR10BE: # %bb.0: # %entry ; PWR10BE-NEXT: mflr r0 ; PWR10BE-NEXT: std r0, 16(r1) -; PWR10BE-NEXT: stdu r1, -144(r1) +; PWR10BE-NEXT: stdu r1, -128(r1) ; PWR10BE-NEXT: bl __gcc_qadd ; PWR10BE-NEXT: nop ; PWR10BE-NEXT: stfd f2, 120(r1) ; PWR10BE-NEXT: stfd f1, 112(r1) ; PWR10BE-NEXT: lxv vs1, 112(r1) ; PWR10BE-NEXT: xxswapd vs2, vs1 -; PWR10BE-NEXT: addi r1, r1, 144 +; PWR10BE-NEXT: addi r1, r1, 128 ; PWR10BE-NEXT: ld r0, 16(r1) ; PWR10BE-NEXT: mtlr r0 ; PWR10BE-NEXT: blr diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll index 5ec9f6a2a321b..7228d5335a33f 100644 --- a/llvm/test/CodeGen/WebAssembly/simd.ll +++ b/llvm/test/CodeGen/WebAssembly/simd.ll @@ -481,21 +481,6 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v16i8: ; NO-SIMD128: .functype shuffle_undef_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.store8 15($0), $2 -; NO-SIMD128-NEXT: i32.store8 14($0), $2 -; NO-SIMD128-NEXT: i32.store8 13($0), $2 -; NO-SIMD128-NEXT: i32.store8 12($0), $2 -; NO-SIMD128-NEXT: i32.store8 11($0), $2 -; NO-SIMD128-NEXT: i32.store8 10($0), $2 -; NO-SIMD128-NEXT: i32.store8 9($0), $2 -; NO-SIMD128-NEXT: i32.store8 8($0), $2 -; NO-SIMD128-NEXT: i32.store8 7($0), $2 -; NO-SIMD128-NEXT: i32.store8 6($0), $2 -; NO-SIMD128-NEXT: i32.store8 5($0), $2 -; NO-SIMD128-NEXT: i32.store8 4($0), $2 -; NO-SIMD128-NEXT: i32.store8 3($0), $2 -; NO-SIMD128-NEXT: i32.store8 2($0), $2 -; NO-SIMD128-NEXT: i32.store8 1($0), $2 ; NO-SIMD128-NEXT: i32.store8 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <16 x i8> %x, <16 x i8> %y, @@ -994,13 +979,6 @@ define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v8i16: ; NO-SIMD128: .functype shuffle_undef_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.store16 14($0), $2 -; NO-SIMD128-NEXT: i32.store16 12($0), $2 -; NO-SIMD128-NEXT: i32.store16 10($0), $2 -; NO-SIMD128-NEXT: i32.store16 8($0), $2 -; NO-SIMD128-NEXT: i32.store16 6($0), $2 -; NO-SIMD128-NEXT: i32.store16 4($0), $2 -; NO-SIMD128-NEXT: i32.store16 2($0), $2 ; NO-SIMD128-NEXT: i32.store16 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <8 x i16> %x, <8 x i16> %y, @@ -1288,9 +1266,6 @@ define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v4i32: ; NO-SIMD128: .functype shuffle_undef_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.store 12($0), $2 -; NO-SIMD128-NEXT: i32.store 8($0), $2 -; NO-SIMD128-NEXT: i32.store 4($0), $2 ; NO-SIMD128-NEXT: i32.store 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <4 x i32> %x, <4 x i32> %y, @@ -1550,7 +1525,6 @@ define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v2i64: ; NO-SIMD128: .functype shuffle_undef_v2i64 (i32, i64, i64, i64, i64) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i64.store 8($0), $2 ; NO-SIMD128-NEXT: i64.store 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <2 x i64> %x, <2 x i64> %y, @@ -1819,9 +1793,6 @@ define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v4f32: ; NO-SIMD128: .functype shuffle_undef_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: f32.store 12($0), $2 -; NO-SIMD128-NEXT: f32.store 8($0), $2 -; NO-SIMD128-NEXT: f32.store 4($0), $2 ; NO-SIMD128-NEXT: f32.store 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <4 x float> %x, <4 x float> %y, @@ -2082,7 +2053,6 @@ define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) { ; NO-SIMD128-LABEL: shuffle_undef_v2f64: ; NO-SIMD128: .functype shuffle_undef_v2f64 (i32, f64, f64, f64, f64) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: f64.store 8($0), $2 ; NO-SIMD128-NEXT: f64.store 0($0), $2 ; NO-SIMD128-NEXT: return %res = shufflevector <2 x double> %x, <2 x double> %y, diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 28ac4496acb9b..97cc1f8a15694 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -141,8 +141,10 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pmuludq %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] ; SSE2-NEXT: movdqa %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 7e081310c35be..49cb7c707a14f 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -474,8 +474,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 ; SSE2-NEXT: pand %xmm7, %xmm6 ; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] @@ -548,8 +546,6 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 ; SSSE3-NEXT: pand %xmm7, %xmm6 ; SSSE3-NEXT: paddd %xmm8, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSSE3-NEXT: pmuludq %xmm2, %xmm1 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] @@ -578,25 +574,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pmuldq %xmm2, %xmm0 ; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE41-NEXT: movd %r9d, %xmm4 ; SSE41-NEXT: movdqa %xmm4, %xmm5 ; SSE41-NEXT: pmuldq %xmm3, %xmm4 -; SSE41-NEXT: pinsrd $1, %edx, %xmm3 -; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %esi -; SSE41-NEXT: pinsrd $1, %esi, %xmm5 +; SSE41-NEXT: pinsrd $1, %ecx, %xmm3 +; SSE41-NEXT: movl {{[0-9]+}}(%rsp), %edx +; SSE41-NEXT: pinsrd $1, %edx, %xmm5 ; SSE41-NEXT: pmulld %xmm3, %xmm5 ; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm1 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movd %edx, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] -; SSE41-NEXT: movd %esi, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0] +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE41-NEXT: movd %ecx, %xmm3 +; SSE41-NEXT: movd %edx, %xmm6 ; SSE41-NEXT: pmuldq %xmm3, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7] -; SSE41-NEXT: movq %xmm5, 16(%rcx) +; SSE41-NEXT: movq %xmm5, 16(%rsi) ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 @@ -607,7 +601,7 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] ; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, (%rcx) +; SSE41-NEXT: movdqa %xmm1, (%rsi) ; SSE41-NEXT: psrad $31, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 68c6ca93576b7..62db6d234d301 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -394,8 +394,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSE2-NEXT: pmuludq %xmm2, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] @@ -444,8 +444,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = mem[0,0,0,0] +; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero ; SSSE3-NEXT: pmuludq %xmm2, %xmm6 ; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] ; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] @@ -492,9 +492,7 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 ; SSE41-NEXT: pxor %xmm6, %xmm3 ; SSE41-NEXT: movd %edi, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] ; SSE41-NEXT: movd %r9d, %xmm8 -; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] ; SSE41-NEXT: pmuludq %xmm7, %xmm8 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7] diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll index 3257936f62e3b..3d34205096afe 100644 --- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll +++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll @@ -105,14 +105,13 @@ define void @shuf5(ptr %p) nounwind { ; X86-LABEL: shuf5: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,0,0,0,0,0,0,0,0] +; X86-NEXT: movsd {{.*#+}} xmm0 = [33,33,u,u,u,u,u,u,0,0,u,u,u,u,u,u] ; X86-NEXT: movsd %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shuf5: ; X64: # %bb.0: -; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121 -; X64-NEXT: movq %rax, (%rdi) +; X64-NEXT: movq $8481, (%rdi) # imm = 0x2121 ; X64-NEXT: retq %v = shufflevector <2 x i8> , <2 x i8> poison, <8 x i32> store <8 x i8> %v, ptr %p, align 8