diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index e7e8c74c322..3e5da69f48e 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -2806,6 +2806,14 @@ hipError_t ihipMemcpyBatch(void** dsts, void** srcs, size_t* sizes, size_t count } } + if (attrs != nullptr && !stream.device().settings().sdma_swap_supported_) { + for (size_t i = 0; i < numAttrs; ++i) { + if (attrs[i].flags & hipMemcpyFlagExtOpSwap) { + return hipErrorNotSupported; + } + } + } + // Classify copies by type and group them std::vector bufferCopyIndices; std::vector hostToHostIndices; diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index 06908d20e9c..086ccc779fa 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -693,7 +693,8 @@ class Settings : public amd::HeapObject { uint gwsInitSupported_ : 1; //!< Check if GWS is supported on this machine. uint kernel_arg_opt_ : 1; //!< Enables kernel arg optimization for blit kernels uint kernel_arg_impl_ : 2; //!< Kernel argument implementation - uint reserved_ : 14; + uint sdma_swap_supported_ : 1; //!< SDMA linear swap copy (gfx94x/gfx95x) + uint reserved_ : 13; }; uint value_; }; diff --git a/projects/clr/rocclr/device/rocm/rocblit.cpp b/projects/clr/rocclr/device/rocm/rocblit.cpp index d91ccf58fbf..ff3125d7b1b 100644 --- a/projects/clr/rocclr/device/rocm/rocblit.cpp +++ b/projects/clr/rocclr/device/rocm/rocblit.cpp @@ -786,7 +786,26 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector agent_groups; + struct MultiArrays { + std::vector srcs; + std::vector dsts; + std::vector dst_agents; + std::vector sizes; + }; + + MultiArrays swapPending; + hsa_agent_t swapSrcAgent = {}; + for (const auto& op : ops) { + if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) { + assert(op.src_size == op.dst_size && "Asymmetric swap not yet supported"); + if (swapPending.srcs.empty()) swapSrcAgent = op.src_agent; + swapPending.srcs.push_back(op.src); + swapPending.dsts.push_back(op.dst); + swapPending.dst_agents.push_back(op.dst_agent); + swapPending.sizes.push_back(op.src_size); + continue; + } auto& ag = agent_groups[op.src_agent.handle]; ag.src_agent = op.src_agent; BcastKey bkey{op.src, op.size}; @@ -800,17 +819,27 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector srcs; - std::vector dsts; - std::vector dst_agents; - std::vector sizes; - }; std::vector multiStore; multiStore.reserve(agent_groups.size()); std::vector finalOps; + if (!swapPending.srcs.empty()) { + multiStore.push_back(std::move(swapPending)); + auto& stored = multiStore.back(); + + hsa_amd_memory_copy_op_t swap = {}; + swap.version = HSA_AMD_MEMORY_COPY_OP_VERSION; + swap.type = HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP; + swap.src_agent = swapSrcAgent; + swap.src_list = stored.srcs.data(); + swap.dst_list = stored.dsts.data(); + swap.dst_agent_list = stored.dst_agents.data(); + swap.size_list = stored.sizes.data(); + swap.num_entries = static_cast(stored.srcs.size()); + finalOps.push_back(swap); + } + for (auto& [agent_handle, ag] : agent_groups) { MultiArrays pending; @@ -823,7 +852,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector(be.dsts.size()); + bcast.num_entries = static_cast(be.dsts.size()); bcast.size = be.tmpl.size; finalOps.push_back(bcast); } else { @@ -848,7 +877,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector(stored.srcs.size()); + multi.num_entries = static_cast(stored.srcs.size()); finalOps.push_back(multi); } else if (pending.srcs.size() == 1) { hsa_amd_memory_copy_op_t linear = {}; @@ -872,20 +901,30 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector 0) { - for (uint32_t d = 0; d < op.num_dsts; ++d) { + } else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) { + for (uint32_t d = 0; d < op.num_entries; ++d) { + ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, + "HSA BatchCopy Swap [%u/%u] engineOp=%s, addr_a=%p, addr_b=%p, " + "size=%zu, wait_event=0x%zx, completion_signal=0x%zx", + d + 1, op.num_entries, EngineOpName(engine), op.src_list[d], + op.dst_list[d], op.size_list[d], + (wait_events.size() != 0) ? wait_events[0].handle : 0, + op.completion_signal.handle); + } + } else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_entries > 0) { + for (uint32_t d = 0; d < op.num_entries; ++d) { ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2, "HSA BatchCopy Multi [%u/%u] engineOp=%s, src=%p, dst=%p, " "size=%zu, wait_event=0x%zx, completion_signal=0x%zx", - d + 1, op.num_dsts, EngineOpName(engine), op.src_list[d], + d + 1, op.num_entries, EngineOpName(engine), op.src_list[d], op.dst_list[d], op.size_list[d], (wait_events.size() != 0) ? wait_events[0].handle : 0, op.completion_signal.handle); diff --git a/projects/clr/rocclr/device/rocm/rocsettings.cpp b/projects/clr/rocclr/device/rocm/rocsettings.cpp index cef0b5da6aa..b449a1e8826 100644 --- a/projects/clr/rocclr/device/rocm/rocsettings.cpp +++ b/projects/clr/rocclr/device/rocm/rocsettings.cpp @@ -144,6 +144,10 @@ bool Settings::create(bool fullProfile, const amd::Isa& isa, bool enableXNACK, b queue_pipe_dist_ = DEBUG_HIP_DYNAMIC_QUEUES == 2 ? true : false; } + if (gfxipMajor == 9 && gfxipMinor >= 4) { + sdma_swap_supported_ = true; + } + setKernelArgImpl(isa, isXgmi, hasValidHDPFlush); if (gfxipMajor >= 10) { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/details/fmt.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/details/fmt.hpp index 99cfa19d999..8c48c767f98 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/details/fmt.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/details/fmt.hpp @@ -271,7 +271,7 @@ struct formatter } auto type = static_cast(op.type); - const bool is_linear_multi = (type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_dsts > 0); + const bool is_linear_multi = (type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_entries > 0); if(is_linear_multi && op.reserved0 != 0) { reserved += fmt::format(", reserved0={}", op.reserved0); @@ -282,12 +282,12 @@ struct formatter case HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST: return fmt::format_to( ctx.out(), - "[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, " + "[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, " "completion_signal={}, src={}, src_agent={}, dst_list={}, " "dst_agent_list={}, size={}{}{}{}]", type, op.version, - op.num_dsts, + op.num_entries, op.traffic_class, op.completion_signal.handle, fmt::ptr(op.src), @@ -302,12 +302,12 @@ struct formatter case HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP: return fmt::format_to( ctx.out(), - "[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, " + "[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, " "completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, " "src_size={}, dst_size={}{}{}{}]", type, op.version, - op.num_dsts, + op.num_entries, op.traffic_class, op.completion_signal.handle, fmt::ptr(op.src), @@ -325,12 +325,12 @@ struct formatter { return fmt::format_to( ctx.out(), - "[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, " + "[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, " "completion_signal={}, src_list={}, src_agent={}, dst_list={}, " "dst_agent_list={}, size_list={}{}{}{}]", type, op.version, - op.num_dsts, + op.num_entries, op.traffic_class, op.completion_signal.handle, fmt::ptr(op.src_list), @@ -344,12 +344,12 @@ struct formatter } return fmt::format_to( ctx.out(), - "[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, " + "[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, " "completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, " "size={}, unused_size={}{}{}{}]", type, op.version, - op.num_dsts, + op.num_entries, op.traffic_class, op.completion_signal.handle, fmt::ptr(op.src), @@ -366,12 +366,12 @@ struct formatter case HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_SRCDST: return fmt::format_to( ctx.out(), - "[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, " + "[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, " "completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, " "size={}, unused_size={}{}{}{}]", type, op.version, - op.num_dsts, + op.num_entries, op.traffic_class, op.completion_signal.handle, fmt::ptr(op.src), diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h index 770437777a5..ef08bbde4fa 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h @@ -103,6 +103,12 @@ class BlitSdmaBase : public core::Blit { virtual hsa_status_t SubmitLinearCopyBody(void* dst, const void* src, size_t size, core::Signal& prologue_signal, core::Signal& body_signal) = 0; + + virtual hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size, + core::Signal& prologue_signal, + core::Signal& body_signal) = 0; + + virtual bool SwapSupported() const = 0; }; template class BlitSdma : public BlitSdmaBase { @@ -207,6 +213,12 @@ template class BlitSdma : public BlitSdmaBase { core::Signal& prologue_signal, core::Signal& body_signal) override; + hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size, + core::Signal& prologue_signal, + core::Signal& body_signal) override; + + bool SwapSupported() const override { return swap_supported_; } + private: /// @brief Acquires the address into queue buffer where a new command /// packet of specified size could be written. The address that is @@ -261,6 +273,9 @@ template class BlitSdma : public BlitSdmaBase { void BuildBroadcastCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst1, void* dst2, const void* src, size_t size); + void BuildSwapCopyCommand(char* cmd_addr, uint32_t num_copy_command, + void* addr_a, void* addr_b, size_t size); + void BuildCopyRectCommand(const std::function& append, const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset, @@ -334,6 +349,8 @@ template class BlitSdma : public BlitSdmaBase { static const uint32_t broadcast_copy_command_size_; + static const uint32_t swap_copy_command_size_; + static const uint32_t fill_command_size_; static const uint32_t fence_command_size_; @@ -392,6 +409,9 @@ template class BlitSdma : public BlitSdmaBase { /// True if SDMA supports multicast copy (one src -> multiple dst). bool multicast_supported_; + + /// True if SDMA supports linear swap copy (gfx94X+). + bool swap_supported_; }; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index e1143f35c74..7c06baad428 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -748,16 +748,28 @@ class GpuAgent : public GpuAgentInt { const hsa_amd_memory_copy_op_t& op, std::vector& dep_signals); - // Multi-linear copy: LINEAR op with num_dsts > 0, independent copies + // Multi-linear copy: LINEAR op with num_entries > 0, independent copies // (different src/dst/size per entry) sharing a single completion signal. // Uses prologue/body/epilogue fan-out across available SDMA engines. hsa_status_t DmaCopyMulti( const hsa_amd_memory_copy_op_t& op, std::vector& dep_signals); - // Common fan-out implementation shared by DmaCopyBroadcast and DmaCopyMulti. - // Submits prologue, per-entry copy bodies, and epilogue with one signal. - hsa_status_t DmaCopyFanOut( + // Linear swap: exchanges the contents of src and dst buffers. + // Only supported on gfx94X / gfx95X. Uses DmaCopyFanOutOp with + // HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP. + hsa_status_t DmaCopySwap( + const hsa_amd_memory_copy_op_t& op, + std::vector& dep_signals); + + // Common fan-out implementation shared by DmaCopyBroadcast, DmaCopyMulti, + // and swap operations. Submits prologue, per-entry bodies (selected by + // @p op), and epilogue with one signal. + // @p op is the hsa_amd_memory_copy_op_type_t from the public API; only + // HSA_AMD_MEMORY_COPY_OP_LINEAR and HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP are + // currently supported. + hsa_status_t DmaCopyFanOutOp( + hsa_amd_memory_copy_op_type_t op, core::Signal& out_signal, std::vector& dep_signals, uint16_t num_entries, diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/sdma_registers.h b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/sdma_registers.h index 9115fe2f5b5..027ed01d48f 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/inc/sdma_registers.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/inc/sdma_registers.h @@ -64,6 +64,7 @@ const unsigned int SDMA_SUBOP_COPY_LINEAR = 0; // Broadcast linear copy uses the linear sub-op with the broadcast packet format. const unsigned int SDMA_SUBOP_COPY_LINEAR_BROADCAST = SDMA_SUBOP_COPY_LINEAR; const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4; +const unsigned int SDMA_SUBOP_COPY_SWAP = 9; const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2; const unsigned int SDMA_SUBOP_USER_GCR = 1; const unsigned int SDMA_ATOMIC_ADD64 = 47; @@ -221,8 +222,77 @@ typedef struct SDMA_PKT_COPY_LINEAR_BROADCAST_TAG { } DST2_ADDR_HI_UNION; static const size_t kMaxSize_ = 0x3fffe0; + static const size_t kDstAlignMask_ = 0x1F; } SDMA_PKT_COPY_LINEAR_BROADCAST; +// linear copy (swap) packet (SDMA5.2+) +// Atomically swaps data between Address A and Address B. +// Addresses must be 64-byte aligned for gfx94/gfx95X. +typedef struct SDMA_PKT_COPY_LINEAR_SWAP_TAG { + union { + struct { + unsigned int op : 8; + unsigned int sub_op : 8; + unsigned int extra_info : 16; + }; + unsigned int DW_0_DATA; + } HEADER_UNION; + + union { + struct { + unsigned int count : 30; + unsigned int reserved_0 : 2; + }; + unsigned int DW_1_DATA; + } COUNT_UNION; + + union { + struct { + unsigned int reserved_0 : 16; + unsigned int dst_sw : 2; + unsigned int dst_cache_policy : 3; + unsigned int reserved_1 : 3; + unsigned int src_sw : 2; + unsigned int src_cache_policy : 3; + unsigned int reserved_2 : 3; + }; + unsigned int DW_2_DATA; + } PARAMETER_UNION; + + union { + struct { + unsigned int reserved_0 : 6; + unsigned int addr_a_31_6 : 26; + }; + unsigned int DW_3_DATA; + } ADDR_A_LO_UNION; + + union { + struct { + unsigned int addr_a_63_32 : 32; + }; + unsigned int DW_4_DATA; + } ADDR_A_HI_UNION; + + union { + struct { + unsigned int reserved_0 : 6; + unsigned int addr_b_31_6 : 26; + }; + unsigned int DW_5_DATA; + } ADDR_B_LO_UNION; + + union { + struct { + unsigned int addr_b_63_32 : 32; + }; + unsigned int DW_6_DATA; + } ADDR_B_HI_UNION; + + static const size_t kMaxSize_ = 0x3fffffe0; + static const size_t kAlignment_ = 64; +} SDMA_PKT_COPY_LINEAR_SWAP; + // linear sub-window (pre-GFX12) typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG { static const unsigned int pitch_bits = 19; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp index 741e49b8288..401e5addd48 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp @@ -84,6 +84,9 @@ const uint32_t BlitSdma::linear_copy_command_size_ = sizeof(SDMA_PKT_COP template const uint32_t BlitSdma::broadcast_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR_BROADCAST); +template +const uint32_t BlitSdma::swap_copy_command_size_ = sizeof(SDMA_PKT_COPY_LINEAR_SWAP); + template const uint32_t BlitSdma::fill_command_size_ = sizeof(SDMA_PKT_CONSTANT_FILL); @@ -128,7 +131,8 @@ BlitSdma::BlitSdma() queue_rptr_(nullptr), queue_doorbell_(nullptr), broadcast_supported_(false), - multicast_supported_(false) { + multicast_supported_(false), + swap_supported_(false) { std::memset(&queue_resource_, 0, sizeof(queue_resource_)); } @@ -188,11 +192,13 @@ hsa_status_t BlitSdma::Initialize(const core::Agent& agent, bool use_xgm broadcast_supported_ = true; } else if (major == 9) { broadcast_supported_ = (minor >= 4) || (minor == 0 && stepping >= 10); + swap_supported_ = (minor >= 4); } // Multicast not yet supported on any current hardware. multicast_supported_ = false; + // Allocate queue buffer. queue_start_addr_ = (char*)agent_->system_allocator()(kQueueSize, 0x1000, core::MemoryRegion::AllocateExecutable); @@ -999,6 +1005,33 @@ hsa_status_t BlitSdma::SubmitLinearCopyBody( prologue_signal, body_signal); } +template +hsa_status_t BlitSdma::SubmitLinearSwapBody( + void* addr_a, void* addr_b, size_t size, + core::Signal& prologue_signal, + core::Signal& body_signal) { + + if (!swap_supported_) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + // Addresses must be aligned for SWAP operation. Make this check here to + // avoid SDMA blit creation at top level. + constexpr size_t kAlign = SDMA_PKT_COPY_LINEAR_SWAP::kAlignment_; + if ((reinterpret_cast(addr_a) & (kAlign - 1)) != 0 || + (reinterpret_cast(addr_b) & (kAlign - 1)) != 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + const size_t max_copy_size = SDMA_PKT_COPY_LINEAR_SWAP::kMaxSize_; + const uint32_t num_copy_command = + static_cast((size + max_copy_size - 1) / max_copy_size); + + std::vector buff(num_copy_command); + BuildSwapCopyCommand(reinterpret_cast(&buff[0]), num_copy_command, addr_a, addr_b, size); + + return SubmitBody(&buff[0], buff.size() * sizeof(SDMA_PKT_COPY_LINEAR_SWAP), size, + prologue_signal, body_signal); +} + template hsa_status_t BlitSdma::SubmitLinearCopyCommand(void* dst, const void* src, size_t size) { // Break the copy into multiple copy operation incase the copy size exceeds @@ -1053,6 +1086,13 @@ hsa_status_t BlitSdma::SubmitLinearCopyBroadcastCommand( return HSA_STATUS_ERROR_INVALID_ARGUMENT; } + constexpr size_t kMask = SDMA_PKT_COPY_LINEAR_BROADCAST::kDstAlignMask_; + for (size_t i = 0; i + 1 < dsts.size(); i += 2) { + if ((reinterpret_cast(dsts[i]) & kMask) != + (reinterpret_cast(dsts[i + 1]) & kMask)) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + // Each broadcast packet copies from one src to two dsts. // An odd trailing destination falls back to a regular linear copy. const uint32_t num_pairs = static_cast(dsts.size() / 2); @@ -1390,6 +1430,9 @@ template void BlitSdma::BuildBroadcastCopyCommand(char* cmd_addr, uint32_t num_copy_command, void* dst1, void* dst2, const void* src, size_t size) { + constexpr size_t kMask = SDMA_PKT_COPY_LINEAR_BROADCAST::kDstAlignMask_; + assert((reinterpret_cast(dst1) & kMask) == + (reinterpret_cast(dst2) & kMask)); size_t cur_size = 0; const size_t max_copy_size = max_single_linear_copy_size_ ? max_single_linear_copy_size_ : kMaxSingleCopySize; @@ -1431,6 +1474,45 @@ void BlitSdma::BuildBroadcastCopyCommand(char* cmd_addr, uint32_t num_co assert(cur_size == size); } +template +void BlitSdma::BuildSwapCopyCommand(char* cmd_addr, uint32_t num_copy_command, + void* addr_a, void* addr_b, size_t size) { + constexpr size_t kAlign = SDMA_PKT_COPY_LINEAR_SWAP::kAlignment_; + assert((reinterpret_cast(addr_a) & (kAlign - 1)) == 0); + assert((reinterpret_cast(addr_b) & (kAlign - 1)) == 0); + + size_t cur_size = 0; + const size_t max_copy_size = SDMA_PKT_COPY_LINEAR_SWAP::kMaxSize_; + for (uint32_t i = 0; i < num_copy_command; ++i) { + const uint32_t copy_size = + static_cast(std::min((size - cur_size), max_copy_size)); + + void* cur_addr_a = static_cast(addr_a) + cur_size; + void* cur_addr_b = static_cast(addr_b) + cur_size; + + SDMA_PKT_COPY_LINEAR_SWAP* packet_addr = + reinterpret_cast(cmd_addr); + + memset(packet_addr, 0, sizeof(SDMA_PKT_COPY_LINEAR_SWAP)); + + packet_addr->HEADER_UNION.op = SDMA_OP_COPY; + packet_addr->HEADER_UNION.sub_op = SDMA_SUBOP_COPY_SWAP; + + packet_addr->COUNT_UNION.count = copy_size - 1; + + packet_addr->ADDR_A_LO_UNION.DW_3_DATA = ptrlow32(cur_addr_a); + packet_addr->ADDR_A_HI_UNION.addr_a_63_32 = ptrhigh32(cur_addr_a); + + packet_addr->ADDR_B_LO_UNION.DW_5_DATA = ptrlow32(cur_addr_b); + packet_addr->ADDR_B_HI_UNION.addr_b_63_32 = ptrhigh32(cur_addr_b); + + cmd_addr += swap_copy_command_size_; + cur_size += copy_size; + } + + assert(cur_size == size); +} + /* Copies are done in terms of elements (1, 2, 4, 8, or 16 bytes) and have alignment restrictions. Elements are coded by the log2 of the element size in bytes (ie. element 0=1 byte, 4=16 byte). diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index 18accc3887a..29bc315fd55 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -770,7 +770,7 @@ core::Blit* GpuAgent::CreateBlitSdma(bool use_xgmi, int rec_eng) { switch (isa_->GetMajorVersion()) { case 9: sdma = new BlitSdmaV4(); - copy_size_override = (isa_->GetMinorVersion() > 4 || + copy_size_override = (isa_->GetMinorVersion() >= 4 || (isa_->GetMinorVersion() == 0 && isa_->GetStepping() == 10)) ? copy_size_overrides[1] : copy_size_overrides[0]; break; @@ -1334,7 +1334,8 @@ hsa_status_t GpuAgent::DmaPreferredEngine(core::Agent& dst_agent, core::Agent& s return HSA_STATUS_SUCCESS; } -hsa_status_t GpuAgent::DmaCopyFanOut( +hsa_status_t GpuAgent::DmaCopyFanOutOp( + hsa_amd_memory_copy_op_type_t op, core::Signal& out_signal, std::vector& dep_signals, uint16_t num_entries, @@ -1354,6 +1355,9 @@ hsa_status_t GpuAgent::DmaCopyFanOut( return HSA_STATUS_ERROR_INVALID_ARGUMENT; BlitSdmaBase* coordinator = static_cast((*coord_blit).get()); + if (op == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP && !coordinator->SwapSupported()) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + // Resolve per-entry SDMA engines. constexpr bool kUseRRBalancing = false; const uint32_t total_sdma = @@ -1456,11 +1460,14 @@ hsa_status_t GpuAgent::DmaCopyFanOut( out_signal.AddRelaxed(num_entries); } + const char* op_name = + (op == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) ? "Swap" : "Copy"; + // Prologue: dep polls, HDP flush, GCR invalidate, decrement prologue_signal. LogPrint(HSA_AMD_LOG_FLAG_SDMA, - "SDMA FanOut Prologue: engine %02u, num_entries=%u, dep_signal=0x%zx, " + "SDMA FanOut(%s) Prologue: engine %02u, num_entries=%u, dep_signal=0x%zx, " "completion_signal=0x%zx, prologue_signal=0x%zx", - BlitHostToDev, num_entries, + op_name, BlitHostToDev, num_entries, dep_signals.empty() ? 0 : core::Signal::Convert(dep_signals[0]).handle, core::Signal::Convert(&out_signal).handle, core::Signal::Convert(prologue_raw).handle); @@ -1468,27 +1475,37 @@ hsa_status_t GpuAgent::DmaCopyFanOut( *prologue_raw); if (stat != HSA_STATUS_SUCCESS) return stat; - // Fan out: one copy body per entry on its resolved engine. + // Fan out: one body per entry on its resolved engine. for (uint32_t d = 0; d < num_entries; ++d) { core::Signal& body_sig = use_body_signals ? *body_raw[d] : out_signal; LogPrint(HSA_AMD_LOG_FLAG_SDMA, - "SDMA FanOut Body[%u/%u]: engine %02u, src=%p, dst=%p, size=%zu, " + "SDMA FanOut(%s) Body[%u/%u]: engine %02u, src=%p, dst=%p, size=%zu, " "prologue_signal=0x%zx, body_signal=0x%zx", + op_name, d + 1, num_entries, engines[d].idx, src_list[d], dst_list[d], size_list[d], core::Signal::Convert(prologue_raw).handle, core::Signal::Convert(&body_sig).handle); - stat = engines[d].blit->SubmitLinearCopyBody( - dst_list[d], src_list[d], size_list[d], - *prologue_raw, body_sig); + switch (op) { + case HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP: + stat = engines[d].blit->SubmitLinearSwapBody( + dst_list[d], const_cast(src_list[d]), size_list[d], + *prologue_raw, body_sig); + break; + default: + stat = engines[d].blit->SubmitLinearCopyBody( + dst_list[d], src_list[d], size_list[d], + *prologue_raw, body_sig); + break; + } if (stat != HSA_STATUS_SUCCESS) return stat; } - // Epilogue: waits for all bodies, GCR writeback, end timestamp, signal → 0. + // Epilogue: waits for all bodies, GCR writeback, end timestamp, signal -> 0. LogPrint(HSA_AMD_LOG_FLAG_SDMA, - "SDMA FanOut Epilogue: engine %02u, completion_signal=0x%zx, " + "SDMA FanOut(%s) Epilogue: engine %02u, completion_signal=0x%zx, " "num_body_signals=%zu", - BlitHostToDev, + op_name, BlitHostToDev, core::Signal::Convert(&out_signal).handle, body_raw.size()); constexpr hsa_signal_value_t kWaitValue = 1; @@ -1502,7 +1519,7 @@ hsa_status_t GpuAgent::DmaCopyBroadcast( core::Signal* out_signal_obj = core::Signal::Convert(op.completion_signal); core::Signal& out_signal = *out_signal_obj; - const uint16_t num_dsts = op.num_dsts; + const uint16_t num_entries = op.num_entries; constexpr size_t kBroadcastMaxSize = 1024 * 1024; // Try HW broadcast for small transfers. @@ -1518,12 +1535,12 @@ hsa_status_t GpuAgent::DmaCopyBroadcast( out_signal.async_copy_agent(core::Agent::Convert(this->public_handle())); LogPrint(HSA_AMD_LOG_FLAG_SDMA, - "SDMA Broadcast using engine %02u, src=%p, num_dsts=%u, size=%zu, " + "SDMA Broadcast using engine %02u, src=%p, num_entries=%u, size=%zu, " "dep_signal=0x%zx, completion_signal=0x%zx", - BlitHostToDev, op.src, num_dsts, op.size, + BlitHostToDev, op.src, num_entries, op.size, dep_signals.empty() ? 0 : core::Signal::Convert(dep_signals[0]).handle, out_signal_obj->signal_); - std::vector dsts(op.dst_list, op.dst_list + num_dsts); + std::vector dsts(op.dst_list, op.dst_list + num_entries); return sdma_blit->SubmitLinearCopyBroadcastCommand( dsts, op.src, op.size, dep_signals, out_signal); } @@ -1531,12 +1548,12 @@ hsa_status_t GpuAgent::DmaCopyBroadcast( } // Fall back to fan-out: expand broadcast into per-entry arrays. - std::vector srcs(num_dsts, op.src); - std::vector sizes(num_dsts, op.size); + std::vector srcs(num_entries, op.src); + std::vector sizes(num_entries, op.size); - return DmaCopyFanOut(out_signal, dep_signals, num_dsts, - srcs.data(), op.dst_list, op.dst_agent_list, - sizes.data()); + return DmaCopyFanOutOp(HSA_AMD_MEMORY_COPY_OP_LINEAR, out_signal, dep_signals, + num_entries, srcs.data(), op.dst_list, + op.dst_agent_list, sizes.data()); } hsa_status_t GpuAgent::DmaCopyMulti( @@ -1546,10 +1563,22 @@ hsa_status_t GpuAgent::DmaCopyMulti( core::Signal* out_signal_obj = core::Signal::Convert(op.completion_signal); core::Signal& out_signal = *out_signal_obj; - return DmaCopyFanOut(out_signal, dep_signals, op.num_dsts, - const_cast(op.src_list), - op.dst_list, op.dst_agent_list, - op.size_list); + return DmaCopyFanOutOp(HSA_AMD_MEMORY_COPY_OP_LINEAR, out_signal, dep_signals, + op.num_entries, const_cast(op.src_list), + op.dst_list, op.dst_agent_list, op.size_list); +} + +hsa_status_t GpuAgent::DmaCopySwap( + const hsa_amd_memory_copy_op_t& op, + std::vector& dep_signals) { + + core::Signal* out_signal_obj = core::Signal::Convert(op.completion_signal); + core::Signal& out_signal = *out_signal_obj; + + return DmaCopyFanOutOp(HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP, out_signal, + dep_signals, op.num_entries, + const_cast(op.src_list), + op.dst_list, op.dst_agent_list, op.size_list); } hsa_status_t GpuAgent::DmaCopyBatch(const hsa_amd_memory_copy_op_t* ops, @@ -1568,7 +1597,7 @@ hsa_status_t GpuAgent::DmaCopyBatch(const hsa_amd_memory_copy_op_t* ops, switch (op.type) { case HSA_AMD_MEMORY_COPY_OP_LINEAR: { - if (op.num_dsts > 0) { + if (op.num_entries > 0) { status = DmaCopyMulti(op, dep_signals); } else { core::Agent* dst_agent = core::Agent::Convert(op.dst_agent); @@ -1589,10 +1618,13 @@ hsa_status_t GpuAgent::DmaCopyBatch(const hsa_amd_memory_copy_op_t* ops, status = DmaCopyBroadcast(op, dep_signals); break; case HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP: + if (op.num_entries == 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + status = DmaCopySwap(op, dep_signals); + break; case HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_SRC: case HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_DST: case HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_SRCDST: - // Future implementation return HSA_STATUS_ERROR_INVALID_ARGUMENT; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp index ba7f9cad8f9..f5ff374a952 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/hsa_ext_amd.cpp @@ -433,13 +433,13 @@ hsa_status_t hsa_amd_memory_async_batch_copy(const hsa_amd_memory_copy_op_t* cop core::Agent* dst_agent = nullptr; switch (op.type) { case HSA_AMD_MEMORY_COPY_OP_LINEAR: - if (op.num_dsts > 0) { + if (op.num_entries > 0) { // Multi-linear: arrays of src/dst/size, one signal for all entries. if (op.src_list == nullptr || op.dst_list == nullptr || op.dst_agent_list == nullptr || op.size_list == nullptr || - op.num_dsts > 1024 || op.reserved0 != 0) + op.num_entries > 1024 || op.reserved0 != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - for (uint32_t d = 0; d < op.num_dsts; ++d) { + for (uint32_t d = 0; d < op.num_entries; ++d) { IS_BAD_PTR(op.src_list[d]); IS_BAD_PTR(op.dst_list[d]); core::Agent* da = core::Agent::Convert(op.dst_agent_list[d]); @@ -462,45 +462,59 @@ hsa_status_t hsa_amd_memory_async_batch_copy(const hsa_amd_memory_copy_op_t* cop IS_BAD_PTR(op.dst); dst_agent = core::Agent::Convert(op.dst_agent); IS_VALID(dst_agent); - if (op.num_dsts != 0 || op.unused_size != 0) + if (op.num_entries != 0 || op.unused_size != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; case HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST: if (op.dst_list == nullptr || op.dst_agent_list == nullptr || - op.num_dsts == 0 || op.num_dsts > 1024 || op.unused_size != 0) + op.num_entries == 0 || op.num_entries > 1024 || op.unused_size != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - for (uint32_t d = 0; d < op.num_dsts; ++d) { + for (uint32_t d = 0; d < op.num_entries; ++d) { IS_BAD_PTR(op.dst_list[d]); core::Agent* da = core::Agent::Convert(op.dst_agent_list[d]); IS_VALID(da); } break; case HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP: - IS_BAD_PTR(op.dst); - dst_agent = core::Agent::Convert(op.dst_agent); - IS_VALID(dst_agent); - if (op.num_dsts != 0) return HSA_STATUS_ERROR_INVALID_ARGUMENT; - if (op.src_size == 0 || op.dst_size == 0) - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if (op.num_entries > 0) { + if (op.src_list == nullptr || op.dst_list == nullptr || + op.dst_agent_list == nullptr || op.size_list == nullptr || + op.num_entries > 1024 || op.reserved0 != 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + for (uint32_t d = 0; d < op.num_entries; ++d) { + IS_BAD_PTR(op.src_list[d]); + IS_BAD_PTR(op.dst_list[d]); + core::Agent* da = core::Agent::Convert(op.dst_agent_list[d]); + IS_VALID(da); + if (op.size_list[d] == 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + } else { + IS_BAD_PTR(op.dst); + dst_agent = core::Agent::Convert(op.dst_agent); + IS_VALID(dst_agent); + if (op.src_size == 0 || op.dst_size == 0) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - const bool is_linear_multi = - (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_dsts > 0); + const bool is_multi = + (op.num_entries > 0); bool has_work; - if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) - has_work = (op.src_size > 0); - else if (is_linear_multi) + if (is_multi) has_work = true; + else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) + has_work = (op.src_size > 0); else has_work = (op.size > 0); if (has_work) { core::Agent* copy_agent = nullptr; - if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST || is_linear_multi) { + if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST || is_multi) { if (src_agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice) return HSA_STATUS_ERROR_INVALID_AGENT; copy_agent = src_agent; diff --git a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h index 74970a35bb5..1ea46cbc251 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h +++ b/projects/rocr-runtime/runtime/hsa-runtime/inc/hsa_ext_amd.h @@ -1865,7 +1865,7 @@ hsa_status_t HSA_API * @brief Type of memory copy operation within a batch. */ typedef enum { - HSA_AMD_MEMORY_COPY_OP_LINEAR = 0, /**< Linear copy (num_dsts==0: single; num_dsts>0: multi) */ + HSA_AMD_MEMORY_COPY_OP_LINEAR = 0, /**< Linear copy (num_entries==0: single; num_entries>0: multi) */ HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST = 1, /**< Linear broadcast: single src -> multiple dsts */ HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP = 2, /**< Linear swap: exchange contents of src and dst */ HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_SRC = 3, /**< Source address resolved via indirection */ @@ -1946,33 +1946,41 @@ typedef enum { * * Field usage per operation type: * - * LINEAR (default, single copy when num_dsts == 0): + * LINEAR (default, single copy when num_entries == 0): * src, src_agent -- source pointer and agent * dst, dst_agent -- destination pointer and agent * size -- copy size in bytes - * num_dsts -- 0 + * num_entries -- 0 * - * LINEAR (multi-copy when num_dsts > 0, one signal for all entries): - * src_list -- caller-owned array of num_dsts source pointers + * LINEAR (multi-copy when num_entries > 0, one signal for all entries): + * src_list -- caller-owned array of num_entries source pointers * src_agent -- common source agent (must be GPU) - * dst_list -- caller-owned array of num_dsts destination pointers - * dst_agent_list -- caller-owned array of num_dsts destination agents - * size_list -- caller-owned array of num_dsts copy sizes in bytes - * num_dsts -- number of entries (>= 1, <= 1024) + * dst_list -- caller-owned array of num_entries destination pointers + * dst_agent_list -- caller-owned array of num_entries destination agents + * size_list -- caller-owned array of num_entries copy sizes in bytes + * num_entries -- number of entries (>= 1, <= 1024) * * LINEAR_BROADCAST (single source -> multiple destinations): * src, src_agent -- source pointer and agent (must be GPU) - * dst_list -- caller-owned array of num_dsts destination pointers - * dst_agent_list -- caller-owned array of num_dsts destination agents + * dst_list -- caller-owned array of num_entries destination pointers + * dst_agent_list -- caller-owned array of num_entries destination agents * size -- copy size in bytes (same for every destination) - * num_dsts -- number of entries in dst_list / dst_agent_list (>= 1, <= 1024) + * num_entries -- number of entries in dst_list / dst_agent_list (>= 1, <= 1024) * - * LINEAR_SWAP (exchange contents of two buffers): + * LINEAR_SWAP (exchange contents of two buffers, multi-entry when num_entries > 0): + * src_list -- caller-owned array of num_entries source pointers + * src_agent -- common agent for routing + * dst_list -- caller-owned array of num_entries destination pointers + * dst_agent_list -- caller-owned array of num_entries destination agents + * size_list -- caller-owned array of num_entries swap sizes in bytes + * num_entries -- number of entries (>= 1, <= 1024) + * + * LINEAR_SWAP (single, when num_entries == 0): * src, src_agent -- first buffer pointer and agent (modified in place) * dst, dst_agent -- second buffer pointer and agent (modified in place) * src_size -- size of the source region in bytes * dst_size -- size of the destination region in bytes - * num_dsts -- must be 0 + * num_entries -- 0 * * LINEAR_INDIRECT_SRC (source address resolved via indirection): * src -- void** pointing to the actual source address @@ -1992,7 +2000,7 @@ typedef enum { * For all INDIRECT_* types: * src_agent, dst_agent -- source and destination agents * size -- copy size in bytes - * num_dsts -- must be 0 + * num_entries -- must be 0 * * Future-proofing unions (reserved, must not be used): * src_agent_list -- reserved for future gather operations @@ -2001,12 +2009,12 @@ typedef enum { typedef struct hsa_amd_memory_copy_op_s { uint16_t version; /**< Struct version. Must be HSA_AMD_MEMORY_COPY_OP_VERSION. */ uint16_t type; /**< Operation type (hsa_amd_memory_copy_op_type_t) */ - uint16_t num_dsts; /**< LINEAR multi / BROADCAST: number of entries; others: must be 0 */ + uint16_t num_entries; /**< LINEAR multi / BROADCAST / SWAP: number of entries; others: must be 0 */ uint16_t traffic_class; /**< QoS traffic class. 0 = default/unspecified. */ hsa_signal_t completion_signal; /**< Completion signal for this operation */ union { void* src; /**< Source pointer (or void** for INDIRECT_SRC/SRCDST) */ - void** src_list; /**< LINEAR multi: caller-owned array of num_dsts source pointers */ + void** src_list; /**< LINEAR multi: caller-owned array of num_entries source pointers */ }; union { hsa_agent_t src_agent; /**< Source agent */ @@ -2014,11 +2022,11 @@ typedef struct hsa_amd_memory_copy_op_s { }; union { hsa_agent_t dst_agent; /**< Destination agent (single-dst types) */ - hsa_agent_t* dst_agent_list; /**< LINEAR multi / BROADCAST: caller-owned array of num_dsts destination agents */ + hsa_agent_t* dst_agent_list; /**< LINEAR multi / BROADCAST: caller-owned array of num_entries destination agents */ }; union { void* dst; /**< Destination pointer (or void** for INDIRECT_DST/SRCDST) */ - void** dst_list; /**< LINEAR multi / BROADCAST: caller-owned array of num_dsts destination pointers */ + void** dst_list; /**< LINEAR multi / BROADCAST: caller-owned array of num_entries destination pointers */ }; union { struct { @@ -2030,7 +2038,7 @@ typedef struct hsa_amd_memory_copy_op_s { size_t dst_size; /**< SWAP: destination region size in bytes */ }; struct { - size_t* size_list; /**< LINEAR multi: caller-owned array of num_dsts copy sizes */ + size_t* size_list; /**< LINEAR multi: caller-owned array of num_entries copy sizes */ size_t reserved0; /**< Must be 0 for LINEAR multi */ }; }; @@ -2065,7 +2073,7 @@ typedef struct hsa_amd_memory_copy_op_s { * * Each operation is self-describing via its @c type field. A BROADCAST operation * is a single op that copies one source to multiple destinations via @c dst_list - * and @c num_dsts. A SWAP operation exchanges two buffers using @c src_size and + * and @c num_entries. A SWAP operation exchanges two buffers using @c src_size and * @c dst_size. * * @param[in] copy_ops Array of copy operation descriptors.