Skip to content

Commit a5865f6

Browse files
committed
rocr/clr: Add SDMA linear swap copy support
- Add SDMA_PKT_COPY_LINEAR_SWAP packet (SDMA_SUBOP_COPY_SWAP=9) with 30-bit count for gfx94X/gfx95X - Add BlitSdma::BuildSwapCopyCommand and SubmitLinearSwapBody to build and submit swap packets via the existing prologue/body/epilogue path - Refactor DmaCopyFanOut into DmaCopyFanOutOp parameterised by hsa_amd_memory_copy_op_type_t so copy and swap share the same fan-out logic without code duplication - Add GpuAgent::DmaCopySwap and wire HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP in DmaCopyBatch (rejects num_entries==0; swap always uses list form) - Check SwapSupported() early in DmaCopyFanOutOp before allocating signals to avoid resource leaks on unsupported hardware - CLR rocrCopyBufferBatch: group swap ops into multi-entry ops using the same MultiArrays struct as linear multi, bypassing broadcast grouping; set src_agent for validation routing; assert src_size == dst_size (asymmetric swap reserved for future use) - Update hsa_ext_amd.cpp validation for multi-entry swap (num_entries, src_list/dst_list/size_list, reserved0) and single-entry swap - Rename num_dsts -> num_entries across the public API and all callers; keep num_dsts as deprecated union alias for backward compatibility - Update LINEAR_SWAP docs to describe both multi-entry and single-entry forms Made-with: Cursor
1 parent 7beafab commit a5865f6

File tree

11 files changed

+348
-84
lines changed

11 files changed

+348
-84
lines changed

projects/clr/hipamd/src/hip_memory.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2806,6 +2806,14 @@ hipError_t ihipMemcpyBatch(void** dsts, void** srcs, size_t* sizes, size_t count
28062806
}
28072807
}
28082808

2809+
if (attrs != nullptr && !stream.device().settings().sdma_swap_supported_) {
2810+
for (size_t i = 0; i < numAttrs; ++i) {
2811+
if (attrs[i].flags & hipMemcpyFlagExtOpSwap) {
2812+
return hipErrorNotSupported;
2813+
}
2814+
}
2815+
}
2816+
28092817
// Classify copies by type and group them
28102818
std::vector<size_t> bufferCopyIndices;
28112819
std::vector<size_t> hostToHostIndices;

projects/clr/rocclr/device/device.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,8 @@ class Settings : public amd::HeapObject {
693693
uint gwsInitSupported_ : 1; //!< Check if GWS is supported on this machine.
694694
uint kernel_arg_opt_ : 1; //!< Enables kernel arg optimization for blit kernels
695695
uint kernel_arg_impl_ : 2; //!< Kernel argument implementation
696-
uint reserved_ : 14;
696+
uint sdma_swap_supported_ : 1; //!< SDMA linear swap copy (gfx94x/gfx95x)
697+
uint reserved_ : 13;
697698
};
698699
uint value_;
699700
};

projects/clr/rocclr/device/rocm/rocblit.cpp

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,26 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
786786
};
787787
std::map<uint64_t, AgentGroup> agent_groups;
788788

789+
struct MultiArrays {
790+
std::vector<void*> srcs;
791+
std::vector<void*> dsts;
792+
std::vector<hsa_agent_t> dst_agents;
793+
std::vector<size_t> sizes;
794+
};
795+
796+
MultiArrays swapPending;
797+
hsa_agent_t swapSrcAgent = {};
798+
789799
for (const auto& op : ops) {
800+
if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) {
801+
assert(op.src_size == op.dst_size && "Asymmetric swap not yet supported");
802+
if (swapPending.srcs.empty()) swapSrcAgent = op.src_agent;
803+
swapPending.srcs.push_back(op.src);
804+
swapPending.dsts.push_back(op.dst);
805+
swapPending.dst_agents.push_back(op.dst_agent);
806+
swapPending.sizes.push_back(op.src_size);
807+
continue;
808+
}
790809
auto& ag = agent_groups[op.src_agent.handle];
791810
ag.src_agent = op.src_agent;
792811
BcastKey bkey{op.src, op.size};
@@ -800,17 +819,27 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
800819

801820
gpu().Barriers().SetActiveEngine(engine);
802821

803-
struct MultiArrays {
804-
std::vector<void*> srcs;
805-
std::vector<void*> dsts;
806-
std::vector<hsa_agent_t> dst_agents;
807-
std::vector<size_t> sizes;
808-
};
809822
std::vector<MultiArrays> multiStore;
810823
multiStore.reserve(agent_groups.size());
811824

812825
std::vector<hsa_amd_memory_copy_op_t> finalOps;
813826

827+
if (!swapPending.srcs.empty()) {
828+
multiStore.push_back(std::move(swapPending));
829+
auto& stored = multiStore.back();
830+
831+
hsa_amd_memory_copy_op_t swap = {};
832+
swap.version = HSA_AMD_MEMORY_COPY_OP_VERSION;
833+
swap.type = HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP;
834+
swap.src_agent = swapSrcAgent;
835+
swap.src_list = stored.srcs.data();
836+
swap.dst_list = stored.dsts.data();
837+
swap.dst_agent_list = stored.dst_agents.data();
838+
swap.size_list = stored.sizes.data();
839+
swap.num_entries = static_cast<uint16_t>(stored.srcs.size());
840+
finalOps.push_back(swap);
841+
}
842+
814843
for (auto& [agent_handle, ag] : agent_groups) {
815844
MultiArrays pending;
816845

@@ -823,7 +852,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
823852
bcast.src_agent = ag.src_agent;
824853
bcast.dst_list = be.dsts.data();
825854
bcast.dst_agent_list = be.dst_agents.data();
826-
bcast.num_dsts = static_cast<uint16_t>(be.dsts.size());
855+
bcast.num_entries = static_cast<uint16_t>(be.dsts.size());
827856
bcast.size = be.tmpl.size;
828857
finalOps.push_back(bcast);
829858
} else {
@@ -848,7 +877,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
848877
multi.dst_list = stored.dsts.data();
849878
multi.dst_agent_list = stored.dst_agents.data();
850879
multi.size_list = stored.sizes.data();
851-
multi.num_dsts = static_cast<uint16_t>(stored.srcs.size());
880+
multi.num_entries = static_cast<uint16_t>(stored.srcs.size());
852881
finalOps.push_back(multi);
853882
} else if (pending.srcs.size() == 1) {
854883
hsa_amd_memory_copy_op_t linear = {};
@@ -872,20 +901,30 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
872901
for (size_t i = 0; i < finalOps.size(); ++i) {
873902
const auto& op = finalOps[i];
874903
if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST) {
875-
for (uint32_t d = 0; d < op.num_dsts; ++d) {
904+
for (uint32_t d = 0; d < op.num_entries; ++d) {
876905
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
877906
"HSA BatchCopy Broadcast [%u/%u] engineOp=%s, src=%p, dst=%p, "
878907
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
879-
d + 1, op.num_dsts, EngineOpName(engine), op.src, op.dst_list[d],
908+
d + 1, op.num_entries, EngineOpName(engine), op.src, op.dst_list[d],
880909
op.size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
881910
op.completion_signal.handle);
882911
}
883-
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_dsts > 0) {
884-
for (uint32_t d = 0; d < op.num_dsts; ++d) {
912+
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) {
913+
for (uint32_t d = 0; d < op.num_entries; ++d) {
914+
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
915+
"HSA BatchCopy Swap [%u/%u] engineOp=%s, addr_a=%p, addr_b=%p, "
916+
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
917+
d + 1, op.num_entries, EngineOpName(engine), op.src_list[d],
918+
op.dst_list[d], op.size_list[d],
919+
(wait_events.size() != 0) ? wait_events[0].handle : 0,
920+
op.completion_signal.handle);
921+
}
922+
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_entries > 0) {
923+
for (uint32_t d = 0; d < op.num_entries; ++d) {
885924
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
886925
"HSA BatchCopy Multi [%u/%u] engineOp=%s, src=%p, dst=%p, "
887926
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
888-
d + 1, op.num_dsts, EngineOpName(engine), op.src_list[d],
927+
d + 1, op.num_entries, EngineOpName(engine), op.src_list[d],
889928
op.dst_list[d], op.size_list[d],
890929
(wait_events.size() != 0) ? wait_events[0].handle : 0,
891930
op.completion_signal.handle);

projects/clr/rocclr/device/rocm/rocsettings.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ bool Settings::create(bool fullProfile, const amd::Isa& isa, bool enableXNACK, b
144144
queue_pipe_dist_ = DEBUG_HIP_DYNAMIC_QUEUES == 2 ? true : false;
145145
}
146146

147+
if (gfxipMajor == 9 && gfxipMinor >= 4) {
148+
sdma_swap_supported_ = true;
149+
}
150+
147151
setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);
148152

149153
if (gfxipMajor >= 10) {

projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_blit_sdma.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,12 @@ class BlitSdmaBase : public core::Blit {
103103
virtual hsa_status_t SubmitLinearCopyBody(void* dst, const void* src, size_t size,
104104
core::Signal& prologue_signal,
105105
core::Signal& body_signal) = 0;
106+
107+
virtual hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size,
108+
core::Signal& prologue_signal,
109+
core::Signal& body_signal) = 0;
110+
111+
virtual bool SwapSupported() const = 0;
106112
};
107113

108114
template <bool useGCR> class BlitSdma : public BlitSdmaBase {
@@ -207,6 +213,12 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
207213
core::Signal& prologue_signal,
208214
core::Signal& body_signal) override;
209215

216+
hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size,
217+
core::Signal& prologue_signal,
218+
core::Signal& body_signal) override;
219+
220+
bool SwapSupported() const override { return swap_supported_; }
221+
210222
private:
211223
/// @brief Acquires the address into queue buffer where a new command
212224
/// packet of specified size could be written. The address that is
@@ -261,6 +273,9 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
261273
void BuildBroadcastCopyCommand(char* cmd_addr, uint32_t num_copy_command,
262274
void* dst1, void* dst2, const void* src, size_t size);
263275

276+
void BuildSwapCopyCommand(char* cmd_addr, uint32_t num_copy_command,
277+
void* addr_a, void* addr_b, size_t size);
278+
264279
void BuildCopyRectCommand(const std::function<void*(size_t)>& append,
265280
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
266281
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
@@ -334,6 +349,8 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
334349

335350
static const uint32_t broadcast_copy_command_size_;
336351

352+
static const uint32_t swap_copy_command_size_;
353+
337354
static const uint32_t fill_command_size_;
338355

339356
static const uint32_t fence_command_size_;
@@ -392,6 +409,9 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
392409

393410
/// True if SDMA supports multicast copy (one src -> multiple dst).
394411
bool multicast_supported_;
412+
413+
/// True if SDMA supports linear swap copy (gfx94X+).
414+
bool swap_supported_;
395415
};
396416

397417

projects/rocr-runtime/runtime/hsa-runtime/core/inc/amd_gpu_agent.h

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -748,16 +748,28 @@ class GpuAgent : public GpuAgentInt {
748748
const hsa_amd_memory_copy_op_t& op,
749749
std::vector<core::Signal*>& dep_signals);
750750

751-
// Multi-linear copy: LINEAR op with num_dsts > 0, independent copies
751+
// Multi-linear copy: LINEAR op with num_entries > 0, independent copies
752752
// (different src/dst/size per entry) sharing a single completion signal.
753753
// Uses prologue/body/epilogue fan-out across available SDMA engines.
754754
hsa_status_t DmaCopyMulti(
755755
const hsa_amd_memory_copy_op_t& op,
756756
std::vector<core::Signal*>& dep_signals);
757757

758-
// Common fan-out implementation shared by DmaCopyBroadcast and DmaCopyMulti.
759-
// Submits prologue, per-entry copy bodies, and epilogue with one signal.
760-
hsa_status_t DmaCopyFanOut(
758+
// Linear swap: exchanges the contents of src and dst buffers.
759+
// Only supported on gfx94X / gfx95X. Uses DmaCopyFanOutOp with
760+
// HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP.
761+
hsa_status_t DmaCopySwap(
762+
const hsa_amd_memory_copy_op_t& op,
763+
std::vector<core::Signal*>& dep_signals);
764+
765+
// Common fan-out implementation shared by DmaCopyBroadcast, DmaCopyMulti,
766+
// and swap operations. Submits prologue, per-entry bodies (selected by
767+
// @p op), and epilogue with one signal.
768+
// @p op is the hsa_amd_memory_copy_op_type_t from the public API; only
769+
// HSA_AMD_MEMORY_COPY_OP_LINEAR and HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP are
770+
// currently supported.
771+
hsa_status_t DmaCopyFanOutOp(
772+
hsa_amd_memory_copy_op_type_t op,
761773
core::Signal& out_signal,
762774
std::vector<core::Signal*>& dep_signals,
763775
uint16_t num_entries,

projects/rocr-runtime/runtime/hsa-runtime/core/inc/sdma_registers.h

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ const unsigned int SDMA_SUBOP_COPY_LINEAR = 0;
6464
// Broadcast linear copy uses the linear sub-op with the broadcast packet format.
6565
const unsigned int SDMA_SUBOP_COPY_LINEAR_BROADCAST = SDMA_SUBOP_COPY_LINEAR;
6666
const unsigned int SDMA_SUBOP_COPY_LINEAR_RECT = 4;
67+
const unsigned int SDMA_SUBOP_COPY_SWAP = 9;
6768
const unsigned int SDMA_SUBOP_TIMESTAMP_GET_GLOBAL = 2;
6869
const unsigned int SDMA_SUBOP_USER_GCR = 1;
6970
const unsigned int SDMA_ATOMIC_ADD64 = 47;
@@ -223,6 +224,70 @@ typedef struct SDMA_PKT_COPY_LINEAR_BROADCAST_TAG {
223224
static const size_t kMaxSize_ = 0x3fffe0;
224225
} SDMA_PKT_COPY_LINEAR_BROADCAST;
225226

227+
// linear copy (swap) packet (SDMA5.2+)
228+
// Atomically swaps data between Address A and Address B.
229+
typedef struct SDMA_PKT_COPY_LINEAR_SWAP_TAG {
230+
union {
231+
struct {
232+
unsigned int op : 8;
233+
unsigned int sub_op : 8;
234+
unsigned int extra_info : 16;
235+
};
236+
unsigned int DW_0_DATA;
237+
} HEADER_UNION;
238+
239+
union {
240+
struct {
241+
unsigned int count : 30;
242+
unsigned int reserved_0 : 2;
243+
};
244+
unsigned int DW_1_DATA;
245+
} COUNT_UNION;
246+
247+
union {
248+
struct {
249+
unsigned int reserved_0 : 16;
250+
unsigned int dst_sw : 2;
251+
unsigned int dst_cache_policy : 3;
252+
unsigned int reserved_1 : 3;
253+
unsigned int src_sw : 2;
254+
unsigned int src_cache_policy : 3;
255+
unsigned int reserved_2 : 3;
256+
};
257+
unsigned int DW_2_DATA;
258+
} PARAMETER_UNION;
259+
260+
union {
261+
struct {
262+
unsigned int addr_a_31_0 : 32;
263+
};
264+
unsigned int DW_3_DATA;
265+
} ADDR_A_LO_UNION;
266+
267+
union {
268+
struct {
269+
unsigned int addr_a_63_32 : 32;
270+
};
271+
unsigned int DW_4_DATA;
272+
} ADDR_A_HI_UNION;
273+
274+
union {
275+
struct {
276+
unsigned int addr_b_31_0 : 32;
277+
};
278+
unsigned int DW_5_DATA;
279+
} ADDR_B_LO_UNION;
280+
281+
union {
282+
struct {
283+
unsigned int addr_b_63_32 : 32;
284+
};
285+
unsigned int DW_6_DATA;
286+
} ADDR_B_HI_UNION;
287+
288+
static const size_t kMaxSize_ = 0x3fffffe0;
289+
} SDMA_PKT_COPY_LINEAR_SWAP;
290+
226291
// linear sub-window (pre-GFX12)
227292
typedef struct SDMA_PKT_COPY_LINEAR_RECT_TAG {
228293
static const unsigned int pitch_bits = 19;

0 commit comments

Comments
 (0)