Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions projects/clr/hipamd/src/hip_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2806,6 +2806,14 @@ hipError_t ihipMemcpyBatch(void** dsts, void** srcs, size_t* sizes, size_t count
}
}

if (attrs != nullptr && !stream.device().settings().sdma_swap_supported_) {
for (size_t i = 0; i < numAttrs; ++i) {
if (attrs[i].flags & hipMemcpyFlagExtOpSwap) {
return hipErrorNotSupported;
}
}
}

// Classify copies by type and group them
std::vector<size_t> bufferCopyIndices;
std::vector<size_t> hostToHostIndices;
Expand Down
3 changes: 2 additions & 1 deletion projects/clr/rocclr/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,8 @@ class Settings : public amd::HeapObject {
uint gwsInitSupported_ : 1; //!< Check if GWS is supported on this machine.
uint kernel_arg_opt_ : 1; //!< Enables kernel arg optimization for blit kernels
uint kernel_arg_impl_ : 2; //!< Kernel argument implementation
uint reserved_ : 14;
uint sdma_swap_supported_ : 1; //!< SDMA linear swap copy (gfx94x/gfx95x)
uint reserved_ : 13;
};
uint value_;
};
Expand Down
65 changes: 52 additions & 13 deletions projects/clr/rocclr/device/rocm/rocblit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,26 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
};
std::map<uint64_t, AgentGroup> agent_groups;

struct MultiArrays {
std::vector<void*> srcs;
std::vector<void*> dsts;
std::vector<hsa_agent_t> dst_agents;
std::vector<size_t> sizes;
};

MultiArrays swapPending;
hsa_agent_t swapSrcAgent = {};

for (const auto& op : ops) {
if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) {
assert(op.src_size == op.dst_size && "Asymmetric swap not yet supported");
if (swapPending.srcs.empty()) swapSrcAgent = op.src_agent;
swapPending.srcs.push_back(op.src);
swapPending.dsts.push_back(op.dst);
swapPending.dst_agents.push_back(op.dst_agent);
swapPending.sizes.push_back(op.src_size);
continue;
}
auto& ag = agent_groups[op.src_agent.handle];
ag.src_agent = op.src_agent;
BcastKey bkey{op.src, op.size};
Expand All @@ -800,17 +819,27 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o

gpu().Barriers().SetActiveEngine(engine);

struct MultiArrays {
std::vector<void*> srcs;
std::vector<void*> dsts;
std::vector<hsa_agent_t> dst_agents;
std::vector<size_t> sizes;
};
std::vector<MultiArrays> multiStore;
multiStore.reserve(agent_groups.size());

std::vector<hsa_amd_memory_copy_op_t> finalOps;

if (!swapPending.srcs.empty()) {
multiStore.push_back(std::move(swapPending));
auto& stored = multiStore.back();

hsa_amd_memory_copy_op_t swap = {};
swap.version = HSA_AMD_MEMORY_COPY_OP_VERSION;
swap.type = HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP;
swap.src_agent = swapSrcAgent;
swap.src_list = stored.srcs.data();
swap.dst_list = stored.dsts.data();
swap.dst_agent_list = stored.dst_agents.data();
swap.size_list = stored.sizes.data();
swap.num_entries = static_cast<uint16_t>(stored.srcs.size());
finalOps.push_back(swap);
}

for (auto& [agent_handle, ag] : agent_groups) {
MultiArrays pending;

Expand All @@ -823,7 +852,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
bcast.src_agent = ag.src_agent;
bcast.dst_list = be.dsts.data();
bcast.dst_agent_list = be.dst_agents.data();
bcast.num_dsts = static_cast<uint16_t>(be.dsts.size());
bcast.num_entries = static_cast<uint16_t>(be.dsts.size());
bcast.size = be.tmpl.size;
finalOps.push_back(bcast);
} else {
Expand All @@ -848,7 +877,7 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
multi.dst_list = stored.dsts.data();
multi.dst_agent_list = stored.dst_agents.data();
multi.size_list = stored.sizes.data();
multi.num_dsts = static_cast<uint16_t>(stored.srcs.size());
multi.num_entries = static_cast<uint16_t>(stored.srcs.size());
finalOps.push_back(multi);
} else if (pending.srcs.size() == 1) {
hsa_amd_memory_copy_op_t linear = {};
Expand All @@ -872,20 +901,30 @@ bool DmaBlitManager::rocrCopyBufferBatch(const std::vector<hsa_amd_memory_copy_o
for (size_t i = 0; i < finalOps.size(); ++i) {
const auto& op = finalOps[i];
if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST) {
for (uint32_t d = 0; d < op.num_dsts; ++d) {
for (uint32_t d = 0; d < op.num_entries; ++d) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
"HSA BatchCopy Broadcast [%u/%u] engineOp=%s, src=%p, dst=%p, "
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
d + 1, op.num_dsts, EngineOpName(engine), op.src, op.dst_list[d],
d + 1, op.num_entries, EngineOpName(engine), op.src, op.dst_list[d],
op.size, (wait_events.size() != 0) ? wait_events[0].handle : 0,
op.completion_signal.handle);
}
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_dsts > 0) {
for (uint32_t d = 0; d < op.num_dsts; ++d) {
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP) {
for (uint32_t d = 0; d < op.num_entries; ++d) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
"HSA BatchCopy Swap [%u/%u] engineOp=%s, addr_a=%p, addr_b=%p, "
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
d + 1, op.num_entries, EngineOpName(engine), op.src_list[d],
op.dst_list[d], op.size_list[d],
(wait_events.size() != 0) ? wait_events[0].handle : 0,
op.completion_signal.handle);
}
} else if (op.type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_entries > 0) {
for (uint32_t d = 0; d < op.num_entries; ++d) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY2,
"HSA BatchCopy Multi [%u/%u] engineOp=%s, src=%p, dst=%p, "
"size=%zu, wait_event=0x%zx, completion_signal=0x%zx",
d + 1, op.num_dsts, EngineOpName(engine), op.src_list[d],
d + 1, op.num_entries, EngineOpName(engine), op.src_list[d],
op.dst_list[d], op.size_list[d],
(wait_events.size() != 0) ? wait_events[0].handle : 0,
op.completion_signal.handle);
Expand Down
4 changes: 4 additions & 0 deletions projects/clr/rocclr/device/rocm/rocsettings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ bool Settings::create(bool fullProfile, const amd::Isa& isa, bool enableXNACK, b
queue_pipe_dist_ = DEBUG_HIP_DYNAMIC_QUEUES == 2 ? true : false;
}

if (gfxipMajor == 9 && gfxipMinor >= 4) {
sdma_swap_supported_ = true;
}

setKernelArgImpl(isa, isXgmi, hasValidHDPFlush);

if (gfxipMajor >= 10) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ struct formatter<hsa_amd_memory_copy_op_t>
}

auto type = static_cast<hsa_amd_memory_copy_op_type_t>(op.type);
const bool is_linear_multi = (type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_dsts > 0);
const bool is_linear_multi = (type == HSA_AMD_MEMORY_COPY_OP_LINEAR && op.num_entries > 0);
if(is_linear_multi && op.reserved0 != 0)
{
reserved += fmt::format(", reserved0={}", op.reserved0);
Expand All @@ -282,12 +282,12 @@ struct formatter<hsa_amd_memory_copy_op_t>
case HSA_AMD_MEMORY_COPY_OP_LINEAR_BROADCAST:
return fmt::format_to(
ctx.out(),
"[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, "
"[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, "
"completion_signal={}, src={}, src_agent={}, dst_list={}, "
"dst_agent_list={}, size={}{}{}{}]",
type,
op.version,
op.num_dsts,
op.num_entries,
op.traffic_class,
op.completion_signal.handle,
fmt::ptr(op.src),
Expand All @@ -302,12 +302,12 @@ struct formatter<hsa_amd_memory_copy_op_t>
case HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP:
return fmt::format_to(
ctx.out(),
"[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, "
"[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, "
"completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, "
"src_size={}, dst_size={}{}{}{}]",
type,
op.version,
op.num_dsts,
op.num_entries,
op.traffic_class,
op.completion_signal.handle,
fmt::ptr(op.src),
Expand All @@ -325,12 +325,12 @@ struct formatter<hsa_amd_memory_copy_op_t>
{
return fmt::format_to(
ctx.out(),
"[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, "
"[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, "
"completion_signal={}, src_list={}, src_agent={}, dst_list={}, "
"dst_agent_list={}, size_list={}{}{}{}]",
type,
op.version,
op.num_dsts,
op.num_entries,
op.traffic_class,
op.completion_signal.handle,
fmt::ptr(op.src_list),
Expand All @@ -344,12 +344,12 @@ struct formatter<hsa_amd_memory_copy_op_t>
}
return fmt::format_to(
ctx.out(),
"[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, "
"[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, "
"completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, "
"size={}, unused_size={}{}{}{}]",
type,
op.version,
op.num_dsts,
op.num_entries,
op.traffic_class,
op.completion_signal.handle,
fmt::ptr(op.src),
Expand All @@ -366,12 +366,12 @@ struct formatter<hsa_amd_memory_copy_op_t>
case HSA_AMD_MEMORY_COPY_OP_LINEAR_INDIRECT_SRCDST:
return fmt::format_to(
ctx.out(),
"[MEMORY_COPY_OP type={}, version={}, num_dsts={}, traffic_class={}, "
"[MEMORY_COPY_OP type={}, version={}, num_entries={}, traffic_class={}, "
"completion_signal={}, src={}, src_agent={}, dst={}, dst_agent={}, "
"size={}, unused_size={}{}{}{}]",
type,
op.version,
op.num_dsts,
op.num_entries,
op.traffic_class,
op.completion_signal.handle,
fmt::ptr(op.src),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ class BlitSdmaBase : public core::Blit {
virtual hsa_status_t SubmitLinearCopyBody(void* dst, const void* src, size_t size,
core::Signal& prologue_signal,
core::Signal& body_signal) = 0;

virtual hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size,
core::Signal& prologue_signal,
core::Signal& body_signal) = 0;

virtual bool SwapSupported() const = 0;
};

template <bool useGCR> class BlitSdma : public BlitSdmaBase {
Expand Down Expand Up @@ -207,6 +213,12 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
core::Signal& prologue_signal,
core::Signal& body_signal) override;

hsa_status_t SubmitLinearSwapBody(void* addr_a, void* addr_b, size_t size,
core::Signal& prologue_signal,
core::Signal& body_signal) override;

bool SwapSupported() const override { return swap_supported_; }

private:
/// @brief Acquires the address into queue buffer where a new command
/// packet of specified size could be written. The address that is
Expand Down Expand Up @@ -261,6 +273,9 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {
void BuildBroadcastCopyCommand(char* cmd_addr, uint32_t num_copy_command,
void* dst1, void* dst2, const void* src, size_t size);

void BuildSwapCopyCommand(char* cmd_addr, uint32_t num_copy_command,
void* addr_a, void* addr_b, size_t size);

void BuildCopyRectCommand(const std::function<void*(size_t)>& append,
const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset,
const hsa_pitched_ptr_t* src, const hsa_dim3_t* src_offset,
Expand Down Expand Up @@ -334,6 +349,8 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {

static const uint32_t broadcast_copy_command_size_;

static const uint32_t swap_copy_command_size_;

static const uint32_t fill_command_size_;

static const uint32_t fence_command_size_;
Expand Down Expand Up @@ -392,6 +409,9 @@ template <bool useGCR> class BlitSdma : public BlitSdmaBase {

/// True if SDMA supports multicast copy (one src -> multiple dst).
bool multicast_supported_;

/// True if SDMA supports linear swap copy (gfx94X+).
bool swap_supported_;
};


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -748,16 +748,28 @@ class GpuAgent : public GpuAgentInt {
const hsa_amd_memory_copy_op_t& op,
std::vector<core::Signal*>& dep_signals);

// Multi-linear copy: LINEAR op with num_dsts > 0, independent copies
// Multi-linear copy: LINEAR op with num_entries > 0, independent copies
// (different src/dst/size per entry) sharing a single completion signal.
// Uses prologue/body/epilogue fan-out across available SDMA engines.
hsa_status_t DmaCopyMulti(
const hsa_amd_memory_copy_op_t& op,
std::vector<core::Signal*>& dep_signals);

// Common fan-out implementation shared by DmaCopyBroadcast and DmaCopyMulti.
// Submits prologue, per-entry copy bodies, and epilogue with one signal.
hsa_status_t DmaCopyFanOut(
// Linear swap: exchanges the contents of src and dst buffers.
// Only supported on gfx94X / gfx95X. Uses DmaCopyFanOutOp with
// HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP.
hsa_status_t DmaCopySwap(
const hsa_amd_memory_copy_op_t& op,
std::vector<core::Signal*>& dep_signals);

// Common fan-out implementation shared by DmaCopyBroadcast, DmaCopyMulti,
// and swap operations. Submits prologue, per-entry bodies (selected by
// @p op), and epilogue with one signal.
// @p op is the hsa_amd_memory_copy_op_type_t from the public API; only
// HSA_AMD_MEMORY_COPY_OP_LINEAR and HSA_AMD_MEMORY_COPY_OP_LINEAR_SWAP are
// currently supported.
hsa_status_t DmaCopyFanOutOp(
hsa_amd_memory_copy_op_type_t op,
core::Signal& out_signal,
std::vector<core::Signal*>& dep_signals,
uint16_t num_entries,
Expand Down
Loading
Loading