Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1810,6 +1810,8 @@ initialize_rocprofv3()
"force configuration");
}

// Fix ROCM-1214: rocplaycap child skips configure, client_identifier is null
if(getenv("ROCPROFV3_PLAYBACK_CHILD") != nullptr) return;
ROCP_FATAL_IF(!client_identifier) << "nullptr to client identifier!";
ROCP_FATAL_IF(!client_finalizer && !tool::get_config().list_metrics)
<< "nullptr to client finalizer!"; // exception for listing metrics
Expand Down Expand Up @@ -3232,12 +3234,19 @@ wait_pid(pid_t _pid, int _opts = 0)
int _status = 0;
pid_t _pid_v = -1;
_opts |= WUNTRACED;
// Fix ROCM-1214: add timeout to avoid deadlock when parent/child wait each other
auto _deadline = std::chrono::steady_clock::now() + std::chrono::seconds{5};
do
{
if((_opts & WNOHANG) > 0)
{
std::this_thread::yield();
std::this_thread::sleep_for(std::chrono::milliseconds{100});
if(std::chrono::steady_clock::now() > _deadline)
{
ROCP_WARNING << fmt::format("wait_pid timeout waiting for child {}", _pid);
return std::nullopt;
}
}
_pid_v = waitpid(_pid, &_status, _opts);
} while(_pid_v == 0);
Expand Down Expand Up @@ -3418,8 +3427,19 @@ rocprofv3_error_signal_handler(int signo, siginfo_t* info, void* ucontext)
this_func,
signo);

finalize_rocprofv3(this_func);
if(tool::get_config().enable_process_sync) wait_peer_finished(this_pid, this_ppid);
// Fix ROCM-1214: skip finalize on SIGABRT — HSA already torn down
if(signo != SIGABRT)
{
finalize_rocprofv3(this_func);
if(tool::get_config().enable_process_sync) wait_peer_finished(this_pid, this_ppid);
}
else
{
ROCP_WARNING << "skipping finalize_rocprofv3 on SIGABRT to avoid re-entry";
flush();
generate_output(cleanup_mode::destroy);
_exit(134);
}

ROCP_INFO << fmt::format(
"[PPID={}][PID={}][TID={}][{}] rocprofv3 finalizing after signal {}... complete",
Expand All @@ -3429,7 +3449,8 @@ rocprofv3_error_signal_handler(int signo, siginfo_t* info, void* ucontext)
this_func,
signo);

if(get_chained_signals().at(signo))
// Fix ROCM-1214: skip chained handler for SIGABRT to avoid recursive abort
if(signo != SIGABRT && get_chained_signals().at(signo))
{
ROCP_INFO << fmt::format(
"[PPID={}][PID={}][TID={}][{}] rocprofv3 found chained signal handler for {}",
Expand Down Expand Up @@ -3517,6 +3538,8 @@ rocprofiler_configure(uint32_t version,
uint32_t priority,
rocprofiler_client_id_t* id)
{
// Fix ROCM-1214: skip initialization in rocplaycap child processes
if(getenv("ROCPROFV3_PLAYBACK_CHILD") != nullptr) return nullptr;
initialize_logging();

// set the client name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,9 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
auto* buf = buffer::get_buffer(callback_data.buffer.handle);
if(!buf && callback_data.buffer != rocprofiler_buffer_id_t{.handle = 0})
{
ROCP_FATAL << fmt::format("Buffer {} destroyed before record was written",
callback_data.buffer.handle);
// Fix ROCM-1214: buffer destroyed before AQL completion callback (race on teardown)
ROCP_WARNING << fmt::format("Buffer {} destroyed before record was written (skipping)",
callback_data.buffer.handle);
return false;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ class consumer_thread_t
valid.store(false);
cv.notify_all();

if(!exited) cv.wait(lk, [&] { return exited.load(); });
if(!exited)
cv.wait_for(lk, std::chrono::seconds(5), [&] {
return exited.load();
}); // Fix ROCM-1214: timeout to avoid hang
if(consumer.joinable()) consumer.join();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,14 @@ proccess_completed_cb(completed_cb_params_t&& params)

if(info->buffer)
{
buf = CHECK_NOTNULL(buffer::get_buffer(info->buffer->handle));
// Fix ROCM-1214: buffer may be destroyed before AQL callback (race on teardown)
buf = buffer::get_buffer(info->buffer->handle);
if(!buf)
{
ROCP_WARNING << fmt::format(
"Buffer {} destroyed before sample was processed (skipping)", info->buffer->handle);
return;
}
}

auto _corr_id_v =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,9 @@ queue_controller_sync()
void
queue_controller_fini()
{
// Fix ROCM-1214: skip sync() during fini — HSA runtime may already be torn down
if(get_queue_controller())
get_queue_controller()->iterate_queues([](const Queue* _queue) { _queue->sync(); });
get_queue_controller()->iterate_queues([](const Queue* _queue) { (void) _queue; });
}

void
Expand Down
Loading