Skip to content

Commit e2155c1

Browse files
committed
[rocprofiler-sdk] - unable to collect PMC data
Fix unable to collect PMC data when running rocprofv3 --pmc with roccap play. The issue caused SSH disconnect and node destabilization due to duplicate /dev/kfd opens in rocplaycap child processes, HSA runtime teardown race conditions, and signal handler deadlock. Fix profiler initialization, teardown race conditions, and signal handler issues to ensure stable PMC data collection with roccap play. Note: companion fixes for rocplaycap will be submitted separately.
1 parent 3ad5411 commit e2155c1

File tree

5 files changed

+43
-8
lines changed

5 files changed

+43
-8
lines changed

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1810,6 +1810,8 @@ initialize_rocprofv3()
18101810
"force configuration");
18111811
}
18121812

1813+
// Fix ROCM-1214: rocplaycap child skips configure, client_identifier is null
1814+
if(getenv("ROCPROFV3_PLAYBACK_CHILD") != nullptr) return;
18131815
ROCP_FATAL_IF(!client_identifier) << "nullptr to client identifier!";
18141816
ROCP_FATAL_IF(!client_finalizer && !tool::get_config().list_metrics)
18151817
<< "nullptr to client finalizer!"; // exception for listing metrics
@@ -3232,12 +3234,19 @@ wait_pid(pid_t _pid, int _opts = 0)
32323234
int _status = 0;
32333235
pid_t _pid_v = -1;
32343236
_opts |= WUNTRACED;
3237+
// Fix ROCM-1214: add timeout to avoid deadlock when parent/child wait each other
3238+
auto _deadline = std::chrono::steady_clock::now() + std::chrono::seconds{5};
32353239
do
32363240
{
32373241
if((_opts & WNOHANG) > 0)
32383242
{
32393243
std::this_thread::yield();
32403244
std::this_thread::sleep_for(std::chrono::milliseconds{100});
3245+
if(std::chrono::steady_clock::now() > _deadline)
3246+
{
3247+
ROCP_WARNING << fmt::format("wait_pid timeout waiting for child {}", _pid);
3248+
return std::nullopt;
3249+
}
32413250
}
32423251
_pid_v = waitpid(_pid, &_status, _opts);
32433252
} while(_pid_v == 0);
@@ -3418,8 +3427,19 @@ rocprofv3_error_signal_handler(int signo, siginfo_t* info, void* ucontext)
34183427
this_func,
34193428
signo);
34203429

3421-
finalize_rocprofv3(this_func);
3422-
if(tool::get_config().enable_process_sync) wait_peer_finished(this_pid, this_ppid);
3430+
// Fix ROCM-1214: skip finalize on SIGABRT — HSA already torn down
3431+
if(signo != SIGABRT)
3432+
{
3433+
finalize_rocprofv3(this_func);
3434+
if(tool::get_config().enable_process_sync) wait_peer_finished(this_pid, this_ppid);
3435+
}
3436+
else
3437+
{
3438+
ROCP_WARNING << "skipping finalize_rocprofv3 on SIGABRT to avoid re-entry";
3439+
flush();
3440+
generate_output(cleanup_mode::destroy);
3441+
_exit(134);
3442+
}
34233443

34243444
ROCP_INFO << fmt::format(
34253445
"[PPID={}][PID={}][TID={}][{}] rocprofv3 finalizing after signal {}... complete",
@@ -3429,7 +3449,8 @@ rocprofv3_error_signal_handler(int signo, siginfo_t* info, void* ucontext)
34293449
this_func,
34303450
signo);
34313451

3432-
if(get_chained_signals().at(signo))
3452+
// Fix ROCM-1214: skip chained handler for SIGABRT to avoid recursive abort
3453+
if(signo != SIGABRT && get_chained_signals().at(signo))
34333454
{
34343455
ROCP_INFO << fmt::format(
34353456
"[PPID={}][PID={}][TID={}][{}] rocprofv3 found chained signal handler for {}",
@@ -3517,6 +3538,8 @@ rocprofiler_configure(uint32_t version,
35173538
uint32_t priority,
35183539
rocprofiler_client_id_t* id)
35193540
{
3541+
// Fix ROCM-1214: skip initialization in rocplaycap child processes
3542+
if(getenv("ROCPROFV3_PLAYBACK_CHILD") != nullptr) return nullptr;
35203543
initialize_logging();
35213544

35223545
// set the client name

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/device_counting.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,9 @@ agent_async_handler(hsa_signal_value_t /*signal_v*/, void* data)
192192
auto* buf = buffer::get_buffer(callback_data.buffer.handle);
193193
if(!buf && callback_data.buffer != rocprofiler_buffer_id_t{.handle = 0})
194194
{
195-
ROCP_FATAL << fmt::format("Buffer {} destroyed before record was written",
196-
callback_data.buffer.handle);
195+
// Fix ROCM-1214: buffer destroyed before AQL completion callback (race on teardown)
196+
ROCP_WARNING << fmt::format("Buffer {} destroyed before record was written (skipping)",
197+
callback_data.buffer.handle);
197198
return false;
198199
}
199200

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/sample_consumer.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,10 @@ class consumer_thread_t
6262
valid.store(false);
6363
cv.notify_all();
6464

65-
if(!exited) cv.wait(lk, [&] { return exited.load(); });
65+
if(!exited)
66+
cv.wait_for(lk, std::chrono::seconds(5), [&] {
67+
return exited.load();
68+
}); // Fix ROCM-1214: timeout to avoid hang
6669
if(consumer.joinable()) consumer.join();
6770
}
6871

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/sample_processing.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,14 @@ proccess_completed_cb(completed_cb_params_t&& params)
7272

7373
if(info->buffer)
7474
{
75-
buf = CHECK_NOTNULL(buffer::get_buffer(info->buffer->handle));
75+
// Fix ROCM-1214: buffer may be destroyed before AQL callback (race on teardown)
76+
buf = buffer::get_buffer(info->buffer->handle);
77+
if(!buf)
78+
{
79+
ROCP_WARNING << fmt::format(
80+
"Buffer {} destroyed before sample was processed (skipping)", info->buffer->handle);
81+
return;
82+
}
7683
}
7784

7885
auto _corr_id_v =

projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,9 @@ queue_controller_sync()
542542
void
543543
queue_controller_fini()
544544
{
545+
// Fix ROCM-1214: skip sync() during fini — HSA runtime may already be torn down
545546
if(get_queue_controller())
546-
get_queue_controller()->iterate_queues([](const Queue* _queue) { _queue->sync(); });
547+
get_queue_controller()->iterate_queues([](const Queue* _queue) { (void) _queue; });
547548
}
548549

549550
void

0 commit comments

Comments
 (0)