Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions paddle/phi/api/include/compat/ATen/core/TensorBody.h
Original file line number Diff line number Diff line change
Expand Up @@ -726,13 +726,6 @@ class Tensor : public TensorBase {
void record_stream(at::Stream s) const;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void record_stream(at::cuda::CUDAStream s) const;
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
void record_stream(hipStream_t s) const;
#else
void record_stream(cudaStream_t s) const;
#endif
#endif

Tensor var(int dim) const { return var(at::IntArrayRef{dim}, true, false); }
Expand Down
28 changes: 1 addition & 27 deletions paddle/phi/api/include/compat/ATen/ops/record_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,32 +53,6 @@ inline void Tensor::record_stream(at::Stream s) const {
inline void Tensor::record_stream(at::cuda::CUDAStream s) const {
record_stream(static_cast<at::Stream>(s));
}

// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// at::Stream
#ifdef PADDLE_WITH_HIP
inline void Tensor::record_stream(hipStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
PD_CHECK(dense_tensor != nullptr,
"record_stream only supports DenseTensor, but got a non-dense "
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(), s);
}
#else
inline void Tensor::record_stream(cudaStream_t s) const {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(tensor_.impl());
PD_CHECK(dense_tensor != nullptr,
"record_stream only supports DenseTensor, but got a non-dense "
"tensor implementation.");
PD_CHECK(dense_tensor->place().GetType() != phi::AllocationType::CPU,
"record_stream is not supported for CPU tensors.");
paddle::memory::RecordStream(dense_tensor->Holder(),
reinterpret_cast<gpuStream_t>(s));
}
#endif
#endif

} // namespace at
19 changes: 0 additions & 19 deletions paddle/phi/api/include/compat/c10/core/Event.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,25 +95,6 @@ struct Event final {
void record(const c10::cuda::CUDAStream& stream) { record(stream.unwrap()); }
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(youge325): Remove after DeepEP paddle branch is updated to use
// c10::Stream
#ifdef PADDLE_WITH_HIP
void record(const hipStream_t& stream) {
TORCH_CHECK(device_type_ == DeviceType::CUDA,
"Raw hipStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#else
void record(const cudaStream_t& stream) {
TORCH_CHECK(
device_type_ == DeviceType::CUDA,
"Raw cudaStream_t recording is only supported for CUDA events.");
RecordBackendEvent(stream, phi::backends::gpu::GetCurrentDeviceId());
}
#endif
#endif

void block(const Stream& stream) const {
if (!was_marked_for_recording_) {
return;
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/api/include/compat/c10/core/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,7 @@ struct hash<c10::Stream> {
}
};
} // namespace std

namespace at {
using c10::Stream;
}
Comment on lines +108 to +111
Copy link

Copilot AI Apr 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding namespace at { using c10::Stream; } here conflicts with an existing at::Stream definition in paddle/phi/api/include/compat/ATen/core/TensorBody.h (currently using Stream = c10::Stream;). Since TensorBody.h includes <c10/core/Stream.h>, this will trigger a redefinition error when compiling any TU that includes ATen/core/TensorBody.h.

To fix: keep a single canonical at::Stream definition (either remove the Stream alias from TensorBody.h, or drop this new at::Stream export and rely on the existing one).

Suggested change
namespace at {
using c10::Stream;
}

Copilot uses AI. Check for mistakes.
10 changes: 10 additions & 0 deletions paddle/phi/api/include/compat/c10/cuda/CUDAStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ inline void initTLSCurrentStreams() {

} // namespace

inline CUDAStream make_cuda_stream(cudaStream_t raw,
c10::DeviceIndex device_index) {
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}

CUDAStream getStreamFromPool(const int priority,
c10::DeviceIndex device_index) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down
28 changes: 0 additions & 28 deletions paddle/phi/api/include/compat/c10/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,6 @@ class CUDAStream {

Device device() const { return Device(DeviceType::CUDA, device_index()); }

#ifdef PADDLE_WITH_HIP
hipStream_t raw_stream() const { return stream(); }
#else
cudaStream_t raw_stream() const { return stream(); }
#endif

struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
Expand Down Expand Up @@ -139,28 +133,6 @@ class CUDAStream {
Stream stream_;
};

#ifdef PADDLE_WITH_HIP
inline CUDAStream make_cuda_stream(hipStream_t raw,
c10::DeviceIndex device_index) {
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}
#else
inline CUDAStream make_cuda_stream(cudaStream_t raw,
c10::DeviceIndex device_index) {
c10::StreamId sid =
static_cast<c10::StreamId>(reinterpret_cast<intptr_t>(raw));
return CUDAStream(
c10::Stream(c10::Stream::UNSAFE,
c10::Device(c10::DeviceType::CUDA, device_index),
sid));
}
#endif

/**
* Get the current CUDA stream for the passed CUDA device, or for the
* current device if no device index is passed.
Expand Down
18 changes: 0 additions & 18 deletions test/cpp/compat/ATen_record_stream_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,6 @@ using RecordCudaStreamMethod = void (at::Tensor::*)(at::cuda::CUDAStream) const;
[[maybe_unused]] static RecordCudaStreamMethod g_record_cuda_stream_method =
&at::Tensor::record_stream;

// Raw stream type is platform-specific:
// - CUDA: cudaStream_t (CUstream_st*)
// - HIP: hipStream_t (ihipStream_t*)
// Only test the raw stream overload on CUDA builds where cudaStream_t is
// consistently defined. HIP builds use hipStream_t which is a different type.
#if defined(PADDLE_WITH_CUDA)
using RecordRawCudaStreamMethod = void (at::Tensor::*)(cudaStream_t) const;
[[maybe_unused]] static RecordRawCudaStreamMethod
g_record_raw_cuda_stream_method = &at::Tensor::record_stream;
#endif

TEST_F(RecordStreamTest, CudaTensorCurrentCudaStream) {
if (!at::cuda::is_available()) {
return;
Expand All @@ -80,13 +69,6 @@ TEST_F(RecordStreamTest, CudaTensorDefaultCudaStream) {
EXPECT_NO_THROW(cuda_tensor.record_stream(default_stream));
}

TEST_F(RecordStreamTest, CudaTensorRawCudaStream) {
if (!at::cuda::is_available()) {
return;
}
auto stream = at::cuda::getCurrentCUDAStream();
EXPECT_NO_THROW(cuda_tensor.record_stream(stream.raw_stream()));
}
#endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP

// --- Error path: CPU tensor + CPU stream (record_stream does not support CPU
Expand Down
31 changes: 0 additions & 31 deletions test/cpp/compat/c10_Event_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,24 +40,6 @@ TEST(EventTest, CpuEventRecordThrows) {
EXPECT_THROW(event.recordOnce(stream), std::exception);
}

// Test device_count() works in both CPU and CUDA builds
TEST(EventTest, DeviceCount) {
c10::DeviceIndex count = c10::cuda::device_count();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// In CUDA builds, should return actual device count (>= 0)
EXPECT_GE(count, 0);
#else
// In CPU-only builds, should return 0
EXPECT_EQ(count, 0);
#endif
}

#ifdef PADDLE_WITH_CUDA
using RawEventRecordMethod = void (c10::Event::*)(const cudaStream_t&);
[[maybe_unused]] static RawEventRecordMethod g_raw_event_record_method =
&c10::Event::record;
#endif

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(EventTest, CudaEventLazyCreateAndRecord) {
if (!at::cuda::is_available()) {
Expand Down Expand Up @@ -111,19 +93,6 @@ TEST(EventTest, CudaEventElapsedTimeWithTimingEnabled) {
EXPECT_GE(elapsed_ms, 0.0);
}

#ifdef PADDLE_WITH_CUDA
TEST(EventTest, CudaEventRawStreamRecordCompatibility) {
if (!at::cuda::is_available()) {
return;
}
auto stream = c10::cuda::getCurrentCUDAStream();
c10::Event event(c10::DeviceType::CUDA);
EXPECT_NO_THROW(event.record(stream.raw_stream()));
EXPECT_EQ(event.device_index(), stream.device_index());
EXPECT_TRUE(event.was_marked_for_recording());
}
#endif

TEST(EventTest, CudaEventRejectsDifferentDeviceRecord) {
if (c10::cuda::device_count() < 2) {
return;
Expand Down
Loading