Skip to content

Commit 087b6d1

Browse files
authored
Particles: CUDA Array Interface (#86)
* Particles: CUDA Array Interface Add the `__cuda_array_interface__` to particles. * Particles: Allocator Support * Allocators: Container, Tile, SoA * Safe Compile Time: Available Arenas * ParticleContainer: Simplify Remove one unnecessary & untested combination.
1 parent e37a4e8 commit 087b6d1

12 files changed

+355
-136
lines changed

src/Base/Array4.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ void make_Array4(py::module &m, std::string typestr)
159159
// __array_function__ feature requires NumPy 1.16 or later.
160160

161161

162-
// Nvidia GPUs: __cuda_array_interface__ v2
162+
// Nvidia GPUs: __cuda_array_interface__ v3
163163
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
164164
.def_property_readonly("__cuda_array_interface__", [](Array4<T> const & a4) {
165165
auto d = array_interface(a4);

src/Base/PODVector.cpp

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,33 @@
1616
namespace py = pybind11;
1717
using namespace amrex;
1818

19+
namespace
20+
{
21+
/** CPU: __array_interface__ v3
22+
*
23+
* https://numpy.org/doc/stable/reference/arrays.interface.html
24+
*/
25+
template <class T, class Allocator = std::allocator<T> >
26+
py::dict
27+
array_interface(PODVector<T, Allocator> const & podvector)
28+
{
29+
auto d = py::dict();
30+
bool const read_only = false;
31+
d["data"] = py::make_tuple(std::intptr_t(podvector.dataPtr()), read_only);
32+
d["shape"] = py::make_tuple(podvector.size());
33+
d["strides"] = py::none();
34+
d["typestr"] = py::format_descriptor<T>::format();
35+
d["version"] = 3;
36+
return d;
37+
}
38+
}
39+
1940
template <class T, class Allocator = std::allocator<T> >
20-
void make_PODVector(py::module &m, std::string typestr)
41+
void make_PODVector(py::module &m, std::string typestr, std::string allocstr)
2142
{
22-
using PODVector_type=PODVector<T, Allocator>;
23-
auto const podv_name = std::string("PODVector_").append(typestr);
43+
using PODVector_type = PODVector<T, Allocator>;
44+
auto const podv_name = std::string("PODVector_").append(typestr)
45+
.append("_").append(allocstr);
2446

2547
py::class_<PODVector_type>(m, podv_name.c_str())
2648
.def("__repr__",
@@ -60,12 +82,26 @@ void make_PODVector(py::module &m, std::string typestr)
6082
// swap
6183

6284
.def_property_readonly("__array_interface__", [](PODVector_type const & podvector) {
63-
auto d = py::dict();
64-
bool const read_only = false;
65-
d["data"] = py::make_tuple(std::intptr_t(podvector.dataPtr()), read_only);
66-
d["shape"] = py::make_tuple(podvector.size());
67-
d["strides"] = py::none();
68-
d["typestr"] = py::format_descriptor<T>::format();
85+
return array_interface(podvector);
86+
})
87+
.def_property_readonly("__cuda_array_interface__", [](PODVector_type const & podvector) {
88+
// Nvidia GPUs: __cuda_array_interface__ v3
89+
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
90+
auto d = array_interface(podvector);
91+
92+
// data:
93+
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
94+
// TODO For zero-size arrays, use 0 here.
95+
96+
// None or integer
97+
// An optional stream upon which synchronization must take place at the point of consumption, either by synchronizing on the stream or enqueuing operations on the data on the given stream. Integer values in this entry are as follows:
98+
// 0: This is disallowed as it would be ambiguous between None and the default stream, and also between the legacy and per-thread default streams. Any use case where 0 might be given should either use None, 1, or 2 instead for clarity.
99+
// 1: The legacy default stream.
100+
// 2: The per-thread default stream.
101+
// Any other integer: a cudaStream_t represented as a Python integer.
102+
// When None, no synchronization is required.
103+
d["stream"] = py::none();
104+
69105
d["version"] = 3;
70106
return d;
71107
})
@@ -75,6 +111,20 @@ void make_PODVector(py::module &m, std::string typestr)
75111
;
76112
}
77113

114+
template <class T>
115+
void make_PODVector(py::module &m, std::string typestr)
116+
{
117+
// see Src/Base/AMReX_GpuContainers.H
118+
make_PODVector<T, std::allocator<T>> (m, typestr, "std");
119+
make_PODVector<T, amrex::ArenaAllocator<T>> (m, typestr, "arena");
120+
make_PODVector<T, amrex::PinnedArenaAllocator<T>> (m, typestr, "pinned");
121+
#ifdef AMREX_USE_GPU
122+
make_PODVector<T, amrex::DeviceArenaAllocator<T>> (m, typestr, "device");
123+
make_PODVector<T, amrex::ManagedArenaAllocator<T>> (m, typestr, "managed");
124+
make_PODVector<T, amrex::AsyncArenaAllocator<T>> (m, typestr, "async");
125+
#endif
126+
}
127+
78128
void init_PODVector(py::module& m) {
79129
make_PODVector<ParticleReal> (m, "real");
80130
make_PODVector<int> (m, "int");

src/Base/Vector.cpp

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,31 @@
2020
namespace py = pybind11;
2121
using namespace amrex;
2222

23+
namespace
24+
{
25+
/** CPU: __array_interface__ v3
26+
*
27+
* https://numpy.org/doc/stable/reference/arrays.interface.html
28+
*/
29+
template <class T, class Allocator = std::allocator<T> >
30+
py::dict
31+
array_interface(Vector<T, Allocator> const & vector)
32+
{
33+
auto d = py::dict();
34+
bool const read_only = false;
35+
d["data"] = py::make_tuple(std::intptr_t(vector.dataPtr()), read_only);
36+
d["shape"] = py::make_tuple(vector.size());
37+
d["strides"] = py::none();
38+
d["typestr"] = py::format_descriptor<T>::format();
39+
d["version"] = 3;
40+
return d;
41+
}
42+
}
2343

2444
template <class T, class Allocator = std::allocator<T> >
2545
void make_Vector(py::module &m, std::string typestr)
2646
{
27-
using Vector_type=Vector<T, Allocator>;
47+
using Vector_type = Vector<T, Allocator>;
2848
auto const v_name = std::string("Vector_").append(typestr);
2949

3050
py::class_<Vector_type>(m, v_name.c_str())
@@ -47,15 +67,30 @@ void make_Vector(py::module &m, std::string typestr)
4767
.def("size", &Vector_type::size)
4868

4969
.def_property_readonly("__array_interface__", [](Vector_type const & vector) {
50-
auto d = py::dict();
51-
bool const read_only = false;
52-
d["data"] = py::make_tuple(std::intptr_t(vector.dataPtr()), read_only);
53-
d["shape"] = py::make_tuple(vector.size());
54-
d["strides"] = py::none();
55-
d["typestr"] = py::format_descriptor<T>::format();
70+
return array_interface(vector);
71+
})
72+
.def_property_readonly("__cuda_array_interface__", [](Vector_type const & vector) {
73+
// Nvidia GPUs: __cuda_array_interface__ v3
74+
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
75+
auto d = array_interface(vector);
76+
77+
// data:
78+
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
79+
// TODO For zero-size arrays, use 0 here.
80+
81+
// None or integer
82+
// An optional stream upon which synchronization must take place at the point of consumption, either by synchronizing on the stream or enqueuing operations on the data on the given stream. Integer values in this entry are as follows:
83+
// 0: This is disallowed as it would be ambiguous between None and the default stream, and also between the legacy and per-thread default streams. Any use case where 0 might be given should either use None, 1, or 2 instead for clarity.
84+
// 1: The legacy default stream.
85+
// 2: The per-thread default stream.
86+
// Any other integer: a cudaStream_t represented as a Python integer.
87+
// When None, no synchronization is required.
88+
d["stream"] = py::none();
89+
5690
d["version"] = 3;
5791
return d;
5892
})
93+
5994
// setter & getter
6095
.def("__setitem__", [](Vector_type & vector, int const idx, T const value){ vector[idx] = value; })
6196
.def("__getitem__", [](Vector_type & v, int const idx){ return v[idx]; })

src/Particle/ArrayOfStructs.cpp

Lines changed: 88 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,63 @@
1414
namespace py = pybind11;
1515
using namespace amrex;
1616

17+
namespace
18+
{
19+
/** CPU: __array_interface__ v3
20+
*
21+
* https://numpy.org/doc/stable/reference/arrays.interface.html
22+
*/
23+
template <int NReal, int NInt,
24+
template<class> class Allocator=DefaultAllocator>
25+
py::dict
26+
array_interface(ArrayOfStructs<NReal, NInt, Allocator> const & aos)
27+
{
28+
using ParticleType = Particle<NReal, NInt>;
29+
using RealType = typename ParticleType::RealType;
30+
31+
auto d = py::dict();
32+
bool const read_only = false;
33+
d["data"] = py::make_tuple(std::intptr_t(aos.dataPtr()), read_only);
34+
d["shape"] = py::make_tuple(aos.size());
35+
d["strides"] = py::make_tuple(sizeof(ParticleType));
36+
d["typestr"] = "|V" + std::to_string(sizeof(ParticleType));
37+
py::list descr;
38+
descr.append(py::make_tuple("x", py::format_descriptor<RealType>::format()));
39+
#if (AMREX_SPACEDIM >= 2)
40+
descr.append(py::make_tuple("y", py::format_descriptor<RealType>::format()));
41+
#endif
42+
#if (AMREX_SPACEDIM >= 3)
43+
descr.append(py::make_tuple("z", py::format_descriptor<RealType>::format()));
44+
#endif
45+
if (NReal > 0) {
46+
for(int ii=0; ii < NReal; ii++) {
47+
descr.append(py::make_tuple("rdata_"+std::to_string(ii),py::format_descriptor<RealType>::format()));
48+
}
49+
}
50+
descr.append(py::make_tuple("cpuid", py::format_descriptor<uint64_t>::format()) );
51+
if (NInt > 0) {
52+
for(int ii=0; ii < NInt; ++ii) {
53+
descr.append(py::make_tuple("idata_"+std::to_string(ii),py::format_descriptor<int>::format()));
54+
}
55+
}
56+
57+
d["descr"] = descr;
58+
d["version"] = 3;
59+
return d;
60+
}
61+
}
1762

1863
template <int NReal, int NInt,
1964
template<class> class Allocator=DefaultAllocator>
20-
void make_ArrayOfStructs(py::module &m)
65+
void make_ArrayOfStructs(py::module &m, std::string allocstr)
2166
{
22-
using AOSType = ArrayOfStructs<NReal, NInt>;
67+
using AOSType = ArrayOfStructs<NReal, NInt, Allocator>;
2368
using ParticleType = Particle<NReal, NInt>;
24-
using RealType = typename ParticleType::RealType;
2569

26-
auto const aos_name = std::string("ArrayOfStructs_").append(std::to_string(NReal) + "_" + std::to_string(NInt));
70+
auto const aos_name = std::string("ArrayOfStructs_")
71+
.append(std::to_string(NReal)).append("_")
72+
.append(std::to_string(NInt)).append("_")
73+
.append(allocstr);
2774
py::class_<AOSType>(m, aos_name.c_str())
2875
.def(py::init())
2976
// TODO:
@@ -41,35 +88,29 @@ void make_ArrayOfStructs(py::module &m)
4188
.def("push_back", &AOSType::push_back)
4289
.def("pop_back", &AOSType::pop_back)
4390
.def("back", py::overload_cast<>(&AOSType::back),"get back member. Problem!!!!! this is perfo")
91+
4492
// setter & getter
4593
.def_property_readonly("__array_interface__", [](AOSType const & aos) {
46-
auto d = py::dict();
47-
bool const read_only = false;
48-
d["data"] = py::make_tuple(std::intptr_t(aos.dataPtr()), read_only);
49-
d["shape"] = py::make_tuple(aos.size());
50-
d["strides"] = py::make_tuple(sizeof(ParticleType));
51-
d["typestr"] = "|V" + std::to_string(sizeof(ParticleType));
52-
py::list descr;
53-
descr.append(py::make_tuple("x", py::format_descriptor<RealType>::format()));
54-
#if (AMREX_SPACEDIM >= 2)
55-
descr.append(py::make_tuple("y", py::format_descriptor<RealType>::format()));
56-
#endif
57-
#if (AMREX_SPACEDIM >= 3)
58-
descr.append(py::make_tuple("z", py::format_descriptor<RealType>::format()));
59-
#endif
60-
if (NReal > 0) {
61-
for(int ii=0; ii < NReal; ii++) {
62-
descr.append(py::make_tuple("rdata_"+std::to_string(ii),py::format_descriptor<RealType>::format()));
63-
}
64-
}
65-
descr.append(py::make_tuple("cpuid", py::format_descriptor<uint64_t>::format()) );
66-
if (NInt > 0) {
67-
for(int ii=0; ii < NInt; ++ii) {
68-
descr.append(py::make_tuple("idata_"+std::to_string(ii),py::format_descriptor<int>::format()));
69-
}
70-
}
94+
return array_interface(aos);
95+
})
96+
.def_property_readonly("__cuda_array_interface__", [](AOSType const & aos) {
97+
// Nvidia GPUs: __cuda_array_interface__ v3
98+
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
99+
auto d = array_interface(aos);
100+
101+
// data:
102+
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
103+
// TODO For zero-size arrays, use 0 here.
104+
105+
// None or integer
106+
// An optional stream upon which synchronization must take place at the point of consumption, either by synchronizing on the stream or enqueuing operations on the data on the given stream. Integer values in this entry are as follows:
107+
// 0: This is disallowed as it would be ambiguous between None and the default stream, and also between the legacy and per-thread default streams. Any use case where 0 might be given should either use None, 1, or 2 instead for clarity.
108+
// 1: The legacy default stream.
109+
// 2: The per-thread default stream.
110+
// Any other integer: a cudaStream_t represented as a Python integer.
111+
// When None, no synchronization is required.
112+
d["stream"] = py::none();
71113

72-
d["descr"] = descr;
73114
d["version"] = 3;
74115
return d;
75116
})
@@ -79,9 +120,22 @@ void make_ArrayOfStructs(py::module &m)
79120
;
80121
}
81122

123+
template <int NReal, int NInt>
124+
void make_ArrayOfStructs(py::module &m)
125+
{
126+
// see Src/Base/AMReX_GpuContainers.H
127+
make_ArrayOfStructs<NReal, NInt, std::allocator> (m, "std");
128+
make_ArrayOfStructs<NReal, NInt, amrex::ArenaAllocator> (m, "arena");
129+
make_ArrayOfStructs<NReal, NInt, amrex::PinnedArenaAllocator> (m, "pinned");
130+
#ifdef AMREX_USE_GPU
131+
make_ArrayOfStructs<NReal, NInt, amrex::DeviceArenaAllocator> (m, "device");
132+
make_ArrayOfStructs<NReal, NInt, amrex::ManagedArenaAllocator> (m, "managed");
133+
make_ArrayOfStructs<NReal, NInt, amrex::AsyncArenaAllocator> (m, "async");
134+
#endif
135+
}
136+
82137
void init_ArrayOfStructs(py::module& m) {
83-
make_ArrayOfStructs< 0, 0> (m);
84-
make_ArrayOfStructs< 7, 0> (m);
85-
make_ArrayOfStructs< 1, 1> (m);
86-
make_ArrayOfStructs< 2, 1> (m);
138+
make_ArrayOfStructs<0, 0> (m); // WarpX 22.07, ImpactX 22.07, HiPACE++ 22.07
139+
make_ArrayOfStructs<1, 1> (m); // test in ParticleContainer
140+
make_ArrayOfStructs<2, 1> (m); // test
87141
}

0 commit comments

Comments
 (0)