diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index ecc0de1cb6ec3..66ab155e5c299 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) { return (x + N) & ~(N - 1); } +// Perform a lane parallel memset on a uint32_t pointer. +void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) { + uint64_t mask = gpu::get_lane_mask(); + uint32_t workers = cpp::popcount(uniform); + for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers) + s[i] = c; +} + } // namespace impl /// A slab allocator used to hand out identically sized slabs of memory. @@ -157,10 +165,15 @@ struct Slab { Header *header = reinterpret_cast
(memory); header->chunk_size = chunk_size; header->global_index = global_index; + } - // This memset is expensive and likely not necessary for the current 'kfd' - // driver. Until zeroed pages are exposed by the API we must be careful. - __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); + // Set the necessary bitfield bytes to zero in parallel using many lanes. This + // must be called before the bitfield can be accessed safely, memory is not + // guaranteed to be zero initialized in the current implementation. + void initialize(uint64_t uniform) { + uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) / + sizeof(uint32_t); + impl::uniform_memset(get_bitfield(), 0, size, uniform); } // Get the number of chunks that can theoretically fit inside this slab. @@ -354,14 +367,7 @@ struct GuardPtr { void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; - Slab *mem = new (raw) Slab(cpp::forward(args)...); - - cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); - ptr.store(mem, cpp::MemoryOrder::RELAXED); - cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); - if (!ref.acquire(n, count)) - ref.reset(n, count); - return mem; + return new (raw) Slab(cpp::forward(args)...); } if (!expected || expected == reinterpret_cast(SENTINEL)) @@ -374,6 +380,16 @@ struct GuardPtr { return ptr.load(cpp::MemoryOrder::RELAXED); } + // Finalize the associated memory and signal that it is ready to use by + // resetting the counter. + void finalize(Slab *mem, uint32_t n, uint64_t &count) { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + ptr.store(mem, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + if (!ref.acquire(n, count)) + ref.reset(n, count); + } + public: // Attempt to lock access to the pointer, potentially creating it if empty. // The uniform mask represents which lanes share the same pointer. For each @@ -392,6 +408,14 @@ struct GuardPtr { if (!result) return nullptr; + // We defer storing the newly allocated slab until now so that we can use + // multiple lanes to initialize it and release it for use. + if (count == cpp::numeric_limits::max()) { + result->initialize(uniform); + if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) + finalize(result, cpp::popcount(uniform), count); + } + if (count != cpp::numeric_limits::max()) count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;