Skip to content

Commit f70d90b

Browse files
Put arena cleanup nodes on a separate chunked list instead of at the ends of arena blocks.
The motivation is (a) better data locality during SerialArena::CleanupList and (b) simplification of arena layout. PiperOrigin-RevId: 631173641
1 parent 24f27c3 commit f70d90b

File tree

6 files changed

+253
-182
lines changed

6 files changed

+253
-182
lines changed

src/google/protobuf/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1185,6 +1185,7 @@ cc_test(
11851185
}),
11861186
deps = [
11871187
":arena",
1188+
":arena_cleanup",
11881189
":cc_test_protos",
11891190
":lite_test_util",
11901191
":port",

src/google/protobuf/arena.cc

Lines changed: 119 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -60,27 +60,43 @@ ArenaBlock* SentryArenaBlock() {
6060
}
6161
#endif
6262

63-
SizedPtr AllocateMemory(const AllocationPolicy* policy_ptr, size_t last_size,
64-
size_t min_bytes) {
63+
inline size_t AllocationSize(size_t last_size, size_t start_size,
64+
size_t max_size) {
65+
if (last_size == 0) return start_size;
66+
// Double the current block size, up to a limit.
67+
return std::min(2 * last_size, max_size);
68+
}
69+
70+
SizedPtr AllocateMemory(const AllocationPolicy& policy, size_t size) {
71+
if (policy.block_alloc == nullptr) {
72+
return AllocateAtLeast(size);
73+
}
74+
return {policy.block_alloc(size), size};
75+
}
76+
77+
SizedPtr AllocateBlock(const AllocationPolicy* policy_ptr, size_t last_size,
78+
size_t min_bytes) {
6579
AllocationPolicy policy; // default policy
6680
if (policy_ptr) policy = *policy_ptr;
67-
size_t size;
68-
if (last_size != 0) {
69-
// Double the current block size, up to a limit.
70-
auto max_size = policy.max_block_size;
71-
size = std::min(2 * last_size, max_size);
72-
} else {
73-
size = policy.start_block_size;
74-
}
81+
size_t size =
82+
AllocationSize(last_size, policy.start_block_size, policy.max_block_size);
7583
// Verify that min_bytes + kBlockHeaderSize won't overflow.
7684
ABSL_CHECK_LE(min_bytes, std::numeric_limits<size_t>::max() -
7785
SerialArena::kBlockHeaderSize);
7886
size = std::max(size, SerialArena::kBlockHeaderSize + min_bytes);
7987

80-
if (policy.block_alloc == nullptr) {
81-
return AllocateAtLeast(size);
82-
}
83-
return {policy.block_alloc(size), size};
88+
return AllocateMemory(policy, size);
89+
}
90+
91+
SizedPtr AllocateCleanupChunk(const AllocationPolicy* policy_ptr,
92+
size_t last_size) {
93+
constexpr size_t kStartSize = 64;
94+
constexpr size_t kMaxSize = 4 << 10;
95+
static_assert(kStartSize % sizeof(cleanup::CleanupNode) == 0, "");
96+
97+
const size_t size = AllocationSize(last_size, kStartSize, kMaxSize);
98+
if (policy_ptr == nullptr) return AllocateAtLeast(size);
99+
return AllocateMemory(*policy_ptr, size);
84100
}
85101

86102
class GetDeallocator {
@@ -102,14 +118,95 @@ class GetDeallocator {
102118

103119
} // namespace
104120

121+
namespace cleanup {
122+
struct ChunkList::Chunk {
123+
CleanupNode* First() { return reinterpret_cast<CleanupNode*>(this + 1); }
124+
CleanupNode* Last() { return First() + Capacity() - 1; }
125+
static size_t Capacity(size_t size) {
126+
return (size - sizeof(Chunk)) / sizeof(CleanupNode);
127+
}
128+
size_t Capacity() const { return Capacity(size); }
129+
130+
Chunk* next;
131+
size_t size;
132+
// Cleanup nodes follow.
133+
};
134+
135+
void ChunkList::AddFallback(void* elem, void (*destructor)(void*),
136+
SerialArena& arena) {
137+
ABSL_DCHECK_EQ(next_, limit_);
138+
SizedPtr mem = AllocateCleanupChunk(arena.parent_.AllocPolicy(),
139+
head_ == nullptr ? 0 : head_->size);
140+
arena.AddSpaceAllocated(mem.n);
141+
head_ = new (mem.p) Chunk{head_, mem.n};
142+
next_ = head_->First();
143+
prefetch_ptr_ = reinterpret_cast<char*>(next_);
144+
limit_ = next_ + Chunk::Capacity(mem.n);
145+
AddFromExisting(elem, destructor);
146+
}
147+
148+
void ChunkList::Cleanup(const SerialArena& arena) {
149+
Chunk* c = head_;
150+
if (c == nullptr) return;
151+
GetDeallocator deallocator(arena.parent_.AllocPolicy());
152+
153+
// Iterate backwards in order to destroy in the right order.
154+
CleanupNode* it = next_ - 1;
155+
while (true) {
156+
CleanupNode* first = c->First();
157+
// A prefetch distance of 8 here was chosen arbitrarily.
158+
constexpr int kPrefetchDistance = 8;
159+
CleanupNode* prefetch = it;
160+
// Prefetch the first kPrefetchDistance nodes.
161+
for (int i = 0; prefetch >= first && i < kPrefetchDistance;
162+
--prefetch, ++i) {
163+
prefetch->Prefetch();
164+
}
165+
// For the middle nodes, run destructor and prefetch the node
166+
// kPrefetchDistance after the current one.
167+
for (; prefetch >= first; --it, --prefetch) {
168+
it->Destroy();
169+
prefetch->Prefetch();
170+
}
171+
// Note: we could consider prefetching `next` chunk earlier.
172+
absl::PrefetchToLocalCacheNta(c->next);
173+
// Destroy the rest without prefetching.
174+
for (; it >= first; --it) {
175+
it->Destroy();
176+
}
177+
Chunk* next = c->next;
178+
deallocator({c, c->size});
179+
if (next == nullptr) return;
180+
c = next;
181+
it = c->Last();
182+
};
183+
}
184+
185+
std::vector<void*> ChunkList::PeekForTesting() {
186+
std::vector<void*> ret;
187+
Chunk* c = head_;
188+
if (c == nullptr) return ret;
189+
// Iterate backwards to match destruction order.
190+
CleanupNode* it = next_ - 1;
191+
while (true) {
192+
CleanupNode* first = c->First();
193+
for (; it >= first; --it) {
194+
ret.push_back(it->elem);
195+
}
196+
c = c->next;
197+
if (c == nullptr) return ret;
198+
it = c->Last();
199+
};
200+
}
201+
} // namespace cleanup
202+
105203
// It is guaranteed that this is constructed in `b`. IOW, this is not the first
106204
// arena and `b` cannot be sentry.
107205
SerialArena::SerialArena(ArenaBlock* b, ThreadSafeArena& parent)
108206
: ptr_{b->Pointer(kBlockHeaderSize + ThreadSafeArena::kSerialArenaSize)},
109207
limit_{b->Limit()},
110208
prefetch_ptr_(
111209
b->Pointer(kBlockHeaderSize + ThreadSafeArena::kSerialArenaSize)),
112-
prefetch_limit_(b->Limit()),
113210
head_{b},
114211
space_allocated_{b->size},
115212
parent_{parent} {
@@ -130,22 +227,7 @@ SerialArena::SerialArena(FirstSerialArena, ArenaBlock* b,
130227
}
131228

132229
std::vector<void*> SerialArena::PeekCleanupListForTesting() {
133-
std::vector<void*> res;
134-
135-
ArenaBlock* b = head();
136-
if (b->IsSentry()) return res;
137-
138-
const auto peek_list = [&](char* pos, char* end) {
139-
for (; pos != end; pos += cleanup::Size()) {
140-
cleanup::PeekNode(pos, res);
141-
}
142-
};
143-
144-
peek_list(limit_, b->Limit());
145-
for (b = b->next; b; b = b->next) {
146-
peek_list(reinterpret_cast<char*>(b->cleanup_nodes), b->Limit());
147-
}
148-
return res;
230+
return cleanup_list_.PeekForTesting();
149231
}
150232

151233
std::vector<void*> ThreadSafeArena::PeekCleanupListForTesting() {
@@ -223,25 +305,16 @@ void* SerialArena::AllocateFromStringBlockFallback() {
223305
PROTOBUF_NOINLINE
224306
void* SerialArena::AllocateAlignedWithCleanupFallback(
225307
size_t n, size_t align, void (*destructor)(void*)) {
226-
size_t required = AlignUpTo(n, align) + cleanup::Size();
308+
size_t required = AlignUpTo(n, align);
227309
AllocateNewBlock(required);
228310
return AllocateAlignedWithCleanup(n, align, destructor);
229311
}
230312

231-
PROTOBUF_NOINLINE
232-
void SerialArena::AddCleanupFallback(void* elem, void (*destructor)(void*)) {
233-
AllocateNewBlock(cleanup::Size());
234-
AddCleanupFromExisting(elem, destructor);
235-
}
236-
237313
void SerialArena::AllocateNewBlock(size_t n) {
238314
size_t used = 0;
239315
size_t wasted = 0;
240316
ArenaBlock* old_head = head();
241317
if (!old_head->IsSentry()) {
242-
// Sync limit to block
243-
old_head->cleanup_nodes = limit_;
244-
245318
// Record how much used in this block.
246319
used = static_cast<size_t>(ptr() - old_head->Pointer(kBlockHeaderSize));
247320
wasted = old_head->size - used - kBlockHeaderSize;
@@ -253,7 +326,7 @@ void SerialArena::AllocateNewBlock(size_t n) {
253326
// but with a CPU regression. The regression might have been an artifact of
254327
// the microbenchmark.
255328

256-
auto mem = AllocateMemory(parent_.AllocPolicy(), old_head->size, n);
329+
auto mem = AllocateBlock(parent_.AllocPolicy(), old_head->size, n);
257330
AddSpaceAllocated(mem.n);
258331
ThreadSafeArenaStats::RecordAllocateStats(parent_.arena_stats_.MutableStats(),
259332
/*used=*/used,
@@ -314,34 +387,6 @@ size_t SerialArena::FreeStringBlocks(StringBlock* string_block,
314387
return deallocated;
315388
}
316389

317-
void SerialArena::CleanupList() {
318-
ArenaBlock* b = head();
319-
if (b->IsSentry()) return;
320-
321-
b->cleanup_nodes = limit_;
322-
do {
323-
char* limit = b->Limit();
324-
char* it = reinterpret_cast<char*>(b->cleanup_nodes);
325-
ABSL_DCHECK(!b->IsSentry() || it == limit);
326-
// A prefetch distance of 8 here was chosen arbitrarily.
327-
char* prefetch = it;
328-
int prefetch_dist = 8;
329-
for (; prefetch < limit && --prefetch_dist; prefetch += cleanup::Size()) {
330-
cleanup::PrefetchNode(prefetch);
331-
}
332-
for (; prefetch < limit;
333-
it += cleanup::Size(), prefetch += cleanup::Size()) {
334-
cleanup::DestroyNode(it);
335-
cleanup::PrefetchNode(prefetch);
336-
}
337-
absl::PrefetchToLocalCacheNta(b->next);
338-
for (; it < limit; it += cleanup::Size()) {
339-
cleanup::DestroyNode(it);
340-
}
341-
b = b->next;
342-
} while (b);
343-
}
344-
345390
// Stores arrays of void* and SerialArena* instead of linked list of
346391
// SerialArena* to speed up traversing all SerialArena. The cost of walk is non
347392
// trivial when there are many nodes. Separately storing "ids" minimizes cache
@@ -550,7 +595,7 @@ ArenaBlock* ThreadSafeArena::FirstBlock(void* buf, size_t size,
550595

551596
SizedPtr mem;
552597
if (buf == nullptr || size < kBlockHeaderSize + kAllocPolicySize) {
553-
mem = AllocateMemory(&policy, 0, kAllocPolicySize);
598+
mem = AllocateBlock(&policy, 0, kAllocPolicySize);
554599
} else {
555600
mem = {buf, size};
556601
// Record user-owned block.
@@ -734,6 +779,8 @@ uint64_t ThreadSafeArena::Reset() {
734779
// Have to do this in a first pass, because some of the destructors might
735780
// refer to memory in other blocks.
736781
CleanupList();
782+
// Reset the first arena's cleanup list.
783+
first_arena_.cleanup_list_ = cleanup::ChunkList();
737784

738785
// Discard all blocks except the first one. Whether it is user-provided or
739786
// allocated, always reuse the first block for the first arena.
@@ -913,7 +960,7 @@ SerialArena* ThreadSafeArena::GetSerialArenaFallback(size_t n) {
913960
// have any blocks yet. So we'll allocate its first block now. It must be
914961
// big enough to host SerialArena and the pending request.
915962
serial = SerialArena::New(
916-
AllocateMemory(alloc_policy_.get(), 0, n + kSerialArenaSize), *this);
963+
AllocateBlock(alloc_policy_.get(), 0, n + kSerialArenaSize), *this);
917964

918965
AddSerialArena(id, serial);
919966
}

src/google/protobuf/arena_cleanup.h

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
namespace google {
2222
namespace protobuf {
2323
namespace internal {
24+
25+
class SerialArena;
26+
2427
namespace cleanup {
2528

2629
// Helper function invoking the destructor of `object`
@@ -33,44 +36,61 @@ void arena_destruct_object(void* object) {
3336
// destroyed, and the function to destroy it (`destructor`)
3437
// elem must be aligned at minimum on a 4 byte boundary.
3538
struct CleanupNode {
39+
// Optimization: performs a prefetch on the elem for the cleanup node. We
40+
// explicitly use NTA prefetch here to avoid polluting remote caches: we are
41+
// destroying these instances, there is no purpose for these cache lines to
42+
// linger around in remote caches.
43+
ABSL_ATTRIBUTE_ALWAYS_INLINE void Prefetch() {
44+
// TODO: we should also prefetch the destructor code once
45+
// processors support code prefetching.
46+
absl::PrefetchToLocalCacheNta(elem);
47+
}
48+
49+
// Destroys the object referenced by the cleanup node.
50+
ABSL_ATTRIBUTE_ALWAYS_INLINE void Destroy() { destructor(elem); }
51+
3652
void* elem;
3753
void (*destructor)(void*);
3854
};
3955

40-
inline ABSL_ATTRIBUTE_ALWAYS_INLINE CleanupNode* ToCleanup(void* pos) {
41-
return reinterpret_cast<CleanupNode*>(pos);
42-
}
43-
44-
// Adds a cleanup entry at memory location `pos`.
45-
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void CreateNode(void* pos, void* elem,
56+
// Manages the list of cleanup nodes in a chunked linked list. Chunks grow by
57+
// factors of two up to a limit. Trivially destructible, but Cleanup() must be
58+
// called before destruction.
59+
class ChunkList {
60+
public:
61+
PROTOBUF_ALWAYS_INLINE void Add(void* elem, void (*destructor)(void*),
62+
SerialArena& arena) {
63+
if (PROTOBUF_PREDICT_TRUE(next_ < limit_)) {
64+
AddFromExisting(elem, destructor);
65+
return;
66+
}
67+
AddFallback(elem, destructor, arena);
68+
}
69+
70+
// Runs all inserted cleanups and frees allocated chunks. Must be called
71+
// before destruction.
72+
void Cleanup(const SerialArena& arena);
73+
74+
private:
75+
struct Chunk;
76+
friend class internal::SerialArena;
77+
78+
void AddFallback(void* elem, void (*destructor)(void*), SerialArena& arena);
79+
ABSL_ATTRIBUTE_ALWAYS_INLINE void AddFromExisting(void* elem,
4680
void (*destructor)(void*)) {
47-
CleanupNode n = {elem, destructor};
48-
memcpy(pos, &n, sizeof(n));
49-
}
50-
51-
// Optimization: performs a prefetch on the elem for the cleanup node at `pos`.
52-
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void PrefetchNode(void* pos) {
53-
// We explicitly use NTA prefetch here to avoid polluting remote caches: we
54-
// are destroying these instances, there is no purpose for these cache lines
55-
// to linger around in remote caches.
56-
absl::PrefetchToLocalCacheNta(ToCleanup(pos)->elem);
57-
}
58-
59-
// Destroys the object referenced by the cleanup node.
60-
inline ABSL_ATTRIBUTE_ALWAYS_INLINE void DestroyNode(void* pos) {
61-
CleanupNode* cleanup = ToCleanup(pos);
62-
cleanup->destructor(cleanup->elem);
63-
}
64-
65-
// Append in `out` the pointer to the to-be-cleaned object in `pos`.
66-
inline void PeekNode(void* pos, std::vector<void*>& out) {
67-
out.push_back(ToCleanup(pos)->elem);
68-
}
69-
70-
// Returns the required size for a cleanup node.
71-
constexpr ABSL_ATTRIBUTE_ALWAYS_INLINE size_t Size() {
72-
return sizeof(CleanupNode);
73-
}
81+
*next_++ = CleanupNode{elem, destructor};
82+
}
83+
84+
// Returns the pointers to the to-be-cleaned objects.
85+
std::vector<void*> PeekForTesting();
86+
87+
Chunk* head_ = nullptr;
88+
CleanupNode* next_ = nullptr;
89+
CleanupNode* limit_ = nullptr;
90+
// Current prefetch position. Data from `next_` up to but not including
91+
// `prefetch_ptr_` is software prefetched. Used in SerialArena prefetching.
92+
const char* prefetch_ptr_ = nullptr;
93+
};
7494

7595
} // namespace cleanup
7696
} // namespace internal

0 commit comments

Comments
 (0)