Skip to content

Commit 72a48f9

Browse files
Add BTI to branch targets when branch protection is enabled. This resolves
#23306 Since there's a measurable performance cost (~12% for a long varint heavy message) on little and mid size cores, these instructions aren't unconditionally emitted even though they're compatible (CPUs lacking FEAT_BTI interpret them as HINT/NOP). Even with these extra instructions, the assembly path is still much faster than the generic one. PiperOrigin-RevId: 834093313
1 parent 13ee2ce commit 72a48f9

File tree

3 files changed

+70
-4
lines changed

3 files changed

+70
-4
lines changed

upb/port/def.inc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,21 @@ Error, UINTPTR_MAX is undefined
398398
#define UPB_ARM64_ASM 0
399399
#endif
400400

401+
/* When compiling with branch protection, we need to ensure that all branch
402+
* targets in assembly use the appropriate landing pad instruction. These
403+
* instructions are backwards compatible with processors that don't have
404+
* FEAT_BTI and are treated as nops.
405+
*/
406+
#if UPB_ARM64_ASM && defined(__ARM_FEATURE_BTI_DEFAULT)
407+
#if __ARM_FEATURE_BTI_DEFAULT == 1
408+
#define UPB_ARM64_BTI_DEFAULT 1
409+
#else
410+
#define UPB_ARM64_BTI_DEFAULT 0
411+
#endif
412+
#else
413+
#define UPB_ARM64_BTI_DEFAULT 0
414+
#endif
415+
401416
/* This check is not fully robust: it does not require that we have "musttail"
402417
* support available. We need tail calls to avoid consuming arbitrary amounts
403418
* of stack space.

upb/port/undef.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,4 @@
8383
#undef UPB_XSAN_STRUCT_SIZE
8484
#undef UPB_ENABLE_REF_CYCLE_CHECKS
8585
#undef UPB_ARM64_ASM
86+
#undef UPB_ARM64_BTI_DEFAULT

upb/wire/encode.c

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,24 @@ static char* encode_fixed32(char* ptr, upb_encstate* e, uint32_t val) {
142142
#define UPB_PB_VARINT_MAX_LEN 10
143143

144144
#if UPB_ARM64_ASM
145+
// Each arm64 instruction encodes to 4 bytes, and it takes two intructions
146+
// to process each byte of output, so we branch ahead by (4 + 4) * skip to
147+
// avoid the remaining bytes. When BTI is on, we need to use specific
148+
// "landing pad" instructions, so we pad those with nop to make it a power
149+
// of 2, skipping 16 bytes at each stage instead of 8. This carries some
150+
// overhead especially on in-order cores so they're not included unless
151+
// building with branch protection.
152+
#if UPB_ARM64_BTI_DEFAULT
153+
// BTI is used with jc targets here because we don't control which register will
154+
// be used for addr; if it's x16 or x17 a `br` is treated like a call.
155+
#define UPB_BTI_JC "bti jc\n"
156+
#define UPB_BTI_NOP "nop\n"
157+
#define UPB_BTI_SHIFT_IMM "4\n"
158+
#else
159+
#define UPB_BTI_JC
160+
#define UPB_BTI_NOP
161+
#define UPB_BTI_SHIFT_IMM "3\n"
162+
#endif
145163
UPB_NOINLINE static char* encode_longvarint(char* ptr, upb_encstate* e,
146164
uint64_t val) {
147165
ptr = encode_reserve(ptr, e, UPB_PB_VARINT_MAX_LEN);
@@ -154,37 +172,66 @@ UPB_NOINLINE static char* encode_longvarint(char* ptr, upb_encstate* e,
154172
ptr += skip;
155173
uint64_t addr, mask;
156174
__asm__ volatile(
175+
// Formatter keeps merging short lines
176+
// clang-format off
157177
"adr %[addr], 0f\n"
158-
// Each arm64 instruction encodes to 4 bytes, and it takes two
159-
// intructions to process each byte of output, so we branch ahead by
160-
// (4 + 4) * skip to avoid the remaining bytes.
161-
"add %[addr], %[addr], %[cnt], lsl #3\n"
178+
"add %[addr], %[addr], %[cnt], lsl #" UPB_BTI_SHIFT_IMM
162179
"mov %w[mask], #0x80\n"
163180
"br %[addr]\n"
181+
".p2align " UPB_BTI_SHIFT_IMM
164182
"0:\n"
165183
// We don't need addr any more, but we've got the register for our whole
166184
// assembly block so we'll use it as scratch to store the shift+masked
167185
// values before storing them.
168186
// The following stores are unsigned offset stores:
169187
// strb Wt, [Xn, #imm]
188+
UPB_BTI_JC
170189
"orr %[addr], %[mask], %[val], lsr #56\n"
171190
"strb %w[addr], [%[ptr], #8]\n"
191+
UPB_BTI_NOP
192+
193+
UPB_BTI_JC
172194
"orr %[addr], %[mask], %[val], lsr #49\n"
173195
"strb %w[addr], [%[ptr], #7]\n"
196+
UPB_BTI_NOP
197+
198+
UPB_BTI_JC
174199
"orr %[addr], %[mask], %[val], lsr #42\n"
175200
"strb %w[addr], [%[ptr], #6]\n"
201+
UPB_BTI_NOP
202+
203+
UPB_BTI_JC
176204
"orr %[addr], %[mask], %[val], lsr #35\n"
177205
"strb %w[addr], [%[ptr], #5]\n"
206+
UPB_BTI_NOP
207+
208+
UPB_BTI_JC
178209
"orr %[addr], %[mask], %[val], lsr #28\n"
179210
"strb %w[addr], [%[ptr], #4]\n"
211+
UPB_BTI_NOP
212+
213+
UPB_BTI_JC
180214
"orr %w[addr], %w[mask], %w[val], lsr #21\n"
181215
"strb %w[addr], [%[ptr], #3]\n"
216+
UPB_BTI_NOP
217+
218+
UPB_BTI_JC
182219
"orr %w[addr], %w[mask], %w[val], lsr #14\n"
183220
"strb %w[addr], [%[ptr], #2]\n"
221+
UPB_BTI_NOP
222+
223+
UPB_BTI_JC
184224
"orr %w[addr], %w[mask], %w[val], lsr #7\n"
185225
"strb %w[addr], [%[ptr], #1]\n"
226+
UPB_BTI_NOP
227+
228+
UPB_BTI_JC
186229
"orr %w[addr], %w[val], #0x80\n"
187230
"strb %w[addr], [%[ptr]]\n"
231+
UPB_BTI_NOP
232+
233+
UPB_BTI_JC
234+
// clang-format on
188235
: [addr] "=&r"(addr), [mask] "=&r"(mask)
189236
: [val] "r"(val), [ptr] "r"(ptr), [cnt] "r"((uint64_t)skip)
190237
: "memory");
@@ -193,6 +240,9 @@ UPB_NOINLINE static char* encode_longvarint(char* ptr, upb_encstate* e,
193240
ptr[continuations] = val >> (7 * continuations);
194241
return ptr;
195242
}
243+
#undef UPB_BTI_JC
244+
#undef UPB_BTI_NOP
245+
#undef UPB_BTI_SHIFT_IMM
196246
#else
197247
UPB_NOINLINE
198248
static char* encode_longvarint(char* ptr, upb_encstate* e, uint64_t val) {

0 commit comments

Comments
 (0)