@@ -142,6 +142,24 @@ static char* encode_fixed32(char* ptr, upb_encstate* e, uint32_t val) {
142142#define UPB_PB_VARINT_MAX_LEN 10
143143
144144#if UPB_ARM64_ASM
145+ // Each arm64 instruction encodes to 4 bytes, and it takes two intructions
146+ // to process each byte of output, so we branch ahead by (4 + 4) * skip to
147+ // avoid the remaining bytes. When BTI is on, we need to use specific
148+ // "landing pad" instructions, so we pad those with nop to make it a power
149+ // of 2, skipping 16 bytes at each stage instead of 8. This carries some
150+ // overhead especially on in-order cores so they're not included unless
151+ // building with branch protection.
152+ #if UPB_ARM64_BTI_DEFAULT
153+ // BTI is used with jc targets here because we don't control which register will
154+ // be used for addr; if it's x16 or x17 a `br` is treated like a call.
155+ #define UPB_BTI_JC "bti jc\n"
156+ #define UPB_BTI_NOP "nop\n"
157+ #define UPB_BTI_SHIFT_IMM "4\n"
158+ #else
159+ #define UPB_BTI_JC
160+ #define UPB_BTI_NOP
161+ #define UPB_BTI_SHIFT_IMM "3\n"
162+ #endif
145163UPB_NOINLINE static char * encode_longvarint (char * ptr , upb_encstate * e ,
146164 uint64_t val ) {
147165 ptr = encode_reserve (ptr , e , UPB_PB_VARINT_MAX_LEN );
@@ -154,37 +172,66 @@ UPB_NOINLINE static char* encode_longvarint(char* ptr, upb_encstate* e,
154172 ptr += skip ;
155173 uint64_t addr , mask ;
156174 __asm__ volatile (
175+ // Formatter keeps merging short lines
176+ // clang-format off
157177 "adr %[addr], 0f\n"
158- // Each arm64 instruction encodes to 4 bytes, and it takes two
159- // intructions to process each byte of output, so we branch ahead by
160- // (4 + 4) * skip to avoid the remaining bytes.
161- "add %[addr], %[addr], %[cnt], lsl #3\n"
178+ "add %[addr], %[addr], %[cnt], lsl #" UPB_BTI_SHIFT_IMM
162179 "mov %w[mask], #0x80\n"
163180 "br %[addr]\n"
181+ ".p2align " UPB_BTI_SHIFT_IMM
164182 "0:\n"
165183 // We don't need addr any more, but we've got the register for our whole
166184 // assembly block so we'll use it as scratch to store the shift+masked
167185 // values before storing them.
168186 // The following stores are unsigned offset stores:
169187 // strb Wt, [Xn, #imm]
188+ UPB_BTI_JC
170189 "orr %[addr], %[mask], %[val], lsr #56\n"
171190 "strb %w[addr], [%[ptr], #8]\n"
191+ UPB_BTI_NOP
192+
193+ UPB_BTI_JC
172194 "orr %[addr], %[mask], %[val], lsr #49\n"
173195 "strb %w[addr], [%[ptr], #7]\n"
196+ UPB_BTI_NOP
197+
198+ UPB_BTI_JC
174199 "orr %[addr], %[mask], %[val], lsr #42\n"
175200 "strb %w[addr], [%[ptr], #6]\n"
201+ UPB_BTI_NOP
202+
203+ UPB_BTI_JC
176204 "orr %[addr], %[mask], %[val], lsr #35\n"
177205 "strb %w[addr], [%[ptr], #5]\n"
206+ UPB_BTI_NOP
207+
208+ UPB_BTI_JC
178209 "orr %[addr], %[mask], %[val], lsr #28\n"
179210 "strb %w[addr], [%[ptr], #4]\n"
211+ UPB_BTI_NOP
212+
213+ UPB_BTI_JC
180214 "orr %w[addr], %w[mask], %w[val], lsr #21\n"
181215 "strb %w[addr], [%[ptr], #3]\n"
216+ UPB_BTI_NOP
217+
218+ UPB_BTI_JC
182219 "orr %w[addr], %w[mask], %w[val], lsr #14\n"
183220 "strb %w[addr], [%[ptr], #2]\n"
221+ UPB_BTI_NOP
222+
223+ UPB_BTI_JC
184224 "orr %w[addr], %w[mask], %w[val], lsr #7\n"
185225 "strb %w[addr], [%[ptr], #1]\n"
226+ UPB_BTI_NOP
227+
228+ UPB_BTI_JC
186229 "orr %w[addr], %w[val], #0x80\n"
187230 "strb %w[addr], [%[ptr]]\n"
231+ UPB_BTI_NOP
232+
233+ UPB_BTI_JC
234+ // clang-format on
188235 : [addr ] "=&r" (addr ), [mask ] "=&r" (mask )
189236 : [val ] "r" (val ), [ptr ] "r" (ptr ), [cnt ] "r" ((uint64_t )skip )
190237 : "memory" );
@@ -193,6 +240,9 @@ UPB_NOINLINE static char* encode_longvarint(char* ptr, upb_encstate* e,
193240 ptr [continuations ] = val >> (7 * continuations );
194241 return ptr ;
195242}
243+ #undef UPB_BTI_JC
244+ #undef UPB_BTI_NOP
245+ #undef UPB_BTI_SHIFT_IMM
196246#else
197247UPB_NOINLINE
198248static char * encode_longvarint (char * ptr , upb_encstate * e , uint64_t val ) {
0 commit comments