Skip to content

Commit b583fa8

Browse files
committed
Improve: Faster utf8_count_neon w/out u64 unpacking in loop
1 parent 73da441 commit b583fa8

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

include/stringzilla/utf8.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1549,8 +1549,7 @@ SZ_PUBLIC sz_size_t sz_utf8_count_neon(sz_cptr_t text, sz_size_t length) {
15491549
uint8x16_t continuation_mask_vec = vdupq_n_u8(0xC0);
15501550
uint8x16_t continuation_pattern_vec = vdupq_n_u8(0x80);
15511551
sz_u8_t const *text_u8 = (sz_u8_t const *)text;
1552-
sz_size_t char_count = 0;
1553-
1552+
uint64x2_t char_count_vec = vdupq_n_u64(0);
15541553
while (length >= 16) {
15551554
text_vec.u8x16 = vld1q_u8(text_u8);
15561555
headers_vec.u8x16 = vandq_u8(text_vec.u8x16, continuation_mask_vec);
@@ -1560,25 +1559,29 @@ SZ_PUBLIC sz_size_t sz_utf8_count_neon(sz_cptr_t text, sz_size_t length) {
15601559
uint16x8_t sum16 = vpaddlq_u8(start_flags);
15611560
uint32x4_t sum32 = vpaddlq_u16(sum16);
15621561
uint64x2_t sum64 = vpaddlq_u32(sum32);
1563-
char_count += vgetq_lane_u64(sum64, 0) + vgetq_lane_u64(sum64, 1);
1562+
char_count_vec = vaddq_u64(char_count_vec, sum64);
15641563
text_u8 += 16;
15651564
length -= 16;
15661565
}
15671566

1567+
sz_size_t char_count = vgetq_lane_u64(char_count_vec, 0) + vgetq_lane_u64(char_count_vec, 1);
15681568
if (length) char_count += sz_utf8_count_serial((sz_cptr_t)text_u8, length);
15691569
return char_count;
15701570
}
15711571

15721572
SZ_PUBLIC sz_cptr_t sz_utf8_find_nth_neon(sz_cptr_t text, sz_size_t length, sz_size_t n) {
1573+
// TODO: Implement a NEON-accelerated version of sz_utf8_find_nth in absense of PDEP instruction.
15731574
return sz_utf8_find_nth_serial(text, length, n);
15741575
}
15751576

15761577
SZ_PUBLIC sz_cptr_t sz_utf8_unpack_chunk_neon( //
15771578
sz_cptr_t text, sz_size_t length, //
15781579
sz_rune_t *runes, sz_size_t runes_capacity, //
15791580
sz_size_t *runes_unpacked) {
1581+
// TODO: Implement a fast NEON version once we come up with an AVX-512 design.
15801582
return sz_utf8_unpack_chunk_serial(text, length, runes, runes_capacity, runes_unpacked);
15811583
}
1584+
15821585
#if defined(__clang__)
15831586
#pragma clang attribute pop
15841587
#elif defined(__GNUC__)

0 commit comments

Comments
 (0)