Skip to content

Commit 0259f58

Browse files
committed
Add: NEON UTF-8 tokenization kernels
1 parent 8c34baf commit 0259f58

File tree

4 files changed

+227
-13
lines changed

4 files changed

+227
-13
lines changed

include/stringzilla/find.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1617,7 +1617,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byteset_ice(sz_cptr_t text, sz_size_t length, sz_by
16171617
#pragma GCC target("+simd")
16181618
#endif
16191619

1620-
SZ_INTERNAL sz_u64_t sz_vreinterpretq_u8_u4_(uint8x16_t vec) {
1620+
SZ_INTERNAL sz_u64_t sz_find_vreinterpretq_u8_u4_(uint8x16_t vec) {
16211621
// Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
16221622
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
16231623
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
@@ -1634,7 +1634,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t
16341634
// In Arm NEON we don't have a `movemask` to combine it with `ctz` and get the offset of the match.
16351635
// But assuming the `vmaxvq` is cheap, we can use it to find the first match, by blending (bitwise selecting)
16361636
// the vector with a relative offsets array.
1637-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1637+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
16381638
if (matches) return h + sz_u64_ctz(matches) / 4;
16391639

16401640
h += 16, h_length -= 16;
@@ -1651,7 +1651,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_byte_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_
16511651
while (h_length >= 16) {
16521652
h_vec.u8x16 = vld1q_u8((sz_u8_t const *)h + h_length - 16);
16531653
matches_vec.u8x16 = vceqq_u8(h_vec.u8x16, n_vec.u8x16);
1654-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1654+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
16551655
if (matches) return h + h_length - 1 - sz_u64_clz(matches) / 4;
16561656
h_length -= 16;
16571657
}
@@ -1676,7 +1676,7 @@ SZ_PUBLIC sz_u64_t sz_find_byteset_neon_register_( //
16761676
uint8x16_t matches_vec = vorrq_u8(matches_top_vec, matches_bottom_vec);
16771677
// Istead of pure `vandq_u8`, we can immediately broadcast a match presence across each 8-bit word.
16781678
matches_vec = vtstq_u8(matches_vec, byte_mask_vec);
1679-
return sz_vreinterpretq_u8_u4_(matches_vec);
1679+
return sz_find_vreinterpretq_u8_u4_(matches_vec);
16801680
}
16811681

16821682
SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, sz_size_t n_length) {
@@ -1701,7 +1701,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
17011701
h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
17021702
matches_vec.u8x16 =
17031703
vandq_u8(vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
1704-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1704+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
17051705
if (matches) return h + sz_u64_ctz(matches) / 4;
17061706
}
17071707
}
@@ -1723,7 +1723,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
17231723
vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
17241724
vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
17251725
vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
1726-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1726+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
17271727
if (matches) return h + sz_u64_ctz(matches) / 4;
17281728
}
17291729
}
@@ -1747,7 +1747,7 @@ SZ_PUBLIC sz_cptr_t sz_find_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n, s
17471747
vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
17481748
vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
17491749
vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
1750-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1750+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
17511751
while (matches) {
17521752
int potential_offset = sz_u64_ctz(matches) / 4;
17531753
if (sz_equal_neon(h + potential_offset, n, n_length)) return h + potential_offset;
@@ -1787,7 +1787,7 @@ SZ_PUBLIC sz_cptr_t sz_rfind_neon(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n,
17871787
vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16), //
17881788
vceqq_u8(h_mid_vec.u8x16, n_mid_vec.u8x16)),
17891789
vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
1790-
matches = sz_vreinterpretq_u8_u4_(matches_vec.u8x16);
1790+
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
17911791
while (matches) {
17921792
int potential_offset = sz_u64_clz(matches) / 4;
17931793
if (sz_equal_neon(h + h_length - n_length - potential_offset, n, n_length))

include/stringzilla/utf8.h

Lines changed: 196 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1356,16 +1356,195 @@ SZ_PUBLIC sz_cptr_t sz_utf8_unpack_chunk_haswell( //
13561356
#pragma endregion // Haswell Implementation
13571357

13581358
#pragma region NEON Implementation
1359+
#if SZ_USE_NEON
1360+
#if defined(__clang__)
1361+
#pragma clang attribute push(__attribute__((target("+simd"))), apply_to = function)
1362+
#elif defined(__GNUC__)
1363+
#pragma GCC push_options
1364+
#pragma GCC target("+simd")
1365+
#endif
1366+
1367+
SZ_INTERNAL sz_u64_t sz_utf8_vreinterpretq_u8_u4_(uint8x16_t vec) {
1368+
// Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
1369+
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
1370+
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(vec), 4)), 0) & 0x8888888888888888ull;
1371+
}
13591372

13601373
SZ_PUBLIC sz_cptr_t sz_utf8_find_newline_neon(sz_cptr_t text, sz_size_t length, sz_size_t *matched_length) {
1374+
1375+
sz_u128_vec_t text_vec;
1376+
uint8x16_t n_vec = vdupq_n_u8('\n');
1377+
uint8x16_t v_vec = vdupq_n_u8('\v');
1378+
uint8x16_t f_vec = vdupq_n_u8('\f');
1379+
uint8x16_t r_vec = vdupq_n_u8('\r');
1380+
uint8x16_t xc2_vec = vdupq_n_u8(0xC2);
1381+
uint8x16_t x85_vec = vdupq_n_u8(0x85);
1382+
uint8x16_t xe2_vec = vdupq_n_u8(0xE2);
1383+
uint8x16_t x80_vec = vdupq_n_u8(0x80);
1384+
uint8x16_t xa8_vec = vdupq_n_u8(0xA8);
1385+
uint8x16_t xa9_vec = vdupq_n_u8(0xA9);
1386+
1387+
uint8x16_t drop1_vec = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1388+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00};
1389+
uint8x16_t drop2_vec = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1390+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00};
1391+
1392+
while (length >= 16) {
1393+
text_vec.u8x16 = vld1q_u8((sz_u8_t const *)text);
1394+
1395+
// 1-byte matches
1396+
uint8x16_t n_cmp = vceqq_u8(text_vec.u8x16, n_vec);
1397+
uint8x16_t v_cmp = vceqq_u8(text_vec.u8x16, v_vec);
1398+
uint8x16_t f_cmp = vceqq_u8(text_vec.u8x16, f_vec);
1399+
uint8x16_t r_cmp = vceqq_u8(text_vec.u8x16, r_vec);
1400+
uint8x16_t one_vec = vorrq_u8(vorrq_u8(n_cmp, v_cmp), vorrq_u8(f_cmp, r_cmp));
1401+
1402+
// 2- & 3-byte matches with shifted views
1403+
uint8x16_t t1 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 1);
1404+
uint8x16_t t2 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 2);
1405+
uint8x16_t rn_vec = vandq_u8(r_cmp, vceqq_u8(t1, n_vec));
1406+
uint8x16_t xc285_vec = vandq_u8(vceqq_u8(text_vec.u8x16, xc2_vec), vceqq_u8(t1, x85_vec));
1407+
uint8x16_t two_vec = vandq_u8(vorrq_u8(rn_vec, xc285_vec), drop1_vec); // Ignore last split match
1408+
1409+
uint8x16_t xe2_cmp = vceqq_u8(text_vec.u8x16, xe2_vec);
1410+
uint8x16_t e280_vec = vandq_u8(xe2_cmp, vceqq_u8(t1, x80_vec));
1411+
uint8x16_t e280a8_vec = vandq_u8(e280_vec, vceqq_u8(t2, xa8_vec));
1412+
uint8x16_t e280a9_vec = vandq_u8(e280_vec, vceqq_u8(t2, xa9_vec));
1413+
uint8x16_t three_vec = vandq_u8(vorrq_u8(e280a8_vec, e280a9_vec), drop2_vec); // Ignore last two split matches
1414+
1415+
// Quick presence check
1416+
uint8x16_t combined_vec = vorrq_u8(one_vec, vorrq_u8(two_vec, three_vec));
1417+
if (vmaxvq_u8(combined_vec)) {
1418+
1419+
// Late mask extraction only when a match exists
1420+
sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_(one_vec);
1421+
sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_(two_vec);
1422+
sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_(three_vec);
1423+
sz_u64_t combined_mask = one_mask | two_mask | three_mask;
1424+
1425+
int bit_index = sz_u64_ctz(combined_mask);
1426+
sz_u64_t first_match_mask = (sz_u64_t)1 << bit_index;
1427+
sz_size_t length_value = 1;
1428+
length_value += (first_match_mask & (two_mask | three_mask)) != 0;
1429+
length_value += (first_match_mask & three_mask) != 0;
1430+
*matched_length = length_value;
1431+
return text + (bit_index / 4);
1432+
}
1433+
text += 14;
1434+
length -= 14;
1435+
}
1436+
13611437
return sz_utf8_find_newline_serial(text, length, matched_length);
13621438
}
13631439

13641440
SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon(sz_cptr_t text, sz_size_t length, sz_size_t *matched_length) {
1441+
1442+
sz_u128_vec_t text_vec;
1443+
uint8x16_t t_vec = vdupq_n_u8('\t');
1444+
uint8x16_t r_vec = vdupq_n_u8('\r');
1445+
uint8x16_t x20_vec = vdupq_n_u8(' ');
1446+
uint8x16_t xc2_vec = vdupq_n_u8(0xC2);
1447+
uint8x16_t x85_vec = vdupq_n_u8(0x85);
1448+
uint8x16_t xa0_vec = vdupq_n_u8(0xA0);
1449+
uint8x16_t xe1_vec = vdupq_n_u8(0xE1);
1450+
uint8x16_t xe2_vec = vdupq_n_u8(0xE2);
1451+
uint8x16_t xe3_vec = vdupq_n_u8(0xE3);
1452+
uint8x16_t x9a_vec = vdupq_n_u8(0x9A);
1453+
uint8x16_t x80_vec = vdupq_n_u8(0x80);
1454+
uint8x16_t x81_vec = vdupq_n_u8(0x81);
1455+
uint8x16_t x8d_vec = vdupq_n_u8(0x8D);
1456+
uint8x16_t xa8_vec = vdupq_n_u8(0xA8);
1457+
uint8x16_t xa9_vec = vdupq_n_u8(0xA9);
1458+
uint8x16_t xaf_vec = vdupq_n_u8(0xAF);
1459+
uint8x16_t x9f_vec = vdupq_n_u8(0x9F);
1460+
1461+
uint8x16_t drop1_vec = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1462+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00};
1463+
uint8x16_t drop2_vec = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1464+
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00};
1465+
1466+
while (length >= 16) {
1467+
text_vec.u8x16 = vld1q_u8((sz_u8_t const *)text);
1468+
1469+
// 1-byte matches
1470+
uint8x16_t x20_cmp = vceqq_u8(text_vec.u8x16, x20_vec);
1471+
uint8x16_t range_cmp = vandq_u8(vcgeq_u8(text_vec.u8x16, t_vec), vcleq_u8(text_vec.u8x16, r_vec));
1472+
uint8x16_t one_vec = vorrq_u8(x20_cmp, range_cmp);
1473+
1474+
// 2-byte matches
1475+
uint8x16_t text1 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 1);
1476+
uint8x16_t xc2_cmp = vceqq_u8(text_vec.u8x16, xc2_vec);
1477+
uint8x16_t two_vec =
1478+
vorrq_u8(vandq_u8(xc2_cmp, vceqq_u8(text1, x85_vec)), vandq_u8(xc2_cmp, vceqq_u8(text1, xa0_vec)));
1479+
two_vec = vandq_u8(two_vec, drop1_vec); // Ignore last split match
1480+
1481+
// 3-byte matches
1482+
uint8x16_t text2 = vextq_u8(text_vec.u8x16, text_vec.u8x16, 2);
1483+
uint8x16_t xe1_cmp = vceqq_u8(text_vec.u8x16, xe1_vec);
1484+
uint8x16_t xe2_cmp = vceqq_u8(text_vec.u8x16, xe2_vec);
1485+
uint8x16_t xe3_cmp = vceqq_u8(text_vec.u8x16, xe3_vec);
1486+
uint8x16_t x80_ge_cmp = vcgeq_u8(text2, x80_vec);
1487+
uint8x16_t x8d_le_cmp = vcleq_u8(text2, x8d_vec);
1488+
1489+
uint8x16_t ogham_vec = vandq_u8(xe1_cmp, vandq_u8(vceqq_u8(text1, x9a_vec), vceqq_u8(text2, x80_vec)));
1490+
uint8x16_t range_e280_vec =
1491+
vandq_u8(xe2_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vandq_u8(x80_ge_cmp, x8d_le_cmp)));
1492+
uint8x16_t u2028_vec = vandq_u8(xe2_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vceqq_u8(text2, xa8_vec)));
1493+
uint8x16_t u2029_vec = vandq_u8(xe2_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vceqq_u8(text2, xa9_vec)));
1494+
uint8x16_t u202f_vec = vandq_u8(xe2_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vceqq_u8(text2, xaf_vec)));
1495+
uint8x16_t u205f_vec = vandq_u8(xe2_cmp, vandq_u8(vceqq_u8(text1, x81_vec), vceqq_u8(text2, x9f_vec)));
1496+
uint8x16_t ideographic_vec = vandq_u8(xe3_cmp, vandq_u8(vceqq_u8(text1, x80_vec), vceqq_u8(text2, x80_vec)));
1497+
uint8x16_t three_vec = vorrq_u8(vorrq_u8(vorrq_u8(ogham_vec, range_e280_vec), vorrq_u8(u2028_vec, u2029_vec)),
1498+
vorrq_u8(vorrq_u8(u202f_vec, u205f_vec), ideographic_vec));
1499+
three_vec = vandq_u8(three_vec, drop2_vec); // Ignore last two split matches
1500+
1501+
uint8x16_t combined_vec = vorrq_u8(one_vec, vorrq_u8(two_vec, three_vec));
1502+
if (vmaxvq_u8(combined_vec)) {
1503+
// Late mask extraction only when a match exists
1504+
sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_(one_vec);
1505+
sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_(two_vec);
1506+
sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_(three_vec);
1507+
sz_u64_t combined_mask = one_mask | two_mask | three_mask;
1508+
1509+
int bit_index = sz_u64_ctz(combined_mask);
1510+
sz_u64_t first_match_mask = (sz_u64_t)1 << bit_index;
1511+
sz_size_t length_value = 1;
1512+
length_value += (first_match_mask & (two_mask | three_mask)) != 0;
1513+
length_value += (first_match_mask & three_mask) != 0;
1514+
*matched_length = length_value;
1515+
return text + (bit_index / 4);
1516+
}
1517+
text += 14;
1518+
length -= 14;
1519+
}
1520+
13651521
return sz_utf8_find_whitespace_serial(text, length, matched_length);
13661522
}
13671523

1368-
SZ_PUBLIC sz_size_t sz_utf8_count_neon(sz_cptr_t text, sz_size_t length) { return sz_utf8_count_serial(text, length); }
1524+
SZ_PUBLIC sz_size_t sz_utf8_count_neon(sz_cptr_t text, sz_size_t length) {
1525+
sz_u128_vec_t text_vec, headers_vec, continuation_vec;
1526+
uint8x16_t continuation_mask_vec = vdupq_n_u8(0xC0);
1527+
uint8x16_t continuation_pattern_vec = vdupq_n_u8(0x80);
1528+
sz_u8_t const *text_u8 = (sz_u8_t const *)text;
1529+
sz_size_t char_count = 0;
1530+
1531+
while (length >= 16) {
1532+
text_vec.u8x16 = vld1q_u8(text_u8);
1533+
headers_vec.u8x16 = vandq_u8(text_vec.u8x16, continuation_mask_vec);
1534+
continuation_vec.u8x16 = vceqq_u8(headers_vec.u8x16, continuation_pattern_vec);
1535+
// Convert 0xFF/0x00 into 1/0 and sum.
1536+
uint8x16_t start_flags = vshrq_n_u8(vmvnq_u8(continuation_vec.u8x16), 7);
1537+
uint16x8_t sum16 = vpaddlq_u8(start_flags);
1538+
uint32x4_t sum32 = vpaddlq_u16(sum16);
1539+
uint64x2_t sum64 = vpaddlq_u32(sum32);
1540+
char_count += vgetq_lane_u64(sum64, 0) + vgetq_lane_u64(sum64, 1);
1541+
text_u8 += 16;
1542+
length -= 16;
1543+
}
1544+
1545+
if (length) char_count += sz_utf8_count_serial((sz_cptr_t)text_u8, length);
1546+
return char_count;
1547+
}
13691548

13701549
SZ_PUBLIC sz_cptr_t sz_utf8_find_nth_neon(sz_cptr_t text, sz_size_t length, sz_size_t n) {
13711550
return sz_utf8_find_nth_serial(text, length, n);
@@ -1377,6 +1556,12 @@ SZ_PUBLIC sz_cptr_t sz_utf8_unpack_chunk_neon( //
13771556
sz_size_t *runes_unpacked) {
13781557
return sz_utf8_unpack_chunk_serial(text, length, runes, runes_capacity, runes_unpacked);
13791558
}
1559+
#if defined(__clang__)
1560+
#pragma clang attribute pop
1561+
#elif defined(__GNUC__)
1562+
#pragma GCC pop_options
1563+
#endif
1564+
#endif // SZ_USE_NEON
13801565

13811566
#pragma endregion // NEON Implementation
13821567

@@ -1389,6 +1574,8 @@ SZ_DYNAMIC sz_size_t sz_utf8_count(sz_cptr_t text, sz_size_t length) {
13891574
return sz_utf8_count_ice(text, length);
13901575
#elif SZ_USE_HASWELL
13911576
return sz_utf8_count_haswell(text, length);
1577+
#elif SZ_USE_NEON
1578+
return sz_utf8_count_neon(text, length);
13921579
#else
13931580
return sz_utf8_count_serial(text, length);
13941581
#endif
@@ -1399,6 +1586,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_nth(sz_cptr_t text, sz_size_t length, sz_size_
13991586
return sz_utf8_find_nth_ice(text, length, n);
14001587
#elif SZ_USE_HASWELL
14011588
return sz_utf8_find_nth_haswell(text, length, n);
1589+
#elif SZ_USE_NEON
1590+
return sz_utf8_find_nth_neon(text, length, n);
14021591
#else
14031592
return sz_utf8_find_nth_serial(text, length, n);
14041593
#endif
@@ -1409,6 +1598,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_newline(sz_cptr_t text, sz_size_t length, sz_s
14091598
return sz_utf8_find_newline_ice(text, length, matched_length);
14101599
#elif SZ_USE_HASWELL
14111600
return sz_utf8_find_newline_haswell(text, length, matched_length);
1601+
#elif SZ_USE_NEON
1602+
return sz_utf8_find_newline_neon(text, length, matched_length);
14121603
#else
14131604
return sz_utf8_find_newline_serial(text, length, matched_length);
14141605
#endif
@@ -1419,6 +1610,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_whitespace(sz_cptr_t text, sz_size_t length, s
14191610
return sz_utf8_find_whitespace_ice(text, length, matched_length);
14201611
#elif SZ_USE_HASWELL
14211612
return sz_utf8_find_whitespace_haswell(text, length, matched_length);
1613+
#elif SZ_USE_NEON
1614+
return sz_utf8_find_whitespace_neon(text, length, matched_length);
14221615
#else
14231616
return sz_utf8_find_whitespace_serial(text, length, matched_length);
14241617
#endif
@@ -1430,6 +1623,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_unpack_chunk(sz_cptr_t text, sz_size_t length, sz_r
14301623
return sz_utf8_unpack_chunk_ice(text, length, runes, runes_capacity, runes_unpacked);
14311624
#elif SZ_USE_HASWELL
14321625
return sz_utf8_unpack_chunk_haswell(text, length, runes, runes_capacity, runes_unpacked);
1626+
#elif SZ_USE_NEON
1627+
return sz_utf8_unpack_chunk_neon(text, length, runes, runes_capacity, runes_unpacked);
14331628
#else
14341629
return sz_utf8_unpack_chunk_serial(text, length, runes, runes_capacity, runes_unpacked);
14351630
#endif

scripts/bench_find.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,18 @@ void bench_utf8_find_boundary(environment_t const &env) {
671671
matcher_from_sz_utf8_find_boundary<sz_utf8_find_whitespace_ice>>(env))
672672
.log(base_whitespace);
673673
#endif
674+
#if SZ_USE_NEON
675+
bench_unary( //
676+
env, "sz_utf8_find_newline_neon", base_newline_call,
677+
callable_for_utf8_find_boundary<sz::range_matches,
678+
matcher_from_sz_utf8_find_boundary<sz_utf8_find_newline_neon>>(env))
679+
.log(base_newline);
680+
bench_unary( //
681+
env, "sz_utf8_find_whitespace_neon", base_whitespace_call,
682+
callable_for_utf8_find_boundary<sz::range_matches,
683+
matcher_from_sz_utf8_find_boundary<sz_utf8_find_whitespace_neon>>(env))
684+
.log(base_whitespace);
685+
#endif
674686
}
675687

676688
#pragma endregion
@@ -692,4 +704,4 @@ int main(int argc, char const **argv) {
692704

693705
std::printf("All benchmarks passed.\n");
694706
return 0;
695-
}
707+
}

scripts/bench_token.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,15 +231,22 @@ void bench_utf8_count(environment_t const &env) {
231231
#if SZ_USE_ICE
232232
bench_unary(env, "sz_utf8_count_ice", validator, utf8_count_from_sz<sz_utf8_count_ice> {env}).log(base);
233233
#endif
234+
#if SZ_USE_NEON
235+
bench_unary(env, "sz_utf8_count_neon", validator, utf8_count_from_sz<sz_utf8_count_neon> {env}).log(base);
236+
#endif
234237
}
235238

236239
void bench_utf8_unpack(environment_t const &env) {
237240

238-
auto validator = utf8_unpack_from_sz<sz_utf8_unpack_chunk_serial> {env};
241+
auto validator = utf8_unpack_from_sz<sz_utf8_unpack_chunk_serial> {env, {}};
239242
bench_result_t base = bench_unary(env, "sz_utf8_unpack_chunk_serial", validator).log();
240243

241244
#if SZ_USE_ICE
242-
bench_unary(env, "sz_utf8_unpack_chunk_ice", validator, utf8_unpack_from_sz<sz_utf8_unpack_chunk_ice> {env})
245+
bench_unary(env, "sz_utf8_unpack_chunk_ice", validator, utf8_unpack_from_sz<sz_utf8_unpack_chunk_ice> {env, {}})
246+
.log(base);
247+
#endif
248+
#if SZ_USE_NEON
249+
bench_unary(env, "sz_utf8_unpack_chunk_neon", validator, utf8_unpack_from_sz<sz_utf8_unpack_chunk_neon> {env, {}})
243250
.log(base);
244251
#endif
245252
}
@@ -529,4 +536,4 @@ int main(int argc, char const **argv) {
529536

530537
std::printf("All benchmarks passed.\n");
531538
return 0;
532-
}
539+
}

0 commit comments

Comments
 (0)