@@ -1356,16 +1356,195 @@ SZ_PUBLIC sz_cptr_t sz_utf8_unpack_chunk_haswell( //
13561356#pragma endregion // Haswell Implementation
13571357
13581358#pragma region NEON Implementation
1359+ #if SZ_USE_NEON
1360+ #if defined(__clang__ )
1361+ #pragma clang attribute push(__attribute__((target("+simd"))), apply_to = function)
1362+ #elif defined(__GNUC__ )
1363+ #pragma GCC push_options
1364+ #pragma GCC target("+simd")
1365+ #endif
1366+
1367+ SZ_INTERNAL sz_u64_t sz_utf8_vreinterpretq_u8_u4_ (uint8x16_t vec ) {
1368+ // Use `vshrn` to produce a bitmask, similar to `movemask` in SSE.
1369+ // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
1370+ return vget_lane_u64 (vreinterpret_u64_u8 (vshrn_n_u16 (vreinterpretq_u16_u8 (vec ), 4 )), 0 ) & 0x8888888888888888ull ;
1371+ }
13591372
13601373SZ_PUBLIC sz_cptr_t sz_utf8_find_newline_neon (sz_cptr_t text , sz_size_t length , sz_size_t * matched_length ) {
1374+
1375+ sz_u128_vec_t text_vec ;
1376+ uint8x16_t n_vec = vdupq_n_u8 ('\n' );
1377+ uint8x16_t v_vec = vdupq_n_u8 ('\v' );
1378+ uint8x16_t f_vec = vdupq_n_u8 ('\f' );
1379+ uint8x16_t r_vec = vdupq_n_u8 ('\r' );
1380+ uint8x16_t xc2_vec = vdupq_n_u8 (0xC2 );
1381+ uint8x16_t x85_vec = vdupq_n_u8 (0x85 );
1382+ uint8x16_t xe2_vec = vdupq_n_u8 (0xE2 );
1383+ uint8x16_t x80_vec = vdupq_n_u8 (0x80 );
1384+ uint8x16_t xa8_vec = vdupq_n_u8 (0xA8 );
1385+ uint8x16_t xa9_vec = vdupq_n_u8 (0xA9 );
1386+
1387+ uint8x16_t drop1_vec = {0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
1388+ 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 };
1389+ uint8x16_t drop2_vec = {0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
1390+ 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x00 };
1391+
1392+ while (length >= 16 ) {
1393+ text_vec .u8x16 = vld1q_u8 ((sz_u8_t const * )text );
1394+
1395+ // 1-byte matches
1396+ uint8x16_t n_cmp = vceqq_u8 (text_vec .u8x16 , n_vec );
1397+ uint8x16_t v_cmp = vceqq_u8 (text_vec .u8x16 , v_vec );
1398+ uint8x16_t f_cmp = vceqq_u8 (text_vec .u8x16 , f_vec );
1399+ uint8x16_t r_cmp = vceqq_u8 (text_vec .u8x16 , r_vec );
1400+ uint8x16_t one_vec = vorrq_u8 (vorrq_u8 (n_cmp , v_cmp ), vorrq_u8 (f_cmp , r_cmp ));
1401+
1402+ // 2- & 3-byte matches with shifted views
1403+ uint8x16_t t1 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 1 );
1404+ uint8x16_t t2 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 2 );
1405+ uint8x16_t rn_vec = vandq_u8 (r_cmp , vceqq_u8 (t1 , n_vec ));
1406+ uint8x16_t xc285_vec = vandq_u8 (vceqq_u8 (text_vec .u8x16 , xc2_vec ), vceqq_u8 (t1 , x85_vec ));
1407+ uint8x16_t two_vec = vandq_u8 (vorrq_u8 (rn_vec , xc285_vec ), drop1_vec ); // Ignore last split match
1408+
1409+ uint8x16_t xe2_cmp = vceqq_u8 (text_vec .u8x16 , xe2_vec );
1410+ uint8x16_t e280_vec = vandq_u8 (xe2_cmp , vceqq_u8 (t1 , x80_vec ));
1411+ uint8x16_t e280a8_vec = vandq_u8 (e280_vec , vceqq_u8 (t2 , xa8_vec ));
1412+ uint8x16_t e280a9_vec = vandq_u8 (e280_vec , vceqq_u8 (t2 , xa9_vec ));
1413+ uint8x16_t three_vec = vandq_u8 (vorrq_u8 (e280a8_vec , e280a9_vec ), drop2_vec ); // Ignore last two split matches
1414+
1415+ // Quick presence check
1416+ uint8x16_t combined_vec = vorrq_u8 (one_vec , vorrq_u8 (two_vec , three_vec ));
1417+ if (vmaxvq_u8 (combined_vec )) {
1418+
1419+ // Late mask extraction only when a match exists
1420+ sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_ (one_vec );
1421+ sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_ (two_vec );
1422+ sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_ (three_vec );
1423+ sz_u64_t combined_mask = one_mask | two_mask | three_mask ;
1424+
1425+ int bit_index = sz_u64_ctz (combined_mask );
1426+ sz_u64_t first_match_mask = (sz_u64_t )1 << bit_index ;
1427+ sz_size_t length_value = 1 ;
1428+ length_value += (first_match_mask & (two_mask | three_mask )) != 0 ;
1429+ length_value += (first_match_mask & three_mask ) != 0 ;
1430+ * matched_length = length_value ;
1431+ return text + (bit_index / 4 );
1432+ }
1433+ text += 14 ;
1434+ length -= 14 ;
1435+ }
1436+
13611437 return sz_utf8_find_newline_serial (text , length , matched_length );
13621438}
13631439
13641440SZ_PUBLIC sz_cptr_t sz_utf8_find_whitespace_neon (sz_cptr_t text , sz_size_t length , sz_size_t * matched_length ) {
1441+
1442+ sz_u128_vec_t text_vec ;
1443+ uint8x16_t t_vec = vdupq_n_u8 ('\t' );
1444+ uint8x16_t r_vec = vdupq_n_u8 ('\r' );
1445+ uint8x16_t x20_vec = vdupq_n_u8 (' ' );
1446+ uint8x16_t xc2_vec = vdupq_n_u8 (0xC2 );
1447+ uint8x16_t x85_vec = vdupq_n_u8 (0x85 );
1448+ uint8x16_t xa0_vec = vdupq_n_u8 (0xA0 );
1449+ uint8x16_t xe1_vec = vdupq_n_u8 (0xE1 );
1450+ uint8x16_t xe2_vec = vdupq_n_u8 (0xE2 );
1451+ uint8x16_t xe3_vec = vdupq_n_u8 (0xE3 );
1452+ uint8x16_t x9a_vec = vdupq_n_u8 (0x9A );
1453+ uint8x16_t x80_vec = vdupq_n_u8 (0x80 );
1454+ uint8x16_t x81_vec = vdupq_n_u8 (0x81 );
1455+ uint8x16_t x8d_vec = vdupq_n_u8 (0x8D );
1456+ uint8x16_t xa8_vec = vdupq_n_u8 (0xA8 );
1457+ uint8x16_t xa9_vec = vdupq_n_u8 (0xA9 );
1458+ uint8x16_t xaf_vec = vdupq_n_u8 (0xAF );
1459+ uint8x16_t x9f_vec = vdupq_n_u8 (0x9F );
1460+
1461+ uint8x16_t drop1_vec = {0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
1462+ 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 };
1463+ uint8x16_t drop2_vec = {0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF ,
1464+ 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0xFF , 0x00 , 0x00 };
1465+
1466+ while (length >= 16 ) {
1467+ text_vec .u8x16 = vld1q_u8 ((sz_u8_t const * )text );
1468+
1469+ // 1-byte matches
1470+ uint8x16_t x20_cmp = vceqq_u8 (text_vec .u8x16 , x20_vec );
1471+ uint8x16_t range_cmp = vandq_u8 (vcgeq_u8 (text_vec .u8x16 , t_vec ), vcleq_u8 (text_vec .u8x16 , r_vec ));
1472+ uint8x16_t one_vec = vorrq_u8 (x20_cmp , range_cmp );
1473+
1474+ // 2-byte matches
1475+ uint8x16_t text1 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 1 );
1476+ uint8x16_t xc2_cmp = vceqq_u8 (text_vec .u8x16 , xc2_vec );
1477+ uint8x16_t two_vec =
1478+ vorrq_u8 (vandq_u8 (xc2_cmp , vceqq_u8 (text1 , x85_vec )), vandq_u8 (xc2_cmp , vceqq_u8 (text1 , xa0_vec )));
1479+ two_vec = vandq_u8 (two_vec , drop1_vec ); // Ignore last split match
1480+
1481+ // 3-byte matches
1482+ uint8x16_t text2 = vextq_u8 (text_vec .u8x16 , text_vec .u8x16 , 2 );
1483+ uint8x16_t xe1_cmp = vceqq_u8 (text_vec .u8x16 , xe1_vec );
1484+ uint8x16_t xe2_cmp = vceqq_u8 (text_vec .u8x16 , xe2_vec );
1485+ uint8x16_t xe3_cmp = vceqq_u8 (text_vec .u8x16 , xe3_vec );
1486+ uint8x16_t x80_ge_cmp = vcgeq_u8 (text2 , x80_vec );
1487+ uint8x16_t x8d_le_cmp = vcleq_u8 (text2 , x8d_vec );
1488+
1489+ uint8x16_t ogham_vec = vandq_u8 (xe1_cmp , vandq_u8 (vceqq_u8 (text1 , x9a_vec ), vceqq_u8 (text2 , x80_vec )));
1490+ uint8x16_t range_e280_vec =
1491+ vandq_u8 (xe2_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vandq_u8 (x80_ge_cmp , x8d_le_cmp )));
1492+ uint8x16_t u2028_vec = vandq_u8 (xe2_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vceqq_u8 (text2 , xa8_vec )));
1493+ uint8x16_t u2029_vec = vandq_u8 (xe2_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vceqq_u8 (text2 , xa9_vec )));
1494+ uint8x16_t u202f_vec = vandq_u8 (xe2_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vceqq_u8 (text2 , xaf_vec )));
1495+ uint8x16_t u205f_vec = vandq_u8 (xe2_cmp , vandq_u8 (vceqq_u8 (text1 , x81_vec ), vceqq_u8 (text2 , x9f_vec )));
1496+ uint8x16_t ideographic_vec = vandq_u8 (xe3_cmp , vandq_u8 (vceqq_u8 (text1 , x80_vec ), vceqq_u8 (text2 , x80_vec )));
1497+ uint8x16_t three_vec = vorrq_u8 (vorrq_u8 (vorrq_u8 (ogham_vec , range_e280_vec ), vorrq_u8 (u2028_vec , u2029_vec )),
1498+ vorrq_u8 (vorrq_u8 (u202f_vec , u205f_vec ), ideographic_vec ));
1499+ three_vec = vandq_u8 (three_vec , drop2_vec ); // Ignore last two split matches
1500+
1501+ uint8x16_t combined_vec = vorrq_u8 (one_vec , vorrq_u8 (two_vec , three_vec ));
1502+ if (vmaxvq_u8 (combined_vec )) {
1503+ // Late mask extraction only when a match exists
1504+ sz_u64_t one_mask = sz_utf8_vreinterpretq_u8_u4_ (one_vec );
1505+ sz_u64_t two_mask = sz_utf8_vreinterpretq_u8_u4_ (two_vec );
1506+ sz_u64_t three_mask = sz_utf8_vreinterpretq_u8_u4_ (three_vec );
1507+ sz_u64_t combined_mask = one_mask | two_mask | three_mask ;
1508+
1509+ int bit_index = sz_u64_ctz (combined_mask );
1510+ sz_u64_t first_match_mask = (sz_u64_t )1 << bit_index ;
1511+ sz_size_t length_value = 1 ;
1512+ length_value += (first_match_mask & (two_mask | three_mask )) != 0 ;
1513+ length_value += (first_match_mask & three_mask ) != 0 ;
1514+ * matched_length = length_value ;
1515+ return text + (bit_index / 4 );
1516+ }
1517+ text += 14 ;
1518+ length -= 14 ;
1519+ }
1520+
13651521 return sz_utf8_find_whitespace_serial (text , length , matched_length );
13661522}
13671523
1368- SZ_PUBLIC sz_size_t sz_utf8_count_neon (sz_cptr_t text , sz_size_t length ) { return sz_utf8_count_serial (text , length ); }
1524+ SZ_PUBLIC sz_size_t sz_utf8_count_neon (sz_cptr_t text , sz_size_t length ) {
1525+ sz_u128_vec_t text_vec , headers_vec , continuation_vec ;
1526+ uint8x16_t continuation_mask_vec = vdupq_n_u8 (0xC0 );
1527+ uint8x16_t continuation_pattern_vec = vdupq_n_u8 (0x80 );
1528+ sz_u8_t const * text_u8 = (sz_u8_t const * )text ;
1529+ sz_size_t char_count = 0 ;
1530+
1531+ while (length >= 16 ) {
1532+ text_vec .u8x16 = vld1q_u8 (text_u8 );
1533+ headers_vec .u8x16 = vandq_u8 (text_vec .u8x16 , continuation_mask_vec );
1534+ continuation_vec .u8x16 = vceqq_u8 (headers_vec .u8x16 , continuation_pattern_vec );
1535+ // Convert 0xFF/0x00 into 1/0 and sum.
1536+ uint8x16_t start_flags = vshrq_n_u8 (vmvnq_u8 (continuation_vec .u8x16 ), 7 );
1537+ uint16x8_t sum16 = vpaddlq_u8 (start_flags );
1538+ uint32x4_t sum32 = vpaddlq_u16 (sum16 );
1539+ uint64x2_t sum64 = vpaddlq_u32 (sum32 );
1540+ char_count += vgetq_lane_u64 (sum64 , 0 ) + vgetq_lane_u64 (sum64 , 1 );
1541+ text_u8 += 16 ;
1542+ length -= 16 ;
1543+ }
1544+
1545+ if (length ) char_count += sz_utf8_count_serial ((sz_cptr_t )text_u8 , length );
1546+ return char_count ;
1547+ }
13691548
13701549SZ_PUBLIC sz_cptr_t sz_utf8_find_nth_neon (sz_cptr_t text , sz_size_t length , sz_size_t n ) {
13711550 return sz_utf8_find_nth_serial (text , length , n );
@@ -1377,6 +1556,12 @@ SZ_PUBLIC sz_cptr_t sz_utf8_unpack_chunk_neon( //
13771556 sz_size_t * runes_unpacked ) {
13781557 return sz_utf8_unpack_chunk_serial (text , length , runes , runes_capacity , runes_unpacked );
13791558}
1559+ #if defined(__clang__ )
1560+ #pragma clang attribute pop
1561+ #elif defined(__GNUC__ )
1562+ #pragma GCC pop_options
1563+ #endif
1564+ #endif // SZ_USE_NEON
13801565
13811566#pragma endregion // NEON Implementation
13821567
@@ -1389,6 +1574,8 @@ SZ_DYNAMIC sz_size_t sz_utf8_count(sz_cptr_t text, sz_size_t length) {
13891574 return sz_utf8_count_ice (text , length );
13901575#elif SZ_USE_HASWELL
13911576 return sz_utf8_count_haswell (text , length );
1577+ #elif SZ_USE_NEON
1578+ return sz_utf8_count_neon (text , length );
13921579#else
13931580 return sz_utf8_count_serial (text , length );
13941581#endif
@@ -1399,6 +1586,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_nth(sz_cptr_t text, sz_size_t length, sz_size_
13991586 return sz_utf8_find_nth_ice (text , length , n );
14001587#elif SZ_USE_HASWELL
14011588 return sz_utf8_find_nth_haswell (text , length , n );
1589+ #elif SZ_USE_NEON
1590+ return sz_utf8_find_nth_neon (text , length , n );
14021591#else
14031592 return sz_utf8_find_nth_serial (text , length , n );
14041593#endif
@@ -1409,6 +1598,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_newline(sz_cptr_t text, sz_size_t length, sz_s
14091598 return sz_utf8_find_newline_ice (text , length , matched_length );
14101599#elif SZ_USE_HASWELL
14111600 return sz_utf8_find_newline_haswell (text , length , matched_length );
1601+ #elif SZ_USE_NEON
1602+ return sz_utf8_find_newline_neon (text , length , matched_length );
14121603#else
14131604 return sz_utf8_find_newline_serial (text , length , matched_length );
14141605#endif
@@ -1419,6 +1610,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_find_whitespace(sz_cptr_t text, sz_size_t length, s
14191610 return sz_utf8_find_whitespace_ice (text , length , matched_length );
14201611#elif SZ_USE_HASWELL
14211612 return sz_utf8_find_whitespace_haswell (text , length , matched_length );
1613+ #elif SZ_USE_NEON
1614+ return sz_utf8_find_whitespace_neon (text , length , matched_length );
14221615#else
14231616 return sz_utf8_find_whitespace_serial (text , length , matched_length );
14241617#endif
@@ -1430,6 +1623,8 @@ SZ_DYNAMIC sz_cptr_t sz_utf8_unpack_chunk(sz_cptr_t text, sz_size_t length, sz_r
14301623 return sz_utf8_unpack_chunk_ice (text , length , runes , runes_capacity , runes_unpacked );
14311624#elif SZ_USE_HASWELL
14321625 return sz_utf8_unpack_chunk_haswell (text , length , runes , runes_capacity , runes_unpacked );
1626+ #elif SZ_USE_NEON
1627+ return sz_utf8_unpack_chunk_neon (text , length , runes , runes_capacity , runes_unpacked );
14331628#else
14341629 return sz_utf8_unpack_chunk_serial (text , length , runes , runes_capacity , runes_unpacked );
14351630#endif
0 commit comments