@@ -1986,18 +1986,42 @@ SZ_PUBLIC sz_cptr_t sz_find_whitespace_utf8_ice(sz_cptr_t text, sz_size_t length
19861986 sz_u64_t one_byte_mask = _cvtmask64_u64 (
19871987 _kor_mask64 (_kor_mask64 (x20_mask , _kand_mask64 (t_mask , r_mask )), _kand_mask64 (x1c_ge_mask , x1f_le_mask )));
19881988
1989- // 2-byte indicators & matches
1990- __mmask64 xc2_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , xc2_vec .zmm );
1989+ // Instead of immediately checking for 2-byte and 3-byte matches with a ridiculous number of masks and
1990+ // comparisons, let's define a "fast path" for following cases:
1991+ // - no whitespaces are found in the range
1992+ // - a one-byte match comes before any possible prefix byte of a multi-byte match
1993+ __mmask64 xc2_mask = _mm512_mask_cmpeq_epi8_mask (0x7FFFFFFFFFFFFFFF , text_vec .zmm , xc2_vec .zmm );
1994+ __mmask64 xe1_mask = _mm512_mask_cmpeq_epi8_mask (0x3FFFFFFFFFFFFFFF , text_vec .zmm , xe1_vec .zmm );
1995+ __mmask64 xe2_mask = _mm512_mask_cmpeq_epi8_mask (0x3FFFFFFFFFFFFFFF , text_vec .zmm , xe2_vec .zmm );
1996+ __mmask64 xe3_mask = _mm512_mask_cmpeq_epi8_mask (0x3FFFFFFFFFFFFFFF , text_vec .zmm , xe3_vec .zmm );
1997+ sz_u64_t prefix_mask =
1998+ _cvtmask64_u64 (_kor_mask64 (_kor_mask64 (xc2_mask , xe1_mask ), _kor_mask64 (xe2_mask , xe3_mask )));
1999+
2000+ // Check if we matched the "fast path"
2001+ if (one_byte_mask ) {
2002+ if (prefix_mask ) {
2003+ int first_one_byte_offset = sz_u64_ctz (one_byte_mask );
2004+ int first_prefix_offset = sz_u64_ctz (prefix_mask );
2005+ if (first_one_byte_offset < first_prefix_offset ) {
2006+ * matched_length = 1 ;
2007+ return text + first_one_byte_offset ;
2008+ }
2009+ }
2010+ else {
2011+ int first_one_byte_offset = sz_u64_ctz (one_byte_mask );
2012+ * matched_length = 1 ;
2013+ return text + first_one_byte_offset ;
2014+ }
2015+ }
2016+
2017+ // 2-byte indicators suffixes & matches
19912018 __mmask64 x85_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , x85_vec .zmm );
19922019 __mmask64 xa0_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , xa0_vec .zmm );
19932020 __mmask64 xc285_mask = _kand_mask64 (xc2_mask , _kshiftri_mask64 (x85_mask , 1 )); // U+0085 NEL
19942021 __mmask64 xc2a0_mask = _kand_mask64 (xc2_mask , _kshiftri_mask64 (xa0_mask , 1 )); // U+00A0 NBSP
19952022 sz_u64_t two_byte_mask = _cvtmask64_u64 (_kor_mask64 (xc285_mask , xc2a0_mask ));
19962023
1997- // 3-byte indicators
1998- __mmask64 xe1_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , xe1_vec .zmm );
1999- __mmask64 xe2_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , xe2_vec .zmm );
2000- __mmask64 xe3_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , xe3_vec .zmm );
2024+ // 3-byte indicators suffixes
20012025 __mmask64 x9a_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , x9a_vec .zmm );
20022026 __mmask64 x80_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , x80_vec .zmm );
20032027 __mmask64 x81_mask = _mm512_cmpeq_epi8_mask (text_vec .zmm , x81_vec .zmm );
0 commit comments