Skip to content

Commit a3c407f

Browse files
committed
Improve: Fast path for UTF-8 whitespaces
sz_find_newline_utf8_serial: 0.9 GB/s sz_find_whitespace_utf8_serial: 0.7 GB/s sz_find_newline_utf8_ice: 14.5 GB/s - 16x sz_find_whitespace_utf8_ice: 1.3 GB/s - 1.85x
1 parent 26b0074 commit a3c407f

File tree

1 file changed

+30
-6
lines changed

1 file changed

+30
-6
lines changed

include/stringzilla/find.h

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,18 +1986,42 @@ SZ_PUBLIC sz_cptr_t sz_find_whitespace_utf8_ice(sz_cptr_t text, sz_size_t length
19861986
sz_u64_t one_byte_mask = _cvtmask64_u64(
19871987
_kor_mask64(_kor_mask64(x20_mask, _kand_mask64(t_mask, r_mask)), _kand_mask64(x1c_ge_mask, x1f_le_mask)));
19881988

1989-
// 2-byte indicators & matches
1990-
__mmask64 xc2_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, xc2_vec.zmm);
1989+
// Instead of immediately checking for 2-byte and 3-byte matches with a ridiculous number of masks and
1990+
// comparisons, let's define a "fast path" for following cases:
1991+
// - no whitespaces are found in the range
1992+
// - a one-byte match comes before any possible prefix byte of a multi-byte match
1993+
__mmask64 xc2_mask = _mm512_mask_cmpeq_epi8_mask(0x7FFFFFFFFFFFFFFF, text_vec.zmm, xc2_vec.zmm);
1994+
__mmask64 xe1_mask = _mm512_mask_cmpeq_epi8_mask(0x3FFFFFFFFFFFFFFF, text_vec.zmm, xe1_vec.zmm);
1995+
__mmask64 xe2_mask = _mm512_mask_cmpeq_epi8_mask(0x3FFFFFFFFFFFFFFF, text_vec.zmm, xe2_vec.zmm);
1996+
__mmask64 xe3_mask = _mm512_mask_cmpeq_epi8_mask(0x3FFFFFFFFFFFFFFF, text_vec.zmm, xe3_vec.zmm);
1997+
sz_u64_t prefix_mask =
1998+
_cvtmask64_u64(_kor_mask64(_kor_mask64(xc2_mask, xe1_mask), _kor_mask64(xe2_mask, xe3_mask)));
1999+
2000+
// Check if we matched the "fast path"
2001+
if (one_byte_mask) {
2002+
if (prefix_mask) {
2003+
int first_one_byte_offset = sz_u64_ctz(one_byte_mask);
2004+
int first_prefix_offset = sz_u64_ctz(prefix_mask);
2005+
if (first_one_byte_offset < first_prefix_offset) {
2006+
*matched_length = 1;
2007+
return text + first_one_byte_offset;
2008+
}
2009+
}
2010+
else {
2011+
int first_one_byte_offset = sz_u64_ctz(one_byte_mask);
2012+
*matched_length = 1;
2013+
return text + first_one_byte_offset;
2014+
}
2015+
}
2016+
2017+
// 2-byte indicators suffixes & matches
19912018
__mmask64 x85_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, x85_vec.zmm);
19922019
__mmask64 xa0_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, xa0_vec.zmm);
19932020
__mmask64 xc285_mask = _kand_mask64(xc2_mask, _kshiftri_mask64(x85_mask, 1)); // U+0085 NEL
19942021
__mmask64 xc2a0_mask = _kand_mask64(xc2_mask, _kshiftri_mask64(xa0_mask, 1)); // U+00A0 NBSP
19952022
sz_u64_t two_byte_mask = _cvtmask64_u64(_kor_mask64(xc285_mask, xc2a0_mask));
19962023

1997-
// 3-byte indicators
1998-
__mmask64 xe1_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, xe1_vec.zmm);
1999-
__mmask64 xe2_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, xe2_vec.zmm);
2000-
__mmask64 xe3_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, xe3_vec.zmm);
2024+
// 3-byte indicators suffixes
20012025
__mmask64 x9a_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, x9a_vec.zmm);
20022026
__mmask64 x80_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, x80_vec.zmm);
20032027
__mmask64 x81_mask = _mm512_cmpeq_epi8_mask(text_vec.zmm, x81_vec.zmm);

0 commit comments

Comments
 (0)