Skip to content

Commit e5c477d

Browse files
committed
Add: Hash-free search kernel for small needles
1 parent 8614658 commit e5c477d

File tree

1 file changed

+218
-6
lines changed

1 file changed

+218
-6
lines changed

include/stringzilla/utf8_case.h

Lines changed: 218 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1089,6 +1089,189 @@ SZ_INTERNAL sz_bool_t sz_utf8_is_fully_caseless_(sz_cptr_t str, sz_size_t len) {
10891089
return sz_true_k;
10901090
}
10911091

1092+
/**
1093+
* @brief Hash-free case-insensitive search for needles that fold to exactly 1 rune.
1094+
* Examples: 'a', 'A', 'б', 'Б' (but NOT 'ß' which folds to 'ss' = 2 runes).
1095+
*
1096+
* Single-pass algorithm: parses each source rune, folds it, checks if it produces
1097+
* exactly one rune matching the target. No iterator overhead, no verification needed.
1098+
*
1099+
* @param[in] target_folded The single folded rune to search for.
1100+
*/
1101+
SZ_INTERNAL sz_cptr_t sz_utf8_case_insensitive_find_1folded_serial_( //
1102+
sz_cptr_t haystack, sz_size_t haystack_length, //
1103+
sz_rune_t needle_folded, sz_size_t *match_length) {
1104+
1105+
sz_cptr_t const haystack_end = haystack + haystack_length;
1106+
1107+
// Each haystack rune may fold in up to 3 runes
1108+
sz_rune_t haystack_rune;
1109+
sz_rune_length_t haystack_rune_length;
1110+
1111+
// If we simply initialize the runes for zero, the code will break
1112+
// when the needle itself is the NUL character
1113+
sz_rune_t haystack_folded_runes[3] = {~needle_folded};
1114+
while (haystack < haystack_end) {
1115+
sz_rune_parse(haystack, &haystack_rune, &haystack_rune_length);
1116+
sz_unicode_fold_codepoint_(haystack_rune, haystack_folded_runes);
1117+
1118+
// Perform branchless equality check via arithmetic
1119+
sz_u32_t has_match = //
1120+
(haystack_folded_runes[0] == needle_folded) + //
1121+
(haystack_folded_runes[1] == needle_folded) + //
1122+
(haystack_folded_runes[2] == needle_folded);
1123+
1124+
if (has_match) {
1125+
*match_length = haystack_rune_length;
1126+
return haystack;
1127+
}
1128+
1129+
haystack += haystack_rune_length;
1130+
}
1131+
1132+
*match_length = 0;
1133+
return SZ_NULL_CHAR;
1134+
}
1135+
1136+
/**
1137+
* @brief Hash-free case-insensitive search for needles that fold to exactly 2 runes.
1138+
* Examples: 'ab', 'AB', 'ß' (folds to 'ss'), 'fi' (folds to 'fi').
1139+
*
1140+
* Single-pass sliding window over the folded rune stream. Handles expansions (ß→ss)
1141+
* by buffering folded runes from each source and tracking source boundaries.
1142+
*/
1143+
SZ_INTERNAL sz_cptr_t sz_utf8_case_insensitive_find_2folded_serial_( //
1144+
sz_cptr_t haystack, sz_size_t haystack_length, //
1145+
sz_rune_t first_needle_folded, sz_rune_t second_needle_folded, sz_size_t *match_length) {
1146+
1147+
sz_cptr_t const haystack_end = haystack + haystack_length;
1148+
1149+
// Each haystack rune may fold in up to 3 runes, but we also keep an extra slot
1150+
// for the last folded rune from the previous iterato step
1151+
sz_rune_t haystack_rune;
1152+
sz_rune_length_t haystack_rune_length, haystack_last_rune_length = sz_utf8_invalid_k;
1153+
1154+
// If we simply initialize the runes for zero, the code will break
1155+
// when the needle itself is the NUL character
1156+
sz_rune_t haystack_folded_runes[4] = {~first_needle_folded};
1157+
while (haystack < haystack_end) {
1158+
sz_rune_parse(haystack, &haystack_rune, &haystack_rune_length);
1159+
1160+
// Export into the last 3 rune entries of the 4-element array,
1161+
// keeping the first position with historical data untouched
1162+
sz_size_t folded_count = sz_unicode_fold_codepoint_(haystack_rune, haystack_folded_runes + 1);
1163+
1164+
// Perform branchless equality check via arithmetic
1165+
sz_u32_t has_match_f0 = first_needle_folded == haystack_folded_runes[0];
1166+
sz_u32_t has_match_f1 = first_needle_folded == haystack_folded_runes[1];
1167+
sz_u32_t has_match_f2 = first_needle_folded == haystack_folded_runes[2];
1168+
sz_u32_t has_match_s1 = second_needle_folded == haystack_folded_runes[1];
1169+
sz_u32_t has_match_s2 = second_needle_folded == haystack_folded_runes[2];
1170+
sz_u32_t has_match_s3 = second_needle_folded == haystack_folded_runes[3];
1171+
1172+
// Branchless match detection: each product is 0 or 1
1173+
sz_u32_t match_at_01 = has_match_f0 * has_match_s1;
1174+
sz_u32_t match_at_12 = has_match_f1 * has_match_s2;
1175+
sz_u32_t match_at_23 = has_match_f2 * has_match_s3;
1176+
sz_u32_t has_match = match_at_01 + match_at_12 + match_at_23;
1177+
1178+
if (has_match) {
1179+
// Only `match_at_01` spans sources; others are within current source
1180+
sz_size_t back_offset = match_at_01 * (sz_size_t)haystack_last_rune_length;
1181+
*match_length = (sz_size_t)haystack_rune_length + back_offset;
1182+
return haystack - back_offset;
1183+
}
1184+
1185+
haystack_folded_runes[0] = haystack_folded_runes[folded_count];
1186+
haystack_last_rune_length = haystack_rune_length;
1187+
haystack += haystack_rune_length;
1188+
}
1189+
1190+
*match_length = 0;
1191+
return SZ_NULL_CHAR;
1192+
}
1193+
1194+
/**
1195+
* @brief Hash-free case-insensitive search for needles that fold to exactly 3 runes.
1196+
* Examples: 'abc', 'ABC', 'aß' (folds to 'ass'), 'fia' (folds to 'fia').
1197+
*
1198+
* Single-pass sliding window of 3 folded runes over the haystack's folded stream.
1199+
* Handles expansions (ß→ss) by buffering and tracking source boundaries.
1200+
*/
1201+
SZ_INTERNAL sz_cptr_t sz_utf8_case_insensitive_find_3folded_serial_( //
1202+
sz_cptr_t haystack, sz_size_t haystack_length, //
1203+
sz_rune_t first_needle_folded, sz_rune_t second_needle_folded, sz_rune_t third_needle_folded,
1204+
sz_size_t *match_length) {
1205+
1206+
sz_cptr_t const haystack_end = haystack + haystack_length;
1207+
1208+
// Each haystack rune may fold in up to 3 runes, but we also keep an extra 2 slots
1209+
// for the last folded rune from the previous iteration step, and the one before that
1210+
sz_rune_t haystack_rune;
1211+
sz_rune_length_t haystack_rune_length, haystack_last_rune_length = sz_utf8_invalid_k,
1212+
haystack_preceding_rune_length = sz_utf8_invalid_k;
1213+
1214+
// Initialize historical slots with sentinels that can never match their respective needle positions
1215+
// This prevents false matches on first iterations when history is not yet populated
1216+
sz_rune_t haystack_folded_runes[5] = {~first_needle_folded, ~second_needle_folded, 0, 0, 0};
1217+
while (haystack < haystack_end) {
1218+
sz_rune_parse(haystack, &haystack_rune, &haystack_rune_length);
1219+
1220+
// Export into the last 3 rune entries of the 5-element array,
1221+
// keeping the first two positions with historical data untouched
1222+
sz_size_t folded_count = sz_unicode_fold_codepoint_(haystack_rune, haystack_folded_runes + 2);
1223+
1224+
// Perform branchless equality check via arithmetic
1225+
sz_u32_t has_match_f0 = first_needle_folded == haystack_folded_runes[0];
1226+
sz_u32_t has_match_f1 = first_needle_folded == haystack_folded_runes[1];
1227+
sz_u32_t has_match_f2 = first_needle_folded == haystack_folded_runes[2];
1228+
sz_u32_t has_match_s1 = second_needle_folded == haystack_folded_runes[1];
1229+
sz_u32_t has_match_s2 = second_needle_folded == haystack_folded_runes[2];
1230+
sz_u32_t has_match_s3 = second_needle_folded == haystack_folded_runes[3];
1231+
sz_u32_t has_match_t2 = third_needle_folded == haystack_folded_runes[2];
1232+
sz_u32_t has_match_t3 = third_needle_folded == haystack_folded_runes[3];
1233+
sz_u32_t has_match_t4 = third_needle_folded == haystack_folded_runes[4];
1234+
1235+
// Branchless match detection: each product is 0 or 1
1236+
sz_u32_t match_at_012 = has_match_f0 * has_match_s1 * has_match_t2;
1237+
sz_u32_t match_at_123 = has_match_f1 * has_match_s2 * has_match_t3;
1238+
sz_u32_t match_at_234 = has_match_f2 * has_match_s3 * has_match_t4;
1239+
sz_u32_t has_match = match_at_012 + match_at_123 + match_at_234;
1240+
1241+
if (has_match) {
1242+
// Compute back offset based on which position matched:
1243+
// - `match_at_012`: need preceding + last
1244+
// - `match_at_123`: need last
1245+
// - `match_at_234`: stay at current
1246+
sz_size_t back_for_last = (match_at_012 + match_at_123) * (sz_size_t)haystack_last_rune_length;
1247+
sz_size_t back_for_preceding = match_at_012 * (sz_size_t)haystack_preceding_rune_length;
1248+
sz_size_t back_offset = back_for_last + back_for_preceding;
1249+
*match_length = (sz_size_t)haystack_rune_length + back_offset;
1250+
return haystack - back_offset;
1251+
}
1252+
1253+
// Historical context update here is a bit trickier than in previous spaces
1254+
if (folded_count >= 2) {
1255+
haystack_folded_runes[0] = haystack_folded_runes[folded_count];
1256+
haystack_folded_runes[1] = haystack_folded_runes[folded_count + 1];
1257+
haystack_preceding_rune_length = sz_utf8_invalid_k;
1258+
haystack_last_rune_length = haystack_rune_length;
1259+
}
1260+
else {
1261+
sz_assert_(folded_count == 1);
1262+
haystack_folded_runes[0] = haystack_folded_runes[1];
1263+
haystack_folded_runes[1] = haystack_folded_runes[2];
1264+
haystack_preceding_rune_length = haystack_last_rune_length;
1265+
haystack_last_rune_length = haystack_rune_length;
1266+
}
1267+
1268+
haystack += haystack_rune_length;
1269+
}
1270+
1271+
*match_length = 0;
1272+
return SZ_NULL_CHAR;
1273+
}
1274+
10921275
/**
10931276
* @brief Rabin-Karp style case-insensitive UTF-8 substring search using a ring buffer.
10941277
* Uses a rolling hash over casefolded runes with O(1) updates per position.
@@ -1113,6 +1296,35 @@ SZ_PUBLIC sz_cptr_t sz_utf8_case_insensitive_find_serial( //
11131296
return SZ_NULL_CHAR;
11141297
}
11151298

1299+
// For short needles (up to 12 bytes which can fold to at most ~6 runes), try hash-free search.
1300+
// We fold the needle first and dispatch based on the folded rune count.
1301+
// This avoids ring buffer setup, hash multiplier computation, and rolling hash updates.
1302+
if (needle_length <= 12) {
1303+
sz_rune_t folded[4]; // 4th slot accessed before loop exit
1304+
sz_size_t folded_count = 0;
1305+
sz_utf8_folded_iter_t iter;
1306+
sz_utf8_folded_iter_init_(&iter, needle, needle_length);
1307+
sz_rune_t rune;
1308+
while (folded_count < 4 && sz_utf8_folded_iter_next_(&iter, &rune)) folded[folded_count++] = rune;
1309+
1310+
// Dispatch based on folded rune count
1311+
switch (folded_count) {
1312+
case 1:
1313+
return sz_utf8_case_insensitive_find_1folded_serial_( //
1314+
haystack, haystack_length, //
1315+
folded[0], match_length);
1316+
case 2:
1317+
return sz_utf8_case_insensitive_find_2folded_serial_( //
1318+
haystack, haystack_length, //
1319+
folded[0], folded[1], match_length);
1320+
case 3:
1321+
return sz_utf8_case_insensitive_find_3folded_serial_( //
1322+
haystack, haystack_length, //
1323+
folded[0], folded[1], folded[2], match_length);
1324+
default: break; // 4+ folded runes: fall through to Rabin-Karp
1325+
}
1326+
}
1327+
11161328
sz_size_t const ring_capacity = 32;
11171329
sz_rune_t needle_runes[32];
11181330
sz_size_t needle_prefix_count = 0, needle_total_count = 0;
@@ -4748,18 +4960,18 @@ SZ_PUBLIC sz_cptr_t sz_utf8_case_insensitive_find_ice( //
47484960
// 3. Fall back to serial if no path meets its threshold
47494961

47504962
// Priority 1: ASCII path (broadest haystack compatibility, handles all byte values)
4751-
if (analysis.ascii.length >= 3)
4963+
if (analysis.ascii.length >= 1)
47524964
return sz_utf8_case_insensitive_find_ascii_ice_(haystack, haystack_length, needle, needle_length,
47534965
needle + analysis.ascii.start, analysis.ascii.length,
47544966
matched_length);
47554967

47564968
// Priority 2: Latin1 path (includes Latin-1 Supplement: ß, accented letters, etc.)
4757-
if (analysis.latin1.length >= 4)
4969+
if (analysis.latin1.length >= 2) // Smallest non-ASCII Latin1 codepoint is 2 bytes
47584970
return sz_utf8_case_insensitive_find_latin1_ice_(haystack, haystack_length, needle, needle_length,
47594971
&analysis.latin1, matched_length);
47604972

47614973
// Priority 3: Vietnamese path (includes Latin1 + Latin Extended Additional)
4762-
if (analysis.vietnamese.length >= 6)
4974+
if (analysis.vietnamese.length >= 3) // One Vietnamese codepoints are 3 bytes in size
47634975
return sz_utf8_case_insensitive_find_vietnamese_ice_(haystack, haystack_length, needle, needle_length,
47644976
&analysis.vietnamese, matched_length);
47654977

@@ -4773,13 +4985,13 @@ SZ_PUBLIC sz_cptr_t sz_utf8_case_insensitive_find_ice( //
47734985
if (analysis.armenian.length > best_script_len) best_script_len = analysis.armenian.length;
47744986

47754987
// Select among script-specific paths based on longest window
4776-
if (analysis.cyrillic.length == best_script_len && analysis.cyrillic.length >= 4)
4988+
if (analysis.cyrillic.length == best_script_len && analysis.cyrillic.length >= 2)
47774989
return sz_utf8_case_insensitive_find_cyrillic_ice_(haystack, haystack_length, needle, needle_length,
47784990
&analysis.cyrillic, matched_length);
4779-
if (analysis.greek.length == best_script_len && analysis.greek.length >= 4)
4991+
if (analysis.greek.length == best_script_len && analysis.greek.length >= 2)
47804992
return sz_utf8_case_insensitive_find_greek_ice_(haystack, haystack_length, needle, needle_length,
47814993
&analysis.greek, matched_length);
4782-
if (analysis.armenian.length == best_script_len && analysis.armenian.length >= 4)
4994+
if (analysis.armenian.length == best_script_len && analysis.armenian.length >= 2)
47834995
return sz_utf8_case_insensitive_find_armenian_ice_(haystack, haystack_length, needle, needle_length,
47844996
&analysis.armenian, matched_length);
47854997

0 commit comments

Comments
 (0)