@@ -3096,12 +3096,8 @@ typedef enum {
30963096 * - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
30973097 * - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
30983098 * - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3099- * - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3100- * can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3101- * - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3102- * - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3103- * - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3104- * - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3099+ * - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3100+ * - 'ſ' (U+017F, C5 BF) → 's' (73)
31053101 * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
31063102 * can't precede '̈' (U+0308, CC 88) to avoid:
31073103 * - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
@@ -3161,8 +3157,10 @@ typedef enum {
31613157 * That sign is extremely rare in Western European languages, while the lowercase 'k' is obviously common
31623158 * in German and English. In French, Spanish, and Portuguese - less so. So we add one more check
31633159 * for 'K' (U+212A, E2 84 AA) in the haystack, and if detected, again - revert to serial.
3160+ * Similarly, we check for "ſ" (Latin Small Letter Long S, U+017F, C5 BF) which folds to 's' (U+0073, 73).
3161+ * It's archaic in modern languages but theoretically possible in historical texts.
31643162 *
3165- * So we inherit the following limitations from `sz_utf8_case_rune_safe_ascii_k`:
3163+ * So we allow both 's' and 'k' and inherit only the following limitations from `sz_utf8_case_rune_safe_ascii_k`:
31663164 *
31673165 * - 'i' (U+0069, 69) - can't be first or last; can't follow 'f' (U+0066, 66); can't precede '̇' (U+0307, CC 87)
31683166 * to avoid 'İ' (U+0130, C4 B0) → "i̇" (69 CC 87).
@@ -3233,7 +3231,7 @@ typedef enum {
32333231 * There is also a Unicode rule for folding the Kelvin 'K' (U+212A, E2 84 AA) into 'k' (U+006B, 6B).
32343232 * That sign is extremely rare in Western European languages, while the lowercase 'k' is very common in Turkish,
32353233 * Czech, Polish. So we add one more check for 'K' (U+212A, E2 84 AA) in the haystack, and if detected,
3236- * again - revert to serial.
3234+ * again - revert to serial. Same logic applies to "ſ" (Latin Small Letter Long S, U+017F, C5 BF) folding to 's'.
32373235 *
32383236 * The Turkish dotted 'İ' (U+0130, C4 B0) expands into a 3-byte sequence. We detect it when scanning throhg the
32393237 * haystack and fall back to the serial algorithm. That's pretty much the only triple-byte sequence we will
@@ -3360,10 +3358,11 @@ typedef enum {
33603358 * - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
33613359 * - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
33623360 * - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3363- * - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3364- * can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3365- * - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3366- * - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3361+ * - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3362+ * - 'ſ' (U+017F, C5 BF) → 's' (73)
3363+ * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3364+ * can't precede '̈' (U+0308, CC 88) to avoid:
3365+ * - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
33673366 * - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
33683367 * - 'st' (U+FB06, EF AC 86) → "st" (73 74)
33693368 * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
@@ -3446,7 +3445,7 @@ typedef enum {
34463445 * kernel path (sz_utf8_case_rune_safe_western_europe_k), not the Greek path. The Greek kernel
34473446 * only handles characters that originate in the Greek block.
34483447 *
3449- * We inherit ALL contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
3448+ * We inherit @b all contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
34503449 *
34513450 * - 'a' (U+0061, 61) - can't be last; can't precede 'ʾ' (U+02BE, CA BE) to avoid:
34523451 * - 'ẚ' (U+1E9A, E1 BA 9A) → "aʾ" (61 CA BE)
@@ -3473,10 +3472,11 @@ typedef enum {
34733472 * - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
34743473 * - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
34753474 * - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3476- * - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3477- * can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3478- * - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3479- * - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3475+ * - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3476+ * - 'ſ' (U+017F, C5 BF) → 's' (73)
3477+ * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3478+ * can't precede '̈' (U+0308, CC 88) to avoid:
3479+ * - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
34803480 * - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
34813481 * - 'st' (U+FB06, EF AC 86) → "st" (73 74)
34823482 * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
@@ -3514,9 +3514,51 @@ typedef enum {
35143514 * - D5 A1-BF: lowercase 'ա' (U+0561) through 'ի' (U+057F)
35153515 * - D6 80-86: lowercase 'լ' (U+0580) through 'ֆ' (U+0586)
35163516 *
3517- * We inherit ALL contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`
3518- * because mixed-script documents (e.g. dictionaries) may contain Latin ligatures,
3519- * and add more rules specific to Armenian due to several ligatures:
3517+ * We inherit @b all contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
3518+ *
3519+ * - 'a' (U+0061, 61) - can't be last; can't precede 'ʾ' (U+02BE, CA BE) to avoid:
3520+ * - 'ẚ' (U+1E9A, E1 BA 9A) → "aʾ" (61 CA BE)
3521+ * - 'f' (U+0066, 66) - can't be first or last; can't follow 'f' (U+0066, 66);
3522+ * can't precede 'f' (U+0066, 66), 'i' (U+0069, 69), 'l' (U+006C, 6C) to avoid:
3523+ * - 'ff' (U+FB00, EF AC 80) → "ff" (66 66)
3524+ * - 'fi' (U+FB01, EF AC 81) → "fi" (66 69)
3525+ * - 'fl' (U+FB02, EF AC 82) → "fl" (66 6C)
3526+ * - 'ffi' (U+FB03, EF AC 83) → "ffi" (66 66 69)
3527+ * - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
3528+ * - 'h' (U+0068, 68) - can't be last; can't precede '̱' (U+0331, CC B1) to avoid:
3529+ * - 'ẖ' (U+1E96, E1 BA 96) → "ẖ" (68 CC B1)
3530+ * - 'i' (U+0069, 69) - can't be first or last; can't follow 'f' (U+0066, 66);
3531+ * can't precede '̇' (U+0307, CC 87) to avoid:
3532+ * - 'İ' (U+0130, C4 B0) → "i̇" (69 CC 87)
3533+ * - 'fi' (U+FB01, EF AC 81) → "fi" (66 69)
3534+ * - 'ffi' (U+FB03, EF AC 83) → "ffi" (66 66 69)
3535+ * - 'j' (U+006A, 6A) - can't be last; can't precede '̌' (U+030C, CC 8C) to avoid:
3536+ * - 'ǰ' (U+01F0, C7 B0) → "ǰ" (6A CC 8C)
3537+ * - 'k' (U+006B, 6B) - can't be present at all, because it's a folding target of the Kelvin sign:
3538+ * - 'K' (U+212A, E2 84 AA) → 'k' (6B)
3539+ * - 'l' (U+006C, 6C) - can't be first; can't follow 'f' (U+0066, 66) to avoid:
3540+ * - 'fl' (U+FB02, EF AC 82) → "fl" (66 6C)
3541+ * - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
3542+ * - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
3543+ * - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3544+ * - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3545+ * - 'ſ' (U+017F, C5 BF) → 's' (73)
3546+ * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3547+ * can't precede '̈' (U+0308, CC 88) to avoid:
3548+ * - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
3549+ * - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3550+ * - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3551+ * - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3552+ * can't precede '̈' (U+0308, CC 88) to avoid:
3553+ * - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
3554+ * - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3555+ * - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3556+ * - 'w' (U+0077, 77) - can't be last; can't precede '̊' (U+030A, CC 8A) to avoid:
3557+ * - 'ẘ' (U+1E98, E1 BA 98) → "ẘ" (77 CC 8A)
3558+ * - 'y' (U+0079, 79) - can't be last; can't precede '̊' (U+030A, CC 8A) to avoid:
3559+ * - 'ẙ' (U+1E99, E1 BA 99) → "ẙ" (79 CC 8A)
3560+ *
3561+ * We also add rules specific to Armenian ligatures:
35203562 *
35213563 * - 'և' (U+0587, Ech-Yiwn) → "եւ" ('ե' + 'ւ') - very common
35223564 * - 'ﬓ' (U+FB13, Men-Now) → "մն" ('մ' + 'ն') - quite rare
0 commit comments