Skip to content

Commit 2226e25

Browse files
committed
Docs: Policy for historical S sign 'ſ' (U+017F)
1 parent a9a7d85 commit 2226e25

File tree

1 file changed

+62
-20
lines changed

1 file changed

+62
-20
lines changed

include/stringzilla/utf8_case.h

Lines changed: 62 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3096,12 +3096,8 @@ typedef enum {
30963096
* - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
30973097
* - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
30983098
* - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3099-
* - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3100-
* can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3101-
* - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3102-
* - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3103-
* - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3104-
* - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3099+
* - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3100+
* - 'ſ' (U+017F, C5 BF) → 's' (73)
31053101
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
31063102
* can't precede '̈' (U+0308, CC 88) to avoid:
31073103
* - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
@@ -3161,8 +3157,10 @@ typedef enum {
31613157
* That sign is extremely rare in Western European languages, while the lowercase 'k' is obviously common
31623158
* in German and English. In French, Spanish, and Portuguese - less so. So we add one more check
31633159
* for 'K' (U+212A, E2 84 AA) in the haystack, and if detected, again - revert to serial.
3160+
* Similarly, we check for "ſ" (Latin Small Letter Long S, U+017F, C5 BF) which folds to 's' (U+0073, 73).
3161+
* It's archaic in modern languages but theoretically possible in historical texts.
31643162
*
3165-
* So we inherit the following limitations from `sz_utf8_case_rune_safe_ascii_k`:
3163+
* So we allow both 's' and 'k' and inherit only the following limitations from `sz_utf8_case_rune_safe_ascii_k`:
31663164
*
31673165
* - 'i' (U+0069, 69) - can't be first or last; can't follow 'f' (U+0066, 66); can't precede '̇' (U+0307, CC 87)
31683166
* to avoid 'İ' (U+0130, C4 B0) → "i̇" (69 CC 87).
@@ -3233,7 +3231,7 @@ typedef enum {
32333231
* There is also a Unicode rule for folding the Kelvin 'K' (U+212A, E2 84 AA) into 'k' (U+006B, 6B).
32343232
* That sign is extremely rare in Western European languages, while the lowercase 'k' is very common in Turkish,
32353233
* Czech, Polish. So we add one more check for 'K' (U+212A, E2 84 AA) in the haystack, and if detected,
3236-
* again - revert to serial.
3234+
* again - revert to serial. Same logic applies to "ſ" (Latin Small Letter Long S, U+017F, C5 BF) folding to 's'.
32373235
*
32383236
* The Turkish dotted 'İ' (U+0130, C4 B0) expands into a 3-byte sequence. We detect it when scanning throhg the
32393237
* haystack and fall back to the serial algorithm. That's pretty much the only triple-byte sequence we will
@@ -3360,10 +3358,11 @@ typedef enum {
33603358
* - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
33613359
* - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
33623360
* - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3363-
* - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3364-
* can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3365-
* - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3366-
* - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3361+
* - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3362+
* - 'ſ' (U+017F, C5 BF) → 's' (73)
3363+
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3364+
* can't precede '̈' (U+0308, CC 88) to avoid:
3365+
* - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
33673366
* - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
33683367
* - 'st' (U+FB06, EF AC 86) → "st" (73 74)
33693368
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
@@ -3446,7 +3445,7 @@ typedef enum {
34463445
* kernel path (sz_utf8_case_rune_safe_western_europe_k), not the Greek path. The Greek kernel
34473446
* only handles characters that originate in the Greek block.
34483447
*
3449-
* We inherit ALL contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
3448+
* We inherit @b all contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
34503449
*
34513450
* - 'a' (U+0061, 61) - can't be last; can't precede 'ʾ' (U+02BE, CA BE) to avoid:
34523451
* - 'ẚ' (U+1E9A, E1 BA 9A) → "aʾ" (61 CA BE)
@@ -3473,10 +3472,11 @@ typedef enum {
34733472
* - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
34743473
* - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
34753474
* - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3476-
* - 's' (U+0073, 73) - can't be first or last; can't follow 's' (U+0073, 73);
3477-
* can't precede 's' (U+0073, 73), 't' (U+0074, 74) to avoid:
3478-
* - 'ß' (U+00DF, C3 9F) → "ss" (73 73)
3479-
* - 'ẞ' (U+1E9E, E1 BA 9E) → "ss" (73 73)
3475+
* - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3476+
* - 'ſ' (U+017F, C5 BF) → 's' (73)
3477+
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3478+
* can't precede '̈' (U+0308, CC 88) to avoid:
3479+
* - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
34803480
* - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
34813481
* - 'st' (U+FB06, EF AC 86) → "st" (73 74)
34823482
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
@@ -3514,9 +3514,51 @@ typedef enum {
35143514
* - D5 A1-BF: lowercase 'ա' (U+0561) through 'ի' (U+057F)
35153515
* - D6 80-86: lowercase 'լ' (U+0580) through 'ֆ' (U+0586)
35163516
*
3517-
* We inherit ALL contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`
3518-
* because mixed-script documents (e.g. dictionaries) may contain Latin ligatures,
3519-
* and add more rules specific to Armenian due to several ligatures:
3517+
* We inherit @b all contextual ASCII limitations from `sz_utf8_case_rune_safe_ascii_k`:
3518+
*
3519+
* - 'a' (U+0061, 61) - can't be last; can't precede 'ʾ' (U+02BE, CA BE) to avoid:
3520+
* - 'ẚ' (U+1E9A, E1 BA 9A) → "aʾ" (61 CA BE)
3521+
* - 'f' (U+0066, 66) - can't be first or last; can't follow 'f' (U+0066, 66);
3522+
* can't precede 'f' (U+0066, 66), 'i' (U+0069, 69), 'l' (U+006C, 6C) to avoid:
3523+
* - 'ff' (U+FB00, EF AC 80) → "ff" (66 66)
3524+
* - 'fi' (U+FB01, EF AC 81) → "fi" (66 69)
3525+
* - 'fl' (U+FB02, EF AC 82) → "fl" (66 6C)
3526+
* - 'ffi' (U+FB03, EF AC 83) → "ffi" (66 66 69)
3527+
* - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
3528+
* - 'h' (U+0068, 68) - can't be last; can't precede '̱' (U+0331, CC B1) to avoid:
3529+
* - 'ẖ' (U+1E96, E1 BA 96) → "ẖ" (68 CC B1)
3530+
* - 'i' (U+0069, 69) - can't be first or last; can't follow 'f' (U+0066, 66);
3531+
* can't precede '̇' (U+0307, CC 87) to avoid:
3532+
* - 'İ' (U+0130, C4 B0) → "i̇" (69 CC 87)
3533+
* - 'fi' (U+FB01, EF AC 81) → "fi" (66 69)
3534+
* - 'ffi' (U+FB03, EF AC 83) → "ffi" (66 66 69)
3535+
* - 'j' (U+006A, 6A) - can't be last; can't precede '̌' (U+030C, CC 8C) to avoid:
3536+
* - 'ǰ' (U+01F0, C7 B0) → "ǰ" (6A CC 8C)
3537+
* - 'k' (U+006B, 6B) - can't be present at all, because it's a folding target of the Kelvin sign:
3538+
* - 'K' (U+212A, E2 84 AA) → 'k' (6B)
3539+
* - 'l' (U+006C, 6C) - can't be first; can't follow 'f' (U+0066, 66) to avoid:
3540+
* - 'fl' (U+FB02, EF AC 82) → "fl" (66 6C)
3541+
* - 'ffl' (U+FB04, EF AC 84) → "ffl" (66 66 6C)
3542+
* - 'n' (U+006E, 6E) - can't be first; can't follow 'ʼ' (U+02BC, CA BC) to avoid:
3543+
* - 'ʼn' (U+0149, C5 89) → "ʼn" (CA BC 6E)
3544+
* - 's' (U+0073, 73) - can't be present at all, because it's a folding target of the old S sign:
3545+
* - 'ſ' (U+017F, C5 BF) → 's' (73)
3546+
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3547+
* can't precede '̈' (U+0308, CC 88) to avoid:
3548+
* - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
3549+
* - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3550+
* - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3551+
* - 't' (U+0074, 74) - can't be first or last; can't follow 's' (U+0073, 73);
3552+
* can't precede '̈' (U+0308, CC 88) to avoid:
3553+
* - 'ẗ' (U+1E97, E1 BA 97) → "ẗ" (74 CC 88)
3554+
* - 'ſt' (U+FB05, EF AC 85) → "st" (73 74)
3555+
* - 'st' (U+FB06, EF AC 86) → "st" (73 74)
3556+
* - 'w' (U+0077, 77) - can't be last; can't precede '̊' (U+030A, CC 8A) to avoid:
3557+
* - 'ẘ' (U+1E98, E1 BA 98) → "ẘ" (77 CC 8A)
3558+
* - 'y' (U+0079, 79) - can't be last; can't precede '̊' (U+030A, CC 8A) to avoid:
3559+
* - 'ẙ' (U+1E99, E1 BA 99) → "ẙ" (79 CC 8A)
3560+
*
3561+
* We also add rules specific to Armenian ligatures:
35203562
*
35213563
* - 'և' (U+0587, Ech-Yiwn) → "եւ" ('ե' + 'ւ') - very common
35223564
* - 'ﬓ' (U+FB13, Men-Now) → "մն" ('մ' + 'ն') - quite rare

0 commit comments

Comments
 (0)