Skip to content

Commit 9daa2a7

Browse files
committed
Improve: Case-folding bump from Unicode 16 to 17
PyTest now downloads the Unicode speicifcation and compares all valid UTF-8 encoded characters via StringZilla's new `utf8_case_fold` API
1 parent 36fa527 commit 9daa2a7

File tree

3 files changed

+258
-2
lines changed

3 files changed

+258
-2
lines changed

include/stringzilla/utf8_unpack.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,7 @@ SZ_INTERNAL sz_size_t sz_unicode_fold_codepoint_(sz_rune_t rune, sz_rune_t *fold
317317
if (rune >= 0x10D50 && rune <= 0x10D65) { folded[0] = rune + 0x20; return 1; } // Garay
318318
if (rune >= 0x118A0 && rune <= 0x118BF) { folded[0] = rune + 0x20; return 1; } // Warang Citi
319319
if (rune >= 0x16E40 && rune <= 0x16E5F) { folded[0] = rune + 0x20; return 1; } // Medefaidrin
320+
if (rune >= 0x16EA0 && rune <= 0x16EB8) { folded[0] = rune + 0x1B; return 1; } // Beria Erfe
320321
// 2-byte character ranges with +48 translations
321322
if (rune >= 0x0531 && rune <= 0x0556) { folded[0] = rune + 0x30; return 1; } // Armenian Ա-Ֆ → ա-ֆ
322323
if (rune >= 0x2C00 && rune <= 0x2C2F) { folded[0] = rune + 0x30; return 1; } // Glagolitic Ⰰ-Ⱟ → ⰰ-ⱟ
@@ -347,7 +348,6 @@ SZ_INTERNAL sz_size_t sz_unicode_fold_codepoint_(sz_rune_t rune, sz_rune_t *fold
347348
if (rune >= 0x1057C && rune <= 0x1058A) { folded[0] = rune + 0x27; return 1; } // Vithkuqi (+39)
348349
if (rune >= 0x1058C && rune <= 0x10592) { folded[0] = rune + 0x27; return 1; } // Vithkuqi (+39)
349350
if (rune >= 0x1E900 && rune <= 0x1E921) { folded[0] = rune + 0x22; return 1; } // Adlam 𞤀-𞤡 → 𞤢-𞥃 (+34)
350-
if (rune >= 0x16EA0 && rune <= 0x16EB8) { folded[0] = rune + 0x1B; return 1; } // Kawi (+27)
351351

352352
// Even/odd +1 mappings: uppercase at even codepoint, lowercase at odd (or vice versa)
353353
sz_u32_t is_even = ((rune & 1) == 0), is_odd = !is_even;
@@ -390,7 +390,12 @@ SZ_INTERNAL sz_size_t sz_unicode_fold_codepoint_(sz_rune_t rune, sz_rune_t *fold
390390
if (rune >= 0xA796 && rune <= 0xA7A8 && is_even) { folded[0] = rune + 1; return 1; } // Ꞗ-Ꞩ
391391
if (rune >= 0xA7B4 && rune <= 0xA7C2 && is_even) { folded[0] = rune + 1; return 1; } // Ꞵ-Ꟃ
392392
if (rune == 0xA7C7 || rune == 0xA7C9) { folded[0] = rune + 1; return 1; } // Ꟈ, Ꟊ
393-
if (rune >= 0xA7CC && rune <= 0xA7D8 && is_even) { folded[0] = rune + 1; return 1; } // Ꟍ-Ꟙ
393+
// Latin Extended-D: Only specific even codepoints
394+
if (rune == 0xA7CC || rune == 0xA7CE || rune == 0xA7D0 || rune == 0xA7D2 ||
395+
rune == 0xA7D4 || rune == 0xA7D6 || rune == 0xA7D8) {
396+
folded[0] = rune + 1;
397+
return 1;
398+
}
394399
if (rune == 0xA7DA) { folded[0] = 0xA7DB; return 1; } // Ꟛ → ꟛ
395400
if (rune == 0xA7F5) { folded[0] = 0xA7F6; return 1; } // Ꟶ → ꟶ
396401

python/stringzilla.c

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3151,6 +3151,72 @@ static PyObject *Str_like_translate(PyObject *self, PyObject *const *args, Py_ss
31513151
}
31523152
}
31533153

3154+
static char const doc_utf8_case_fold[] = //
3155+
"Apply Unicode case folding to a UTF-8 string.\n"
3156+
"\n"
3157+
"Case folding normalizes text for case-insensitive comparisons,\n"
3158+
"handling one-to-many expansions (e.g., German sharp S to 'ss').\n"
3159+
"\n"
3160+
"Args:\n"
3161+
" text (Str or str or bytes): The input UTF-8 string.\n"
3162+
"\n"
3163+
"Returns:\n"
3164+
" bytes: The case-folded UTF-8 string.\n"
3165+
"\n"
3166+
"Example:\n"
3167+
" >>> sz.utf8_case_fold('HELLO')\n"
3168+
" b'hello'\n"
3169+
" >>> sz.utf8_case_fold('Stra\\u00dfe') # German sharp S\n"
3170+
" b'strasse'";
3171+
3172+
static PyObject *Str_like_utf8_case_fold(PyObject *self, PyObject *const *args, Py_ssize_t positional_args_count,
3173+
PyObject *args_names_tuple) {
3174+
int is_member = self != NULL && PyObject_TypeCheck(self, &StrType);
3175+
Py_ssize_t nargs_expected = !is_member; // 0 if method, 1 if module function
3176+
3177+
if (positional_args_count != nargs_expected) {
3178+
PyErr_Format(PyExc_TypeError, "utf8_case_fold() takes exactly %zd argument(s)", nargs_expected);
3179+
return NULL;
3180+
}
3181+
3182+
// Reject keyword arguments
3183+
if (args_names_tuple && PyTuple_GET_SIZE(args_names_tuple) > 0) {
3184+
PyErr_SetString(PyExc_TypeError, "utf8_case_fold() takes no keyword arguments");
3185+
return NULL;
3186+
}
3187+
3188+
PyObject *str_obj = is_member ? self : args[0];
3189+
3190+
sz_string_view_t str;
3191+
if (!sz_py_export_string_like(str_obj, &str.start, &str.length)) {
3192+
wrap_current_exception("Argument must be string-like");
3193+
return NULL;
3194+
}
3195+
3196+
// Allocate buffer with 3x capacity for maximum expansion (e.g., some Greek characters)
3197+
sz_size_t max_result_length = str.length * 3;
3198+
if (max_result_length == 0) { return PyBytes_FromStringAndSize("", 0); }
3199+
3200+
PyObject *result_bytes = PyBytes_FromStringAndSize(NULL, max_result_length);
3201+
if (!result_bytes) {
3202+
PyErr_SetString(PyExc_MemoryError, "Unable to allocate memory for case-folded string");
3203+
return NULL;
3204+
}
3205+
3206+
sz_ptr_t destination = (sz_ptr_t)PyBytes_AS_STRING(result_bytes);
3207+
sz_size_t actual_length = sz_utf8_case_fold(str.start, str.length, destination);
3208+
3209+
// Resize to actual length if smaller than allocated
3210+
if (actual_length < max_result_length) {
3211+
if (_PyBytes_Resize(&result_bytes, actual_length) < 0) {
3212+
Py_XDECREF(result_bytes);
3213+
return NULL;
3214+
}
3215+
}
3216+
3217+
return result_bytes;
3218+
}
3219+
31543220
static char const doc_find_first_of[] = //
31553221
"Find the index of the first occurrence of any character from another string.\n"
31563222
"\n"
@@ -4252,6 +4318,7 @@ static PyMethodDef Str_methods[] = {
42524318
{"lstrip", (PyCFunction)Str_like_lstrip, SZ_METHOD_FLAGS, doc_lstrip},
42534319
{"rstrip", (PyCFunction)Str_like_rstrip, SZ_METHOD_FLAGS, doc_rstrip},
42544320
{"strip", (PyCFunction)Str_like_strip, SZ_METHOD_FLAGS, doc_strip},
4321+
{"utf8_case_fold", (PyCFunction)Str_like_utf8_case_fold, SZ_METHOD_FLAGS, doc_utf8_case_fold},
42554322

42564323
// Bidirectional operations
42574324
{"find", (PyCFunction)Str_like_find, SZ_METHOD_FLAGS, doc_find},
@@ -6357,6 +6424,7 @@ static PyMethodDef stringzilla_methods[] = {
63576424
{"lstrip", (PyCFunction)Str_like_lstrip, SZ_METHOD_FLAGS, doc_lstrip},
63586425
{"rstrip", (PyCFunction)Str_like_rstrip, SZ_METHOD_FLAGS, doc_rstrip},
63596426
{"strip", (PyCFunction)Str_like_strip, SZ_METHOD_FLAGS, doc_strip},
6427+
{"utf8_case_fold", (PyCFunction)Str_like_utf8_case_fold, SZ_METHOD_FLAGS, doc_utf8_case_fold},
63606428

63616429
// Bidirectional operations
63626430
{"find", (PyCFunction)Str_like_find, SZ_METHOD_FLAGS, doc_find},

scripts/test_stringzilla.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1447,5 +1447,188 @@ def test_invalid_utf8_handling():
14471447
assert len(str_result) > 0
14481448

14491449

1450+
def test_unit_utf8_case_fold():
1451+
"""Test basic case folding functionality."""
1452+
# ASCII
1453+
assert sz.utf8_case_fold("HELLO") == b"hello"
1454+
assert sz.utf8_case_fold("Hello World") == b"hello world"
1455+
assert sz.utf8_case_fold("") == b""
1456+
assert sz.utf8_case_fold("already lowercase") == b"already lowercase"
1457+
1458+
# German sharp S expansion (ß → ss)
1459+
assert sz.utf8_case_fold("Straße") == b"strasse"
1460+
assert sz.utf8_case_fold("GROSSE") == b"grosse"
1461+
1462+
# Method form on Str
1463+
assert sz.Str("HELLO").utf8_case_fold() == b"hello"
1464+
1465+
# Bytes input
1466+
assert sz.utf8_case_fold(b"HELLO") == b"hello"
1467+
1468+
1469+
@pytest.mark.parametrize(
1470+
"input_str, expected",
1471+
[
1472+
("A", b"a"),
1473+
("Z", b"z"),
1474+
("ß", b"ss"), # German sharp S (U+00DF)
1475+
("ẞ", b"ss"), # Capital sharp S (U+1E9E)
1476+
("fi", b"fi"), # fi ligature (U+FB01)
1477+
("ff", b"ff"), # ff ligature (U+FB00)
1478+
("fl", b"fl"), # fl ligature (U+FB02)
1479+
("ffi", b"ffi"), # ffi ligature (U+FB03)
1480+
("ffl", b"ffl"), # ffl ligature (U+FB04)
1481+
("Σ", "σ".encode("utf-8")), # Greek Sigma
1482+
("Ω", "ω".encode("utf-8")), # Greek Omega
1483+
("Ä", "ä".encode("utf-8")), # German umlaut
1484+
("É", "é".encode("utf-8")), # French accent
1485+
("Ñ", "ñ".encode("utf-8")), # Spanish tilde
1486+
],
1487+
)
1488+
def test_utf8_case_fold_expansions(input_str, expected):
1489+
"""Test case folding with specific known transformations including expansions."""
1490+
assert sz.utf8_case_fold(input_str) == expected
1491+
1492+
1493+
def _parse_case_folding_file(filepath: str) -> Dict[int, bytes]:
1494+
"""Parse Unicode CaseFolding.txt into a dict: codepoint -> folded UTF-8 bytes.
1495+
1496+
Uses status C (common) and F (full) mappings for full case folding.
1497+
"""
1498+
folds = {}
1499+
with open(filepath, "r", encoding="utf-8") as f:
1500+
for line in f:
1501+
line = line.strip()
1502+
if not line or line.startswith("#"):
1503+
continue
1504+
parts = line.split(";")
1505+
if len(parts) < 3:
1506+
continue
1507+
status = parts[1].strip()
1508+
# C = common, F = full (for expansions like ß → ss)
1509+
# Skip S (simple) and T (Turkic) for full case folding
1510+
if status not in ("C", "F"):
1511+
continue
1512+
try:
1513+
codepoint = int(parts[0].strip(), 16)
1514+
# Mapping can be multiple codepoints separated by spaces (e.g., "0073 0073" for ß → ss)
1515+
target_cps = [int(x, 16) for x in parts[2].split("#")[0].strip().split()]
1516+
# Convert target codepoints to UTF-8 bytes
1517+
folded_str = "".join(chr(cp) for cp in target_cps)
1518+
folds[codepoint] = folded_str.encode("utf-8")
1519+
except (ValueError, IndexError):
1520+
continue
1521+
return folds
1522+
1523+
1524+
def _get_case_folding_rules(version: str = "17.0.0") -> Dict[int, bytes]:
1525+
"""Download and parse Unicode CaseFolding.txt, caching in temp directory.
1526+
1527+
Args:
1528+
version: Unicode version string (e.g., "17.0.0")
1529+
1530+
Returns:
1531+
Dict mapping codepoints to their folded UTF-8 bytes
1532+
"""
1533+
import urllib.request
1534+
1535+
cache_path = os.path.join(tempfile.gettempdir(), f"CaseFolding-{version}.txt")
1536+
1537+
# Use cached file if it exists
1538+
if not os.path.exists(cache_path):
1539+
url = f"https://www.unicode.org/Public/{version}/ucd/CaseFolding.txt"
1540+
try:
1541+
urllib.request.urlretrieve(url, cache_path)
1542+
except Exception as e:
1543+
pytest.skip(f"Could not download CaseFolding.txt from {url}: {e}")
1544+
1545+
return _parse_case_folding_file(cache_path)
1546+
1547+
1548+
def test_utf8_case_fold_all_codepoints():
1549+
"""Compare StringZilla case folding with Unicode 17.0 CaseFolding.txt rules.
1550+
1551+
This test downloads the official Unicode 17.0 case folding data file to validate
1552+
StringZilla's implementation, independent of Python's Unicode version.
1553+
The file is cached in the system temp directory for subsequent runs.
1554+
"""
1555+
# Load Unicode 17.0 case folding rules (downloads and caches automatically)
1556+
unicode_folds = _get_case_folding_rules("17.0.0")
1557+
print(f"\nLoaded {len(unicode_folds)} case folding rules from Unicode 17.0")
1558+
1559+
mismatches = []
1560+
missing_folds = []
1561+
extra_folds = []
1562+
1563+
for codepoint in range(0x110000):
1564+
# Skip surrogates (not valid in UTF-8)
1565+
if 0xD800 <= codepoint <= 0xDFFF:
1566+
continue
1567+
1568+
try:
1569+
char = chr(codepoint)
1570+
char_bytes = char.encode("utf-8")
1571+
sz_folded = sz.utf8_case_fold(char)
1572+
1573+
# Get expected folding from Unicode 17.0 rules
1574+
# If not in the table, character maps to itself
1575+
expected = unicode_folds.get(codepoint, char_bytes)
1576+
1577+
if sz_folded != expected:
1578+
entry = (f"U+{codepoint:04X}", repr(char), expected.hex(), sz_folded.hex())
1579+
if codepoint in unicode_folds and sz_folded == char_bytes:
1580+
missing_folds.append(entry) # StringZilla didn't fold but should have
1581+
elif codepoint not in unicode_folds and sz_folded != char_bytes:
1582+
extra_folds.append(entry) # StringZilla folded but shouldn't have
1583+
else:
1584+
mismatches.append(entry) # Both fold but to different targets
1585+
except (ValueError, UnicodeEncodeError):
1586+
continue
1587+
1588+
# Report statistics
1589+
print(f" Missing folds (StringZilla should fold): {len(missing_folds)}")
1590+
print(f" Extra folds (StringZilla shouldn't fold): {len(extra_folds)}")
1591+
print(f" Wrong target (both fold differently): {len(mismatches)}")
1592+
1593+
if missing_folds:
1594+
print(f" First 5 missing: {missing_folds[:5]}")
1595+
if extra_folds:
1596+
print(f" First 5 extra: {extra_folds[:5]}")
1597+
if mismatches:
1598+
print(f" First 5 wrong: {mismatches[:5]}")
1599+
1600+
total_errors = len(mismatches) + len(missing_folds) + len(extra_folds)
1601+
assert total_errors == 0, (
1602+
f"Found {total_errors} case folding errors vs Unicode 17.0: "
1603+
f"{len(mismatches)} wrong targets, {len(missing_folds)} missing, {len(extra_folds)} extra. "
1604+
f"First 10 overall: {(mismatches + missing_folds + extra_folds)[:10]}"
1605+
)
1606+
1607+
1608+
@pytest.mark.parametrize("seed_value", SEED_VALUES)
1609+
def test_utf8_case_fold_random_strings(seed_value: int):
1610+
"""Test case folding on random multi-codepoint strings."""
1611+
seed(seed_value)
1612+
1613+
# Test with ASCII uppercase
1614+
for _ in range(50):
1615+
length = randint(1, 100)
1616+
test_str = "".join(chr(randint(0x41, 0x5A)) for _ in range(length)) # A-Z
1617+
python_folded = test_str.casefold().encode("utf-8")
1618+
sz_folded = sz.utf8_case_fold(test_str)
1619+
assert python_folded == sz_folded, f"Mismatch for: {test_str!r}"
1620+
1621+
# Test with Latin Extended characters
1622+
for _ in range(50):
1623+
length = randint(1, 50)
1624+
# Mix of ASCII uppercase and Latin Extended (includes ß, etc.)
1625+
codepoints = [randint(0x41, 0x5A) for _ in range(length)]
1626+
codepoints += [randint(0xC0, 0xFF) for _ in range(length // 2)]
1627+
test_str = "".join(chr(cp) for cp in codepoints)
1628+
python_folded = test_str.casefold().encode("utf-8")
1629+
sz_folded = sz.utf8_case_fold(test_str)
1630+
assert python_folded == sz_folded, f"Mismatch for: {test_str!r}"
1631+
1632+
14501633
if __name__ == "__main__":
14511634
sys.exit(pytest.main(["-x", "-s", __file__]))

0 commit comments

Comments
 (0)