@@ -1447,5 +1447,188 @@ def test_invalid_utf8_handling():
14471447 assert len (str_result ) > 0
14481448
14491449
1450+ def test_unit_utf8_case_fold ():
1451+ """Test basic case folding functionality."""
1452+ # ASCII
1453+ assert sz .utf8_case_fold ("HELLO" ) == b"hello"
1454+ assert sz .utf8_case_fold ("Hello World" ) == b"hello world"
1455+ assert sz .utf8_case_fold ("" ) == b""
1456+ assert sz .utf8_case_fold ("already lowercase" ) == b"already lowercase"
1457+
1458+ # German sharp S expansion (ß → ss)
1459+ assert sz .utf8_case_fold ("Straße" ) == b"strasse"
1460+ assert sz .utf8_case_fold ("GROSSE" ) == b"grosse"
1461+
1462+ # Method form on Str
1463+ assert sz .Str ("HELLO" ).utf8_case_fold () == b"hello"
1464+
1465+ # Bytes input
1466+ assert sz .utf8_case_fold (b"HELLO" ) == b"hello"
1467+
1468+
1469+ @pytest .mark .parametrize (
1470+ "input_str, expected" ,
1471+ [
1472+ ("A" , b"a" ),
1473+ ("Z" , b"z" ),
1474+ ("ß" , b"ss" ), # German sharp S (U+00DF)
1475+ ("ẞ" , b"ss" ), # Capital sharp S (U+1E9E)
1476+ ("fi" , b"fi" ), # fi ligature (U+FB01)
1477+ ("ff" , b"ff" ), # ff ligature (U+FB00)
1478+ ("fl" , b"fl" ), # fl ligature (U+FB02)
1479+ ("ffi" , b"ffi" ), # ffi ligature (U+FB03)
1480+ ("ffl" , b"ffl" ), # ffl ligature (U+FB04)
1481+ ("Σ" , "σ" .encode ("utf-8" )), # Greek Sigma
1482+ ("Ω" , "ω" .encode ("utf-8" )), # Greek Omega
1483+ ("Ä" , "ä" .encode ("utf-8" )), # German umlaut
1484+ ("É" , "é" .encode ("utf-8" )), # French accent
1485+ ("Ñ" , "ñ" .encode ("utf-8" )), # Spanish tilde
1486+ ],
1487+ )
1488+ def test_utf8_case_fold_expansions (input_str , expected ):
1489+ """Test case folding with specific known transformations including expansions."""
1490+ assert sz .utf8_case_fold (input_str ) == expected
1491+
1492+
1493+ def _parse_case_folding_file (filepath : str ) -> Dict [int , bytes ]:
1494+ """Parse Unicode CaseFolding.txt into a dict: codepoint -> folded UTF-8 bytes.
1495+
1496+ Uses status C (common) and F (full) mappings for full case folding.
1497+ """
1498+ folds = {}
1499+ with open (filepath , "r" , encoding = "utf-8" ) as f :
1500+ for line in f :
1501+ line = line .strip ()
1502+ if not line or line .startswith ("#" ):
1503+ continue
1504+ parts = line .split (";" )
1505+ if len (parts ) < 3 :
1506+ continue
1507+ status = parts [1 ].strip ()
1508+ # C = common, F = full (for expansions like ß → ss)
1509+ # Skip S (simple) and T (Turkic) for full case folding
1510+ if status not in ("C" , "F" ):
1511+ continue
1512+ try :
1513+ codepoint = int (parts [0 ].strip (), 16 )
1514+ # Mapping can be multiple codepoints separated by spaces (e.g., "0073 0073" for ß → ss)
1515+ target_cps = [int (x , 16 ) for x in parts [2 ].split ("#" )[0 ].strip ().split ()]
1516+ # Convert target codepoints to UTF-8 bytes
1517+ folded_str = "" .join (chr (cp ) for cp in target_cps )
1518+ folds [codepoint ] = folded_str .encode ("utf-8" )
1519+ except (ValueError , IndexError ):
1520+ continue
1521+ return folds
1522+
1523+
1524+ def _get_case_folding_rules (version : str = "17.0.0" ) -> Dict [int , bytes ]:
1525+ """Download and parse Unicode CaseFolding.txt, caching in temp directory.
1526+
1527+ Args:
1528+ version: Unicode version string (e.g., "17.0.0")
1529+
1530+ Returns:
1531+ Dict mapping codepoints to their folded UTF-8 bytes
1532+ """
1533+ import urllib .request
1534+
1535+ cache_path = os .path .join (tempfile .gettempdir (), f"CaseFolding-{ version } .txt" )
1536+
1537+ # Use cached file if it exists
1538+ if not os .path .exists (cache_path ):
1539+ url = f"https://www.unicode.org/Public/{ version } /ucd/CaseFolding.txt"
1540+ try :
1541+ urllib .request .urlretrieve (url , cache_path )
1542+ except Exception as e :
1543+ pytest .skip (f"Could not download CaseFolding.txt from { url } : { e } " )
1544+
1545+ return _parse_case_folding_file (cache_path )
1546+
1547+
1548+ def test_utf8_case_fold_all_codepoints ():
1549+ """Compare StringZilla case folding with Unicode 17.0 CaseFolding.txt rules.
1550+
1551+ This test downloads the official Unicode 17.0 case folding data file to validate
1552+ StringZilla's implementation, independent of Python's Unicode version.
1553+ The file is cached in the system temp directory for subsequent runs.
1554+ """
1555+ # Load Unicode 17.0 case folding rules (downloads and caches automatically)
1556+ unicode_folds = _get_case_folding_rules ("17.0.0" )
1557+ print (f"\n Loaded { len (unicode_folds )} case folding rules from Unicode 17.0" )
1558+
1559+ mismatches = []
1560+ missing_folds = []
1561+ extra_folds = []
1562+
1563+ for codepoint in range (0x110000 ):
1564+ # Skip surrogates (not valid in UTF-8)
1565+ if 0xD800 <= codepoint <= 0xDFFF :
1566+ continue
1567+
1568+ try :
1569+ char = chr (codepoint )
1570+ char_bytes = char .encode ("utf-8" )
1571+ sz_folded = sz .utf8_case_fold (char )
1572+
1573+ # Get expected folding from Unicode 17.0 rules
1574+ # If not in the table, character maps to itself
1575+ expected = unicode_folds .get (codepoint , char_bytes )
1576+
1577+ if sz_folded != expected :
1578+ entry = (f"U+{ codepoint :04X} " , repr (char ), expected .hex (), sz_folded .hex ())
1579+ if codepoint in unicode_folds and sz_folded == char_bytes :
1580+ missing_folds .append (entry ) # StringZilla didn't fold but should have
1581+ elif codepoint not in unicode_folds and sz_folded != char_bytes :
1582+ extra_folds .append (entry ) # StringZilla folded but shouldn't have
1583+ else :
1584+ mismatches .append (entry ) # Both fold but to different targets
1585+ except (ValueError , UnicodeEncodeError ):
1586+ continue
1587+
1588+ # Report statistics
1589+ print (f" Missing folds (StringZilla should fold): { len (missing_folds )} " )
1590+ print (f" Extra folds (StringZilla shouldn't fold): { len (extra_folds )} " )
1591+ print (f" Wrong target (both fold differently): { len (mismatches )} " )
1592+
1593+ if missing_folds :
1594+ print (f" First 5 missing: { missing_folds [:5 ]} " )
1595+ if extra_folds :
1596+ print (f" First 5 extra: { extra_folds [:5 ]} " )
1597+ if mismatches :
1598+ print (f" First 5 wrong: { mismatches [:5 ]} " )
1599+
1600+ total_errors = len (mismatches ) + len (missing_folds ) + len (extra_folds )
1601+ assert total_errors == 0 , (
1602+ f"Found { total_errors } case folding errors vs Unicode 17.0: "
1603+ f"{ len (mismatches )} wrong targets, { len (missing_folds )} missing, { len (extra_folds )} extra. "
1604+ f"First 10 overall: { (mismatches + missing_folds + extra_folds )[:10 ]} "
1605+ )
1606+
1607+
1608+ @pytest .mark .parametrize ("seed_value" , SEED_VALUES )
1609+ def test_utf8_case_fold_random_strings (seed_value : int ):
1610+ """Test case folding on random multi-codepoint strings."""
1611+ seed (seed_value )
1612+
1613+ # Test with ASCII uppercase
1614+ for _ in range (50 ):
1615+ length = randint (1 , 100 )
1616+ test_str = "" .join (chr (randint (0x41 , 0x5A )) for _ in range (length )) # A-Z
1617+ python_folded = test_str .casefold ().encode ("utf-8" )
1618+ sz_folded = sz .utf8_case_fold (test_str )
1619+ assert python_folded == sz_folded , f"Mismatch for: { test_str !r} "
1620+
1621+ # Test with Latin Extended characters
1622+ for _ in range (50 ):
1623+ length = randint (1 , 50 )
1624+ # Mix of ASCII uppercase and Latin Extended (includes ß, etc.)
1625+ codepoints = [randint (0x41 , 0x5A ) for _ in range (length )]
1626+ codepoints += [randint (0xC0 , 0xFF ) for _ in range (length // 2 )]
1627+ test_str = "" .join (chr (cp ) for cp in codepoints )
1628+ python_folded = test_str .casefold ().encode ("utf-8" )
1629+ sz_folded = sz .utf8_case_fold (test_str )
1630+ assert python_folded == sz_folded , f"Mismatch for: { test_str !r} "
1631+
1632+
14501633if __name__ == "__main__" :
14511634 sys .exit (pytest .main (["-x" , "-s" , __file__ ]))
0 commit comments