|
7 | 7 | """
|
8 | 8 |
|
9 | 9 | import hashlib
|
| 10 | +from http.client import HTTPException |
10 | 11 | import sys
|
11 | 12 | import unicodedata
|
12 | 13 | import unittest
|
13 |
| -from test.support import script_helper |
| 14 | +from test.support import open_urlresource, script_helper |
14 | 15 |
|
15 | 16 |
|
16 | 17 | class UnicodeMethodsTest(unittest.TestCase):
|
@@ -171,13 +172,6 @@ def test_combining(self):
|
171 | 172 | self.assertRaises(TypeError, self.db.combining)
|
172 | 173 | self.assertRaises(TypeError, self.db.combining, 'xx')
|
173 | 174 |
|
174 |
| - def test_normalize(self): |
175 |
| - self.assertRaises(TypeError, self.db.normalize) |
176 |
| - self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx') |
177 |
| - self.assertEqual(self.db.normalize('NFKC', ''), '') |
178 |
| - # The rest can be found in test_normalization.py |
179 |
| - # which requires an external file. |
180 |
| - |
181 | 175 | def test_pr29(self):
|
182 | 176 | # http://www.unicode.org/review/pr-29.html
|
183 | 177 | # See issues #1054943 and #10254.
|
@@ -208,9 +202,6 @@ def test_issue29456(self):
|
208 | 202 | self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
|
209 | 203 | self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
|
210 | 204 |
|
211 |
| - # For tests of unicodedata.is_normalized / self.db.is_normalized , |
212 |
| - # see test_normalization.py . |
213 |
| - |
214 | 205 | def test_east_asian_width(self):
|
215 | 206 | eaw = self.db.east_asian_width
|
216 | 207 | self.assertRaises(TypeError, eaw, b'a')
|
@@ -315,5 +306,102 @@ def test_linebreak_7643(self):
|
315 | 306 | self.assertEqual(len(lines), 1,
|
316 | 307 | r"\u%.4x should not be a linebreak" % i)
|
317 | 308 |
|
| 309 | +class NormalizationTest(unittest.TestCase): |
| 310 | + @staticmethod |
| 311 | + def check_version(testfile): |
| 312 | + hdr = testfile.readline() |
| 313 | + return unicodedata.unidata_version in hdr |
| 314 | + |
| 315 | + @staticmethod |
| 316 | + def unistr(data): |
| 317 | + data = [int(x, 16) for x in data.split(" ")] |
| 318 | + return "".join([chr(x) for x in data]) |
| 319 | + |
| 320 | + def test_normalization(self): |
| 321 | + TESTDATAFILE = "NormalizationTest.txt" |
| 322 | + TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}" |
| 323 | + |
| 324 | + # Hit the exception early |
| 325 | + try: |
| 326 | + testdata = open_urlresource(TESTDATAURL, encoding="utf-8", |
| 327 | + check=self.check_version) |
| 328 | + except PermissionError: |
| 329 | + self.skipTest(f"Permission error when downloading {TESTDATAURL} " |
| 330 | + f"into the test data directory") |
| 331 | + except (OSError, HTTPException): |
| 332 | + self.fail(f"Could not retrieve {TESTDATAURL}") |
| 333 | + |
| 334 | + with testdata: |
| 335 | + self.run_normalization_tests(testdata) |
| 336 | + |
| 337 | + def run_normalization_tests(self, testdata): |
| 338 | + part = None |
| 339 | + part1_data = {} |
| 340 | + |
| 341 | + def NFC(str): |
| 342 | + return unicodedata.normalize("NFC", str) |
| 343 | + |
| 344 | + def NFKC(str): |
| 345 | + return unicodedata.normalize("NFKC", str) |
| 346 | + |
| 347 | + def NFD(str): |
| 348 | + return unicodedata.normalize("NFD", str) |
| 349 | + |
| 350 | + def NFKD(str): |
| 351 | + return unicodedata.normalize("NFKD", str) |
| 352 | + |
| 353 | + for line in testdata: |
| 354 | + if '#' in line: |
| 355 | + line = line.split('#')[0] |
| 356 | + line = line.strip() |
| 357 | + if not line: |
| 358 | + continue |
| 359 | + if line.startswith("@Part"): |
| 360 | + part = line.split()[0] |
| 361 | + continue |
| 362 | + c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]] |
| 363 | + |
| 364 | + # Perform tests |
| 365 | + self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line) |
| 366 | + self.assertTrue(c4 == NFC(c4) == NFC(c5), line) |
| 367 | + self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line) |
| 368 | + self.assertTrue(c5 == NFD(c4) == NFD(c5), line) |
| 369 | + self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \ |
| 370 | + NFKC(c3) == NFKC(c4) == NFKC(c5), |
| 371 | + line) |
| 372 | + self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \ |
| 373 | + NFKD(c3) == NFKD(c4) == NFKD(c5), |
| 374 | + line) |
| 375 | + |
| 376 | + self.assertTrue(unicodedata.is_normalized("NFC", c2)) |
| 377 | + self.assertTrue(unicodedata.is_normalized("NFC", c4)) |
| 378 | + |
| 379 | + self.assertTrue(unicodedata.is_normalized("NFD", c3)) |
| 380 | + self.assertTrue(unicodedata.is_normalized("NFD", c5)) |
| 381 | + |
| 382 | + self.assertTrue(unicodedata.is_normalized("NFKC", c4)) |
| 383 | + self.assertTrue(unicodedata.is_normalized("NFKD", c5)) |
| 384 | + |
| 385 | + # Record part 1 data |
| 386 | + if part == "@Part1": |
| 387 | + part1_data[c1] = 1 |
| 388 | + |
| 389 | + # Perform tests for all other data |
| 390 | + for c in range(sys.maxunicode+1): |
| 391 | + X = chr(c) |
| 392 | + if X in part1_data: |
| 393 | + continue |
| 394 | + self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c) |
| 395 | + |
| 396 | + def test_edge_cases(self): |
| 397 | + self.assertRaises(TypeError, unicodedata.normalize) |
| 398 | + self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') |
| 399 | + self.assertEqual(unicodedata.normalize('NFKC', ''), '') |
| 400 | + |
| 401 | + def test_bug_834676(self): |
| 402 | + # Check for bug 834676 |
| 403 | + unicodedata.normalize('NFC', '\ud55c\uae00') |
| 404 | + |
| 405 | + |
318 | 406 | if __name__ == "__main__":
|
319 | 407 | unittest.main()
|
0 commit comments