Skip to content

Commit 1ad0c77

Browse files
gnpricebenjaminp
authored andcommitted
bpo-38043: Move unicodedata.normalize tests into test_unicodedata. (GH-15712)
Having these in a separate file from the one that's named after the module in the usual way makes it very easy to miss them when looking for tests for these two functions. (In fact when working recently on is_normalized, I'd been surprised to see no tests for it here and concluded the function had evaded being tested at all. I'd gone as far as to write up some tests myself before I spotted this other file.) Mostly this just means moving all the one file's code into the other, and moving code from the module toplevel to inside the test class to keep it tidily separate from the rest of the file's code. There's one substantive change, which reduces by a bit the amount of code to be moved: we drop the `x > sys.maxunicode` conditional and all the `RangeError` logic behind it. Now if that condition ever occurs it will cause an error at `chr(x)`, and a test failure. That's the right result because, since PEP 393 in Python 3.3, there is no longer such a thing as an "unsupported character".
1 parent 5b00dd8 commit 1ad0c77

File tree

4 files changed

+102
-130
lines changed

4 files changed

+102
-130
lines changed

Lib/test/test_normalization.py

Lines changed: 0 additions & 117 deletions
This file was deleted.

Lib/test/test_ucn.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212

1313
from test import support
1414
from http.client import HTTPException
15-
from test.test_normalization import check_version
1615

1716
try:
1817
from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
@@ -172,6 +171,9 @@ def test_named_sequences_sample(self):
172171

173172
def test_named_sequences_full(self):
174173
# Check all the named sequences
174+
def check_version(testfile):
175+
hdr = testfile.readline()
176+
return unicodedata.unidata_version in hdr
175177
url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
176178
unicodedata.unidata_version)
177179
try:

Lib/test/test_unicodedata.py

Lines changed: 99 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77
"""
88

99
import hashlib
10+
from http.client import HTTPException
1011
import sys
1112
import unicodedata
1213
import unittest
13-
from test.support import script_helper
14+
from test.support import open_urlresource, script_helper
1415

1516

1617
class UnicodeMethodsTest(unittest.TestCase):
@@ -171,13 +172,6 @@ def test_combining(self):
171172
self.assertRaises(TypeError, self.db.combining)
172173
self.assertRaises(TypeError, self.db.combining, 'xx')
173174

174-
def test_normalize(self):
175-
self.assertRaises(TypeError, self.db.normalize)
176-
self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
177-
self.assertEqual(self.db.normalize('NFKC', ''), '')
178-
# The rest can be found in test_normalization.py
179-
# which requires an external file.
180-
181175
def test_pr29(self):
182176
# http://www.unicode.org/review/pr-29.html
183177
# See issues #1054943 and #10254.
@@ -208,9 +202,6 @@ def test_issue29456(self):
208202
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
209203
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
210204

211-
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
212-
# see test_normalization.py .
213-
214205
def test_east_asian_width(self):
215206
eaw = self.db.east_asian_width
216207
self.assertRaises(TypeError, eaw, b'a')
@@ -315,5 +306,102 @@ def test_linebreak_7643(self):
315306
self.assertEqual(len(lines), 1,
316307
r"\u%.4x should not be a linebreak" % i)
317308

309+
class NormalizationTest(unittest.TestCase):
310+
@staticmethod
311+
def check_version(testfile):
312+
hdr = testfile.readline()
313+
return unicodedata.unidata_version in hdr
314+
315+
@staticmethod
316+
def unistr(data):
317+
data = [int(x, 16) for x in data.split(" ")]
318+
return "".join([chr(x) for x in data])
319+
320+
def test_normalization(self):
321+
TESTDATAFILE = "NormalizationTest.txt"
322+
TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
323+
324+
# Hit the exception early
325+
try:
326+
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
327+
check=self.check_version)
328+
except PermissionError:
329+
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
330+
f"into the test data directory")
331+
except (OSError, HTTPException):
332+
self.fail(f"Could not retrieve {TESTDATAURL}")
333+
334+
with testdata:
335+
self.run_normalization_tests(testdata)
336+
337+
def run_normalization_tests(self, testdata):
338+
part = None
339+
part1_data = {}
340+
341+
def NFC(str):
342+
return unicodedata.normalize("NFC", str)
343+
344+
def NFKC(str):
345+
return unicodedata.normalize("NFKC", str)
346+
347+
def NFD(str):
348+
return unicodedata.normalize("NFD", str)
349+
350+
def NFKD(str):
351+
return unicodedata.normalize("NFKD", str)
352+
353+
for line in testdata:
354+
if '#' in line:
355+
line = line.split('#')[0]
356+
line = line.strip()
357+
if not line:
358+
continue
359+
if line.startswith("@Part"):
360+
part = line.split()[0]
361+
continue
362+
c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
363+
364+
# Perform tests
365+
self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
366+
self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
367+
self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
368+
self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
369+
self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
370+
NFKC(c3) == NFKC(c4) == NFKC(c5),
371+
line)
372+
self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
373+
NFKD(c3) == NFKD(c4) == NFKD(c5),
374+
line)
375+
376+
self.assertTrue(unicodedata.is_normalized("NFC", c2))
377+
self.assertTrue(unicodedata.is_normalized("NFC", c4))
378+
379+
self.assertTrue(unicodedata.is_normalized("NFD", c3))
380+
self.assertTrue(unicodedata.is_normalized("NFD", c5))
381+
382+
self.assertTrue(unicodedata.is_normalized("NFKC", c4))
383+
self.assertTrue(unicodedata.is_normalized("NFKD", c5))
384+
385+
# Record part 1 data
386+
if part == "@Part1":
387+
part1_data[c1] = 1
388+
389+
# Perform tests for all other data
390+
for c in range(sys.maxunicode+1):
391+
X = chr(c)
392+
if X in part1_data:
393+
continue
394+
self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
395+
396+
def test_edge_cases(self):
397+
self.assertRaises(TypeError, unicodedata.normalize)
398+
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
399+
self.assertEqual(unicodedata.normalize('NFKC', ''), '')
400+
401+
def test_bug_834676(self):
402+
# Check for bug 834676
403+
unicodedata.normalize('NFC', '\ud55c\uae00')
404+
405+
318406
if __name__ == "__main__":
319407
unittest.main()

PCbuild/lib.pyproj

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1191,7 +1191,6 @@
11911191
<Compile Include="test\test_netrc.py" />
11921192
<Compile Include="test\test_nis.py" />
11931193
<Compile Include="test\test_nntplib.py" />
1194-
<Compile Include="test\test_normalization.py" />
11951194
<Compile Include="test\test_ntpath.py" />
11961195
<Compile Include="test\test_numeric_tower.py" />
11971196
<Compile Include="test\test_opcodes.py" />

0 commit comments

Comments
 (0)