python · vstinner · Aug 1, 2019 · Aug 1, 2019 · Aug 1, 2019 · gnprice
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
@@ -17,6 +17,37 @@
 from test import support, string_tests
 from test.support.script_helper import assert_python_failure
 
+# First invalid code point past the valid range
+INVALID_CODEPOINT = sys.maxunicode + 1
+
+# Characters in the ASCII range (U+0000-U+007f)
+ASCII_CHAR = "a"
+ASCII_LAST_CHAR = "\x7f"
+# Characters in the UCS1 ("latin1") range (U+0000-U+00ff),
+# but not in the ASCII range
+UCS1_CHAR = "\xe9"
+UCS1_LAST_CHAR = "\xff"
+# Characters in UCS2 ("BMP") range (U+0000-U+FFFF),
+# but not in the UCS1 range
+UCS2_CHAR = "\u20ac"  # euro sign
+UCS2_LAST_CHAR = "\uffff"
+# Characters in UCS4 ("Astral") range (U+0000-U+FFFF),
+# but not in the UCS2 range
+UCS4_CHAR = '\U0001F355'  # slice of pizza
+UCS4_LAST_CHAR = chr(sys.maxunicode)
+
+# Test characters of the ASCII range (U+0000-U+007f)
+ASCII_CHARS = ("\x00", ASCII_CHAR, ASCII_LAST_CHAR)
+# Test characters of the UCS1 range (U+0000-U+00ff), but not in ASCII range
+UCS1_ONLY_CHARS = ("\x80", UCS1_CHAR, UCS1_LAST_CHAR)
+# Test characters of the UCS1 range (U+0000-U+00ff)
+UCS1_CHARS = ASCII_CHARS + UCS1_ONLY_CHARS
+# Test characters of the Basic Multilingual Plane (U+0000-U+ffff)
+BMP_CHARS = UCS1_CHARS + ("\u0100", UCS2_CHAR, UCS2_LAST_CHAR)
+# Test characters of the full Unicode Character Set (U+0000-U+10ffff)
+FULL_UCS_CHARS = BMP_CHARS + ('\U00010000', UCS4_CHAR, UCS4_LAST_CHAR)
+
+
 # Error handling (bad decoder return)
 def search_function(encoding):
     def decode1(input, errors="strict"):
@@ -78,7 +109,7 @@ def test_literals(self):
         self.assertEqual('\uffff', '\U0000ffff')
         self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'')
         self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'')
-        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000)
+        self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % INVALID_CODEPOINT)
         # raw strings should not have unicode escapes
         self.assertNotEqual(r"\u0020", " ")
 
@@ -341,12 +372,13 @@ def test_maketrans_translate(self):
                          "[]")
         self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})),
                          "[XXX]")
-        self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})),
-                         "[\xe9]")
         self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})),
                          "x123")
-        self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})),
-                         "x\xe9")
+        for ch in UCS1_ONLY_CHARS:
+            self.assertEqual("[a]".translate(str.maketrans({'a': ch})),
+                             f"[{ch}]")
+            self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': ch})),
+                             f"x{ch}")
 
         # test non-ASCII (don't take the fast-path)
         self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})),
@@ -361,9 +393,8 @@ def test_maketrans_translate(self):
                          "[<\u20ac>\xe9]")
 
         # invalid Unicode characters
-        invalid_char = 0x10ffff+1
-        for before in "a\xe9\u20ac\U0010ffff":
-            mapping = str.maketrans({before: invalid_char})
+        for before in FULL_UCS_CHARS:
+            mapping = str.maketrans({before: INVALID_CODEPOINT})
             text = "[%s]" % before
             self.assertRaises(ValueError, text.translate, mapping)
 
@@ -642,8 +673,9 @@ def test_isalpha(self):
 
     def test_isascii(self):
         super().test_isascii()
-        self.assertFalse("\u20ac".isascii())
-        self.assertFalse("\U0010ffff".isascii())
+        for ch in FULL_UCS_CHARS:
+            self.assertEqual(ch.isascii(), ord(ch) < 0x80,
+                             hex(ord(ch)))
 
     def test_isdecimal(self):
         self.checkequalnofix(False, '', 'isdecimal')
@@ -861,12 +893,15 @@ def test_swapcase(self):
 
     def test_center(self):
         string_tests.CommonTest.test_center(self)
-        self.assertEqual('x'.center(2, '\U0010FFFF'),
-                         'x\U0010FFFF')
-        self.assertEqual('x'.center(3, '\U0010FFFF'),
-                         '\U0010FFFFx\U0010FFFF')
-        self.assertEqual('x'.center(4, '\U0010FFFF'),
-                         '\U0010FFFFx\U0010FFFF\U0010FFFF')
+        for ch1 in FULL_UCS_CHARS:
+            for ch2 in FULL_UCS_CHARS:
+                with self.subTest(ch1=ch1, ch2=ch2):
+                    self.assertEqual(ch1.center(2, ch2),
+                                     ch1 + ch2)
+                    self.assertEqual(ch1.center(3, ch2),
+                                     ch2 + ch1 + ch2)
+                    self.assertEqual(ch1.center(4, ch2),
+                                     ch2 + ch1 + ch2 + ch2)
 
     @unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system")
     @support.cpython_only
@@ -1355,7 +1390,7 @@ def test_formatting(self):
 
         self.assertEqual('%c' % 0x1234, '\u1234')
         self.assertEqual('%c' % 0x21483, '\U00021483')
-        self.assertRaises(OverflowError, "%c".__mod__, (0x110000,))
+        self.assertRaises(OverflowError, "%c".__mod__, (INVALID_CODEPOINT,))
         self.assertEqual('%c' % '\U00021483', '\U00021483')
         self.assertRaises(TypeError, "%c".__mod__, "aa")
         self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3))
@@ -1392,8 +1427,10 @@ def __str__(self):
         self.assertEqual('%F' % INF, 'INF')
 
         # PEP 393
-        self.assertEqual('%.1s' % "a\xe9\u20ac", 'a')
-        self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9')
+        text = ''.join(FULL_UCS_CHARS)
+        for length in range(len(text)):
+            fmt = f'%.{length}s'
+            self.assertEqual(fmt % text, text[:length])
 
         #issue 19995
         class PseudoInt:
@@ -1503,7 +1540,7 @@ def test_constructor(self):
             'unicode remains unicode'
         )
 
-        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
+        for text in FULL_UCS_CHARS:
             subclass = StrSubclass(text)
             self.assertEqual(str(subclass), text)
             self.assertEqual(len(subclass), len(text))
@@ -2131,7 +2168,7 @@ def test_codecs(self):
         # UTF-8 must be roundtrip safe for all code points
         # (except surrogates, which are forbidden).
         u = ''.join(map(chr, list(range(0, 0xd800)) +
-                             list(range(0xe000, 0x110000))))
+                             list(range(0xe000, sys.maxunicode + 1))))
         for encoding in ('utf-8',):
             self.assertEqual(str(u.encode(encoding),encoding), u)
 
@@ -2221,6 +2258,7 @@ def test_ucs4(self):
         y = br'\U00100000'
         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
         self.assertEqual(x, y)
+
         y = br'\U00010000'
         x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
         self.assertEqual(x, y)
@@ -2297,7 +2335,7 @@ def test_raiseMemError(self):
             ascii_struct_size = 24
             compact_struct_size = 36
 
-        for char in ('a', '\xe9', '\u20ac', '\U0010ffff'):
+        for char in FULL_UCS_CHARS:
             code = ord(char)
             if code < 0x100:
                 char_size = 1  # sizeof(Py_UCS1)
@@ -2363,19 +2401,7 @@ def test_resize(self):
     def test_compare(self):
         # Issue #17615
         N = 10
-        ascii = 'a' * N
-        ascii2 = 'z' * N
-        latin = '\x80' * N
-        latin2 = '\xff' * N
-        bmp = '\u0100' * N
-        bmp2 = '\uffff' * N
-        astral = '\U00100000' * N
-        astral2 = '\U0010ffff' * N
-        strings = (
-            ascii, ascii2,
-            latin, latin2,
-            bmp, bmp2,
-            astral, astral2)
+        strings = tuple(ch * N for ch in FULL_UCS_CHARS)
         for text1, text2 in itertools.combinations(strings, 2):
             equal = (text1 is text2)
             self.assertEqual(text1 == text2, equal)
@@ -2398,6 +2424,15 @@ def test_compare(self):
                 self.assertTrue(copy1 <= copy2)
                 self.assertTrue(copy2 >= copy2)
 
+        ascii = ASCII_CHAR * N
+        ascii2 = ASCII_LAST_CHAR  * N
+        latin = UCS1_CHAR * N
+        latin2 = UCS1_LAST_CHAR * N
+        bmp = UCS2_CHAR * N
+        bmp2 = UCS2_LAST_CHAR * N
+        astral = UCS4_CHAR  * N
+        astral2 = UCS4_LAST_CHAR * N
+
         self.assertTrue(ascii < ascii2)
         self.assertTrue(ascii < latin)
         self.assertTrue(ascii < bmp)
@@ -2536,10 +2571,10 @@ def check_format(expected, format, *args):
         # test "%c"
         check_format('\uabcd',
                      b'%c', c_int(0xabcd))
-        check_format('\U0010ffff',
-                     b'%c', c_int(0x10ffff))
+        check_format(chr(sys.maxunicode),
+                     b'%c', c_int(sys.maxunicode))
         with self.assertRaises(OverflowError):
-            PyUnicode_FromFormat(b'%c', c_int(0x110000))
+            PyUnicode_FromFormat(b'%c', c_int(INVALID_CODEPOINT))
         # Issue #18183
         check_format('\U00010000\U00100000',
                      b'%c%c', c_int(0x10000), c_int(0x100000))
@@ -2706,8 +2741,9 @@ def check_format(expected, format, *args):
                      b'%100.80x', c_int(0x123))
 
         # test %A
-        check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
-                     b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
+        check_format(r"%%A:'abc\x%02x\u%04x\U%08x'"
+                     % (ord(UCS1_CHAR), ord(UCS2_CHAR), ord(UCS4_CHAR)),
+                     b'%%A:%A', 'abc' + UCS1_CHAR + UCS2_CHAR + UCS4_CHAR)
 
         # test %V
         check_format('repr=abc',
@@ -2767,7 +2803,7 @@ def test_aswidechar(self):
         self.assertEqual(size, 7)
         self.assertEqual(wchar, 'abc\0def\0')
 
-        nonbmp = chr(0x10ffff)
+        nonbmp = UCS4_CHAR
         if sizeof(c_wchar) == 2:
             buflen = 3
             nchar = 2
@@ -2793,7 +2829,7 @@ def test_aswidecharstring(self):
         self.assertEqual(size, 7)
         self.assertEqual(wchar, 'abc\0def\0')
 
-        nonbmp = chr(0x10ffff)
+        nonbmp = UCS4_CHAR
         if sizeof(c_wchar) == 2:
             nchar = 2
         else: # sizeof(c_wchar) == 4
@@ -2858,8 +2894,8 @@ def test_findchar(self):
                 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
 
         str = "!>_<!"
-        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
-        self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
+        self.assertEqual(unicode_findchar(str, INVALID_CODEPOINT, 0, len(str), 1), -1)
+        self.assertEqual(unicode_findchar(str, INVALID_CODEPOINT, 0, len(str), -1), -1)
         # start < end
         self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
         self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)

diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
@@ -220,10 +220,10 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
             }
             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
-            assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
+            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
             s += 4;
             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
-                (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
+                (STRINGLIB_MAX_CHAR < MAX_UNICODE && ch > STRINGLIB_MAX_CHAR))
                 /* Out-of-range */
                 goto Return;
             *p++ = ch;

diff --git a/Objects/stringlib/find_max_char.h b/Objects/stringlib/find_max_char.h
@@ -54,7 +54,7 @@ STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)
 #define MAX_CHAR_ASCII 0x7f
 #define MAX_CHAR_UCS1  0xff
 #define MAX_CHAR_UCS2  0xffff
-#define MAX_CHAR_UCS4  0x10ffff
+#define MAX_CHAR_UCS4  MAX_UNICODE
 
 Py_LOCAL_INLINE(Py_UCS4)
 STRINGLIB(find_max_char)(const STRINGLIB_CHAR *begin, const STRINGLIB_CHAR *end)

diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -90,8 +90,9 @@ NOTE: In the interpreter's initialization phase, some globals are currently
 extern "C" {
 #endif
 
-/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
+/* Maximum code point of Unicode 12.0: 0x10ffff (1,114,111) */
 #define MAX_UNICODE 0x10ffff
+#define MAX_UNICODE_RANGE "range(0x110000)"
 
 #ifdef Py_DEBUG
 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
@@ -469,13 +470,13 @@ unicode_check_encoding_errors(const char *encoding, const char *errors)
 }
 
 
-/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
+/* The max unicode value is always MAX_UNICODE while using the PEP-393 API.
    This function is kept for backward compatibility with the old API. */
 Py_UNICODE
 PyUnicode_GetMax(void)
 {
 #ifdef Py_UNICODE_WIDE
-    return 0x10FFFF;
+    return MAX_UNICODE;
 #else
     /* This is actually an illegal character, so it should
        not be passed to unichr. */
@@ -2771,7 +2772,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
         int ordinal = va_arg(*vargs, int);
         if (ordinal < 0 || ordinal > MAX_UNICODE) {
             PyErr_SetString(PyExc_OverflowError,
-                            "character argument not in range(0x110000)");
+                            "character argument not in " MAX_UNICODE_RANGE);
             return NULL;
         }
         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
@@ -3209,7 +3210,7 @@ PyUnicode_FromOrdinal(int ordinal)
 {
     if (ordinal < 0 || ordinal > MAX_UNICODE) {
         PyErr_SetString(PyExc_ValueError,
-                        "chr() arg not in range(0x110000)");
+                        "chr() arg not in " MAX_UNICODE_RANGE);
         return NULL;
     }
 
@@ -5562,13 +5563,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
             endinpos = ((const char *)e) - starts;
         }
         else {
-            if (ch < 0x110000) {
+            if (ch <= MAX_UNICODE) {
                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
                     goto onError;
                 q += 4;
                 continue;
             }
-            errmsg = "code point not in range(0x110000)";
+            errmsg = "code point not in " MAX_UNICODE_RANGE;
             startinpos = ((const char *)q) - starts;
             endinpos = startinpos + 4;
         }
@@ -13677,7 +13678,7 @@ _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
     {
     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
-    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
+    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
     default:
         Py_UNREACHABLE();
     }
@@ -14496,7 +14497,7 @@ formatchar(PyObject *v)
 
         if (x < 0 || x > MAX_UNICODE) {
             PyErr_SetString(PyExc_OverflowError,
-                            "%c arg not in range(0x110000)");
+                            "%c arg not in " MAX_UNICODE_RANGE);
             return (Py_UCS4) -1;
         }