-
-
Notifications
You must be signed in to change notification settings - Fork 32.4k
Use sys.maxunicode and MAX_UNICODE constants #15067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,37 @@ | |
from test import support, string_tests | ||
from test.support.script_helper import assert_python_failure | ||
|
||
# First invalid code point past the valid range | ||
INVALID_CODEPOINT = sys.maxunicode + 1 | ||
|
||
# Characters in the ASCII range (U+0000-U+007f) | ||
ASCII_CHAR = "a" | ||
ASCII_LAST_CHAR = "\x7f" | ||
# Characters in the UCS1 ("latin1") range (U+0000-U+00ff), | ||
# but not in the ASCII range | ||
UCS1_CHAR = "\xe9" | ||
UCS1_LAST_CHAR = "\xff" | ||
# Characters in UCS2 ("BMP") range (U+0000-U+FFFF), | ||
# but not in the UCS1 range | ||
UCS2_CHAR = "\u20ac" # euro sign | ||
UCS2_LAST_CHAR = "\uffff" | ||
# Characters in UCS4 ("Astral") range (U+0000-U+FFFF), | ||
# but not in the UCS2 range | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Astral" means specifically "not the BMP", so this isn't quite right. (Hm, also it should say "U+10FFFF".) Perhaps just |
||
UCS4_CHAR = '\U0001F355' # slice of pizza | ||
UCS4_LAST_CHAR = chr(sys.maxunicode) | ||
|
||
# Test characters of the ASCII range (U+0000-U+007f) | ||
ASCII_CHARS = ("\x00", ASCII_CHAR, ASCII_LAST_CHAR) | ||
# Test characters of the UCS1 range (U+0000-U+00ff), but not in ASCII range | ||
UCS1_ONLY_CHARS = ("\x80", UCS1_CHAR, UCS1_LAST_CHAR) | ||
# Test characters of the UCS1 range (U+0000-U+00ff) | ||
UCS1_CHARS = ASCII_CHARS + UCS1_ONLY_CHARS | ||
# Test characters of the Basic Multilingual Plane (U+0000-U+ffff) | ||
BMP_CHARS = UCS1_CHARS + ("\u0100", UCS2_CHAR, UCS2_LAST_CHAR) | ||
# Test characters of the full Unicode Character Set (U+0000-U+10ffff) | ||
FULL_UCS_CHARS = BMP_CHARS + ('\U00010000', UCS4_CHAR, UCS4_LAST_CHAR) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like you don't end up using most of these sequences (except to define the next ones) -- mostly the all-inclusive one Which makes sense -- for basically all of the tests that use these sequences, the desired behavior is something like "it works correctly for every codepoint", and Python code shouldn't have to really know or care whether it fits in the BMP or ASCII or whatever. In that case I think it would be clearest to just write one sequence. Combined with my naming suggestion below, that might look like
|
||
|
||
|
||
# Error handling (bad decoder return) | ||
def search_function(encoding): | ||
def decode1(input, errors="strict"): | ||
|
@@ -78,7 +109,7 @@ def test_literals(self): | |
self.assertEqual('\uffff', '\U0000ffff') | ||
self.assertRaises(SyntaxError, eval, '\'\\Ufffffffe\'') | ||
self.assertRaises(SyntaxError, eval, '\'\\Uffffffff\'') | ||
self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % 0x110000) | ||
self.assertRaises(SyntaxError, eval, '\'\\U%08x\'' % INVALID_CODEPOINT) | ||
vstinner marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# raw strings should not have unicode escapes | ||
self.assertNotEqual(r"\u0020", " ") | ||
|
||
|
@@ -341,12 +372,13 @@ def test_maketrans_translate(self): | |
"[]") | ||
self.assertEqual("[a]".translate(str.maketrans({'a': 'XXX'})), | ||
"[XXX]") | ||
self.assertEqual("[a]".translate(str.maketrans({'a': '\xe9'})), | ||
"[\xe9]") | ||
self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '123'})), | ||
"x123") | ||
self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': '\xe9'})), | ||
"x\xe9") | ||
for ch in UCS1_ONLY_CHARS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this really specific to characters up to U+00FF? If not, better to just use the full list |
||
self.assertEqual("[a]".translate(str.maketrans({'a': ch})), | ||
f"[{ch}]") | ||
self.assertEqual('axb'.translate(str.maketrans({'a': None, 'b': ch})), | ||
f"x{ch}") | ||
|
||
# test non-ASCII (don't take the fast-path) | ||
self.assertEqual("[a]".translate(str.maketrans({'a': '<\xe9>'})), | ||
|
@@ -361,9 +393,8 @@ def test_maketrans_translate(self): | |
"[<\u20ac>\xe9]") | ||
|
||
# invalid Unicode characters | ||
invalid_char = 0x10ffff+1 | ||
for before in "a\xe9\u20ac\U0010ffff": | ||
mapping = str.maketrans({before: invalid_char}) | ||
for before in FULL_UCS_CHARS: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This reads to me like "all characters in UCS" or "in Unicode". In fact I assumed it meant that (here and in similar loops below), and was a bit surprised you were making all these loops so comprehensive 🙂, before I scrolled back up to the top and read their definitions. How about putting something like "EXAMPLE" in the names? That's what they really are -- arbitrary lists of example characters for testing. |
||
mapping = str.maketrans({before: INVALID_CODEPOINT}) | ||
text = "[%s]" % before | ||
self.assertRaises(ValueError, text.translate, mapping) | ||
|
||
|
@@ -642,8 +673,9 @@ def test_isalpha(self): | |
|
||
def test_isascii(self): | ||
super().test_isascii() | ||
self.assertFalse("\u20ac".isascii()) | ||
self.assertFalse("\U0010ffff".isascii()) | ||
for ch in FULL_UCS_CHARS: | ||
self.assertEqual(ch.isascii(), ord(ch) < 0x80, | ||
hex(ord(ch))) | ||
|
||
def test_isdecimal(self): | ||
self.checkequalnofix(False, '', 'isdecimal') | ||
|
@@ -861,12 +893,15 @@ def test_swapcase(self): | |
|
||
def test_center(self): | ||
string_tests.CommonTest.test_center(self) | ||
self.assertEqual('x'.center(2, '\U0010FFFF'), | ||
'x\U0010FFFF') | ||
self.assertEqual('x'.center(3, '\U0010FFFF'), | ||
'\U0010FFFFx\U0010FFFF') | ||
self.assertEqual('x'.center(4, '\U0010FFFF'), | ||
'\U0010FFFFx\U0010FFFF\U0010FFFF') | ||
for ch1 in FULL_UCS_CHARS: | ||
for ch2 in FULL_UCS_CHARS: | ||
with self.subTest(ch1=ch1, ch2=ch2): | ||
self.assertEqual(ch1.center(2, ch2), | ||
ch1 + ch2) | ||
self.assertEqual(ch1.center(3, ch2), | ||
ch2 + ch1 + ch2) | ||
self.assertEqual(ch1.center(4, ch2), | ||
ch2 + ch1 + ch2 + ch2) | ||
|
||
@unittest.skipUnless(sys.maxsize == 2**31 - 1, "requires 32-bit system") | ||
@support.cpython_only | ||
|
@@ -1355,7 +1390,7 @@ def test_formatting(self): | |
|
||
self.assertEqual('%c' % 0x1234, '\u1234') | ||
self.assertEqual('%c' % 0x21483, '\U00021483') | ||
self.assertRaises(OverflowError, "%c".__mod__, (0x110000,)) | ||
self.assertRaises(OverflowError, "%c".__mod__, (INVALID_CODEPOINT,)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
self.assertEqual('%c' % '\U00021483', '\U00021483') | ||
self.assertRaises(TypeError, "%c".__mod__, "aa") | ||
self.assertRaises(ValueError, "%.1\u1032f".__mod__, (1.0/3)) | ||
|
@@ -1392,8 +1427,10 @@ def __str__(self): | |
self.assertEqual('%F' % INF, 'INF') | ||
|
||
# PEP 393 | ||
self.assertEqual('%.1s' % "a\xe9\u20ac", 'a') | ||
self.assertEqual('%.2s' % "a\xe9\u20ac", 'a\xe9') | ||
text = ''.join(FULL_UCS_CHARS) | ||
for length in range(len(text)): | ||
fmt = f'%.{length}s' | ||
self.assertEqual(fmt % text, text[:length]) | ||
|
||
#issue 19995 | ||
class PseudoInt: | ||
|
@@ -1503,7 +1540,7 @@ def test_constructor(self): | |
'unicode remains unicode' | ||
) | ||
|
||
for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'): | ||
for text in FULL_UCS_CHARS: | ||
subclass = StrSubclass(text) | ||
self.assertEqual(str(subclass), text) | ||
self.assertEqual(len(subclass), len(text)) | ||
|
@@ -2131,7 +2168,7 @@ def test_codecs(self): | |
# UTF-8 must be roundtrip safe for all code points | ||
# (except surrogates, which are forbidden). | ||
u = ''.join(map(chr, list(range(0, 0xd800)) + | ||
list(range(0xe000, 0x110000)))) | ||
list(range(0xe000, sys.maxunicode + 1)))) | ||
for encoding in ('utf-8',): | ||
self.assertEqual(str(u.encode(encoding),encoding), u) | ||
|
||
|
@@ -2221,6 +2258,7 @@ def test_ucs4(self): | |
y = br'\U00100000' | ||
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||
self.assertEqual(x, y) | ||
|
||
y = br'\U00010000' | ||
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape") | ||
self.assertEqual(x, y) | ||
|
@@ -2297,7 +2335,7 @@ def test_raiseMemError(self): | |
ascii_struct_size = 24 | ||
compact_struct_size = 36 | ||
|
||
for char in ('a', '\xe9', '\u20ac', '\U0010ffff'): | ||
for char in FULL_UCS_CHARS: | ||
code = ord(char) | ||
if code < 0x100: | ||
char_size = 1 # sizeof(Py_UCS1) | ||
|
@@ -2363,19 +2401,7 @@ def test_resize(self): | |
def test_compare(self): | ||
# Issue #17615 | ||
N = 10 | ||
ascii = 'a' * N | ||
ascii2 = 'z' * N | ||
latin = '\x80' * N | ||
latin2 = '\xff' * N | ||
bmp = '\u0100' * N | ||
bmp2 = '\uffff' * N | ||
astral = '\U00100000' * N | ||
astral2 = '\U0010ffff' * N | ||
strings = ( | ||
ascii, ascii2, | ||
latin, latin2, | ||
bmp, bmp2, | ||
astral, astral2) | ||
strings = tuple(ch * N for ch in FULL_UCS_CHARS) | ||
for text1, text2 in itertools.combinations(strings, 2): | ||
equal = (text1 is text2) | ||
self.assertEqual(text1 == text2, equal) | ||
|
@@ -2398,6 +2424,15 @@ def test_compare(self): | |
self.assertTrue(copy1 <= copy2) | ||
self.assertTrue(copy2 >= copy2) | ||
|
||
ascii = ASCII_CHAR * N | ||
ascii2 = ASCII_LAST_CHAR * N | ||
latin = UCS1_CHAR * N | ||
latin2 = UCS1_LAST_CHAR * N | ||
bmp = UCS2_CHAR * N | ||
bmp2 = UCS2_LAST_CHAR * N | ||
astral = UCS4_CHAR * N | ||
astral2 = UCS4_LAST_CHAR * N | ||
|
||
self.assertTrue(ascii < ascii2) | ||
self.assertTrue(ascii < latin) | ||
self.assertTrue(ascii < bmp) | ||
|
@@ -2536,10 +2571,10 @@ def check_format(expected, format, *args): | |
# test "%c" | ||
check_format('\uabcd', | ||
b'%c', c_int(0xabcd)) | ||
check_format('\U0010ffff', | ||
b'%c', c_int(0x10ffff)) | ||
check_format(chr(sys.maxunicode), | ||
b'%c', c_int(sys.maxunicode)) | ||
with self.assertRaises(OverflowError): | ||
PyUnicode_FromFormat(b'%c', c_int(0x110000)) | ||
PyUnicode_FromFormat(b'%c', c_int(INVALID_CODEPOINT)) | ||
# Issue #18183 | ||
check_format('\U00010000\U00100000', | ||
b'%c%c', c_int(0x10000), c_int(0x100000)) | ||
|
@@ -2706,8 +2741,9 @@ def check_format(expected, format, *args): | |
b'%100.80x', c_int(0x123)) | ||
|
||
# test %A | ||
check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", | ||
b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') | ||
check_format(r"%%A:'abc\x%02x\u%04x\U%08x'" | ||
% (ord(UCS1_CHAR), ord(UCS2_CHAR), ord(UCS4_CHAR)), | ||
b'%%A:%A', 'abc' + UCS1_CHAR + UCS2_CHAR + UCS4_CHAR) | ||
|
||
# test %V | ||
check_format('repr=abc', | ||
|
@@ -2767,7 +2803,7 @@ def test_aswidechar(self): | |
self.assertEqual(size, 7) | ||
self.assertEqual(wchar, 'abc\0def\0') | ||
|
||
nonbmp = chr(0x10ffff) | ||
nonbmp = UCS4_CHAR | ||
if sizeof(c_wchar) == 2: | ||
buflen = 3 | ||
nchar = 2 | ||
|
@@ -2793,7 +2829,7 @@ def test_aswidecharstring(self): | |
self.assertEqual(size, 7) | ||
self.assertEqual(wchar, 'abc\0def\0') | ||
|
||
nonbmp = chr(0x10ffff) | ||
nonbmp = UCS4_CHAR | ||
if sizeof(c_wchar) == 2: | ||
nchar = 2 | ||
else: # sizeof(c_wchar) == 4 | ||
|
@@ -2858,8 +2894,8 @@ def test_findchar(self): | |
self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i) | ||
|
||
str = "!>_<!" | ||
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1) | ||
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1) | ||
self.assertEqual(unicode_findchar(str, INVALID_CODEPOINT, 0, len(str), 1), -1) | ||
self.assertEqual(unicode_findchar(str, INVALID_CODEPOINT, 0, len(str), -1), -1) | ||
# start < end | ||
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4) | ||
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -220,10 +220,10 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, | |
} | ||
ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 - | ||
((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80); | ||
assert ((ch > 0xFFFF) && (ch <= 0x10FFFF)); | ||
assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that the current UTF-8 codec would not work for MAX_UNICODE > 0x001FFFFF. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's the UTF-8 decoder: it has to ensure that it doesn't produce a code point not supported by Python. (ch <= MAX_UNICODE) is the correct test here. |
||
s += 4; | ||
if (STRINGLIB_MAX_CHAR <= 0xFFFF || | ||
(STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR)) | ||
(STRINGLIB_MAX_CHAR < MAX_UNICODE && ch > STRINGLIB_MAX_CHAR)) | ||
/* Out-of-range */ | ||
goto Return; | ||
*p++ = ch; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the normative Unicode jargon, UCS-2 isn't a set of characters -- it's an encoding. This range of characters (*) is called the BMP ("Basic Multilingual Plane"). UCS-2 is a partial encoding, which only works for the BMP.
See https://unicode.org/glossary/
This is all really about characters (or code points), not encodings. So I think the talk of UCS-2 is confusing here -- it's best to just say "BMP", which is the proper name of this range of characters.
(*) in fact of code points, but that doesn't matter here