From 8ae6408a7719604e1eb57b8ee0fcfdba94d3ae1b Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Sun, 13 Sep 2020 02:03:17 +0800 Subject: [PATCH 01/11] encodings.normalize_encoding() should ignore non-ASCII letters --- Lib/encodings/__init__.py | 3 ++- Lib/test/test_source_encoding.py | 13 +++++++++++++ .../2020-09-13-02-02-18.bpo-39337.L3NXTt.rst | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index ddd5afdcf2dab0..4b37d3321c9033 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -61,7 +61,8 @@ def normalize_encoding(encoding): if c.isalnum() or c == '.': if punct and chars: chars.append('_') - chars.append(c) + if c.isascii(): + chars.append(c) punct = False else: punct = True diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index b410c03221bf32..eb24176c0a259b 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -226,5 +226,18 @@ def check_script_output(self, src, expected): self.assertEqual(res.out.rstrip(), expected) +class EncodingsTest(unittest.TestCase): + + def test_bpo39337(self): + """ + bpo-39337: similar to _Py_normalize_encoding(), + encodings.normalize_encoding() should ignore non-ASCII letters . + """ + import encodings + + out = encodings.normalize_encoding("���-8") + self.assertEqual(out, '8') + + if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst new file mode 100644 index 00000000000000..f734592f23c851 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst @@ -0,0 +1,2 @@ +similar to :c:func:`_Py_normalize_encoding`, +:func:`encodings.normalize_encoding` should ignore non-ASCII letters. From 0fcafb883cd25b0f1511c88245e29617f33f48b5 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Sun, 13 Sep 2020 02:06:20 +0800 Subject: [PATCH 02/11] update test update test --- Lib/test/test_source_encoding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index eb24176c0a259b..a18ac241d0041a 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -14,11 +14,11 @@ class MiscSourceEncodingTest(unittest.TestCase): def test_pep263(self): self.assertEqual( - "�����".encode("utf-8"), + "ðÉÔÏÎ".encode("utf-8"), b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd' ) self.assertEqual( - "\�".encode("utf-8"), + "\ð".encode("utf-8"), b'\\\xd0\x9f' ) @@ -235,7 +235,7 @@ def test_bpo39337(self): """ import encodings - out = encodings.normalize_encoding("���-8") + out = encodings.normalize_encoding("кои-8") self.assertEqual(out, '8') From dea24d84d6b171c1ffc379b300c6ee31436683d9 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Sun, 13 Sep 2020 02:23:37 +0800 Subject: [PATCH 03/11] Revert "update test" This reverts commit 0fcafb883cd25b0f1511c88245e29617f33f48b5. --- Lib/test/test_source_encoding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index a18ac241d0041a..eb24176c0a259b 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -14,11 +14,11 @@ class MiscSourceEncodingTest(unittest.TestCase): def test_pep263(self): self.assertEqual( - "ðÉÔÏÎ".encode("utf-8"), + "�����".encode("utf-8"), b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd' ) self.assertEqual( - "\ð".encode("utf-8"), + "\�".encode("utf-8"), b'\\\xd0\x9f' ) @@ -235,7 +235,7 @@ def test_bpo39337(self): """ import encodings - out = encodings.normalize_encoding("кои-8") + out = encodings.normalize_encoding("���-8") self.assertEqual(out, '8') From dbb0062bd00f1dc9fd889eac8de1a7e43f5f4fce Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Sat, 3 Oct 2020 13:41:17 +0800 Subject: [PATCH 04/11] apply victor's comment --- Lib/test/test_source_encoding.py | 4 ++-- .../next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index eb24176c0a259b..59261718cfb1ff 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -235,8 +235,8 @@ def test_bpo39337(self): """ import encodings - out = encodings.normalize_encoding("���-8") - self.assertEqual(out, '8') + out = encodings.normalize_encoding("utf\xE9\u20AC\U0010ffff-8") + self.assertEqual(out, 'utf_8') if __name__ == "__main__": diff --git a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst index f734592f23c851..ea432d47c2831e 100644 --- a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst +++ b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst @@ -1,2 +1 @@ -similar to :c:func:`_Py_normalize_encoding`, -:func:`encodings.normalize_encoding` should ignore non-ASCII letters. +:func:`encodings.normalize_encoding` now ignores non-ASCII letters. From 3fa221f25ebc82096f806506f4e93d8494f88c97 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Fri, 9 Oct 2020 12:34:04 +0800 Subject: [PATCH 05/11] apply victor's comment --- Lib/test/test_codecs.py | 17 +++++++++++++++++ Lib/test/test_source_encoding.py | 13 ------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 3dd56820cd1078..13e075e4947f17 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3403,5 +3403,22 @@ def test_rot13_func(self): 'To be, or not to be, that is the question') +class EncodingNormalizationTest(unittest.TestCase): + + def test_bpo39337(self): + """ + bpo-39337: similar to _Py_normalize_encoding(), + encodings.normalize_encoding() should ignore non-ASCII letters. + """ + import encodings + + out = encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8') + self.assertEqual(out, 'utf_8') + out = encodings.normalize_encoding('utf_8') + self.assertEqual(out, 'utf_8') + out = encodings.normalize_encoding('utf 8') + self.assertEqual(out, 'utf_8') + + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 59261718cfb1ff..b410c03221bf32 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -226,18 +226,5 @@ def check_script_output(self, src, expected): self.assertEqual(res.out.rstrip(), expected) -class EncodingsTest(unittest.TestCase): - - def test_bpo39337(self): - """ - bpo-39337: similar to _Py_normalize_encoding(), - encodings.normalize_encoding() should ignore non-ASCII letters . - """ - import encodings - - out = encodings.normalize_encoding("utf\xE9\u20AC\U0010ffff-8") - self.assertEqual(out, 'utf_8') - - if __name__ == "__main__": unittest.main() From 2e73d13ef9e7999ac7fe575c5eb380f41d59cb37 Mon Sep 17 00:00:00 2001 From: hai shi Date: Sat, 10 Oct 2020 08:34:08 +0800 Subject: [PATCH 06/11] apply victor's comments --- Lib/test/test_codecs.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 40d2da7da7b104..d579f23c25d414 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3442,19 +3442,18 @@ def search_function(encoding): class EncodingNormalizationTest(unittest.TestCase): - def test_bpo39337(self): - """ - bpo-39337: similar to _Py_normalize_encoding(), - encodings.normalize_encoding() should ignore non-ASCII letters. - """ - import encodings - + def test_normalization(self): + # encodings.normalize_encoding() ignores non-ASCII letters. out = encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8') self.assertEqual(out, 'utf_8') out = encodings.normalize_encoding('utf_8') self.assertEqual(out, 'utf_8') out = encodings.normalize_encoding('utf 8') self.assertEqual(out, 'utf_8') + out = encodings.normalize_encoding('UTF 8') + self.assertEqual(out, 'UTF_8') + out = encodings.normalize_encoding('utf...8') + self.assertEqual(out, 'utf...8') if __name__ == "__main__": From 95c1d980d08200d622d04ed559c0449cb102c9ec Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Mon, 12 Oct 2020 08:35:46 +0800 Subject: [PATCH 07/11] apply victor's comment --- Lib/test/test_codecs.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index d579f23c25d414..7497c6615d2cd0 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3417,7 +3417,7 @@ def test_rot13_func(self): class CodecNameNormalizationTest(unittest.TestCase): """Test codec name normalization""" - def test_normalized_encoding(self): + def test_codecs_lookup(self): FOUND = (1, 2, 3, 4) NOT_FOUND = (None, None, None, None) def search_function(encoding): @@ -3439,19 +3439,20 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8')) self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) - -class EncodingNormalizationTest(unittest.TestCase): - - def test_normalization(self): + def test_encodings_normalize_encoding(self): # encodings.normalize_encoding() ignores non-ASCII letters. - out = encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8') - self.assertEqual(out, 'utf_8') out = encodings.normalize_encoding('utf_8') self.assertEqual(out, 'utf_8') + out = encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8') + self.assertEqual(out, 'utf_8') out = encodings.normalize_encoding('utf 8') self.assertEqual(out, 'utf_8') + # encodings.normalize_encoding() doesn't convert + # characters to lower case. out = encodings.normalize_encoding('UTF 8') self.assertEqual(out, 'UTF_8') + out = encodings.normalize_encoding('utf.8') + self.assertEqual(out, 'utf.8') out = encodings.normalize_encoding('utf...8') self.assertEqual(out, 'utf...8') From 03bfd9b9266b99e88c4018be6e819b8595ff19fc Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Mon, 12 Oct 2020 22:17:32 +0800 Subject: [PATCH 08/11] apply victor's comment --- Lib/test/test_codecs.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7497c6615d2cd0..641ffbd9ee408c 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3441,20 +3441,16 @@ def search_function(encoding): def test_encodings_normalize_encoding(self): # encodings.normalize_encoding() ignores non-ASCII letters. - out = encodings.normalize_encoding('utf_8') - self.assertEqual(out, 'utf_8') - out = encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8') - self.assertEqual(out, 'utf_8') - out = encodings.normalize_encoding('utf 8') - self.assertEqual(out, 'utf_8') + self.assertEqual(encodings.normalize_encoding('utf_8'), 'utf_8') + self.assertEqual( + encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8'), + 'utf_8') + self.assertEqual(encodings.normalize_encoding('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. - out = encodings.normalize_encoding('UTF 8') - self.assertEqual(out, 'UTF_8') - out = encodings.normalize_encoding('utf.8') - self.assertEqual(out, 'utf.8') - out = encodings.normalize_encoding('utf...8') - self.assertEqual(out, 'utf...8') + self.assertEqual(encodings.normalize_encoding('UTF 8'), 'UTF_8') + self.assertEqual(encodings.normalize_encoding('utf.8'), 'utf.8') + self.assertEqual(encodings.normalize_encoding('utf...8'), 'utf...8') if __name__ == "__main__": From 38f28bd5ad2623c78b7570fb71abe30b7a78c3ec Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Tue, 13 Oct 2020 08:02:39 +0800 Subject: [PATCH 09/11] apply victor's comment --- Lib/test/test_codecs.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 641ffbd9ee408c..011f5474e3ca5c 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3441,16 +3441,15 @@ def search_function(encoding): def test_encodings_normalize_encoding(self): # encodings.normalize_encoding() ignores non-ASCII letters. - self.assertEqual(encodings.normalize_encoding('utf_8'), 'utf_8') - self.assertEqual( - encodings.normalize_encoding('utf\xE9\u20AC\U0010ffff-8'), - 'utf_8') - self.assertEqual(encodings.normalize_encoding('utf 8'), 'utf_8') + normalize = encodings.normalize_encoding + self.assertEqual(normalize('utf_8'), 'utf_8') + self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') + self.assertEqual(normalize('utf 8'), 'utf_8') # encodings.normalize_encoding() doesn't convert # characters to lower case. - self.assertEqual(encodings.normalize_encoding('UTF 8'), 'UTF_8') - self.assertEqual(encodings.normalize_encoding('utf.8'), 'utf.8') - self.assertEqual(encodings.normalize_encoding('utf...8'), 'utf...8') + self.assertEqual(normalize('UTF 8'), 'UTF_8') + self.assertEqual(normalize('utf.8'), 'utf.8') + self.assertEqual(normalize('utf...8'), 'utf...8') if __name__ == "__main__": From 5982784bf66fc8dbd8dac0f3882af5844496b824 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Tue, 13 Oct 2020 17:06:30 +0800 Subject: [PATCH 10/11] apply victor's comment --- Doc/whatsnew/3.10.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index 4ada4be3b66715..fcf42d8837bf0f 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -186,6 +186,11 @@ by :func:`curses.color_content`, :func:`curses.init_color`, support is provided by the underlying ncurses library. (Contributed by Jeffrey Kintscher and Hans Petter Jansson in :issue:`36982`.) +encodings +--------- +:func:`encodings.normalize_encoding` now ignores non-ASCII letters. +(Contributed by Hai Shi in :issue:`39337`.) + glob ---- From 4ecb8a1bb1f5e973e8d3eaeb0685d849d4023555 Mon Sep 17 00:00:00 2001 From: Hai Shi Date: Wed, 14 Oct 2020 07:52:09 +0800 Subject: [PATCH 11/11] apply victor's comment --- Doc/whatsnew/3.10.rst | 2 +- Lib/test/test_codecs.py | 2 +- .../next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/whatsnew/3.10.rst b/Doc/whatsnew/3.10.rst index fcf42d8837bf0f..c34c36ae102a05 100644 --- a/Doc/whatsnew/3.10.rst +++ b/Doc/whatsnew/3.10.rst @@ -188,7 +188,7 @@ support is provided by the underlying ncurses library. encodings --------- -:func:`encodings.normalize_encoding` now ignores non-ASCII letters. +:func:`encodings.normalize_encoding` now ignores non-ASCII characters. (Contributed by Hai Shi in :issue:`39337`.) glob diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 011f5474e3ca5c..09ceef76eb098d 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3440,7 +3440,7 @@ def search_function(encoding): self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8')) def test_encodings_normalize_encoding(self): - # encodings.normalize_encoding() ignores non-ASCII letters. + # encodings.normalize_encoding() ignores non-ASCII characters. normalize = encodings.normalize_encoding self.assertEqual(normalize('utf_8'), 'utf_8') self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8') diff --git a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst index ea432d47c2831e..c2b4dbe4d12e8e 100644 --- a/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst +++ b/Misc/NEWS.d/next/Library/2020-09-13-02-02-18.bpo-39337.L3NXTt.rst @@ -1 +1 @@ -:func:`encodings.normalize_encoding` now ignores non-ASCII letters. +:func:`encodings.normalize_encoding` now ignores non-ASCII characters.