From 3541237753e8aaada05b8d3391a747f135b701d6 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 7 Jun 2024 22:25:24 +0200 Subject: [PATCH] gh-119182: Use strict error handler in PyUnicode_FromFormat() PyUnicode_FromFormat() now decodes the "%s" format argument from UTF-8 with the "strict" error handler, instead of the "replace" error handler. Remove the unused 'consumed' parameter of unicode_decode_utf8_writer(). --- Doc/c-api/unicode.rst | 7 ++++- Doc/whatsnew/3.14.rst | 5 ++++ Lib/test/test_capi/test_getargs.py | 3 +- Lib/test/test_capi/test_unicode.py | 29 ++++++++++--------- ...-06-07-22-38-08.gh-issue-119182.P3nXBm.rst | 3 ++ Objects/unicodeobject.c | 20 ++++--------- 6 files changed, 36 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513..435fe8947de7af 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -487,7 +487,8 @@ APIs: * - ``s`` - :c:expr:`const char*` or :c:expr:`const wchar_t*` - - A null-terminated C character array. + - A null-terminated C character array. The argument is decoded from + UTF-8 with the "strict" error handler. * - ``p`` - :c:expr:`const void*` @@ -576,6 +577,10 @@ APIs: .. versionchanged:: 3.13 Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added. + .. versionchanged:: 3.14 + The ``"%s"`` format now decodes its argument from UTF-8 with the "strict" + error handler, instead of the "replace" error handler. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b2dd80b64a691a..549400518b2454 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -261,6 +261,11 @@ New Features Porting to Python 3.14 ---------------------- +* :c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from + UTF-8 with the "strict" error handler, instead of the "replace" error + handler. + (Contributed by Victor Stinner in :gh:`119182`.) + Deprecated ---------- diff --git a/Lib/test/test_capi/test_getargs.py b/Lib/test/test_capi/test_getargs.py index 232aa2a80025dc..44452dbef263a5 100644 --- a/Lib/test/test_capi/test_getargs.py +++ b/Lib/test/test_capi/test_getargs.py @@ -1298,8 +1298,7 @@ def test_nonascii_keywords(self): self.assertEqual(parse((), {}, '|O', [invalid]), (NULL,)) self.assertEqual(parse((1,), {'b': 2}, 'O|O', [invalid, 'b']), (1, 2)) - with self.assertRaisesRegex(TypeError, - f"function missing required argument '{name}\ufffd'"): + with self.assertRaises(UnicodeDecodeError): parse((), {}, 'O', [invalid]) with self.assertRaisesRegex(UnicodeDecodeError, f"'utf-8' codec can't decode bytes? "): diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..6d889675be672b 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -384,12 +384,11 @@ def check_format(expected, format, *args): check_format('ascii\x7f=unicode\xe9', b'ascii\x7f=%U', 'unicode\xe9') - # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() - # raises an error - self.assertRaisesRegex(ValueError, - r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' - 'string, got a non-ASCII byte: 0xe9$', - PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + # "%s" format decodes its argument from UTF-8/strict + check_format('value=\u20ac', + b'value=%s', '\u20ac'.encode()) + with self.assertRaises(UnicodeDecodeError): + PyUnicode_FromFormat(b'value=%s', b'invalid\xe9') # test "%c" check_format('\uabcd', @@ -412,11 +411,13 @@ def check_format(expected, format, *args): check_format('%abc', b'%%%s', b'abc') - # truncated string + # test "%s" format with precision check_format('abc', b'%.3s', b'abcdef') - check_format('abc[\ufffd', - b'%.5s', 'abc[\u20ac]'.encode('utf8')) + with self.assertRaises(UnicodeDecodeError): + PyUnicode_FromFormat(b'%.5s', 'abc[\u20ac]'.encode('utf8')) + check_format('abc[\u20ac', + b'%.7s', 'abc[\u20ac]'.encode('utf8')) check_format("'\\u20acABC'", b'%A', '\u20acABC') check_format("'\\u20", @@ -431,8 +432,8 @@ def check_format(expected, format, *args): b'%.3U', '\u20acABCDEF') check_format('\u20acAB', b'%.3V', '\u20acABCDEF', None) - check_format('abc[\ufffd', - b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + with self.assertRaises(UnicodeDecodeError): + PyUnicode_FromFormat(b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) # following tests comes from #7330 # test width modifier and precision modifier with %S @@ -723,9 +724,9 @@ class LocalType: check_format('repr=\u4eba\u6c11', b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') - #Test replace error handler. - check_format('repr=abc\ufffd', - b'repr=%V', None, b'abc\xff') + # Test replace the "strict" error handler. + with self.assertRaises(UnicodeDecodeError): + PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff') # Issue #33817: empty strings check_format('', diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst new file mode 100644 index 00000000000000..e5824b6b4f0eb7 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst @@ -0,0 +1,3 @@ +:c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from +UTF-8 with the "strict" error handler, instead of the "replace" error handler. +Patch by Victor Stinner. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..7ff5fd4dc18665 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed); + _Py_error_handler error_handler, const char *errors); #ifdef Py_DEBUG static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); @@ -2402,11 +2401,11 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", NULL); + _Py_ERROR_STRICT, "strict"); } - PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, - "replace", NULL); + PyObject *unicode = unicode_decode_utf8(str, length, + _Py_ERROR_STRICT, "strict", NULL); if (unicode == NULL) return -1; @@ -4930,13 +4929,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed) + _Py_error_handler error_handler, const char *errors) { if (size == 0) { - if (consumed) { - *consumed = 0; - } return 0; } @@ -4954,9 +4949,6 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, writer->pos += decoded; if (decoded == size) { - if (consumed) { - *consumed = size; - } return 0; } s += decoded; @@ -4964,7 +4956,7 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, } return unicode_decode_utf8_impl(writer, starts, s, end, - error_handler, errors, consumed); + error_handler, errors, NULL); }