From 3d5bca4d1fdaefcaaaeed7415c8f468fb4a2d8e7 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 7 Jun 2024 22:25:24 +0200 Subject: [PATCH 1/7] gh-119182: Decode PyUnicode_FromFormat() format from UTF-8 PyUnicode_FromFormat() now decodes the format string from UTF-8 with the "replace" error handler, instead of decoding it from ASCII. Remove unused 'consumed' parameter of unicode_decode_utf8_writer(). --- Doc/c-api/unicode.rst | 9 +++- Doc/whatsnew/3.14.rst | 4 ++ Lib/test/test_capi/test_unicode.py | 12 +++--- ...-06-07-22-38-08.gh-issue-119182.P3nXBm.rst | 3 ++ Objects/unicodeobject.c | 43 ++++++------------- 5 files changed, 34 insertions(+), 37 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513..1d7c8745cb63d1 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -387,7 +387,8 @@ APIs: arguments, calculate the size of the resulting Python Unicode string and return a string with the values formatted into it. The variable arguments must be C types and must correspond exactly to the format characters in the *format* - ASCII-encoded string. + string. The *format* string is decoded from UTF-8 with the "replace" error + handler. A conversion specifier contains two or more characters and has the following components, which must occur in this order: @@ -487,7 +488,8 @@ APIs: * - ``s`` - :c:expr:`const char*` or :c:expr:`const wchar_t*` - - A null-terminated C character array. + - A null-terminated C character array. :c:expr:`const char*` is decoded + from UTF-8 with the "replace" error handler. * - ``p`` - :c:expr:`const void*` @@ -576,6 +578,9 @@ APIs: .. versionchanged:: 3.13 Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added. + .. versionchanged:: 3.14 + The format string is now decoded from UTF-8 instead of ASCII. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b2dd80b64a691a..21880f872f09cf 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -261,6 +261,10 @@ New Features Porting to Python 3.14 ---------------------- +* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with + the "replace" error handler, instead of decoding it from ASCII. + (Contributed by Victor Stinner in :gh:`119182`.) + Deprecated ---------- diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..2b7352aabeffa1 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -384,12 +384,12 @@ def check_format(expected, format, *args): check_format('ascii\x7f=unicode\xe9', b'ascii\x7f=%U', 'unicode\xe9') - # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() - # raises an error - self.assertRaisesRegex(ValueError, - r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' - 'string, got a non-ASCII byte: 0xe9$', - PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + # Non-ASCII format and non-ASCII arguments are both decoded + # from UTF-8/replace + check_format('unicode\xe9=\u20ac', + 'unicode\xe9=%s'.encode(), '\u20ac'.encode()) + check_format('invalid\ufffd=abc\ufffd', + b'invalid\xe9=%s', b'abc\xe9') # test "%c" check_format('\uabcd', diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst new file mode 100644 index 00000000000000..71e5ae8579a800 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst @@ -0,0 +1,3 @@ +:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with +the "replace" error handler, instead of decoding it from ASCII. Patch by +Victor Stinner. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..cff227911298d0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed); + _Py_error_handler error_handler, const char *errors); #ifdef Py_DEBUG static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); @@ -2402,7 +2401,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace", NULL); + _Py_ERROR_REPLACE, "replace"); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, @@ -2896,28 +2895,21 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) const char *p; Py_ssize_t len; - p = f; - do - { - if ((unsigned char)*p > 127) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*p); - goto fail; - } - p++; + p = strchr(f, '%'); + if (p != NULL) { + len = p - f; } - while (*p != '\0' && *p != '%'); - len = p - f; - - if (*p == '\0') + else { + len = strlen(f); writer.overallocate = 0; + } - if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) + if (unicode_decode_utf8_writer(&writer, f, len, + _Py_ERROR_REPLACE, "replace") < 0) { goto fail; + } - f = p; + f += len; } } va_end(vargs2); @@ -4930,13 +4922,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed) + _Py_error_handler error_handler, const char *errors) { if (size == 0) { - if (consumed) { - *consumed = 0; - } return 0; } @@ -4954,9 +4942,6 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, writer->pos += decoded; if (decoded == size) { - if (consumed) { - *consumed = size; - } return 0; } s += decoded; @@ -4964,7 +4949,7 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, } return unicode_decode_utf8_impl(writer, starts, s, end, - error_handler, errors, consumed); + error_handler, errors, NULL); } From 6a879156a924df2dbe078cc95c6a3b549daf6f2b Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 7 Jun 2024 22:52:41 +0200 Subject: [PATCH 2/7] Update test_exceptions --- Lib/test/test_capi/test_exceptions.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py index c475b6d78d0c56..404cb39a8488d1 100644 --- a/Lib/test/test_capi/test_exceptions.py +++ b/Lib/test/test_capi/test_exceptions.py @@ -264,17 +264,22 @@ def test_format(self): PyErr_Format = getattr(pythonapi, name) PyErr_Format.argtypes = (py_object, c_char_p,) PyErr_Format.restype = py_object + with self.assertRaises(ZeroDivisionError) as e: PyErr_Format(ZeroDivisionError, b'%s %d', b'error', c_int(42)) self.assertEqual(e.exception.args, ('error 42',)) + + with self.assertRaises(ZeroDivisionError) as e: + PyErr_Format(ZeroDivisionError, b'invalid \xff') + self.assertEqual(e.exception.args, ('invalid \ufffd',)) + with self.assertRaises(ZeroDivisionError) as e: PyErr_Format(ZeroDivisionError, b'%s', 'помилка'.encode()) self.assertEqual(e.exception.args, ('помилка',)) with self.assertRaisesRegex(OverflowError, 'not in range'): PyErr_Format(ZeroDivisionError, b'%c', c_int(-1)) - with self.assertRaisesRegex(ValueError, 'format string'): - PyErr_Format(ZeroDivisionError, b'\xff') + self.assertRaises(SystemError, PyErr_Format, list, b'error') # CRASHES PyErr_Format(ZeroDivisionError, NULL) # CRASHES PyErr_Format(py_object(), b'error') @@ -377,7 +382,7 @@ def test_err_formatunraisable(self): self.assertEqual(str(cm.unraisable.exc_value), 'oops!') self.assertEqual(cm.unraisable.exc_traceback.tb_lineno, firstline + 15) - self.assertIsNone(cm.unraisable.err_msg) + self.assertEqual(cm.unraisable.err_msg, 'undecodable \ufffd') self.assertIsNone(cm.unraisable.object) with support.catch_unraisable_exception() as cm: @@ -401,7 +406,8 @@ def test_err_formatunraisable(self): support.captured_stderr() as stderr): formatunraisable(CustomError('oops!'), b'undecodable \xff') lines = stderr.getvalue().splitlines() - self.assertEqual(lines[0], 'Traceback (most recent call last):') + self.assertEqual(lines[0], 'undecodable \ufffd:') + self.assertEqual(lines[1], 'Traceback (most recent call last):') self.assertEqual(lines[-1], f'{__name__}.CustomError: oops!') with (support.swap_attr(sys, 'unraisablehook', None), From e830944769b3ee25bc251bc1b88a71f1c95d1e7c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 12:39:13 +0200 Subject: [PATCH 3/7] Use strict error handler --- Doc/c-api/unicode.rst | 2 +- Doc/whatsnew/3.14.rst | 2 +- Lib/test/test_capi/test_exceptions.py | 14 +++++--------- Lib/test/test_capi/test_unicode.py | 13 +++++++------ .../2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst | 2 +- Objects/unicodeobject.c | 7 ++++++- 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 1d7c8745cb63d1..187c8a4b595c93 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -387,7 +387,7 @@ APIs: arguments, calculate the size of the resulting Python Unicode string and return a string with the values formatted into it. The variable arguments must be C types and must correspond exactly to the format characters in the *format* - string. The *format* string is decoded from UTF-8 with the "replace" error + string. The *format* string is decoded from UTF-8 with the "strict" error handler. A conversion specifier contains two or more characters and has the following diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 21880f872f09cf..2bfef2564af87f 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -262,7 +262,7 @@ Porting to Python 3.14 ---------------------- * :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with - the "replace" error handler, instead of decoding it from ASCII. + the "strict" error handler, instead of decoding it from ASCII. (Contributed by Victor Stinner in :gh:`119182`.) Deprecated diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py index 404cb39a8488d1..df9ff83c2bf6b3 100644 --- a/Lib/test/test_capi/test_exceptions.py +++ b/Lib/test/test_capi/test_exceptions.py @@ -264,21 +264,18 @@ def test_format(self): PyErr_Format = getattr(pythonapi, name) PyErr_Format.argtypes = (py_object, c_char_p,) PyErr_Format.restype = py_object - with self.assertRaises(ZeroDivisionError) as e: PyErr_Format(ZeroDivisionError, b'%s %d', b'error', c_int(42)) self.assertEqual(e.exception.args, ('error 42',)) - - with self.assertRaises(ZeroDivisionError) as e: - PyErr_Format(ZeroDivisionError, b'invalid \xff') - self.assertEqual(e.exception.args, ('invalid \ufffd',)) - with self.assertRaises(ZeroDivisionError) as e: PyErr_Format(ZeroDivisionError, b'%s', 'помилка'.encode()) self.assertEqual(e.exception.args, ('помилка',)) with self.assertRaisesRegex(OverflowError, 'not in range'): PyErr_Format(ZeroDivisionError, b'%c', c_int(-1)) + with self.assertRaisesRegex(ValueError, 'format string') as cm: + PyErr_Format(ZeroDivisionError, b'\xff') + self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError) self.assertRaises(SystemError, PyErr_Format, list, b'error') # CRASHES PyErr_Format(ZeroDivisionError, NULL) @@ -382,7 +379,7 @@ def test_err_formatunraisable(self): self.assertEqual(str(cm.unraisable.exc_value), 'oops!') self.assertEqual(cm.unraisable.exc_traceback.tb_lineno, firstline + 15) - self.assertEqual(cm.unraisable.err_msg, 'undecodable \ufffd') + self.assertIsNone(cm.unraisable.err_msg) self.assertIsNone(cm.unraisable.object) with support.catch_unraisable_exception() as cm: @@ -406,8 +403,7 @@ def test_err_formatunraisable(self): support.captured_stderr() as stderr): formatunraisable(CustomError('oops!'), b'undecodable \xff') lines = stderr.getvalue().splitlines() - self.assertEqual(lines[0], 'undecodable \ufffd:') - self.assertEqual(lines[1], 'Traceback (most recent call last):') + self.assertEqual(lines[0], 'Traceback (most recent call last):') self.assertEqual(lines[-1], f'{__name__}.CustomError: oops!') with (support.swap_attr(sys, 'unraisablehook', None), diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 2b7352aabeffa1..874fa87cf109ea 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -384,12 +384,13 @@ def check_format(expected, format, *args): check_format('ascii\x7f=unicode\xe9', b'ascii\x7f=%U', 'unicode\xe9') - # Non-ASCII format and non-ASCII arguments are both decoded - # from UTF-8/replace - check_format('unicode\xe9=\u20ac', - 'unicode\xe9=%s'.encode(), '\u20ac'.encode()) - check_format('invalid\ufffd=abc\ufffd', - b'invalid\xe9=%s', b'abc\xe9') + # The %s arguments are decoded from UTF-8/replace. + # The format string is decoded from UTF-8/strict. + check_format('value=utf8 \u20ac', + 'value=%s'.encode(), 'utf8 \u20ac'.encode()) + with self.assertRaisesRegex(ValueError, 'format string') as cm: + PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc') + self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError) # test "%c" check_format('\uabcd', diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst index 71e5ae8579a800..c70c34ac751d49 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst @@ -1,3 +1,3 @@ :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with -the "replace" error handler, instead of decoding it from ASCII. Patch by +the "strict" error handler, instead of decoding it from ASCII. Patch by Victor Stinner. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cff227911298d0..f92e6073fdfc4a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2905,7 +2905,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) } if (unicode_decode_utf8_writer(&writer, f, len, - _Py_ERROR_REPLACE, "replace") < 0) { + _Py_ERROR_STRICT, "strict") < 0) { + PyObject *exc = PyErr_GetRaisedException(); + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects a valid UTF-8-encoded " + "format string, got an invalid UTF-8 string"); + _PyErr_ChainExceptions1(exc); goto fail; } From 242e6cb583e384ca67cf417ade13ead6c6927032 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 13:10:30 +0200 Subject: [PATCH 4/7] Fix error handling Replace PyErr_Format() with PyErr_SetString() --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f92e6073fdfc4a..7f7cdcf34d3bbd 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2907,7 +2907,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (unicode_decode_utf8_writer(&writer, f, len, _Py_ERROR_STRICT, "strict") < 0) { PyObject *exc = PyErr_GetRaisedException(); - PyErr_Format(PyExc_ValueError, + PyErr_SetString(PyExc_ValueError, "PyUnicode_FromFormatV() expects a valid UTF-8-encoded " "format string, got an invalid UTF-8 string"); _PyErr_ChainExceptions1(exc); From d04269ff6ea5d129be59ef1668c93ec3fca88eae Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 14:07:49 +0200 Subject: [PATCH 5/7] Add tests on truncated UTF-8 format strings --- Lib/test/test_capi/test_unicode.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 874fa87cf109ea..3dadf7657056b8 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -380,7 +380,7 @@ def check_format(expected, format, *args): text = PyUnicode_FromFormat(format, *args) self.assertEqual(expected, text) - # ascii format, non-ascii argument + # ASCII format, non-ASCII %U argument check_format('ascii\x7f=unicode\xe9', b'ascii\x7f=%U', 'unicode\xe9') @@ -392,6 +392,12 @@ def check_format(expected, format, *args): PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc') self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError) + # Truncated UTF-8 format strings + with self.assertRaisesRegex(ValueError, 'format string'): + PyUnicode_FromFormat(b'truncated utf8: \xc3') + with self.assertRaisesRegex(ValueError, 'format string'): + PyUnicode_FromFormat(b'truncated utf8: \xe2\x82') + # test "%c" check_format('\uabcd', b'%c', c_int(0xabcd)) From 94da5e7680e817a7f54b6b5137fe748b66be3f5f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 14:10:05 +0200 Subject: [PATCH 6/7] Don't mention the strict error handler --- Doc/c-api/unicode.rst | 3 +-- Doc/whatsnew/3.14.rst | 4 ++-- .../C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 187c8a4b595c93..0470e6b91e3cc0 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -387,8 +387,7 @@ APIs: arguments, calculate the size of the resulting Python Unicode string and return a string with the values formatted into it. The variable arguments must be C types and must correspond exactly to the format characters in the *format* - string. The *format* string is decoded from UTF-8 with the "strict" error - handler. + string. The *format* string is decoded from UTF-8. A conversion specifier contains two or more characters and has the following components, which must occur in this order: diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 2bfef2564af87f..8826a1120d625f 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -261,8 +261,8 @@ New Features Porting to Python 3.14 ---------------------- -* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with - the "strict" error handler, instead of decoding it from ASCII. +* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8, + instead of ASCII. (Contributed by Victor Stinner in :gh:`119182`.) Deprecated diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst index c70c34ac751d49..995e4633e35eef 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst @@ -1,3 +1,2 @@ -:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8 with -the "strict" error handler, instead of decoding it from ASCII. Patch by -Victor Stinner. +:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8, +instead of ASCII. Patch by Victor Stinner. From 89fd69ab3eaa1a24648572c65bc129b28e6187a2 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Tue, 11 Jun 2024 20:33:22 +0200 Subject: [PATCH 7/7] Revert consumed parameter --- Objects/unicodeobject.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7f7cdcf34d3bbd..a6817d53e8c9a0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -205,7 +205,8 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors); + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed); #ifdef Py_DEBUG static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); @@ -2401,7 +2402,7 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, if (width < 0) { return unicode_decode_utf8_writer(writer, str, length, - _Py_ERROR_REPLACE, "replace"); + _Py_ERROR_REPLACE, "replace", NULL); } PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, @@ -2905,7 +2906,8 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) } if (unicode_decode_utf8_writer(&writer, f, len, - _Py_ERROR_STRICT, "strict") < 0) { + _Py_ERROR_STRICT, "strict", + NULL) < 0) { PyObject *exc = PyErr_GetRaisedException(); PyErr_SetString(PyExc_ValueError, "PyUnicode_FromFormatV() expects a valid UTF-8-encoded " @@ -4927,9 +4929,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, static int unicode_decode_utf8_writer(_PyUnicodeWriter *writer, const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors) + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed) { if (size == 0) { + if (consumed) { + *consumed = 0; + } return 0; } @@ -4947,6 +4953,9 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, writer->pos += decoded; if (decoded == size) { + if (consumed) { + *consumed = size; + } return 0; } s += decoded; @@ -4954,7 +4963,7 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer, } return unicode_decode_utf8_impl(writer, starts, s, end, - error_handler, errors, NULL); + error_handler, errors, consumed); }