diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 7320d035bab513..0470e6b91e3cc0 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -387,7 +387,7 @@ APIs: arguments, calculate the size of the resulting Python Unicode string and return a string with the values formatted into it. The variable arguments must be C types and must correspond exactly to the format characters in the *format* - ASCII-encoded string. + string. The *format* string is decoded from UTF-8. A conversion specifier contains two or more characters and has the following components, which must occur in this order: @@ -487,7 +487,8 @@ APIs: * - ``s`` - :c:expr:`const char*` or :c:expr:`const wchar_t*` - - A null-terminated C character array. + - A null-terminated C character array. :c:expr:`const char*` is decoded + from UTF-8 with the "replace" error handler. * - ``p`` - :c:expr:`const void*` @@ -576,6 +577,9 @@ APIs: .. versionchanged:: 3.13 Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added. + .. versionchanged:: 3.14 + The format string is now decoded from UTF-8 instead of ASCII. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b2dd80b64a691a..8826a1120d625f 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -261,6 +261,10 @@ New Features Porting to Python 3.14 ---------------------- +* :c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8, + instead of ASCII. + (Contributed by Victor Stinner in :gh:`119182`.) + Deprecated ---------- diff --git a/Lib/test/test_capi/test_exceptions.py b/Lib/test/test_capi/test_exceptions.py index c475b6d78d0c56..df9ff83c2bf6b3 100644 --- a/Lib/test/test_capi/test_exceptions.py +++ b/Lib/test/test_capi/test_exceptions.py @@ -273,8 +273,10 @@ def test_format(self): with self.assertRaisesRegex(OverflowError, 'not in range'): PyErr_Format(ZeroDivisionError, b'%c', c_int(-1)) - with self.assertRaisesRegex(ValueError, 'format string'): + with self.assertRaisesRegex(ValueError, 'format string') as cm: PyErr_Format(ZeroDivisionError, b'\xff') + self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError) + self.assertRaises(SystemError, PyErr_Format, list, b'error') # CRASHES PyErr_Format(ZeroDivisionError, NULL) # CRASHES PyErr_Format(py_object(), b'error') diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index a69f817c515ba7..3dadf7657056b8 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -380,16 +380,23 @@ def check_format(expected, format, *args): text = PyUnicode_FromFormat(format, *args) self.assertEqual(expected, text) - # ascii format, non-ascii argument + # ASCII format, non-ASCII %U argument check_format('ascii\x7f=unicode\xe9', b'ascii\x7f=%U', 'unicode\xe9') - # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() - # raises an error - self.assertRaisesRegex(ValueError, - r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' - 'string, got a non-ASCII byte: 0xe9$', - PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') + # The %s arguments are decoded from UTF-8/replace. + # The format string is decoded from UTF-8/strict. + check_format('value=utf8 \u20ac', + 'value=%s'.encode(), 'utf8 \u20ac'.encode()) + with self.assertRaisesRegex(ValueError, 'format string') as cm: + PyUnicode_FromFormat(b'invalid format string\xff: %s', b'abc') + self.assertIsInstance(cm.exception.__context__, UnicodeDecodeError) + + # Truncated UTF-8 format strings + with self.assertRaisesRegex(ValueError, 'format string'): + PyUnicode_FromFormat(b'truncated utf8: \xc3') + with self.assertRaisesRegex(ValueError, 'format string'): + PyUnicode_FromFormat(b'truncated utf8: \xe2\x82') # test "%c" check_format('\uabcd', diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst new file mode 100644 index 00000000000000..995e4633e35eef --- /dev/null +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst @@ -0,0 +1,2 @@ +:c:func:`PyUnicode_FromFormat` now decodes the format string from UTF-8, +instead of ASCII. Patch by Victor Stinner. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..a6817d53e8c9a0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2896,28 +2896,27 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) const char *p; Py_ssize_t len; - p = f; - do - { - if ((unsigned char)*p > 127) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*p); - goto fail; - } - p++; + p = strchr(f, '%'); + if (p != NULL) { + len = p - f; } - while (*p != '\0' && *p != '%'); - len = p - f; - - if (*p == '\0') + else { + len = strlen(f); writer.overallocate = 0; + } - if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) + if (unicode_decode_utf8_writer(&writer, f, len, + _Py_ERROR_STRICT, "strict", + NULL) < 0) { + PyObject *exc = PyErr_GetRaisedException(); + PyErr_SetString(PyExc_ValueError, + "PyUnicode_FromFormatV() expects a valid UTF-8-encoded " + "format string, got an invalid UTF-8 string"); + _PyErr_ChainExceptions1(exc); goto fail; + } - f = p; + f += len; } } va_end(vargs2);