Skip to content

gh-119182: Use strict error handler in PyUnicode_FromFormat() #120307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,8 @@ APIs:

* - ``s``
- :c:expr:`const char*` or :c:expr:`const wchar_t*`
- A null-terminated C character array.
- A null-terminated C character array. The argument is decoded from
UTF-8 with the "strict" error handler.

* - ``p``
- :c:expr:`const void*`
Expand Down Expand Up @@ -576,6 +577,10 @@ APIs:
.. versionchanged:: 3.13
Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added.

.. versionchanged:: 3.14
The ``"%s"`` format now decodes its argument from UTF-8 with the "strict"
error handler, instead of the "replace" error handler.


.. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)

Expand Down
5 changes: 5 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,11 @@ New Features
Porting to Python 3.14
----------------------

* :c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from
UTF-8 with the "strict" error handler, instead of the "replace" error
handler.
(Contributed by Victor Stinner in :gh:`119182`.)

Deprecated
----------

Expand Down
3 changes: 1 addition & 2 deletions Lib/test/test_capi/test_getargs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,8 +1298,7 @@ def test_nonascii_keywords(self):
self.assertEqual(parse((), {}, '|O', [invalid]), (NULL,))
self.assertEqual(parse((1,), {'b': 2}, 'O|O', [invalid, 'b']),
(1, 2))
with self.assertRaisesRegex(TypeError,
f"function missing required argument '{name}\ufffd'"):
with self.assertRaises(UnicodeDecodeError):
parse((), {}, 'O', [invalid])
with self.assertRaisesRegex(UnicodeDecodeError,
f"'utf-8' codec can't decode bytes? "):
Expand Down
29 changes: 15 additions & 14 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,11 @@ def check_format(expected, format, *args):
check_format('ascii\x7f=unicode\xe9',
b'ascii\x7f=%U', 'unicode\xe9')

# non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
# raises an error
self.assertRaisesRegex(ValueError,
r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
'string, got a non-ASCII byte: 0xe9$',
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
# "%s" format decodes its argument from UTF-8/strict
check_format('value=\u20ac',
b'value=%s', '\u20ac'.encode())
with self.assertRaises(UnicodeDecodeError):
PyUnicode_FromFormat(b'value=%s', b'invalid\xe9')

# test "%c"
check_format('\uabcd',
Expand All @@ -412,11 +411,13 @@ def check_format(expected, format, *args):
check_format('%abc',
b'%%%s', b'abc')

# truncated string
# test "%s" format with precision
check_format('abc',
b'%.3s', b'abcdef')
check_format('abc[\ufffd',
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
with self.assertRaises(UnicodeDecodeError):
PyUnicode_FromFormat(b'%.5s', 'abc[\u20ac]'.encode('utf8'))
check_format('abc[\u20ac',
b'%.7s', 'abc[\u20ac]'.encode('utf8'))
check_format("'\\u20acABC'",
b'%A', '\u20acABC')
check_format("'\\u20",
Expand All @@ -431,8 +432,8 @@ def check_format(expected, format, *args):
b'%.3U', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3V', '\u20acABCDEF', None)
check_format('abc[\ufffd',
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
with self.assertRaises(UnicodeDecodeError):
PyUnicode_FromFormat(b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))

# following tests comes from #7330
# test width modifier and precision modifier with %S
Expand Down Expand Up @@ -723,9 +724,9 @@ class LocalType:
check_format('repr=\u4eba\u6c11',
b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')

#Test replace error handler.
check_format('repr=abc\ufffd',
b'repr=%V', None, b'abc\xff')
# Test replace the "strict" error handler.
with self.assertRaises(UnicodeDecodeError):
PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')

# Issue #33817: empty strings
check_format('',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from
UTF-8 with the "strict" error handler, instead of the "replace" error handler.
Patch by Victor Stinner.
20 changes: 6 additions & 14 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
_Py_error_handler error_handler, const char *errors);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
Expand Down Expand Up @@ -2402,11 +2401,11 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,

if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_REPLACE, "replace", NULL);
_Py_ERROR_STRICT, "strict");
}

PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
"replace", NULL);
PyObject *unicode = unicode_decode_utf8(str, length,
_Py_ERROR_STRICT, "strict", NULL);
if (unicode == NULL)
return -1;

Expand Down Expand Up @@ -4930,13 +4929,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
_Py_error_handler error_handler, const char *errors)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
return 0;
}

Expand All @@ -4954,17 +4949,14 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
size -= decoded;
}

return unicode_decode_utf8_impl(writer, starts, s, end,
error_handler, errors, consumed);
error_handler, errors, NULL);
}


Expand Down
Loading