python · vstinner · Jun 7, 2024
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -487,7 +487,8 @@ APIs:
 
       * - ``s``
         - :c:expr:`const char*` or :c:expr:`const wchar_t*`
-        - A null-terminated C character array.
+        - A null-terminated C character array. The argument is decoded from
+          UTF-8 with the "strict" error handler.
 
       * - ``p``
         - :c:expr:`const void*`
@@ -576,6 +577,10 @@ APIs:
    .. versionchanged:: 3.13
       Support for ``%T``, ``%#T``, ``%N`` and ``%#N`` formats added.
 
+   .. versionchanged:: 3.14
+      The ``"%s"`` format now decodes its argument from UTF-8 with the "strict"
+      error handler, instead of the "replace" error handler.
+
 
 .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs)
 

diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -261,6 +261,11 @@ New Features
 Porting to Python 3.14
 ----------------------
 
+* :c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from
+  UTF-8 with the "strict" error handler, instead of the "replace" error
+  handler.
+  (Contributed by Victor Stinner in :gh:`119182`.)
+
 Deprecated
 ----------
 

diff --git a/Lib/test/test_capi/test_getargs.py b/Lib/test/test_capi/test_getargs.py
@@ -1298,8 +1298,7 @@ def test_nonascii_keywords(self):
                 self.assertEqual(parse((), {}, '|O', [invalid]), (NULL,))
                 self.assertEqual(parse((1,), {'b': 2}, 'O|O', [invalid, 'b']),
                                     (1, 2))
-                with self.assertRaisesRegex(TypeError,
-                        f"function missing required argument '{name}\ufffd'"):
+                with self.assertRaises(UnicodeDecodeError):
                     parse((), {}, 'O', [invalid])
                 with self.assertRaisesRegex(UnicodeDecodeError,
                         f"'utf-8' codec can't decode bytes? "):

diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -384,12 +384,11 @@ def check_format(expected, format, *args):
         check_format('ascii\x7f=unicode\xe9',
                      b'ascii\x7f=%U', 'unicode\xe9')
 
-        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
-        # raises an error
-        self.assertRaisesRegex(ValueError,
-            r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
-            'string, got a non-ASCII byte: 0xe9$',
-            PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
+        # "%s" format decodes its argument from UTF-8/strict
+        check_format('value=\u20ac',
+                     b'value=%s', '\u20ac'.encode())
+        with self.assertRaises(UnicodeDecodeError):
+            PyUnicode_FromFormat(b'value=%s', b'invalid\xe9')
 
         # test "%c"
         check_format('\uabcd',
@@ -412,11 +411,13 @@ def check_format(expected, format, *args):
         check_format('%abc',
                      b'%%%s', b'abc')
 
-        # truncated string
+        # test "%s" format with precision
         check_format('abc',
                      b'%.3s', b'abcdef')
-        check_format('abc[\ufffd',
-                     b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+        with self.assertRaises(UnicodeDecodeError):
+            PyUnicode_FromFormat(b'%.5s', 'abc[\u20ac]'.encode('utf8'))
+        check_format('abc[\u20ac',
+                     b'%.7s', 'abc[\u20ac]'.encode('utf8'))
         check_format("'\\u20acABC'",
                      b'%A', '\u20acABC')
         check_format("'\\u20",
@@ -431,8 +432,8 @@ def check_format(expected, format, *args):
                      b'%.3U', '\u20acABCDEF')
         check_format('\u20acAB',
                      b'%.3V', '\u20acABCDEF', None)
-        check_format('abc[\ufffd',
-                     b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
+        with self.assertRaises(UnicodeDecodeError):
+            PyUnicode_FromFormat(b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
 
         # following tests comes from #7330
         # test width modifier and precision modifier with %S
@@ -723,9 +724,9 @@ class LocalType:
         check_format('repr=\u4eba\u6c11',
                      b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
 
-        #Test replace error handler.
-        check_format('repr=abc\ufffd',
-                     b'repr=%V', None, b'abc\xff')
+        # Test replace the "strict" error handler.
+        with self.assertRaises(UnicodeDecodeError):
+            PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff')
 
         # Issue #33817: empty strings
         check_format('',

diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-38-08.gh-issue-119182.P3nXBm.rst
@@ -0,0 +1,3 @@
+:c:func:`PyUnicode_FromFormat` now decodes the ``"%s"`` format argument from
+UTF-8 with the "strict" error handler, instead of the "replace" error handler.
+Patch by Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -205,8 +205,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                            const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed);
+                           _Py_error_handler error_handler, const char *errors);
 #ifdef Py_DEBUG
 static inline int unicode_is_finalizing(void);
 static int unicode_is_singleton(PyObject *unicode);
@@ -2402,11 +2401,11 @@ unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
 
     if (width < 0) {
         return unicode_decode_utf8_writer(writer, str, length,
-                                          _Py_ERROR_REPLACE, "replace", NULL);
+                                          _Py_ERROR_STRICT, "strict");
     }
 
-    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
-                                                     "replace", NULL);
+    PyObject *unicode = unicode_decode_utf8(str, length,
+                                            _Py_ERROR_STRICT, "strict", NULL);
     if (unicode == NULL)
         return -1;
 
@@ -4930,13 +4929,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
 static int
 unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
                            const char *s, Py_ssize_t size,
-                           _Py_error_handler error_handler, const char *errors,
-                           Py_ssize_t *consumed)
+                           _Py_error_handler error_handler, const char *errors)
 {
     if (size == 0) {
-        if (consumed) {
-            *consumed = 0;
-        }
         return 0;
     }
 
@@ -4954,17 +4949,14 @@ unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
         writer->pos += decoded;
 
         if (decoded == size) {
-            if (consumed) {
-                *consumed = size;
-            }
             return 0;
         }
         s += decoded;
         size -= decoded;
     }
 
     return unicode_decode_utf8_impl(writer, starts, s, end,
-                                    error_handler, errors, consumed);
+                                    error_handler, errors, NULL);
 }