From d6d0203f20d3d88b46a0ace3b8db99c55305ed0c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 10:59:41 +0200 Subject: [PATCH 1/6] gh-119182: Add PyUnicodeWriter_WriteUCS4() function --- Doc/c-api/unicode.rst | 10 +++++ Doc/whatsnew/3.14.rst | 1 + Include/cpython/unicodeobject.h | 4 ++ Lib/test/test_capi/test_unicode.py | 18 +++++++- ...-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst | 3 ++ Modules/_testcapi/unicode.c | 31 +++++++++++++ Objects/unicodeobject.c | 44 +++++++++++++++++++ 7 files changed, 110 insertions(+), 1 deletion(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 4ea20bde38c1db..3b4b5162daf24e 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1563,6 +1563,16 @@ object. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. +.. c:function:: int PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *writer, Py_UCS4 *str, Py_ssize_t size) + + Writer the UCS4 string *str* into *writer*. + + *size* is a number of UCS4 characters. If *size* is equal to ``-1``, get the + string length (search the NUL character). + + On success, return ``0``. + On error, set an exception, leave the writer unchanged, and return ``-1``. + .. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) Call :c:func:`PyObject_Str` on *obj* and write the output into *writer*. diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 2eefa232cdcd02..806e4a9c62b4ea 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -291,6 +291,7 @@ New Features * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. + * :c:func:`PyUnicodeWriter_WriteUCS4`. * :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 059bec8618c8d9..91799137101280 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -463,6 +463,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar( PyUnicodeWriter *writer, const wchar_t *str, Py_ssize_t size); +PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4( + PyUnicodeWriter *writer, + Py_UCS4 *str, + Py_ssize_t size); PyAPI_FUNC(int) PyUnicodeWriter_WriteStr( PyUnicodeWriter *writer, diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 36106b0730dd26..f3c1e06fbdbe4f 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1784,8 +1784,24 @@ def test_widechar(self): writer.write_widechar("latin1=\xE9") writer.write_widechar("-") writer.write_widechar("euro=\u20AC") + writer.write_char("-") + writer.write_ucs4("max=\U0010ffff", -1) writer.write_char('.') - self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC.") + self.assertEqual(writer.finish(), + "latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + + def test_ucs4(self): + writer = self.create_writer(0) + writer.write_ucs4("ascii", -1) + writer.write_char("-") + writer.write_ucs4("latin1=\xe9", -1) + writer.write_char("-") + writer.write_ucs4("euro=\u20ac", -1) + writer.write_char("-") + writer.write_ucs4("max=\U0010ffff", -1) + writer.write_char(".") + self.assertEqual(writer.finish(), + "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") @unittest.skipIf(ctypes is None, 'need ctypes') diff --git a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst index 3d1384c9f3252f..243f290fbd47e2 100644 --- a/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst +++ b/Misc/NEWS.d/next/C API/2024-06-07-22-12-30.gh-issue-119182.yt8Ar7.rst @@ -5,9 +5,12 @@ Add a new :c:type:`PyUnicodeWriter` API to create a Python :class:`str` object: * :c:func:`PyUnicodeWriter_Finish`. * :c:func:`PyUnicodeWriter_WriteChar`. * :c:func:`PyUnicodeWriter_WriteUTF8`. +* :c:func:`PyUnicodeWriter_WriteUCS4`. +* :c:func:`PyUnicodeWriter_WriteWideChar`. * :c:func:`PyUnicodeWriter_WriteStr`. * :c:func:`PyUnicodeWriter_WriteRepr`. * :c:func:`PyUnicodeWriter_WriteSubstring`. * :c:func:`PyUnicodeWriter_Format`. +* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`. Patch by Victor Stinner. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index c723e087baa308..b8ecf53f4f8b9c 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -360,6 +360,36 @@ writer_write_widechar(PyObject *self_raw, PyObject *args) } +static PyObject* +writer_write_ucs4(PyObject *self_raw, PyObject *args) +{ + WriterObject *self = (WriterObject *)self_raw; + if (writer_check(self) < 0) { + return NULL; + } + + PyObject *str; + Py_ssize_t size; + if (!PyArg_ParseTuple(args, "Un", &str, &size)) { + return NULL; + } + Py_ssize_t len = PyUnicode_GET_LENGTH(str); + size = Py_MIN(size, len); + + Py_UCS4 *ucs4 = PyUnicode_AsUCS4Copy(str); + if (ucs4 == NULL) { + return NULL; + } + + int res = PyUnicodeWriter_WriteUCS4(self->writer, ucs4, size); + PyMem_Free(ucs4); + if (res < 0) { + return NULL; + } + Py_RETURN_NONE; +} + + static PyObject* writer_write_str(PyObject *self_raw, PyObject *args) { @@ -484,6 +514,7 @@ static PyMethodDef writer_methods[] = { {"write_char", _PyCFunction_CAST(writer_write_char), METH_VARARGS}, {"write_utf8", _PyCFunction_CAST(writer_write_utf8), METH_VARARGS}, {"write_widechar", _PyCFunction_CAST(writer_write_widechar), METH_VARARGS}, + {"write_ucs4", _PyCFunction_CAST(writer_write_ucs4), METH_VARARGS}, {"write_str", _PyCFunction_CAST(writer_write_str), METH_VARARGS}, {"write_repr", _PyCFunction_CAST(writer_write_repr), METH_VARARGS}, {"write_substring", _PyCFunction_CAST(writer_write_substring), METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 74a743812c9c78..b0a47032c84d91 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2289,6 +2289,50 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) return res; } + +int +PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, + Py_UCS4 *str, + Py_ssize_t size) +{ + _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; + + if (size < 0) { + size = 0; + for (; str[size] != '\0'; size++); + } + + if (size == 0) { + return 0; + } + + Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size); + + if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) { + return -1; + } + + int kind = writer->kind; + void *data = (Py_UCS1*)writer->data + writer->pos * kind; + if (kind == PyUnicode_1BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, + str, str + size, + data); + } + else if (kind == PyUnicode_2BYTE_KIND) { + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, + str, str + size, + data); + } + else { + memcpy(data, str, size * sizeof(Py_UCS4)); + } + writer->pos += size; + + return 0; +} + + PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) { From aefcbf898adee901bf3990310a039c6618237f8c Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:33:02 +0200 Subject: [PATCH 2/6] size must be positive --- Doc/c-api/unicode.rst | 3 +-- Objects/unicodeobject.c | 5 +++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 3b4b5162daf24e..246cf47df62e78 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -1567,8 +1567,7 @@ object. Writer the UCS4 string *str* into *writer*. - *size* is a number of UCS4 characters. If *size* is equal to ``-1``, get the - string length (search the NUL character). + *size* is a number of UCS4 characters. On success, return ``0``. On error, set an exception, leave the writer unchanged, and return ``-1``. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2e814b42f36fe8..c6dc9f09151796 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2298,8 +2298,9 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; if (size < 0) { - size = 0; - for (; str[size] != '\0'; size++); + PyErr_SetString(PyExc_TypeError, + "size must be positive"); + return NULL; } if (size == 0) { From e18a47c3a708d5b0723b7945eb496cc720c2aa3f Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:36:05 +0200 Subject: [PATCH 3/6] Use PyUnicodeWriter_WriteUCS4() on Solaris --- Objects/unicodeobject.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c6dc9f09151796..6966120465a4ee 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2035,11 +2035,9 @@ PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer, if (!converted) { return -1; } - PyObject *unicode = _PyUnicode_FromUCS4(converted, size); - PyMem_Free(converted); - int res = _PyUnicodeWriter_WriteStr(writer, unicode); - Py_DECREF(unicode); + int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size); + PyMem_Free(converted); return res; } #endif From 056c42e2e152fd1cdd8c1595f3e67b7bd92f9b16 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 14:58:17 +0200 Subject: [PATCH 4/6] Fix typo --- Objects/unicodeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6966120465a4ee..f7db0b5d1f27e6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2298,7 +2298,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, if (size < 0) { PyErr_SetString(PyExc_TypeError, "size must be positive"); - return NULL; + return -1; } if (size == 0) { From 8c7691a9dd46bf12c7f72c24fea8ea4d12e709fd Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 15:13:29 +0200 Subject: [PATCH 5/6] Update tests --- Lib/test/test_capi/test_unicode.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index f3c1e06fbdbe4f..91737a54820209 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1785,20 +1785,20 @@ def test_widechar(self): writer.write_widechar("-") writer.write_widechar("euro=\u20AC") writer.write_char("-") - writer.write_ucs4("max=\U0010ffff", -1) + writer.write_widechar("max=\U0010ffff") writer.write_char('.') self.assertEqual(writer.finish(), "latin1=\xE9-euro=\u20AC-max=\U0010ffff.") def test_ucs4(self): writer = self.create_writer(0) - writer.write_ucs4("ascii", -1) + writer.write_ucs4("ascii", 5) writer.write_char("-") - writer.write_ucs4("latin1=\xe9", -1) + writer.write_ucs4("latin1=\xe9", 8) writer.write_char("-") - writer.write_ucs4("euro=\u20ac", -1) + writer.write_ucs4("euro=\u20ac", 6) writer.write_char("-") - writer.write_ucs4("max=\U0010ffff", -1) + writer.write_ucs4("max=\U0010ffff", 5) writer.write_char(".") self.assertEqual(writer.finish(), "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") From 76e66dd101ef4eca5704c70f82aa6aaa18ab0eda Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 24 Jun 2024 15:22:40 +0200 Subject: [PATCH 6/6] Add more tests --- Lib/test/test_capi/test_unicode.py | 20 +++++++++++++++++++- Objects/unicodeobject.c | 4 ++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 91737a54820209..8f9def2e650e56 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1792,7 +1792,7 @@ def test_widechar(self): def test_ucs4(self): writer = self.create_writer(0) - writer.write_ucs4("ascii", 5) + writer.write_ucs4("ascii IGNORED", 5) writer.write_char("-") writer.write_ucs4("latin1=\xe9", 8) writer.write_char("-") @@ -1803,6 +1803,24 @@ def test_ucs4(self): self.assertEqual(writer.finish(), "ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.") + # Test some special characters + writer = self.create_writer(0) + # Lone surrogate character + writer.write_ucs4("lone\uDC80", 5) + writer.write_char("-") + # Surrogate pair + writer.write_ucs4("pair\uDBFF\uDFFF", 5) + writer.write_char("-") + writer.write_ucs4("null[\0]", 7) + self.assertEqual(writer.finish(), + "lone\udc80-pair\udbff-null[\0]") + + # invalid size + writer = self.create_writer(0) + with self.assertRaises(ValueError): + writer.write_ucs4("text", -1) + + @unittest.skipIf(ctypes is None, 'need ctypes') class PyUnicodeWriterFormatTest(unittest.TestCase): diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f7db0b5d1f27e6..8b7e8dae6ee989 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2296,7 +2296,7 @@ PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer, _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer; if (size < 0) { - PyErr_SetString(PyExc_TypeError, + PyErr_SetString(PyExc_ValueError, "size must be positive"); return -1; } @@ -13391,7 +13391,7 @@ PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) { if (length < 0) { - PyErr_SetString(PyExc_TypeError, + PyErr_SetString(PyExc_ValueError, "length must be positive"); return NULL; }