Skip to content

Commit 8aa73b7

Browse files
committed
gh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()
Add PyUnicodeWriter_WriteWideChar() and PyUnicodeWriter_DecodeUTF8Stateful() functions.
1 parent 5c4235c commit 8aa73b7

File tree

5 files changed

+209
-3
lines changed

5 files changed

+209
-3
lines changed

Doc/c-api/unicode.rst

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,9 +1551,17 @@ object.
15511551
On success, return ``0``.
15521552
On error, set an exception, leave the writer unchanged, and return ``-1``.
15531553
1554-
To use a different error handler than ``strict``,
1555-
:c:func:`PyUnicode_DecodeUTF8` can be used with
1556-
:c:func:`PyUnicodeWriter_WriteStr`.
1554+
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
1555+
1556+
.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
1557+
1558+
Writer the wide string *str* into *writer*.
1559+
1560+
*size* is a number of wide characters. If *size* is equal to ``-1``, call
1561+
``wcslen(str)`` to get the string length.
1562+
1563+
On success, return ``0``.
1564+
On error, set an exception, leave the writer unchanged, and return ``-1``.
15571565
15581566
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
15591567
@@ -1586,3 +1594,22 @@ object.
15861594
15871595
On success, return ``0``.
15881596
On error, set an exception, leave the writer unchanged, and return ``-1``.
1597+
1598+
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
1599+
1600+
Decode the string *str* from UTF-8 with *errors* error handler and write the
1601+
output into *writer*.
1602+
1603+
*size* is the string length in bytes. If *size* is equal to ``-1``, call
1604+
``strlen(str)`` to get the string length.
1605+
1606+
*errors* is an error handler name, such as ``"replace"``. If *errors* is
1607+
``NULL``, use the strict error handler.
1608+
1609+
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
1610+
bytes on success.
1611+
1612+
On success, return ``0``.
1613+
On error, set an exception, leave the writer unchanged, and return ``-1``.
1614+
1615+
See also :c:func:`PyUnicodeWriter_WriteUTF8`.

Doc/whatsnew/3.14.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,10 +291,12 @@ New Features
291291
* :c:func:`PyUnicodeWriter_Finish`.
292292
* :c:func:`PyUnicodeWriter_WriteChar`.
293293
* :c:func:`PyUnicodeWriter_WriteUTF8`.
294+
* :c:func:`PyUnicodeWriter_WriteWideChar`.
294295
* :c:func:`PyUnicodeWriter_WriteStr`.
295296
* :c:func:`PyUnicodeWriter_WriteRepr`.
296297
* :c:func:`PyUnicodeWriter_WriteSubstring`.
297298
* :c:func:`PyUnicodeWriter_Format`.
299+
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
298300

299301
(Contributed by Victor Stinner in :gh:`119182`.)
300302

Include/cpython/unicodeobject.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
459459
PyUnicodeWriter *writer,
460460
const char *str,
461461
Py_ssize_t size);
462+
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
463+
PyUnicodeWriter *writer,
464+
wchar_t *str,
465+
Py_ssize_t size);
462466

463467
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
464468
PyUnicodeWriter *writer,
@@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
475479
PyUnicodeWriter *writer,
476480
const char *format,
477481
...);
482+
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
483+
PyUnicodeWriter *writer,
484+
const char *string, /* UTF-8 encoded string */
485+
Py_ssize_t length, /* size of string */
486+
const char *errors, /* error handling */
487+
Py_ssize_t *consumed); /* bytes consumed */
478488

479489

480490
/* --- Private _PyUnicodeWriter API --------------------------------------- */

Modules/_testcapi/unicode.c

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,88 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
374374
}
375375

376376

377+
static PyObject *
378+
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
379+
{
380+
// test PyUnicodeWriter_DecodeUTF8Stateful()
381+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
382+
if (writer == NULL) {
383+
return NULL;
384+
}
385+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
386+
goto error;
387+
}
388+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
389+
goto error;
390+
}
391+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
392+
goto error;
393+
}
394+
395+
PyObject *result = PyUnicodeWriter_Finish(writer);
396+
if (result == NULL) {
397+
return NULL;
398+
}
399+
assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd"));
400+
Py_DECREF(result);
401+
402+
Py_RETURN_NONE;
403+
404+
error:
405+
PyUnicodeWriter_Discard(writer);
406+
return NULL;
407+
}
408+
409+
410+
static PyObject *
411+
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
412+
{
413+
// test PyUnicodeWriter_DecodeUTF8Stateful()
414+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
415+
if (writer == NULL) {
416+
return NULL;
417+
}
418+
Py_ssize_t consumed;
419+
420+
// valid string
421+
consumed = 12345;
422+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
423+
goto error;
424+
}
425+
assert(consumed == 4);
426+
427+
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
428+
goto error;
429+
}
430+
431+
// consumed is 0 if write fails
432+
consumed = 12345;
433+
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
434+
PyErr_Clear();
435+
assert(consumed == 0);
436+
437+
// ignore error handler
438+
consumed = 12345;
439+
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
440+
goto error;
441+
}
442+
assert(consumed == 5);
443+
444+
PyObject *result = PyUnicodeWriter_Finish(writer);
445+
if (result == NULL) {
446+
return NULL;
447+
}
448+
assert(PyUnicode_EqualToUTF8(result, "text-more"));
449+
Py_DECREF(result);
450+
451+
Py_RETURN_NONE;
452+
453+
error:
454+
PyUnicodeWriter_Discard(writer);
455+
return NULL;
456+
}
457+
458+
377459
static PyObject *
378460
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
379461
{
@@ -436,6 +518,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
436518
}
437519

438520

521+
static PyObject *
522+
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
523+
{
524+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
525+
if (writer == NULL) {
526+
return NULL;
527+
}
528+
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
529+
goto error;
530+
}
531+
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
532+
goto error;
533+
}
534+
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
535+
goto error;
536+
}
537+
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
538+
goto error;
539+
}
540+
541+
PyObject *result = PyUnicodeWriter_Finish(writer);
542+
if (result == NULL) {
543+
return NULL;
544+
}
545+
assert(PyUnicode_EqualToUTF8(result,
546+
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
547+
Py_DECREF(result);
548+
549+
Py_RETURN_NONE;
550+
551+
error:
552+
PyUnicodeWriter_Discard(writer);
553+
return NULL;
554+
}
555+
556+
439557
static PyMethodDef TestMethods[] = {
440558
{"unicode_new", unicode_new, METH_VARARGS},
441559
{"unicode_fill", unicode_fill, METH_VARARGS},
@@ -448,8 +566,11 @@ static PyMethodDef TestMethods[] = {
448566
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
449567
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
450568
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
569+
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
570+
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
451571
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
452572
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
573+
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
453574
{NULL},
454575
};
455576

Objects/unicodeobject.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13500,6 +13500,52 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
1350013500
return res;
1350113501
}
1350213502

13503+
13504+
int
13505+
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
13506+
const char *string,
13507+
Py_ssize_t length,
13508+
const char *errors,
13509+
Py_ssize_t *consumed)
13510+
{
13511+
if (length < 0) {
13512+
length = strlen(string);
13513+
}
13514+
13515+
_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13516+
Py_ssize_t old_pos = _writer->pos;
13517+
int res = unicode_decode_utf8_writer(_writer, string, length,
13518+
_Py_ERROR_UNKNOWN, errors, consumed);
13519+
if (res < 0) {
13520+
_writer->pos = old_pos;
13521+
if (consumed) {
13522+
*consumed = 0;
13523+
}
13524+
}
13525+
return res;
13526+
}
13527+
13528+
13529+
int
13530+
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
13531+
wchar_t *str,
13532+
Py_ssize_t size)
13533+
{
13534+
if (size < 0) {
13535+
size = wcslen(str);
13536+
}
13537+
PyObject *obj = PyUnicode_FromWideChar(str, size);
13538+
if (obj == NULL) {
13539+
return -1;
13540+
}
13541+
13542+
_PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
13543+
int res = _PyUnicodeWriter_WriteStr(_writer, obj);
13544+
Py_DECREF(obj);
13545+
return res;
13546+
}
13547+
13548+
1350313549
int
1350413550
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
1350513551
const char *str, Py_ssize_t len)

0 commit comments

Comments
 (0)