Skip to content

gh-119396: Optimize PyUnicode_FromFormat() UTF-8 decoder #119398

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 22, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 141 additions & 62 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
Expand Down Expand Up @@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
}

static int
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t length;
PyObject *unicode;
int res;

if (precision == -1) {
length = strlen(str);
}
Expand All @@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
length++;
}
}
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);

if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_REPLACE, "replace", NULL);
}

PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
"replace", NULL);
if (unicode == NULL)
return -1;

res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
int res = unicode_fromformat_write_str(writer, unicode,
width, -1, flags);
Py_DECREF(unicode);
return res;
}
Expand Down Expand Up @@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
else {
/* UTF-8 */
const char *s = va_arg(*vargs, const char*);
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
return NULL;
}
break;
Expand Down Expand Up @@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
}
else {
assert(str != NULL);
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
return NULL;
}
break;
Expand Down Expand Up @@ -4737,65 +4747,33 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start;
}

static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed)
*consumed = 0;
_Py_RETURN_UNICODE_EMPTY();
}

/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}

const char *starts = s;
const char *end = s + size;

// fast path: try ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (s == end) {
if (consumed) {
*consumed = size;
}
return u;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = s - starts;

static int
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
const char *starts, const char *s, const char *end,
_Py_error_handler error_handler,
const char *errors,
Py_ssize_t *consumed)
{
Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;

while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
int kind = writer->kind;

if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
if (PyUnicode_IS_ASCII(writer->buffer))
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
else
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else if (kind == PyUnicode_2BYTE_KIND) {
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else {
assert(kind == PyUnicode_4BYTE_KIND);
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
}

switch (ch) {
Expand Down Expand Up @@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
endinpos = startinpos + ch - 1;
break;
default:
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
// ch doesn't fit into kind, so change the buffer kind to write
// the character
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
goto onError;
continue;
}
Expand All @@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
break;

case _Py_ERROR_REPLACE:
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
goto onError;
s += (endinpos - startinpos);
break;
Expand All @@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
{
Py_ssize_t i;

if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
goto onError;
for (i=startinpos; i<endinpos; i++) {
ch = (Py_UCS4)(unsigned char)(starts[i]);
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
ch + 0xdc00);
writer.pos++;
writer->pos++;
}
s += (endinpos - startinpos);
break;
Expand All @@ -4866,8 +4846,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
errors, &error_handler_obj,
"utf-8", errmsg,
&starts, &end, &startinpos, &endinpos, &exc, &s,
&writer))
writer)) {
goto onError;
}

if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
return -1;
}
}
}

Expand All @@ -4877,13 +4862,107 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,

Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
return 0;

onError:
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
return -1;
}


static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
_Py_RETURN_UNICODE_EMPTY();
}

/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}

// fast path: try ASCII string.
const char *starts = s;
const char *end = s + size;
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
}
Py_ssize_t decoded = ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (decoded == size) {
if (consumed) {
*consumed = size;
}
return u;
}
s += decoded;
size -= decoded;

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = decoded;

if (unicode_decode_utf8_impl(&writer, starts, s, end,
error_handler, errors,
consumed) < 0) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
return _PyUnicodeWriter_Finish(&writer);
}


static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed) {
*consumed = 0;
}
return 0;
}

// fast path: try ASCII string.
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
return -1;
}

const char *starts = s;
const char *end = s + size;
Py_ssize_t decoded = 0;
Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
if (writer->kind == PyUnicode_1BYTE_KIND
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
decoded = ascii_decode(s, end, dest);
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
s += decoded;
size -= decoded;
}

return unicode_decode_utf8_impl(writer, starts, s, end,
error_handler, errors, consumed);
}


Expand Down
Loading