From 818ea5242eb03b2ef61af8d7b1b3071e44f16125 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 14:44:00 +0200 Subject: [PATCH 1/3] gh-119182: Optimize PyUnicode_FromFormat() Use strchr() and ucs1lib_find_max_char() to optimize the code path formatting sub-strings between '%' formats. --- Objects/unicodeobject.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 1f8c89dd12a528..6f1dadfebe4219 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2891,31 +2891,31 @@ unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) goto fail; } else { - const char *p; + const char *p = strchr(f, '%'); Py_ssize_t len; - - p = f; - do - { - if ((unsigned char)*p > 127) { - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)*p); - goto fail; - } - p++; + if (p != NULL) { + len = p - f; } - while (*p != '\0' && *p != '%'); - len = p - f; - - if (*p == '\0') + else { + len = strlen(f); writer->overallocate = 0; + } - if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) + int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)f, (Py_UCS1*)f + len) < 128); + if (!is_ascii) { + Py_ssize_t i; + for (i=0; i < len && (unsigned char)f[i] <= 127; i++); + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)f[i]); goto fail; + } - f = p; + if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) { + goto fail; + } + f += len; } } va_end(vargs2); From 57d71526a1c9c0ce5d6c5139b042be09cbe69156 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 16:17:27 +0200 Subject: [PATCH 2/3] Run ucs1lib_find_max_char() only once --- Objects/unicodeobject.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 6f1dadfebe4219..37362db8142161 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2875,16 +2875,26 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, static int unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) { - writer->min_length += strlen(format) + 100; + Py_ssize_t len = strlen(format); + writer->min_length += len + 100; writer->overallocate = 1; - va_list vargs2; - const char *f; - // Copy varags to be able to pass a reference to a subfunction. + va_list vargs2; va_copy(vargs2, vargs); - for (f = format; *f; ) { + int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128); + if (!is_ascii) { + Py_ssize_t i; + for (i=0; i < len && (unsigned char)format[i] <= 127; i++); + PyErr_Format(PyExc_ValueError, + "PyUnicode_FromFormatV() expects an ASCII-encoded format " + "string, got a non-ASCII byte: 0x%02x", + (unsigned char)format[i]); + goto fail; + } + + for (const char *f = format; *f; ) { if (*f == '%') { f = unicode_fromformat_arg(writer, f, &vargs2); if (f == NULL) @@ -2892,7 +2902,6 @@ unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) } else { const char *p = strchr(f, '%'); - Py_ssize_t len; if (p != NULL) { len = p - f; } @@ -2901,17 +2910,6 @@ unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) writer->overallocate = 0; } - int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)f, (Py_UCS1*)f + len) < 128); - if (!is_ascii) { - Py_ssize_t i; - for (i=0; i < len && (unsigned char)f[i] <= 127; i++); - PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", - (unsigned char)f[i]); - goto fail; - } - if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) { goto fail; } From 614e132c188717ac563a3fabc2d9fd40ebd35292 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 20 Jun 2024 20:40:55 +0200 Subject: [PATCH 3/3] Add comment --- Objects/unicodeobject.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 37362db8142161..e6feed47fbb2bf 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2883,6 +2883,8 @@ unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs) va_list vargs2; va_copy(vargs2, vargs); + // _PyUnicodeWriter_WriteASCIIString() below requires the format string + // to be encoded to ASCII. int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128); if (!is_ascii) { Py_ssize_t i;