From aafa8e28af19f1875bc68595e85a18b1ad510387 Mon Sep 17 00:00:00 2001
From: Paul Moore
Date: Tue, 17 Jan 2023 17:08:08 +0000
Subject: [PATCH 1/2] Don't send partial UTF-8 sequences to the Windows API
---
Modules/_io/winconsoleio.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/Modules/_io/winconsoleio.c b/Modules/_io/winconsoleio.c
index d5de64b4ac3dfd..4f41ab965e2e67 100644
--- a/Modules/_io/winconsoleio.c
+++ b/Modules/_io/winconsoleio.c
@@ -954,7 +954,7 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
{
BOOL res = TRUE;
wchar_t *wbuf;
- DWORD len, wlen, n = 0;
+ DWORD len, wlen, orig_len, n = 0;
HANDLE handle;
if (self->fd == -1)
@@ -984,6 +984,21 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, Py_buffer *b)
have to reduce and recalculate. */
while (wlen > 32766 / sizeof(wchar_t)) {
len /= 2;
+ orig_len = len;
+ /* Reduce the length until we hit the final byte of a UTF-8 sequence
+ * (top bit is unset). Fix for github issue 82052.
+ */
+ while (len > 0 && (((char *)b->buf)[len-1] & 0x80) != 0)
+ --len;
+ /* If we hit a length of 0, something has gone wrong. This shouldn't
+ * be possible, as valid UTF-8 can have at most 3 non-final bytes
+ * before a final one, and our buffer is way longer than that.
+ * But to be on the safe side, if we hit this issue we just restore
+ * the original length and let the console API sort it out.
+ */
+ if (len == 0) {
+ len = orig_len;
+ }
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
}
Py_END_ALLOW_THREADS
From e59680e0c990ea2d5bb315cb11fc4e959a1b61f3 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Tue, 17 Jan 2023 18:18:01 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
=?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst | 1 +
1 file changed, 1 insertion(+)
create mode 100644 Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst
diff --git a/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst
new file mode 100644
index 00000000000000..4f7ab200b85cba
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2023-01-17-18-17-58.gh-issue-82052.mWyysT.rst
@@ -0,0 +1 @@
+Fixed an issue where writing more than 32K of Unicode output to the console screen in one go can result in mojibake.