From 16be08fdf55f3bda8f272a2225a8920028bfb122 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 29 Apr 2023 18:38:54 -0400 Subject: [PATCH 01/42] Very rough proof-of-concept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 📜🤖 Added by blurb_it. --- ...-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 1 + Modules/main.c | 135 ++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst new file mode 100644 index 00000000000000..8949f435731e34 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst @@ -0,0 +1 @@ +Strings passed to "-c" are now automatically dedented (common leading whitespace is removed). This allows "python -c" invocations to be indented in shell scripts without causing indentation errors. diff --git a/Modules/main.c b/Modules/main.c index 7edfeb3365b4c6..ef90c7d259ad93 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -229,12 +229,145 @@ pymain_import_readline(const PyConfig *config) } +/*_command_dedent(wchar_t *command) */ +/*{ */ +/* // NEW CODE: */ +/* // Remove common leading whitespace from the string */ +/* // Handle dedenting the command */ +/* // */ +/* int cmdlen = wcslen(command); */ +/* fprintf(stderr, "COMMAND: %ls\n", command); */ +/* fprintf(stderr, "cmdlen: %d\n", cmdlen); */ + +/* int num_newlines = 0; */ +/* int num_spaces = 0; */ +/* for (int i = 0; i < cmdlen; i++) */ +/* { */ +/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ +/* num_newlines++; */ +/* } */ +/* } */ +/* int* line_endloc = (int*) malloc(sizeof(int) * num_newlines + 1); */ +/* int* line_lens = (int*) malloc(sizeof(int) * num_newlines + 1); */ +/* int* line_nleadingspaces = (int*) malloc(sizeof(int) * num_newlines + 1);*/ + +/* int curr_line = 0; */ +/* int curr_line = 0; */ +/* for (int i = 0; i < cmdlen; i++) */ +/* { */ +/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ +/* num_newlines++; */ +/* } */ +/* } */ + +/* for (int i = 0; i < cmdlen; i++) */ +/* { */ +/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ +/* num_newlines++; */ +/* } */ +/* if (wcsncmp(command + i, L" ", 1) == 0){ */ +/* num_spaces++; */ +/* } */ +/* fprintf(stderr, "command[%d] = '%lc'\n", i, command[i]); */ +/* } */ +/* fprintf(stderr, "num_newlines: %d\n", num_newlines); */ +/* fprintf(stderr, "num_spaces: %d\n", num_spaces); */ + +/*} */ + +PyObject* _unicode_dedent(PyObject *unicode) +{ + PyObject *lines = PyUnicode_Splitlines(unicode, 1); + /*PyObject_Print(lines, stdout, 0);*/ + /*fprintf(stdout, "\n");*/ + + Py_ssize_t num_lines = PyObject_Length(lines); + + PyObject* space = PyUnicode_FromWideChar(L" ", -1); + PyObject* emptystr = PyUnicode_FromWideChar(L"", -1); + PyObject* new_unicode; + + // Initialize leading space to a large value to indicate + // that it is uninitialized + Py_ssize_t effective_inf = PyObject_Length(unicode) + 1; + Py_ssize_t common_leading_spaces = effective_inf; + + for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) + { + PyObject* index = PyLong_FromSsize_t(line_idx); + PyObject* line = PyObject_GetItem(lines, index); + Py_ssize_t line_len = PyObject_Length(line); + + PyObject* striped_line = _PyUnicode_XStrip(line, 0, space); + Py_ssize_t stripline_len = PyObject_Length(striped_line); + + Py_ssize_t leading_spaces = line_len - stripline_len; + + // On non-empty lines, see if the amount of leading whitespace is less + // than current value. If so, update it. + if (line_len > 1) + { + if (leading_spaces < common_leading_spaces) { + common_leading_spaces = leading_spaces; + } + } + + /*fprintf(stdout, "Index: %d\n", line_idx); */ + /*fprintf(stdout, "Line Length: %d\n", line_len); */ + /*fprintf(stdout, "Strip Line Length: %d\n", stripline_len); */ + /*fprintf(stdout, "leading_spaces: %d\n", leading_spaces); */ + /*fprintf(stdout, "common_leading_spaces: %d\n", common_leading_spaces);*/ + /*fprintf(stdout, "Line: "); */ + /*PyObject_Print(PyObject_Repr(line), stdout, 1);*/ + //fprintf(stdout, "\n"); + Py_DECREF(line); + Py_DECREF(index); + } + + if (common_leading_spaces > 0 && common_leading_spaces < effective_inf){ + + // We found common leading whitespace, strip if off. + PyObject* new_lines = PyList_New(num_lines); + for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) + { + PyObject* index = PyLong_FromSsize_t(line_idx); + PyObject* line = PyObject_GetItem(lines, index); + Py_ssize_t end = PyObject_Length(line); + Py_ssize_t start = common_leading_spaces; + if (end <= 1){ + start = 0; + } + PyObject* new_line = PyUnicode_Substring(line, start, end); + PyList_SetItem(new_lines, line_idx, new_line); + Py_DECREF(line); + Py_DECREF(index); + } + /*PyObject_Print(PyObject_Repr(new_lines), stdout, 0);*/ + //fprintf(stdout, "\n"); + + new_unicode = PyUnicode_Join(emptystr, new_lines); + + Py_DECREF(unicode); + /*PyObject_Print(PyObject_Repr(new_unicode), stdout, 0);*/ + } + else{ + new_unicode = unicode; + } + + //fprintf(stderr, "num_lines: %d\n", num_lines); + Py_DECREF(lines); + return new_unicode; +} + + static int pymain_run_command(wchar_t *command) { PyObject *unicode, *bytes; int ret; + //_command_dedent(wchar_t *command) + unicode = PyUnicode_FromWideChar(command, -1); if (unicode == NULL) { goto error; @@ -244,6 +377,8 @@ pymain_run_command(wchar_t *command) return pymain_exit_err_print(); } + unicode = _unicode_dedent(unicode); + bytes = PyUnicode_AsUTF8String(unicode); Py_DECREF(unicode); if (bytes == NULL) { From e88216b8f982bb2385aae3966ee96564bc802133 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 29 Apr 2023 19:30:25 -0400 Subject: [PATCH 02/42] Cleanups and comments --- Modules/main.c | 51 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index ef90c7d259ad93..122c8006aaf8ff 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -275,51 +275,43 @@ pymain_import_readline(const PyConfig *config) /*} */ + +/* Strip common leading whitespace from an input command */ PyObject* _unicode_dedent(PyObject *unicode) { - PyObject *lines = PyUnicode_Splitlines(unicode, 1); - /*PyObject_Print(lines, stdout, 0);*/ - /*fprintf(stdout, "\n");*/ - - Py_ssize_t num_lines = PyObject_Length(lines); PyObject* space = PyUnicode_FromWideChar(L" ", -1); PyObject* emptystr = PyUnicode_FromWideChar(L"", -1); PyObject* new_unicode; - // Initialize leading space to a large value to indicate + // Break up the input into lines + PyObject *lines = PyUnicode_Splitlines(unicode, 1); + + // Init leading space to a large value to indicate // that it is uninitialized Py_ssize_t effective_inf = PyObject_Length(unicode) + 1; Py_ssize_t common_leading_spaces = effective_inf; + Py_ssize_t num_lines = PyObject_Length(lines); for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) { PyObject* index = PyLong_FromSsize_t(line_idx); PyObject* line = PyObject_GetItem(lines, index); - Py_ssize_t line_len = PyObject_Length(line); - PyObject* striped_line = _PyUnicode_XStrip(line, 0, space); - Py_ssize_t stripline_len = PyObject_Length(striped_line); + // Determine the number of leading whitespace on this line. + Py_ssize_t line_len = PyObject_Length(line); + Py_ssize_t stripline_len = PyObject_Length(striped_line); Py_ssize_t leading_spaces = line_len - stripline_len; // On non-empty lines, see if the amount of leading whitespace is less // than current value. If so, update it. - if (line_len > 1) + if (line_len > 1 && leading_spaces < common_leading_spaces) { - if (leading_spaces < common_leading_spaces) { - common_leading_spaces = leading_spaces; - } + common_leading_spaces = leading_spaces; } - /*fprintf(stdout, "Index: %d\n", line_idx); */ - /*fprintf(stdout, "Line Length: %d\n", line_len); */ - /*fprintf(stdout, "Strip Line Length: %d\n", stripline_len); */ - /*fprintf(stdout, "leading_spaces: %d\n", leading_spaces); */ - /*fprintf(stdout, "common_leading_spaces: %d\n", common_leading_spaces);*/ - /*fprintf(stdout, "Line: "); */ - /*PyObject_Print(PyObject_Repr(line), stdout, 1);*/ - //fprintf(stdout, "\n"); + Py_DECREF(striped_line); Py_DECREF(line); Py_DECREF(index); } @@ -339,23 +331,28 @@ PyObject* _unicode_dedent(PyObject *unicode) } PyObject* new_line = PyUnicode_Substring(line, start, end); PyList_SetItem(new_lines, line_idx, new_line); + + Py_DECREF(new_line); Py_DECREF(line); Py_DECREF(index); } - /*PyObject_Print(PyObject_Repr(new_lines), stdout, 0);*/ - //fprintf(stdout, "\n"); - new_unicode = PyUnicode_Join(emptystr, new_lines); + Py_DECREF(new_lines); + + // We are going to return an updated version of "unicode" that the + // caller will decref, so need to decref the version we are replacing + // here. This feels fragile and like the wrong way to do this. + // Guidance here would be appreciated. Py_DECREF(unicode); - /*PyObject_Print(PyObject_Repr(new_unicode), stdout, 0);*/ } else{ new_unicode = unicode; } - //fprintf(stderr, "num_lines: %d\n", num_lines); Py_DECREF(lines); + Py_DECREF(space); + Py_DECREF(emptystr); return new_unicode; } @@ -366,6 +363,7 @@ pymain_run_command(wchar_t *command) PyObject *unicode, *bytes; int ret; + // Should the input be modified here with pure C? //_command_dedent(wchar_t *command) unicode = PyUnicode_FromWideChar(command, -1); @@ -377,6 +375,7 @@ pymain_run_command(wchar_t *command) return pymain_exit_err_print(); } + // Should the input be modified here with the Python C-API? unicode = _unicode_dedent(unicode); bytes = PyUnicode_AsUTF8String(unicode); From bcb7c77866ec856fc59a21a020a2f7d6a0b72fd2 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 29 Apr 2023 19:43:14 -0400 Subject: [PATCH 03/42] Fix bad decref, only trigger if command starts with a newline --- Modules/main.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 122c8006aaf8ff..352b355ed199e4 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -293,8 +293,7 @@ PyObject* _unicode_dedent(PyObject *unicode) Py_ssize_t common_leading_spaces = effective_inf; Py_ssize_t num_lines = PyObject_Length(lines); - for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) - { + for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) { PyObject* index = PyLong_FromSsize_t(line_idx); PyObject* line = PyObject_GetItem(lines, index); PyObject* striped_line = _PyUnicode_XStrip(line, 0, space); @@ -306,8 +305,7 @@ PyObject* _unicode_dedent(PyObject *unicode) // On non-empty lines, see if the amount of leading whitespace is less // than current value. If so, update it. - if (line_len > 1 && leading_spaces < common_leading_spaces) - { + if (line_len > 1 && leading_spaces < common_leading_spaces) { common_leading_spaces = leading_spaces; } @@ -316,23 +314,22 @@ PyObject* _unicode_dedent(PyObject *unicode) Py_DECREF(index); } - if (common_leading_spaces > 0 && common_leading_spaces < effective_inf){ + if (common_leading_spaces > 0 && common_leading_spaces < effective_inf) { // We found common leading whitespace, strip if off. PyObject* new_lines = PyList_New(num_lines); - for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) - { + for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) { PyObject* index = PyLong_FromSsize_t(line_idx); PyObject* line = PyObject_GetItem(lines, index); Py_ssize_t end = PyObject_Length(line); Py_ssize_t start = common_leading_spaces; - if (end <= 1){ + if (end <= 1) { start = 0; } PyObject* new_line = PyUnicode_Substring(line, start, end); PyList_SetItem(new_lines, line_idx, new_line); - Py_DECREF(new_line); + //Py_DECREF(new_line); // is it correct that we dont need to DECREF here? Py_DECREF(line); Py_DECREF(index); } @@ -346,7 +343,7 @@ PyObject* _unicode_dedent(PyObject *unicode) // Guidance here would be appreciated. Py_DECREF(unicode); } - else{ + else { new_unicode = unicode; } @@ -375,8 +372,11 @@ pymain_run_command(wchar_t *command) return pymain_exit_err_print(); } - // Should the input be modified here with the Python C-API? - unicode = _unicode_dedent(unicode); + // Only perform auto-dedent if the string starts with a newline + if (wcsncmp(command, L"\n", 1) == 0) { + // Should the input be modified here with the Python C-API? + unicode = _unicode_dedent(unicode); + } bytes = PyUnicode_AsUTF8String(unicode); Py_DECREF(unicode); From fb8985aaad69e3c346a8b5eaf2e56871b96028be Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 29 Apr 2023 22:03:50 -0400 Subject: [PATCH 04/42] wchar dedent --- Modules/main.c | 187 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 137 insertions(+), 50 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 352b355ed199e4..9470e0e09d3659 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -228,57 +228,134 @@ pymain_import_readline(const PyConfig *config) } } +#ifdef MS_WINDOWS +# define WCSTOK wcstok_s +#else +# define WCSTOK wcstok +#endif + +//#define DEBUG_DEDENT + +/* Strip common leading whitespace from an input command + * Sort of works? + * */ +wchar_t* _wcs_dedent(wchar_t *command) +{ + // Security problem, what is the right way to do this? + size_t nchars = wcslen(command); + +#ifdef DEBUG_DEDENT + fprintf(stderr, "\nSTART WCS_DEDENT\n"); + fprintf(stderr, "command: '%ls'\n", command); + fprintf(stderr, "nchars: %d\n", nchars); +#endif + + wchar_t *command_copy = (wchar_t *)PyMem_Malloc(nchars * sizeof(wchar_t)); + // wcscpy has security problems, what is the workaround? + wcscpy(command_copy, command); + // fprintf(stderr, "command_copy: '%ls'\n", command_copy); + + size_t num_common_leading_spaces = nchars + 1; + + // I wcstok has a problem because ignores multiple instances of the + // delimiter which may make programs behave differently if that newline + // belongs to a multiline string. However, it is ok if we just want to do a + // first pass over the data to find the common indentation on non-empty + // lines. + wchar_t *buffer; + wchar_t* line = WCSTOK(command_copy, L"\n", &buffer); + while (line) { + + // Move the pointer up to the first non-space character + wchar_t *first_nonspace = line; + while (wcsncmp(first_nonspace, L" ", 1) == 0){ + first_nonspace++; + } -/*_command_dedent(wchar_t *command) */ -/*{ */ -/* // NEW CODE: */ -/* // Remove common leading whitespace from the string */ -/* // Handle dedenting the command */ -/* // */ -/* int cmdlen = wcslen(command); */ -/* fprintf(stderr, "COMMAND: %ls\n", command); */ -/* fprintf(stderr, "cmdlen: %d\n", cmdlen); */ - -/* int num_newlines = 0; */ -/* int num_spaces = 0; */ -/* for (int i = 0; i < cmdlen; i++) */ -/* { */ -/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ -/* num_newlines++; */ -/* } */ -/* } */ -/* int* line_endloc = (int*) malloc(sizeof(int) * num_newlines + 1); */ -/* int* line_lens = (int*) malloc(sizeof(int) * num_newlines + 1); */ -/* int* line_nleadingspaces = (int*) malloc(sizeof(int) * num_newlines + 1);*/ - -/* int curr_line = 0; */ -/* int curr_line = 0; */ -/* for (int i = 0; i < cmdlen; i++) */ -/* { */ -/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ -/* num_newlines++; */ -/* } */ -/* } */ - -/* for (int i = 0; i < cmdlen; i++) */ -/* { */ -/* if (wcsncmp(command + i, L"\n", 1) == 0){ */ -/* num_newlines++; */ -/* } */ -/* if (wcsncmp(command + i, L" ", 1) == 0){ */ -/* num_spaces++; */ -/* } */ -/* fprintf(stderr, "command[%d] = '%lc'\n", i, command[i]); */ -/* } */ -/* fprintf(stderr, "num_newlines: %d\n", num_newlines); */ -/* fprintf(stderr, "num_spaces: %d\n", num_spaces); */ - -/*} */ + // Only check lines that contain non-whitespace characters + if (wcsncmp(first_nonspace, L"\0", 1)) { + + size_t num_leading_spaces = first_nonspace - line; + if (num_leading_spaces < num_common_leading_spaces) { + num_common_leading_spaces = num_leading_spaces; + } +#ifdef DEBUG_DEDENT + fprintf(stderr, "==========\n"); + fprintf(stderr, "line: '%ls'\n", line); + fprintf(stderr, "first_nonspace: '%ls'\n", first_nonspace); + fprintf(stderr, "num_common_leading_spaces: '%d'\n", num_common_leading_spaces); + fprintf(stderr, "num_leading_spaces: '%d'\n", num_leading_spaces); +#endif + } + line = WCSTOK(NULL, L"\n", &buffer); + } + PyMem_Free(command_copy); + + wchar_t *end_ptr = command + nchars; + wchar_t *curr_line_ptr = command; + wchar_t *next_line_ptr; + wchar_t *new_start_loc; + size_t new_line_len; + + // What is the correct way to ensure this is null terminated + wchar_t *new_command = (wchar_t *)PyMem_Malloc((nchars + 1) * sizeof(wchar_t)); + wmemset(new_command, NULL, nchars + 1); + wchar_t *curr_dst = new_command; + + while (curr_line_ptr != end_ptr) { + // Find the end of the current line. + next_line_ptr = wcsstr(curr_line_ptr, L"\n"); + if (next_line_ptr == NULL) { + next_line_ptr = end_ptr; + } + else { + next_line_ptr++; + } + + size_t line_len = next_line_ptr - curr_line_ptr; + + if (line_len > num_common_leading_spaces){ + new_start_loc = curr_line_ptr + num_common_leading_spaces; + new_line_len = line_len - num_common_leading_spaces; + } + else { + new_start_loc = curr_line_ptr; + new_line_len = line_len; + } + + int offset = curr_line_ptr - command; + +#ifdef DEBUG_DEDENT + fprintf(stderr, "line_len: '%d'\n", line_len); + fprintf(stderr, "offset: '%d'\n", offset); +#endif + + // Copy the part of the line we want to keep to the new location + wcsncpy(curr_dst, new_start_loc, new_line_len); + curr_dst += new_line_len; + + curr_line_ptr = next_line_ptr; + } + + // FIXME: I'm sure this is not the memory safe way to do this, but I dont + // know what is. + command = new_command; + +#ifdef DEBUG_DEDENT + fprintf(stderr, "new_command: '%ls'\n", new_command); + fprintf(stderr, "\nEND WCS_DEDENT\n"); +#endif + return command; + +} /* Strip common leading whitespace from an input command */ PyObject* _unicode_dedent(PyObject *unicode) { + /*fprintf(stderr, "\nSTART unicode dedent\n"); */ + /*PyObject_Print(PyObject_Repr(unicode), stderr, 0);*/ + /*fprintf(stderr, "\n"); */ PyObject* space = PyUnicode_FromWideChar(L" ", -1); PyObject* emptystr = PyUnicode_FromWideChar(L"", -1); @@ -350,6 +427,11 @@ PyObject* _unicode_dedent(PyObject *unicode) Py_DECREF(lines); Py_DECREF(space); Py_DECREF(emptystr); + + /*PyObject_Print(PyObject_Repr(new_unicode), stderr, 0);*/ + /*fprintf(stderr, "\nEND unicode dedent\n"); */ + /*fprintf(stderr, "\n"); */ + return new_unicode; } @@ -361,7 +443,12 @@ pymain_run_command(wchar_t *command) int ret; // Should the input be modified here with pure C? - //_command_dedent(wchar_t *command) + if (wcsncmp(command, L"\n", 1) == 0) { + command = _wcs_dedent(command); + if (command == NULL) { + goto error; + } + } unicode = PyUnicode_FromWideChar(command, -1); if (unicode == NULL) { @@ -373,10 +460,10 @@ pymain_run_command(wchar_t *command) } // Only perform auto-dedent if the string starts with a newline - if (wcsncmp(command, L"\n", 1) == 0) { - // Should the input be modified here with the Python C-API? - unicode = _unicode_dedent(unicode); - } + /*if (wcsncmp(command, L"\n", 1) == 0) { */ + /* // Should the input be modified here with the Python C-API?*/ + /* unicode = _unicode_dedent(unicode); */ + /*} */ bytes = PyUnicode_AsUTF8String(unicode); Py_DECREF(unicode); From 26f27a84cec932a1174cd04b37af6110c820c287 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 29 Apr 2023 22:24:25 -0400 Subject: [PATCH 05/42] tweaks --- Modules/main.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 9470e0e09d3659..e395f26013568d 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -250,6 +250,10 @@ wchar_t* _wcs_dedent(wchar_t *command) fprintf(stderr, "nchars: %d\n", nchars); #endif + // Step 1: Find N = the common number leading whitespace chars + + // Create a copy of the command so we can use the descructive WCSTOK to + // tokenize it. wchar_t *command_copy = (wchar_t *)PyMem_Malloc(nchars * sizeof(wchar_t)); // wcscpy has security problems, what is the workaround? wcscpy(command_copy, command); @@ -283,8 +287,8 @@ wchar_t* _wcs_dedent(wchar_t *command) fprintf(stderr, "==========\n"); fprintf(stderr, "line: '%ls'\n", line); fprintf(stderr, "first_nonspace: '%ls'\n", first_nonspace); - fprintf(stderr, "num_common_leading_spaces: '%d'\n", num_common_leading_spaces); - fprintf(stderr, "num_leading_spaces: '%d'\n", num_leading_spaces); + fprintf(stderr, "num_common_leading_spaces: '%zu'\n", num_common_leading_spaces); + fprintf(stderr, "num_leading_spaces: '%zu'\n", num_leading_spaces); #endif } line = WCSTOK(NULL, L"\n", &buffer); @@ -297,9 +301,15 @@ wchar_t* _wcs_dedent(wchar_t *command) wchar_t *new_start_loc; size_t new_line_len; + // Step 2: Remove N leading whitespace chars from each line We do this by + // creating a new string and copying over each line one at a time and not + // copying over the leading whitespace + // What is the correct way to ensure this is null terminated + // Is it ok that this is overallocated? + // Would we want to mutate the input pointer instead? wchar_t *new_command = (wchar_t *)PyMem_Malloc((nchars + 1) * sizeof(wchar_t)); - wmemset(new_command, NULL, nchars + 1); + //wmemset(new_command, NULL, nchars + 1); wchar_t *curr_dst = new_command; while (curr_line_ptr != end_ptr) { @@ -323,11 +333,8 @@ wchar_t* _wcs_dedent(wchar_t *command) new_line_len = line_len; } - int offset = curr_line_ptr - command; - #ifdef DEBUG_DEDENT - fprintf(stderr, "line_len: '%d'\n", line_len); - fprintf(stderr, "offset: '%d'\n", offset); + fprintf(stderr, "line_len: '%zu'\n", line_len); #endif // Copy the part of the line we want to keep to the new location @@ -336,6 +343,8 @@ wchar_t* _wcs_dedent(wchar_t *command) curr_line_ptr = next_line_ptr; } + // null terminate the string (is this sufficient?) + (*curr_dst) = NULL; // FIXME: I'm sure this is not the memory safe way to do this, but I dont // know what is. From 417eff8e6f7a96390c703a8d4ba601356118ea65 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 19:47:48 -0400 Subject: [PATCH 06/42] Use new char* implementation --- Modules/main.c | 216 +++++++++++++------------------------------------ 1 file changed, 54 insertions(+), 162 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index e395f26013568d..8a86b395bb77d4 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -228,93 +228,79 @@ pymain_import_readline(const PyConfig *config) } } -#ifdef MS_WINDOWS -# define WCSTOK wcstok_s -#else -# define WCSTOK wcstok -#endif -//#define DEBUG_DEDENT +// #define DEBUG_DEDENT -/* Strip common leading whitespace from an input command - * Sort of works? - * */ -wchar_t* _wcs_dedent(wchar_t *command) -{ - // Security problem, what is the right way to do this? - size_t nchars = wcslen(command); +/* Strip common leading whitespace utf encoded string + * returns a new PyBytes object that must be deallocated + */ +PyObject* _pybytes_dedent(PyObject *bytes){ + char *input_data = PyBytes_AsString(bytes); + + // Security problem? what is the right way to do this? + Py_ssize_t nchars = strlen(input_data); + + // Allocate new data for the output + PyBytesObject *new_bytes = PyBytes_FromStringAndSize(NULL, nchars); + if (new_bytes == NULL) { + return NULL; + } + char *new_data = PyBytes_AsString(new_bytes); #ifdef DEBUG_DEDENT - fprintf(stderr, "\nSTART WCS_DEDENT\n"); - fprintf(stderr, "command: '%ls'\n", command); + fprintf(stderr, "\nSTART DEDENT\n"); + fprintf(stderr, "input_data: '%s'\n", input_data); fprintf(stderr, "nchars: %d\n", nchars); #endif // Step 1: Find N = the common number leading whitespace chars - // Create a copy of the command so we can use the descructive WCSTOK to - // tokenize it. - wchar_t *command_copy = (wchar_t *)PyMem_Malloc(nchars * sizeof(wchar_t)); - // wcscpy has security problems, what is the workaround? - wcscpy(command_copy, command); - // fprintf(stderr, "command_copy: '%ls'\n", command_copy); - - size_t num_common_leading_spaces = nchars + 1; - - // I wcstok has a problem because ignores multiple instances of the - // delimiter which may make programs behave differently if that newline - // belongs to a multiline string. However, it is ok if we just want to do a - // first pass over the data to find the common indentation on non-empty - // lines. - wchar_t *buffer; - wchar_t* line = WCSTOK(command_copy, L"\n", &buffer); - while (line) { + // Use the output array as a temporary buffer (because we haven't populated it yet) + // so we can use the descructive strtok to tokenize the input. + strcpy(new_data, input_data); + + Py_ssize_t num_common_leading_spaces = nchars + 1; + // Count the number of leading spaces on each line + char *line = strtok(new_data, "\n"); + while (line) { // Move the pointer up to the first non-space character - wchar_t *first_nonspace = line; - while (wcsncmp(first_nonspace, L" ", 1) == 0){ + char *first_nonspace = line; + while (strncmp(first_nonspace, " ", 1) == 0){ first_nonspace++; } - // Only check lines that contain non-whitespace characters - if (wcsncmp(first_nonspace, L"\0", 1)) { + if (strncmp(first_nonspace, "\0", 1)) { - size_t num_leading_spaces = first_nonspace - line; + Py_ssize_t num_leading_spaces = first_nonspace - line; if (num_leading_spaces < num_common_leading_spaces) { num_common_leading_spaces = num_leading_spaces; } #ifdef DEBUG_DEDENT fprintf(stderr, "==========\n"); - fprintf(stderr, "line: '%ls'\n", line); - fprintf(stderr, "first_nonspace: '%ls'\n", first_nonspace); + fprintf(stderr, "line: '%s'\n", line); + fprintf(stderr, "first_nonspace: '%s'\n", first_nonspace); fprintf(stderr, "num_common_leading_spaces: '%zu'\n", num_common_leading_spaces); fprintf(stderr, "num_leading_spaces: '%zu'\n", num_leading_spaces); #endif } - line = WCSTOK(NULL, L"\n", &buffer); + line = strtok(NULL, "\n"); } - PyMem_Free(command_copy); - wchar_t *end_ptr = command + nchars; - wchar_t *curr_line_ptr = command; - wchar_t *next_line_ptr; - wchar_t *new_start_loc; - size_t new_line_len; + char *end_ptr = input_data + nchars; + char *curr_line_ptr = input_data; + char *next_line_ptr; + char *new_start_loc; + Py_ssize_t new_line_len; // Step 2: Remove N leading whitespace chars from each line We do this by // creating a new string and copying over each line one at a time and not // copying over the leading whitespace - // What is the correct way to ensure this is null terminated - // Is it ok that this is overallocated? - // Would we want to mutate the input pointer instead? - wchar_t *new_command = (wchar_t *)PyMem_Malloc((nchars + 1) * sizeof(wchar_t)); - //wmemset(new_command, NULL, nchars + 1); - wchar_t *curr_dst = new_command; - + char *curr_dst = new_data; while (curr_line_ptr != end_ptr) { // Find the end of the current line. - next_line_ptr = wcsstr(curr_line_ptr, L"\n"); + next_line_ptr = strstr(curr_line_ptr, "\n"); if (next_line_ptr == NULL) { next_line_ptr = end_ptr; } @@ -322,7 +308,7 @@ wchar_t* _wcs_dedent(wchar_t *command) next_line_ptr++; } - size_t line_len = next_line_ptr - curr_line_ptr; + Py_ssize_t line_len = next_line_ptr - curr_line_ptr; if (line_len > num_common_leading_spaces){ new_start_loc = curr_line_ptr + num_common_leading_spaces; @@ -336,9 +322,8 @@ wchar_t* _wcs_dedent(wchar_t *command) #ifdef DEBUG_DEDENT fprintf(stderr, "line_len: '%zu'\n", line_len); #endif - // Copy the part of the line we want to keep to the new location - wcsncpy(curr_dst, new_start_loc, new_line_len); + strncpy(curr_dst, new_start_loc, new_line_len); curr_dst += new_line_len; curr_line_ptr = next_line_ptr; @@ -346,102 +331,13 @@ wchar_t* _wcs_dedent(wchar_t *command) // null terminate the string (is this sufficient?) (*curr_dst) = NULL; - // FIXME: I'm sure this is not the memory safe way to do this, but I dont - // know what is. - command = new_command; - #ifdef DEBUG_DEDENT - fprintf(stderr, "new_command: '%ls'\n", new_command); + fprintf(stderr, "new_data: '%s'\n", new_data); fprintf(stderr, "\nEND WCS_DEDENT\n"); #endif - return command; - -} - - -/* Strip common leading whitespace from an input command */ -PyObject* _unicode_dedent(PyObject *unicode) -{ - /*fprintf(stderr, "\nSTART unicode dedent\n"); */ - /*PyObject_Print(PyObject_Repr(unicode), stderr, 0);*/ - /*fprintf(stderr, "\n"); */ - - PyObject* space = PyUnicode_FromWideChar(L" ", -1); - PyObject* emptystr = PyUnicode_FromWideChar(L"", -1); - PyObject* new_unicode; - - // Break up the input into lines - PyObject *lines = PyUnicode_Splitlines(unicode, 1); - - // Init leading space to a large value to indicate - // that it is uninitialized - Py_ssize_t effective_inf = PyObject_Length(unicode) + 1; - Py_ssize_t common_leading_spaces = effective_inf; - - Py_ssize_t num_lines = PyObject_Length(lines); - for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) { - PyObject* index = PyLong_FromSsize_t(line_idx); - PyObject* line = PyObject_GetItem(lines, index); - PyObject* striped_line = _PyUnicode_XStrip(line, 0, space); - - // Determine the number of leading whitespace on this line. - Py_ssize_t line_len = PyObject_Length(line); - Py_ssize_t stripline_len = PyObject_Length(striped_line); - Py_ssize_t leading_spaces = line_len - stripline_len; - - // On non-empty lines, see if the amount of leading whitespace is less - // than current value. If so, update it. - if (line_len > 1 && leading_spaces < common_leading_spaces) { - common_leading_spaces = leading_spaces; - } - - Py_DECREF(striped_line); - Py_DECREF(line); - Py_DECREF(index); - } + return new_bytes; - if (common_leading_spaces > 0 && common_leading_spaces < effective_inf) { - // We found common leading whitespace, strip if off. - PyObject* new_lines = PyList_New(num_lines); - for (Py_ssize_t line_idx = 0; line_idx < num_lines; line_idx ++) { - PyObject* index = PyLong_FromSsize_t(line_idx); - PyObject* line = PyObject_GetItem(lines, index); - Py_ssize_t end = PyObject_Length(line); - Py_ssize_t start = common_leading_spaces; - if (end <= 1) { - start = 0; - } - PyObject* new_line = PyUnicode_Substring(line, start, end); - PyList_SetItem(new_lines, line_idx, new_line); - - //Py_DECREF(new_line); // is it correct that we dont need to DECREF here? - Py_DECREF(line); - Py_DECREF(index); - } - new_unicode = PyUnicode_Join(emptystr, new_lines); - - Py_DECREF(new_lines); - - // We are going to return an updated version of "unicode" that the - // caller will decref, so need to decref the version we are replacing - // here. This feels fragile and like the wrong way to do this. - // Guidance here would be appreciated. - Py_DECREF(unicode); - } - else { - new_unicode = unicode; - } - - Py_DECREF(lines); - Py_DECREF(space); - Py_DECREF(emptystr); - - /*PyObject_Print(PyObject_Repr(new_unicode), stderr, 0);*/ - /*fprintf(stderr, "\nEND unicode dedent\n"); */ - /*fprintf(stderr, "\n"); */ - - return new_unicode; } @@ -451,14 +347,6 @@ pymain_run_command(wchar_t *command) PyObject *unicode, *bytes; int ret; - // Should the input be modified here with pure C? - if (wcsncmp(command, L"\n", 1) == 0) { - command = _wcs_dedent(command); - if (command == NULL) { - goto error; - } - } - unicode = PyUnicode_FromWideChar(command, -1); if (unicode == NULL) { goto error; @@ -468,18 +356,22 @@ pymain_run_command(wchar_t *command) return pymain_exit_err_print(); } - // Only perform auto-dedent if the string starts with a newline - /*if (wcsncmp(command, L"\n", 1) == 0) { */ - /* // Should the input be modified here with the Python C-API?*/ - /* unicode = _unicode_dedent(unicode); */ - /*} */ - bytes = PyUnicode_AsUTF8String(unicode); Py_DECREF(unicode); if (bytes == NULL) { goto error; } + // Only perform auto-dedent if the string starts with a newline + if (strncmp(PyBytes_AsString(bytes), "\n", 1) == 0) { + PyObject *new_bytes = _pybytes_dedent(bytes); + if (new_bytes == NULL) { + goto error; + } + Py_DECREF(bytes); + bytes = new_bytes; + } + PyCompilerFlags cf = _PyCompilerFlags_INIT; cf.cf_flags |= PyCF_IGNORE_COOKIE; ret = PyRun_SimpleStringFlags(PyBytes_AsString(bytes), &cf); From 924e0a6897f452ac1cf3161ae2d9202e2acaa992 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 19:48:55 -0400 Subject: [PATCH 07/42] Rename function --- Modules/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 8a86b395bb77d4..55ea0025520dac 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -234,7 +234,7 @@ pymain_import_readline(const PyConfig *config) /* Strip common leading whitespace utf encoded string * returns a new PyBytes object that must be deallocated */ -PyObject* _pybytes_dedent(PyObject *bytes){ +PyObject* _utf_8_bytes_dedent(PyObject *bytes){ char *input_data = PyBytes_AsString(bytes); // Security problem? what is the right way to do this? @@ -364,7 +364,7 @@ pymain_run_command(wchar_t *command) // Only perform auto-dedent if the string starts with a newline if (strncmp(PyBytes_AsString(bytes), "\n", 1) == 0) { - PyObject *new_bytes = _pybytes_dedent(bytes); + PyObject *new_bytes = _utf_8_bytes_dedent(bytes); if (new_bytes == NULL) { goto error; } From 9f956726057b98cbe3f6e40a1620b56ab62de912 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 19:59:07 -0400 Subject: [PATCH 08/42] tweaks --- Modules/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 55ea0025520dac..f69c322d4a5379 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -293,12 +293,12 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ char *new_start_loc; Py_ssize_t new_line_len; - // Step 2: Remove N leading whitespace chars from each line We do this by - // creating a new string and copying over each line one at a time and not - // copying over the leading whitespace + // Step 2: Remove N leading whitespace chars from each line by copying data + // (except leading spaces) from the input buffer to the output buffer one + // line at a time. char *curr_dst = new_data; - while (curr_line_ptr != end_ptr) { + while (curr_line_ptr < end_ptr) { // Find the end of the current line. next_line_ptr = strstr(curr_line_ptr, "\n"); if (next_line_ptr == NULL) { From 3f4a78bf047ab45f452bf89a4d27ae4bdb64e171 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 20:05:34 -0400 Subject: [PATCH 09/42] More tweaks --- Modules/main.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index f69c322d4a5379..9ba8de056639f3 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -231,17 +231,15 @@ pymain_import_readline(const PyConfig *config) // #define DEBUG_DEDENT -/* Strip common leading whitespace utf encoded string - * returns a new PyBytes object that must be deallocated - */ +/* Strip common leading whitespace utf encoded string */ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ - char *input_data = PyBytes_AsString(bytes); + char *input_data; + Py_ssize_t nchars; - // Security problem? what is the right way to do this? - Py_ssize_t nchars = strlen(input_data); + PyBytes_AsStringAndSize(bytes, &input_data, &nchars); - // Allocate new data for the output - PyBytesObject *new_bytes = PyBytes_FromStringAndSize(NULL, nchars); + // Allocate new data for the output as a copy of the input + PyBytesObject *new_bytes = PyBytes_FromStringAndSize(input_data, nchars); if (new_bytes == NULL) { return NULL; } @@ -255,13 +253,11 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ // Step 1: Find N = the common number leading whitespace chars - // Use the output array as a temporary buffer (because we haven't populated it yet) // so we can use the descructive strtok to tokenize the input. - strcpy(new_data, input_data); - Py_ssize_t num_common_leading_spaces = nchars + 1; // Count the number of leading spaces on each line + // Use the output array as a temporary buffer (we will repopulate it later) char *line = strtok(new_data, "\n"); while (line) { // Move the pointer up to the first non-space character From 97f2079c46b68bd835f715435058e72bed891d23 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 22:08:23 -0400 Subject: [PATCH 10/42] Replace strncmp with direct char comparison --- Modules/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 9ba8de056639f3..27295a8cb0dfcb 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -262,11 +262,11 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ while (line) { // Move the pointer up to the first non-space character char *first_nonspace = line; - while (strncmp(first_nonspace, " ", 1) == 0){ + while (*first_nonspace == ' '){ first_nonspace++; } // Only check lines that contain non-whitespace characters - if (strncmp(first_nonspace, "\0", 1)) { + if (*first_nonspace != '\0') { Py_ssize_t num_leading_spaces = first_nonspace - line; if (num_leading_spaces < num_common_leading_spaces) { @@ -359,7 +359,7 @@ pymain_run_command(wchar_t *command) } // Only perform auto-dedent if the string starts with a newline - if (strncmp(PyBytes_AsString(bytes), "\n", 1) == 0) { + if (*PyBytes_AsString(bytes) == '\n') { PyObject *new_bytes = _utf_8_bytes_dedent(bytes); if (new_bytes == NULL) { goto error; From 04435eb4a79ae5e3940cfa804f1011287b901f68 Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 22:12:12 -0400 Subject: [PATCH 11/42] Remove debug code --- Modules/main.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 27295a8cb0dfcb..4e60b5e1e2bee0 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -229,8 +229,6 @@ pymain_import_readline(const PyConfig *config) } -// #define DEBUG_DEDENT - /* Strip common leading whitespace utf encoded string */ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ char *input_data; @@ -245,12 +243,6 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ } char *new_data = PyBytes_AsString(new_bytes); -#ifdef DEBUG_DEDENT - fprintf(stderr, "\nSTART DEDENT\n"); - fprintf(stderr, "input_data: '%s'\n", input_data); - fprintf(stderr, "nchars: %d\n", nchars); -#endif - // Step 1: Find N = the common number leading whitespace chars // so we can use the descructive strtok to tokenize the input. @@ -272,13 +264,6 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ if (num_leading_spaces < num_common_leading_spaces) { num_common_leading_spaces = num_leading_spaces; } -#ifdef DEBUG_DEDENT - fprintf(stderr, "==========\n"); - fprintf(stderr, "line: '%s'\n", line); - fprintf(stderr, "first_nonspace: '%s'\n", first_nonspace); - fprintf(stderr, "num_common_leading_spaces: '%zu'\n", num_common_leading_spaces); - fprintf(stderr, "num_leading_spaces: '%zu'\n", num_leading_spaces); -#endif } line = strtok(NULL, "\n"); } @@ -315,9 +300,6 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ new_line_len = line_len; } -#ifdef DEBUG_DEDENT - fprintf(stderr, "line_len: '%zu'\n", line_len); -#endif // Copy the part of the line we want to keep to the new location strncpy(curr_dst, new_start_loc, new_line_len); curr_dst += new_line_len; @@ -327,13 +309,7 @@ PyObject* _utf_8_bytes_dedent(PyObject *bytes){ // null terminate the string (is this sufficient?) (*curr_dst) = NULL; -#ifdef DEBUG_DEDENT - fprintf(stderr, "new_data: '%s'\n", new_data); - fprintf(stderr, "\nEND WCS_DEDENT\n"); -#endif return new_bytes; - - } From 4c4eca9bc6228b6eebfdca2d30fa94396de6c91a Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 30 Apr 2023 23:44:52 -0400 Subject: [PATCH 12/42] Made new function static --- Modules/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Modules/main.c b/Modules/main.c index 4e60b5e1e2bee0..8959ee297f7e4f 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -230,7 +230,8 @@ pymain_import_readline(const PyConfig *config) /* Strip common leading whitespace utf encoded string */ -PyObject* _utf_8_bytes_dedent(PyObject *bytes){ +static PyObject* +_utf_8_bytes_dedent(PyObject *bytes){ char *input_data; Py_ssize_t nchars; From f9c969be644eda481c15595566cdb487127c0345 Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 1 May 2023 01:23:05 -0400 Subject: [PATCH 13/42] Handwritten char iter and _PyBytesWriter_ --- Modules/main.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 102 insertions(+), 4 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 8959ee297f7e4f..c00e3f06f55a35 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -234,9 +234,105 @@ static PyObject* _utf_8_bytes_dedent(PyObject *bytes){ char *input_data; Py_ssize_t nchars; + bool curr_reading_whitespace = true; + int curr_num_leading_spaces = 0; + int curr_num_leading_tabs = 0; + int num_common_leading_chars; + char c; PyBytes_AsStringAndSize(bytes, &input_data, &nchars); + int num_common_leading_spaces = nchars + 1; + int num_common_leading_tabs = nchars + 1; + + char *data_iter; + + data_iter = input_data; + while ( (c = *data_iter++) ){ + if (c == '\n') { + // Finished reading the line + if (!curr_reading_whitespace) { + // If the line had some non-whitespace characters + // update the current common leading tab/space count + if (curr_num_leading_spaces < num_common_leading_spaces) { + num_common_leading_spaces = curr_num_leading_spaces; + } + if (curr_num_leading_tabs < num_common_leading_tabs) { + num_common_leading_tabs = curr_num_leading_tabs; + } + } + // About to start reading a new line + curr_reading_whitespace = true; + curr_num_leading_spaces = 0; + curr_num_leading_tabs = 0; + } + else if (curr_reading_whitespace){ + if (c == ' ' && curr_num_leading_tabs == 0) { + curr_num_leading_spaces++; + } + else if (c == '\t' && curr_num_leading_spaces == 0) { + curr_num_leading_tabs++; + } + else { + // Encountered a non-whitespace character + curr_reading_whitespace = false; + } + } + } + if (num_common_leading_spaces > num_common_leading_tabs){ + num_common_leading_chars = num_common_leading_spaces; + } + else { + num_common_leading_chars = num_common_leading_tabs; + } + + if (num_common_leading_chars > 0) { + // We need to trigger a dedent + char *new_data; + char *curr_line_ptr = input_data; + char *next_line_ptr; + char *new_start_loc; + Py_ssize_t new_line_len; + _PyBytesWriter writer; + _PyBytesWriter_Init(&writer); + new_data = _PyBytesWriter_Alloc(&writer, nchars); + if (new_data == NULL) { + return NULL; + } + + data_iter = input_data; + c = *data_iter; + while (c) { + // Find the end of the current line. + while ( (c = *data_iter++) != '\n' ){ + if (c == NULL) { + break; + } + } + next_line_ptr = data_iter; + Py_ssize_t line_len = next_line_ptr - curr_line_ptr; + if (line_len > num_common_leading_chars){ + new_start_loc = curr_line_ptr + num_common_leading_chars; + new_line_len = line_len - num_common_leading_chars; + } + else { + new_start_loc = curr_line_ptr; + new_line_len = line_len; + } + // Copy this line over to the new buffer (removing common + // leading chars) + new_data = _PyBytesWriter_WriteBytes(&writer, new_data, new_start_loc, new_line_len); + curr_line_ptr = next_line_ptr; + } + PyObject *new_bytes = _PyBytesWriter_Finish(&writer, new_data); + return new_bytes; + } + else { + // No leading chars, no work to be done. + return bytes; + } + +#if 0 // Allocate new data for the output as a copy of the input PyBytesObject *new_bytes = PyBytes_FromStringAndSize(input_data, nchars); if (new_bytes == NULL) { @@ -245,8 +341,6 @@ _utf_8_bytes_dedent(PyObject *bytes){ char *new_data = PyBytes_AsString(new_bytes); // Step 1: Find N = the common number leading whitespace chars - - // so we can use the descructive strtok to tokenize the input. Py_ssize_t num_common_leading_spaces = nchars + 1; // Count the number of leading spaces on each line @@ -309,8 +403,9 @@ _utf_8_bytes_dedent(PyObject *bytes){ } // null terminate the string (is this sufficient?) (*curr_dst) = NULL; - return new_bytes; +#endif + } @@ -341,7 +436,10 @@ pymain_run_command(wchar_t *command) if (new_bytes == NULL) { goto error; } - Py_DECREF(bytes); + if (new_bytes != bytes) { + // dedent allocated new bytes, replace the old with the new + Py_DECREF(bytes); + } bytes = new_bytes; } From 674f1e07d4f3ba7ba5702984017627af58e5e5b6 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 21:42:21 +0800 Subject: [PATCH 14/42] reimplement it to imitate `textwrap.dedent` --- Modules/main.c | 276 +++++++++++++++++++++---------------------------- 1 file changed, 115 insertions(+), 161 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index c00e3f06f55a35..c3c4f1b7ce285c 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -228,187 +228,149 @@ pymain_import_readline(const PyConfig *config) } } +/* Strip common leading whitespace, just as textwrap.dedent. + It stoles 1 reference from bytes if succeeded, else it will return NULL. */ +static PyObject *dedent_utf8_bytes(PyObject *bytes) { + if (bytes == NULL || !PyBytes_CheckExact(bytes)) { + return NULL; + } -/* Strip common leading whitespace utf encoded string */ -static PyObject* -_utf_8_bytes_dedent(PyObject *bytes){ - char *input_data; + char *start; Py_ssize_t nchars; - bool curr_reading_whitespace = true; - int curr_num_leading_spaces = 0; - int curr_num_leading_tabs = 0; - int num_common_leading_chars; - char c; - - PyBytes_AsStringAndSize(bytes, &input_data, &nchars); - - int num_common_leading_spaces = nchars + 1; - int num_common_leading_tabs = nchars + 1; - - char *data_iter; - - data_iter = input_data; - while ( (c = *data_iter++) ){ - if (c == '\n') { - // Finished reading the line - if (!curr_reading_whitespace) { - // If the line had some non-whitespace characters - // update the current common leading tab/space count - if (curr_num_leading_spaces < num_common_leading_spaces) { - num_common_leading_spaces = curr_num_leading_spaces; - } - if (curr_num_leading_tabs < num_common_leading_tabs) { - num_common_leading_tabs = curr_num_leading_tabs; + + if (PyBytes_AsStringAndSize(bytes, &start, &nchars) != 0) { + return NULL; + } + + char *end = start + nchars; + assert(start < end); + + char *candidate_start = NULL; + Py_ssize_t candidate_len = 0; + + for (char *iter = start; iter < end; ++iter) { + char *line_start = iter; + char *leading_whitespace_end = NULL; + + // scan the whole line + char c = 0; + while (iter < end && (c = *iter) != '\n') { + if (!leading_whitespace_end && c != ' ' && c != '\t') { + if (iter == line_start) { + // some line has no indent, fast exit! + return bytes; } + leading_whitespace_end = iter; } - // About to start reading a new line - curr_reading_whitespace = true; - curr_num_leading_spaces = 0; - curr_num_leading_tabs = 0; - } - else if (curr_reading_whitespace){ - if (c == ' ' && curr_num_leading_tabs == 0) { - curr_num_leading_spaces++; - } - else if (c == '\t' && curr_num_leading_spaces == 0) { - curr_num_leading_tabs++; - } - else { - // Encountered a non-whitespace character - curr_reading_whitespace = false; - } + ++iter; } - } - if (num_common_leading_spaces > num_common_leading_tabs){ - num_common_leading_chars = num_common_leading_spaces; - } - else { - num_common_leading_chars = num_common_leading_tabs; - } - - if (num_common_leading_chars > 0) { - // We need to trigger a dedent - char *new_data; - char *curr_line_ptr = input_data; - char *next_line_ptr; - char *new_start_loc; - Py_ssize_t new_line_len; - _PyBytesWriter writer; - _PyBytesWriter_Init(&writer); - new_data = _PyBytesWriter_Alloc(&writer, nchars); - if (new_data == NULL) { - return NULL; + + // we reach the end of a line + + // if this line has all white space, skip it + if (!leading_whitespace_end) { + continue; } - data_iter = input_data; - c = *data_iter; - while (c) { - // Find the end of the current line. - while ( (c = *data_iter++) != '\n' ){ - if (c == NULL) { + if (!candidate_start) { + candidate_start = line_start; + candidate_len = leading_whitespace_end - line_start; + assert(candidate_len > 0); + } else { + /* We then compare with the current longest leading whitespace. + + [line_start, leading_whitespace_end) is the leading whitespace of + this line, + + [candidate_start, candidate_start + candidate_len) + is the leading whitespace of the current longest leading + whitespace. */ + Py_ssize_t new_candidate_len = 0; + + for (char *candidate_iter = candidate_start, + *line_iter = line_start; + candidate_iter < candidate_start + candidate_len && + line_iter < leading_whitespace_end; + ++candidate_iter, ++line_iter) { + if (*candidate_iter != *line_iter) { break; } + ++new_candidate_len; } - next_line_ptr = data_iter; - Py_ssize_t line_len = next_line_ptr - curr_line_ptr; - if (line_len > num_common_leading_chars){ - new_start_loc = curr_line_ptr + num_common_leading_chars; - new_line_len = line_len - num_common_leading_chars; - } - else { - new_start_loc = curr_line_ptr; - new_line_len = line_len; + + candidate_len = new_candidate_len; + if (candidate_len == 0) { + return bytes; } - // Copy this line over to the new buffer (removing common - // leading chars) - new_data = _PyBytesWriter_WriteBytes(&writer, new_data, new_start_loc, new_line_len); - curr_line_ptr = next_line_ptr; } - PyObject *new_bytes = _PyBytesWriter_Finish(&writer, new_data); - return new_bytes; - } - else { - // No leading chars, no work to be done. - return bytes; - } - -#if 0 - // Allocate new data for the output as a copy of the input - PyBytesObject *new_bytes = PyBytes_FromStringAndSize(input_data, nchars); - if (new_bytes == NULL) { - return NULL; } - char *new_data = PyBytes_AsString(new_bytes); - // Step 1: Find N = the common number leading whitespace chars - Py_ssize_t num_common_leading_spaces = nchars + 1; + assert(candidate_len > 0); - // Count the number of leading spaces on each line - // Use the output array as a temporary buffer (we will repopulate it later) - char *line = strtok(new_data, "\n"); - while (line) { - // Move the pointer up to the first non-space character - char *first_nonspace = line; - while (*first_nonspace == ' '){ - first_nonspace++; - } - // Only check lines that contain non-whitespace characters - if (*first_nonspace != '\0') { + // trigger a dedent + char *p; + PyObject *new_bytes; + char *line_start; + Py_ssize_t new_line_len; + bool in_leading_space; + _PyBytesWriter writer; - Py_ssize_t num_leading_spaces = first_nonspace - line; - if (num_leading_spaces < num_common_leading_spaces) { - num_common_leading_spaces = num_leading_spaces; - } - } - line = strtok(NULL, "\n"); + _PyBytesWriter_Init(&writer); + p = _PyBytesWriter_Alloc(&writer, nchars); + if (p == NULL) { + goto error; } - char *end_ptr = input_data + nchars; - char *curr_line_ptr = input_data; - char *next_line_ptr; - char *new_start_loc; - Py_ssize_t new_line_len; + for (char *iter = start; iter < end; ++iter) { + line_start = iter; - // Step 2: Remove N leading whitespace chars from each line by copying data - // (except leading spaces) from the input buffer to the output buffer one - // line at a time. - - char *curr_dst = new_data; - while (curr_line_ptr < end_ptr) { - // Find the end of the current line. - next_line_ptr = strstr(curr_line_ptr, "\n"); - if (next_line_ptr == NULL) { - next_line_ptr = end_ptr; - } - else { - next_line_ptr++; + // iterate over a line + while (iter < end && *iter != '\n') { + if (in_leading_space && *iter != ' ' && *iter != '\t') { + in_leading_space = false; + } + ++iter; } - Py_ssize_t line_len = next_line_ptr - curr_line_ptr; + // invariant: *iter == '\n' or iter == end - if (line_len > num_common_leading_spaces){ - new_start_loc = curr_line_ptr + num_common_leading_spaces; - new_line_len = line_len - num_common_leading_spaces; + // if this line has all white space, write '\n' + if (in_leading_space) { + p = _PyBytesWriter_Prepare(&writer, p, 1); + if (p == NULL) { + goto error; + } + *p++ = '\n'; + continue; } - else { - new_start_loc = curr_line_ptr; - new_line_len = line_len; + + // copy [new_line_start + candidate_len, iter) to buffer, then append + // '\n' + new_line_len = iter - line_start - candidate_len; + assert(new_line_len >= 0); + p = _PyBytesWriter_Prepare(&writer, p, new_line_len + 1); + if (p == NULL) { + goto error; } + memcpy(p, line_start + candidate_len, new_line_len); - // Copy the part of the line we want to keep to the new location - strncpy(curr_dst, new_start_loc, new_line_len); - curr_dst += new_line_len; + p += new_line_len; - curr_line_ptr = next_line_ptr; + // this may always append '\n' at the end of the input + *p++ = '\n'; + } + + new_bytes = _PyBytesWriter_Finish(&writer, p); + if (new_bytes == NULL) { + goto error; } - // null terminate the string (is this sufficient?) - (*curr_dst) = NULL; return new_bytes; -#endif +error: + _PyBytesWriter_Dealloc(&writer); + return NULL; } - static int pymain_run_command(wchar_t *command) { @@ -430,17 +392,9 @@ pymain_run_command(wchar_t *command) goto error; } - // Only perform auto-dedent if the string starts with a newline - if (*PyBytes_AsString(bytes) == '\n') { - PyObject *new_bytes = _utf_8_bytes_dedent(bytes); - if (new_bytes == NULL) { - goto error; - } - if (new_bytes != bytes) { - // dedent allocated new bytes, replace the old with the new - Py_DECREF(bytes); - } - bytes = new_bytes; + bytes = dedent_utf8_bytes(bytes); + if (bytes == NULL) { + goto error; } PyCompilerFlags cf = _PyCompilerFlags_INIT; From 05d41692a664c18c11e31087e35ff6a484b11b7f Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 22:15:15 +0800 Subject: [PATCH 15/42] fix missing initialization --- Modules/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Modules/main.c b/Modules/main.c index c3c4f1b7ce285c..0d7032f5e63164 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -323,6 +323,7 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { for (char *iter = start; iter < end; ++iter) { line_start = iter; + in_leading_space = true; // iterate over a line while (iter < end && *iter != '\n') { From 9d53c4ef2b390c3e5a3c872cbc98f171dd0b5280 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 22:20:57 +0800 Subject: [PATCH 16/42] fix ref leak --- Modules/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Modules/main.c b/Modules/main.c index 0d7032f5e63164..2b7fa7a97da775 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -357,7 +357,7 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { p += new_line_len; - // this may always append '\n' at the end of the input + // this may always append '\n' at the end of `new_bytes` *p++ = '\n'; } @@ -365,6 +365,7 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { if (new_bytes == NULL) { goto error; } + Py_DECREF(bytes); return new_bytes; error: From 689a13a4624b7b655b61b9271a1985ff475d51c5 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 22:25:10 +0800 Subject: [PATCH 17/42] fix empty string --- Modules/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Modules/main.c b/Modules/main.c index 2b7fa7a97da775..5758b2f702391d 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -9,6 +9,7 @@ #include "pycore_pystate.h" // _PyInterpreterState_GET() /* Includes for exit_sigint() */ +#include #include // perror() #ifdef HAVE_SIGNAL_H # include // SIGINT @@ -305,6 +306,9 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { } } + if (candidate_len == 0) { + return bytes; + } assert(candidate_len > 0); // trigger a dedent From f0ac7ea9fc5b365602fb07e8159216b2609358fd Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 22:29:16 +0800 Subject: [PATCH 18/42] nit: remove unnecessary variable --- Modules/main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 5758b2f702391d..4267deefe6c60d 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -254,9 +254,8 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { char *leading_whitespace_end = NULL; // scan the whole line - char c = 0; - while (iter < end && (c = *iter) != '\n') { - if (!leading_whitespace_end && c != ' ' && c != '\t') { + while (iter < end && *iter != '\n') { + if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { if (iter == line_start) { // some line has no indent, fast exit! return bytes; @@ -266,8 +265,6 @@ static PyObject *dedent_utf8_bytes(PyObject *bytes) { ++iter; } - // we reach the end of a line - // if this line has all white space, skip it if (!leading_whitespace_end) { continue; From 71cad010de23e64bf7a04d0b732a8a06bfd92870 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 1 May 2023 22:29:54 +0800 Subject: [PATCH 19/42] remove unnecessary include --- Modules/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Modules/main.c b/Modules/main.c index 4267deefe6c60d..75954fca0a2fca 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -9,7 +9,6 @@ #include "pycore_pystate.h" // _PyInterpreterState_GET() /* Includes for exit_sigint() */ -#include #include // perror() #ifdef HAVE_SIGNAL_H # include // SIGINT From 4549de811225ca846e5fb8928750cb31e6fe5c5f Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 1 May 2023 12:44:48 -0400 Subject: [PATCH 20/42] Add test cases --- Lib/test/test_cmd_line.py | 69 +++++++++++++++++++++++++++++++++++++++ Modules/main.c | 2 +- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 94298003063593..f9310a598c674a 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -901,6 +901,75 @@ def res2int(res): ) self.assertEqual(res2int(res), (6000, 6000)) + def test_cmd_dedent(self): + # test that -c auto-dedents its arguments + from textwrap import dedent + test_cases = [ + { + 'code': ''' + print('space-auto-dedent') + ''', + 'expected': b'space-auto-dedent', + }, + { + 'code': dedent(''' + ^^^print('tab-auto-dedent') + ''').replace('^', '\t'), + 'expected': b'tab-auto-dedent', + }, + { + 'code': dedent(''' + ^^if 1: + ^^^^print('mixed-auto-dedent-1') + ^^print('mixed-auto-dedent-2') + ''').replace('^', '\t \t'), + 'expected': b'mixed-auto-dedent-1\nmixed-auto-dedent-2', + }, + { + 'code': ''' + data = """$ + + this data has an empty newline above and a newline with spaces below $ + $ + """$ + if 1: $ + print(repr(data))$ + '''.replace('$', ''), + # Note: entirely blank lines are normalized to \n, even if they + # are part of a data string. This is consistent with + # textwrap.dedent behavior, but might not be intuitive. + 'expected': b"'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'", + }, + ] + for case in test_cases: + # Run the auto-dedent case + args1 = sys.executable, '-c', case['code'] + proc1 = subprocess.run(args1, stdout=subprocess.PIPE) + self.assertEqual(proc1.returncode, 0, proc1) + output1 = proc1.stdout.strip() + + # Manually dedent beforehand, check the result is the same. + args2 = sys.executable, '-c', dedent(case['code']) + proc2 = subprocess.run(args2, stdout=subprocess.PIPE) + self.assertEqual(proc2.returncode, 0, proc2) + output2 = proc2.stdout.strip() + + self.assertEqual(output1, output2) + self.assertEqual(output1, case['expected']) + + def test_cmd_dedent_failcase(self): + # Mixing tabs and spaces is not allowed + from textwrap import dedent + template = dedent( + ''' + -+if 1: + +-++ print('will fail') + ''') + code = template.replace('-', ' ').replace('+', '\t') + assert_python_failure('-c', code) + code = template.replace('-', '\t').replace('+', ' ') + assert_python_failure('-c', code) + @unittest.skipIf(interpreter_requires_environment(), 'Cannot run -I tests when PYTHON env vars are required.') diff --git a/Modules/main.c b/Modules/main.c index 75954fca0a2fca..bfcdd3d5f8c07b 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -229,7 +229,7 @@ pymain_import_readline(const PyConfig *config) } /* Strip common leading whitespace, just as textwrap.dedent. - It stoles 1 reference from bytes if succeeded, else it will return NULL. */ + It steals 1 reference from bytes if succeeded, else it will return NULL. */ static PyObject *dedent_utf8_bytes(PyObject *bytes) { if (bytes == NULL || !PyBytes_CheckExact(bytes)) { return NULL; From 0c3b90b12ed9b9e8e3901b6ead172b8343742bf9 Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 1 May 2023 13:46:03 -0400 Subject: [PATCH 21/42] Fix test on windows --- Lib/test/test_cmd_line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index f9310a598c674a..f518b7407dfdaf 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -955,7 +955,7 @@ def test_cmd_dedent(self): output2 = proc2.stdout.strip() self.assertEqual(output1, output2) - self.assertEqual(output1, case['expected']) + self.assertEqual(output1.replace(b'\r', b''), case['expected']) def test_cmd_dedent_failcase(self): # Mixing tabs and spaces is not allowed From 1f5b7463abe2a169b896db2e8086a9c498001ec8 Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 1 May 2023 19:44:37 -0400 Subject: [PATCH 22/42] normalize windows line endings --- Lib/test/test_cmd_line.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index f518b7407dfdaf..89f7a299fb901c 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -955,7 +955,7 @@ def test_cmd_dedent(self): output2 = proc2.stdout.strip() self.assertEqual(output1, output2) - self.assertEqual(output1.replace(b'\r', b''), case['expected']) + self.assertEqual(output1.replace(b'\r\n', b'\n'), case['expected']) def test_cmd_dedent_failcase(self): # Mixing tabs and spaces is not allowed From 1f17e23c3d4af2c082beebd1b855e1bc262e4d71 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Thu, 4 May 2023 02:32:42 +0800 Subject: [PATCH 23/42] Update Modules/main.c Co-authored-by: Kirill Podoprigora <80244920+Eclips4@users.noreply.github.com> --- Modules/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Modules/main.c b/Modules/main.c index bfcdd3d5f8c07b..b016783f69beb1 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -230,7 +230,9 @@ pymain_import_readline(const PyConfig *config) /* Strip common leading whitespace, just as textwrap.dedent. It steals 1 reference from bytes if succeeded, else it will return NULL. */ -static PyObject *dedent_utf8_bytes(PyObject *bytes) { +static PyObject * +dedent_utf8_bytes(PyObject *bytes) +{ if (bytes == NULL || !PyBytes_CheckExact(bytes)) { return NULL; } From c84616c1dacaebd8f75b2eb4c912860b43947d87 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 11:39:28 +0800 Subject: [PATCH 24/42] refactor code --- Include/internal/pycore_bytesobject.h | 25 +++ ...-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 8 +- Modules/main.c | 148 +++--------------- Objects/bytesobject.c | 147 +++++++++++++++++ 4 files changed, 197 insertions(+), 131 deletions(-) diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index 115c0c52c8f9a9..f1167c1de52233 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -122,6 +122,31 @@ PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, const void *bytes, Py_ssize_t size); + +/** Dedent a UTF-8 encoded string. + * behavior is expected to match `textwrap.dedent` + * + * return value: + * 0, no need to dedent, writer untouched + * 1, success + * -1, failure + * + * str is the beginning of the string to dedent. + * expecting (str != NULL) + * + * len is the length of the string to dedent. + * expecting (len >= 0) + * + * writer is a _PyBytesWriter object to write the dedented string. + * expecting (writer != NULL) + * + * p points to a char* indicating the current position in the _PyBytesWriter. + * It is updated to the new position after writing the dedented string on exit. + * expecting (p != NULL && *p != NULL) + */ +PyAPI_FUNC(int) +_PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, + char **p); #ifdef __cplusplus } #endif diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst index 8949f435731e34..c02978d024bcc8 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst @@ -1 +1,7 @@ -Strings passed to "-c" are now automatically dedented (common leading whitespace is removed). This allows "python -c" invocations to be indented in shell scripts without causing indentation errors. +String arguments passed to "-c" are now automatically dedented as if by +:func:`textwrap.dedent`. This allows "python -c" invocations to be indented + in shell scripts without causing indentation errors. + +Add a private API :c:func:`_PyBytes_Dedent`. + +(Patch by Jon Crall and Steven Sun) diff --git a/Modules/main.c b/Modules/main.c index 0cd99628cdf64d..cd9a5b84f9a0b6 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -7,6 +7,7 @@ #include "pycore_pathconfig.h" // _PyPathConfig_ComputeSysPath0() #include "pycore_pylifecycle.h" // _Py_PreInitializeFromPyArgv() #include "pycore_pystate.h" // _PyInterpreterState_GET() +#include "pycore_bytesobject.h" // _PyBytesWriter, _PyBytes_Dedent() /* Includes for exit_sigint() */ #include // perror() @@ -229,150 +230,35 @@ pymain_import_readline(const PyConfig *config) } /* Strip common leading whitespace, just as textwrap.dedent. - It steals 1 reference from bytes if succeeded, else it will return NULL. */ + It returns a new reference. */ static PyObject * dedent_utf8_bytes(PyObject *bytes) { - if (bytes == NULL || !PyBytes_CheckExact(bytes)) { - return NULL; - } + assert(bytes == NULL || !PyBytes_CheckExact(bytes->ob_type)); - char *start; Py_ssize_t nchars; - + char *start; if (PyBytes_AsStringAndSize(bytes, &start, &nchars) != 0) { return NULL; } - char *end = start + nchars; - assert(start < end); - - char *candidate_start = NULL; - Py_ssize_t candidate_len = 0; - - for (char *iter = start; iter < end; ++iter) { - char *line_start = iter; - char *leading_whitespace_end = NULL; - - // scan the whole line - while (iter < end && *iter != '\n') { - if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { - if (iter == line_start) { - // some line has no indent, fast exit! - return bytes; - } - leading_whitespace_end = iter; - } - ++iter; - } - - // if this line has all white space, skip it - if (!leading_whitespace_end) { - continue; - } - - if (!candidate_start) { - candidate_start = line_start; - candidate_len = leading_whitespace_end - line_start; - assert(candidate_len > 0); - } else { - /* We then compare with the current longest leading whitespace. - - [line_start, leading_whitespace_end) is the leading whitespace of - this line, - - [candidate_start, candidate_start + candidate_len) - is the leading whitespace of the current longest leading - whitespace. */ - Py_ssize_t new_candidate_len = 0; - - for (char *candidate_iter = candidate_start, - *line_iter = line_start; - candidate_iter < candidate_start + candidate_len && - line_iter < leading_whitespace_end; - ++candidate_iter, ++line_iter) { - if (*candidate_iter != *line_iter) { - break; - } - ++new_candidate_len; - } - - candidate_len = new_candidate_len; - if (candidate_len == 0) { - return bytes; - } - } - } - - if (candidate_len == 0) { - return bytes; - } - assert(candidate_len > 0); - - // trigger a dedent - char *p; - PyObject *new_bytes; - char *line_start; - Py_ssize_t new_line_len; - bool in_leading_space; _PyBytesWriter writer; - _PyBytesWriter_Init(&writer); - p = _PyBytesWriter_Alloc(&writer, nchars); + char *p = _PyBytesWriter_Alloc(&writer, nchars); if (p == NULL) { - goto error; + return NULL; } - for (char *iter = start; iter < end; ++iter) { - line_start = iter; - in_leading_space = true; - - // iterate over a line - while (iter < end && *iter != '\n') { - if (in_leading_space && *iter != ' ' && *iter != '\t') { - in_leading_space = false; - } - ++iter; - } - - // invariant: *iter == '\n' or iter == end - - // if this line has all white space, write '\n' - if (in_leading_space) { - p = _PyBytesWriter_Prepare(&writer, p, 1); - if (p == NULL) { - goto error; - } - *p++ = '\n'; - continue; - } - - // copy [new_line_start + candidate_len, iter) to buffer, then append - // '\n' - new_line_len = iter - line_start - candidate_len; - assert(new_line_len >= 0); - p = _PyBytesWriter_Prepare(&writer, p, new_line_len + 1); - if (p == NULL) { - goto error; - } - memcpy(p, line_start + candidate_len, new_line_len); - - p += new_line_len; - - // this may always append '\n' at the end of `new_bytes` - *p++ = '\n'; + int ret = _PyBytes_Dedent(start, nchars, &writer, &p); + if (ret < 0) { + return NULL; } - - new_bytes = _PyBytesWriter_Finish(&writer, p); - if (new_bytes == NULL) { - goto error; + if (ret == 0) { + Py_INCREF(bytes); + _PyBytesWriter_Dealloc(&writer); + return bytes; } - Py_DECREF(bytes); - return new_bytes; - -error: - _PyBytesWriter_Dealloc(&writer); - return NULL; + return _PyBytesWriter_Finish(&writer, p); } static int @@ -396,10 +282,12 @@ pymain_run_command(wchar_t *command) goto error; } - bytes = dedent_utf8_bytes(bytes); - if (bytes == NULL) { + PyObject *new_bytes = dedent_utf8_bytes(bytes); + if (new_bytes == NULL) { + Py_DECREF(bytes); goto error; } + Py_SETREF(bytes, new_bytes); PyCompilerFlags cf = _PyCompilerFlags_INIT; cf.cf_flags |= PyCF_IGNORE_COOKIE; diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 6b9231a9fa7693..8c82c6e1ab1dbe 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3535,6 +3535,8 @@ _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr, } +/* Algorithms on bytes */ + void _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, const char* src, Py_ssize_t len_src) @@ -3558,3 +3560,148 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, } } +/** Dedent a UTF-8 encoded string. + * behavior is expected to match `textwrap.dedent` + * + * return value: + * 0, no need to dedent, writer untouched + * 1, success + * -1, failure + * + * str is the beginning of the string to dedent. + * expecting (str != NULL) + * + * len is the length of the string to dedent. + * expecting (len >= 0) + * + * writer is a _PyBytesWriter object to write the dedented string. + * expecting (writer != NULL) + * + * p points to a char* indicating the current position in the _PyBytesWriter. + * It is updated to the new position after writing the dedented string on exit. + * expecting (p != NULL && *p != NULL) + */ +int +_PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, + char **p) +{ + assert(str); + assert(p != NULL && *p != NULL); + assert(writer); + + if (len <= 0) + return 0; + + const char *end = str + len; + assert(str < end); // prevent overflow when len is too large + + const char *candidate_start = NULL; + Py_ssize_t candidate_len = 0; + + for (const char *iter = str; iter < end; ++iter) { + const char *line_start = iter; + const char *leading_whitespace_end = NULL; + + // scan the whole line + while (iter < end && *iter != '\n') { + if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { + if (iter == line_start) { + // some line has no indent, fast exit! + return 0; + } + leading_whitespace_end = iter; + } + ++iter; + } + + // if this line has all white space, skip it + if (!leading_whitespace_end) { + continue; + } + + if (!candidate_start) { + candidate_start = line_start; + candidate_len = leading_whitespace_end - line_start; + assert(candidate_len > 0); + } else { + /* We then compare with the current longest leading whitespace. + + [line_start, leading_whitespace_end) is the leading whitespace of + this line, + + [candidate_start, candidate_start + candidate_len) + is the leading whitespace of the current longest leading + whitespace. */ + Py_ssize_t new_candidate_len = 0; + + for (const char *candidate_iter = candidate_start, + *line_iter = line_start; + candidate_iter < candidate_start + candidate_len && + line_iter < leading_whitespace_end; + ++candidate_iter, ++line_iter) { + if (*candidate_iter != *line_iter) { + break; + } + ++new_candidate_len; + } + + candidate_len = new_candidate_len; + if (candidate_len == 0) { + return 0; + } + } + } + + assert(candidate_len >= 0); + if (candidate_len == 0) { + return 0; + } + + // trigger a dedent + + // prepare the writer + char *p_ = _PyBytesWriter_Prepare(writer, *p, len); + if (p_ == NULL) { + *p = NULL; + return -1; + } + + for (const char *iter = str; iter < end; ++iter) { + const char *line_start = iter; + bool in_leading_space = true; + + // iterate over a line to find the end of a line + while (iter < end && *iter != '\n') { + if (in_leading_space && *iter != ' ' && *iter != '\t') { + in_leading_space = false; + } + ++iter; + } + + // invariant: *iter == '\n' or iter == end + bool append_newline = iter < end; + + // if this line has all white space, write '\n' + if (in_leading_space && append_newline) { + *p_++ = '\n'; + continue; + } + + /* copy [new_line_start + candidate_len, iter) to buffer, then + conditionally append '\n' */ + + Py_ssize_t new_line_len = iter - line_start - candidate_len; + assert(new_line_len >= 0); + + memcpy(p_, line_start + candidate_len, new_line_len); + + p_ += new_line_len; + + if (append_newline) { + *p_++ = '\n'; + } + } + + *p = p_; + return 1; +} From a19b67564eb07767e2fc53c99cb21b09c2173e38 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 11:58:51 +0800 Subject: [PATCH 25/42] Apply suggestions from code review --- Include/internal/pycore_bytesobject.h | 2 +- Objects/bytesobject.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index f1167c1de52233..12beb1617705c3 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -136,7 +136,7 @@ PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, * * len is the length of the string to dedent. * expecting (len >= 0) - * + * * writer is a _PyBytesWriter object to write the dedented string. * expecting (writer != NULL) * diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 8c82c6e1ab1dbe..7c27eda7e39056 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3573,7 +3573,7 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * * len is the length of the string to dedent. * expecting (len >= 0) - * + * * writer is a _PyBytesWriter object to write the dedented string. * expecting (writer != NULL) * @@ -3584,7 +3584,7 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, int _PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, char **p) -{ +{ assert(str); assert(p != NULL && *p != NULL); assert(writer); From 7ce411f5fc4ddb76fc81a2d6b0e354de5546874b Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 17:26:50 +0800 Subject: [PATCH 26/42] Update Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst Co-authored-by: Inada Naoki --- .../2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst index c02978d024bcc8..adeb39570bcd9d 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst @@ -1,6 +1,6 @@ String arguments passed to "-c" are now automatically dedented as if by :func:`textwrap.dedent`. This allows "python -c" invocations to be indented - in shell scripts without causing indentation errors. +in shell scripts without causing indentation errors. Add a private API :c:func:`_PyBytes_Dedent`. From dea43017537845f82da1d14538fdcaab1b65837d Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 18:20:47 +0800 Subject: [PATCH 27/42] resolve comments --- Include/internal/pycore_bytesobject.h | 49 +++++++++++----------- Modules/main.c | 20 ++++----- Objects/bytesobject.c | 59 +++++++++++---------------- 3 files changed, 58 insertions(+), 70 deletions(-) diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index 12beb1617705c3..c8d034b0e64d91 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -41,6 +41,30 @@ PyAPI_FUNC(void) _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, const char* src, Py_ssize_t len_src); + +/** Dedent a UTF-8 encoded string. + * behavior is expected to match `textwrap.dedent` + * + * return value: + * 0, no need to dedent, `out_len` untouched + * 1, success + * + * `src` is the string to dedent. + * expecting `(src != NULL)` + * + * `src_len` is the length of `src`. + * + * `out` is a buffer for the result. + * expecting `(out != NULL)` + * + * `out_len` points to the length of `out`, and is updated to the length of the + * result upon success. Output buffer should be large enough to hold the result. + * expecting `(out_len != NULL && *out_len >= src_len)` + */ +PyAPI_FUNC(int) +_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char* out, + Py_ssize_t* out_len); + /* --- _PyBytesWriter ----------------------------------------------------- */ /* The _PyBytesWriter structure is big: it contains an embedded "stack buffer". @@ -122,31 +146,6 @@ PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, const void *bytes, Py_ssize_t size); - -/** Dedent a UTF-8 encoded string. - * behavior is expected to match `textwrap.dedent` - * - * return value: - * 0, no need to dedent, writer untouched - * 1, success - * -1, failure - * - * str is the beginning of the string to dedent. - * expecting (str != NULL) - * - * len is the length of the string to dedent. - * expecting (len >= 0) - * - * writer is a _PyBytesWriter object to write the dedented string. - * expecting (writer != NULL) - * - * p points to a char* indicating the current position in the _PyBytesWriter. - * It is updated to the new position after writing the dedented string on exit. - * expecting (p != NULL && *p != NULL) - */ -PyAPI_FUNC(int) -_PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, - char **p); #ifdef __cplusplus } #endif diff --git a/Modules/main.c b/Modules/main.c index cd9a5b84f9a0b6..6853858b01ef74 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -234,7 +234,7 @@ pymain_import_readline(const PyConfig *config) static PyObject * dedent_utf8_bytes(PyObject *bytes) { - assert(bytes == NULL || !PyBytes_CheckExact(bytes->ob_type)); + assert(bytes != NULL && PyBytes_CheckExact(bytes)); Py_ssize_t nchars; char *start; @@ -242,23 +242,23 @@ dedent_utf8_bytes(PyObject *bytes) return NULL; } - _PyBytesWriter writer; - _PyBytesWriter_Init(&writer); - char *p = _PyBytesWriter_Alloc(&writer, nchars); + char* p = PyMem_Malloc(nchars); if (p == NULL) { + PyErr_NoMemory(); return NULL; } - int ret = _PyBytes_Dedent(start, nchars, &writer, &p); - if (ret < 0) { - return NULL; - } + int ret = _PyBytes_Dedent(start, nchars, p, &nchars); + if (ret == 0) { Py_INCREF(bytes); - _PyBytesWriter_Dealloc(&writer); + PyMem_Free(p); return bytes; } - return _PyBytesWriter_Finish(&writer, p); + + PyObject* new_bytes = PyBytes_FromStringAndSize(p, nchars); + PyMem_Free(p); + return new_bytes; } static int diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 7c27eda7e39056..1afae469d75211 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3564,41 +3564,37 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * behavior is expected to match `textwrap.dedent` * * return value: - * 0, no need to dedent, writer untouched + * 0, no need to dedent, `out_len` untouched * 1, success - * -1, failure * - * str is the beginning of the string to dedent. - * expecting (str != NULL) + * `src` is the string to dedent. + * expecting `(src != NULL)` * - * len is the length of the string to dedent. - * expecting (len >= 0) + * `src_len` is the length of `src`. * - * writer is a _PyBytesWriter object to write the dedented string. - * expecting (writer != NULL) + * `out` is a buffer for the result. + * expecting `(out != NULL)` * - * p points to a char* indicating the current position in the _PyBytesWriter. - * It is updated to the new position after writing the dedented string on exit. - * expecting (p != NULL && *p != NULL) + * `out_len` points to the length of `out`, and is updated to the length of the + * result upon success. Output buffer should be large enough to hold the result. + * expecting `(out_len != NULL && *out_len >= src_len)` */ int -_PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, - char **p) -{ - assert(str); - assert(p != NULL && *p != NULL); - assert(writer); +_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *out, + Py_ssize_t *out_len) { + assert(src && out && out_len); + assert(*out_len >= src_len); - if (len <= 0) + if (src_len <= 0) return 0; - const char *end = str + len; - assert(str < end); // prevent overflow when len is too large + const char *end = src + src_len; + assert(src < end); // prevent overflow when src_len is too large const char *candidate_start = NULL; Py_ssize_t candidate_len = 0; - for (const char *iter = str; iter < end; ++iter) { + for (const char *iter = src; iter < end; ++iter) { const char *line_start = iter; const char *leading_whitespace_end = NULL; @@ -3658,15 +3654,9 @@ _PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, } // trigger a dedent + char *out_start = out; - // prepare the writer - char *p_ = _PyBytesWriter_Prepare(writer, *p, len); - if (p_ == NULL) { - *p = NULL; - return -1; - } - - for (const char *iter = str; iter < end; ++iter) { + for (const char *iter = src; iter < end; ++iter) { const char *line_start = iter; bool in_leading_space = true; @@ -3683,7 +3673,7 @@ _PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, // if this line has all white space, write '\n' if (in_leading_space && append_newline) { - *p_++ = '\n'; + *out++ = '\n'; continue; } @@ -3693,15 +3683,14 @@ _PyBytes_Dedent(const char *str, Py_ssize_t len, _PyBytesWriter *writer, Py_ssize_t new_line_len = iter - line_start - candidate_len; assert(new_line_len >= 0); - memcpy(p_, line_start + candidate_len, new_line_len); + memcpy(out, line_start + candidate_len, new_line_len); - p_ += new_line_len; + out += new_line_len; if (append_newline) { - *p_++ = '\n'; + *out++ = '\n'; } } - - *p = p_; + *out_len = out - out_start; return 1; } From e06d40cd9f419291f31d14a563f4634b5af8bd85 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 19:46:55 +0800 Subject: [PATCH 28/42] Update Modules/main.c --- Modules/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Modules/main.c b/Modules/main.c index 6853858b01ef74..913bef7921eb4d 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -7,7 +7,7 @@ #include "pycore_pathconfig.h" // _PyPathConfig_ComputeSysPath0() #include "pycore_pylifecycle.h" // _Py_PreInitializeFromPyArgv() #include "pycore_pystate.h" // _PyInterpreterState_GET() -#include "pycore_bytesobject.h" // _PyBytesWriter, _PyBytes_Dedent() +#include "pycore_bytesobject.h" // _PyBytes_Dedent() /* Includes for exit_sigint() */ #include // perror() From a40d028324c4159a8526f3e69e2cd1765952b1bc Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Sun, 23 Jul 2023 23:12:26 +0800 Subject: [PATCH 29/42] rename `out` to `dest` --- Include/internal/pycore_bytesobject.h | 17 +++++++------- Objects/bytesobject.c | 33 ++++++++++++++------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index c8d034b0e64d91..411dbcc6edc9ea 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -46,7 +46,7 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * behavior is expected to match `textwrap.dedent` * * return value: - * 0, no need to dedent, `out_len` untouched + * 0, no need to dedent, `dest` buffer and `*dest_len` untouched * 1, success * * `src` is the string to dedent. @@ -54,16 +54,17 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * * `src_len` is the length of `src`. * - * `out` is a buffer for the result. - * expecting `(out != NULL)` + * `dest` is a buffer for the result. + * expecting `(dest != NULL)` * - * `out_len` points to the length of `out`, and is updated to the length of the - * result upon success. Output buffer should be large enough to hold the result. - * expecting `(out_len != NULL && *out_len >= src_len)` + * `*dest_len` stores the length of `dest` on entry, and is updated to the + * length of the dedent result upon success. Output buffer should be large + * enough to hold the result. + * expecting `(dest_len != NULL && *dest_len >= src_len)` */ PyAPI_FUNC(int) -_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char* out, - Py_ssize_t* out_len); +_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char* dest, + Py_ssize_t* dest_len); /* --- _PyBytesWriter ----------------------------------------------------- */ diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 1afae469d75211..6de0218459939c 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3564,7 +3564,7 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * behavior is expected to match `textwrap.dedent` * * return value: - * 0, no need to dedent, `out_len` untouched + * 0, no need to dedent, `dest` buffer and `*dest_len` untouched * 1, success * * `src` is the string to dedent. @@ -3572,18 +3572,19 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, * * `src_len` is the length of `src`. * - * `out` is a buffer for the result. - * expecting `(out != NULL)` + * `dest` is a buffer for the result. + * expecting `(dest != NULL)` * - * `out_len` points to the length of `out`, and is updated to the length of the - * result upon success. Output buffer should be large enough to hold the result. - * expecting `(out_len != NULL && *out_len >= src_len)` + * `*dest_len` stores the length of `dest` on entry, and is updated to the + * length of the dedent result upon success. Output buffer should be large + * enough to hold the result. + * expecting `(dest_len != NULL && *dest_len >= src_len)` */ int -_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *out, - Py_ssize_t *out_len) { - assert(src && out && out_len); - assert(*out_len >= src_len); +_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *dest, + Py_ssize_t *dest_len) { + assert(src && dest && dest_len); + assert(*dest_len >= src_len); if (src_len <= 0) return 0; @@ -3654,7 +3655,7 @@ _PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *out, } // trigger a dedent - char *out_start = out; + char *dest_start = dest; for (const char *iter = src; iter < end; ++iter) { const char *line_start = iter; @@ -3673,7 +3674,7 @@ _PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *out, // if this line has all white space, write '\n' if (in_leading_space && append_newline) { - *out++ = '\n'; + *dest++ = '\n'; continue; } @@ -3683,14 +3684,14 @@ _PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *out, Py_ssize_t new_line_len = iter - line_start - candidate_len; assert(new_line_len >= 0); - memcpy(out, line_start + candidate_len, new_line_len); + memcpy(dest, line_start + candidate_len, new_line_len); - out += new_line_len; + dest += new_line_len; if (append_newline) { - *out++ = '\n'; + *dest++ = '\n'; } } - *out_len = out - out_start; + *dest_len = dest - dest_start; return 1; } From 9569655476c2e6d91980e39489c7f0342c95dbab Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Mon, 24 Jul 2023 23:36:22 +0800 Subject: [PATCH 30/42] move to _PyUnicode_Dedent --- Include/internal/pycore_bytesobject.h | 25 ---- Include/internal/pycore_unicodeobject.h | 6 + ...-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 9 +- Modules/main.c | 43 +----- Objects/bytesobject.c | 136 ----------------- Objects/unicodeobject.c | 141 ++++++++++++++++++ 6 files changed, 156 insertions(+), 204 deletions(-) diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index 411dbcc6edc9ea..115c0c52c8f9a9 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -41,31 +41,6 @@ PyAPI_FUNC(void) _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, const char* src, Py_ssize_t len_src); - -/** Dedent a UTF-8 encoded string. - * behavior is expected to match `textwrap.dedent` - * - * return value: - * 0, no need to dedent, `dest` buffer and `*dest_len` untouched - * 1, success - * - * `src` is the string to dedent. - * expecting `(src != NULL)` - * - * `src_len` is the length of `src`. - * - * `dest` is a buffer for the result. - * expecting `(dest != NULL)` - * - * `*dest_len` stores the length of `dest` on entry, and is updated to the - * length of the dedent result upon success. Output buffer should be large - * enough to hold the result. - * expecting `(dest_len != NULL && *dest_len >= src_len)` - */ -PyAPI_FUNC(int) -_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char* dest, - Py_ssize_t* dest_len); - /* --- _PyBytesWriter ----------------------------------------------------- */ /* The _PyBytesWriter structure is big: it contains an embedded "stack buffer". diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index ad59c3e385f2d3..fd10c4c0cba53d 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -339,6 +339,12 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping( PyObject *thousands_sep, Py_UCS4 *maxchar); +/* Dedent a string. + Behaviour is expected to be an exact match of `textwrap.dedent`. + Return a new reference on success, NULL with exception set on error. + */ +PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode); + /* --- Misc functions ----------------------------------------------------- */ extern PyObject* _PyUnicode_FormatLong(PyObject *, int, int, int); diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst index adeb39570bcd9d..2e033ba98e12b3 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst @@ -1,7 +1,6 @@ -String arguments passed to "-c" are now automatically dedented as if by +String arguments passed to "-c" are now automatically dedented as if by :func:`textwrap.dedent`. This allows "python -c" invocations to be indented -in shell scripts without causing indentation errors. +in shell scripts without causing indentation errors. (Patch by Jon Crall and +Steven Sun) -Add a private API :c:func:`_PyBytes_Dedent`. - -(Patch by Jon Crall and Steven Sun) +Add an internal API :c:func:`_PyUnicode_Dedent`. (Patch by Steven Sun) diff --git a/Modules/main.c b/Modules/main.c index 913bef7921eb4d..27cdcb042584fa 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -229,37 +229,6 @@ pymain_import_readline(const PyConfig *config) } } -/* Strip common leading whitespace, just as textwrap.dedent. - It returns a new reference. */ -static PyObject * -dedent_utf8_bytes(PyObject *bytes) -{ - assert(bytes != NULL && PyBytes_CheckExact(bytes)); - - Py_ssize_t nchars; - char *start; - if (PyBytes_AsStringAndSize(bytes, &start, &nchars) != 0) { - return NULL; - } - - char* p = PyMem_Malloc(nchars); - if (p == NULL) { - PyErr_NoMemory(); - return NULL; - } - - int ret = _PyBytes_Dedent(start, nchars, p, &nchars); - - if (ret == 0) { - Py_INCREF(bytes); - PyMem_Free(p); - return bytes; - } - - PyObject* new_bytes = PyBytes_FromStringAndSize(p, nchars); - PyMem_Free(p); - return new_bytes; -} static int pymain_run_command(wchar_t *command) @@ -276,18 +245,16 @@ pymain_run_command(wchar_t *command) return pymain_exit_err_print(); } - bytes = PyUnicode_AsUTF8String(unicode); - Py_DECREF(unicode); - if (bytes == NULL) { + Py_SETREF(unicode, _PyUnicode_Dedent(unicode)); + if (unicode == NULL) { goto error; } - PyObject *new_bytes = dedent_utf8_bytes(bytes); - if (new_bytes == NULL) { - Py_DECREF(bytes); + bytes = PyUnicode_AsUTF8String(unicode); + Py_DECREF(unicode); + if (bytes == NULL) { goto error; } - Py_SETREF(bytes, new_bytes); PyCompilerFlags cf = _PyCompilerFlags_INIT; cf.cf_flags |= PyCF_IGNORE_COOKIE; diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 6de0218459939c..fdfef6cbcdeb06 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3559,139 +3559,3 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, } } } - -/** Dedent a UTF-8 encoded string. - * behavior is expected to match `textwrap.dedent` - * - * return value: - * 0, no need to dedent, `dest` buffer and `*dest_len` untouched - * 1, success - * - * `src` is the string to dedent. - * expecting `(src != NULL)` - * - * `src_len` is the length of `src`. - * - * `dest` is a buffer for the result. - * expecting `(dest != NULL)` - * - * `*dest_len` stores the length of `dest` on entry, and is updated to the - * length of the dedent result upon success. Output buffer should be large - * enough to hold the result. - * expecting `(dest_len != NULL && *dest_len >= src_len)` - */ -int -_PyBytes_Dedent(const char *src, Py_ssize_t src_len, char *dest, - Py_ssize_t *dest_len) { - assert(src && dest && dest_len); - assert(*dest_len >= src_len); - - if (src_len <= 0) - return 0; - - const char *end = src + src_len; - assert(src < end); // prevent overflow when src_len is too large - - const char *candidate_start = NULL; - Py_ssize_t candidate_len = 0; - - for (const char *iter = src; iter < end; ++iter) { - const char *line_start = iter; - const char *leading_whitespace_end = NULL; - - // scan the whole line - while (iter < end && *iter != '\n') { - if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { - if (iter == line_start) { - // some line has no indent, fast exit! - return 0; - } - leading_whitespace_end = iter; - } - ++iter; - } - - // if this line has all white space, skip it - if (!leading_whitespace_end) { - continue; - } - - if (!candidate_start) { - candidate_start = line_start; - candidate_len = leading_whitespace_end - line_start; - assert(candidate_len > 0); - } else { - /* We then compare with the current longest leading whitespace. - - [line_start, leading_whitespace_end) is the leading whitespace of - this line, - - [candidate_start, candidate_start + candidate_len) - is the leading whitespace of the current longest leading - whitespace. */ - Py_ssize_t new_candidate_len = 0; - - for (const char *candidate_iter = candidate_start, - *line_iter = line_start; - candidate_iter < candidate_start + candidate_len && - line_iter < leading_whitespace_end; - ++candidate_iter, ++line_iter) { - if (*candidate_iter != *line_iter) { - break; - } - ++new_candidate_len; - } - - candidate_len = new_candidate_len; - if (candidate_len == 0) { - return 0; - } - } - } - - assert(candidate_len >= 0); - if (candidate_len == 0) { - return 0; - } - - // trigger a dedent - char *dest_start = dest; - - for (const char *iter = src; iter < end; ++iter) { - const char *line_start = iter; - bool in_leading_space = true; - - // iterate over a line to find the end of a line - while (iter < end && *iter != '\n') { - if (in_leading_space && *iter != ' ' && *iter != '\t') { - in_leading_space = false; - } - ++iter; - } - - // invariant: *iter == '\n' or iter == end - bool append_newline = iter < end; - - // if this line has all white space, write '\n' - if (in_leading_space && append_newline) { - *dest++ = '\n'; - continue; - } - - /* copy [new_line_start + candidate_len, iter) to buffer, then - conditionally append '\n' */ - - Py_ssize_t new_line_len = iter - line_start - candidate_len; - assert(new_line_len >= 0); - - memcpy(dest, line_start + candidate_len, new_line_len); - - dest += new_line_len; - - if (append_newline) { - *dest++ = '\n'; - } - } - *dest_len = dest - dest_start; - return 1; -} diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index fe2660c6ce6058..284185756f18bc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -13343,6 +13343,147 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored)) return Py_BuildValue("(N)", copy); } +/* Dedent a string. + Behaviour is expected to be an exact match of `textwrap.dedent`. + Return a new reference on success, NULL with exception set on error. + */ +PyAPI_FUNC(PyObject *) +_PyUnicode_Dedent(PyObject *unicode) +{ + Py_ssize_t src_len = 0; + const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len); + if (!src) { + return NULL; + } + if (src_len <= 0) { + Py_INCREF(unicode); + return unicode; + } + + const char *end = src + src_len; + + // [candidate_start, candidate_start + candidate_len) + // describes the current longest common leading whitespace + const char *candidate_start = NULL; + Py_ssize_t candidate_len = 0; + + for (const char *iter = src; iter < end; ++iter) { + const char *line_start = iter; + const char *leading_whitespace_end = NULL; + + // scan the whole line + while (iter < end && *iter != '\n') { + if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { + /* `iter` points to the first non-whitespace character + in this line */ + if (iter == line_start) { + // some line has no indent, fast exit! + Py_INCREF(unicode); + return unicode; + } + leading_whitespace_end = iter; + } + ++iter; + } + + // if this line has all white space, skip it + if (!leading_whitespace_end) { + continue; + } + + if (!candidate_start) { + // update the first leading whitespace + candidate_start = line_start; + candidate_len = leading_whitespace_end - line_start; + assert(candidate_len > 0); + } else { + /* We then compare with the current longest leading whitespace. + + [line_start, leading_whitespace_end) is the leading whitespace of + this line, + + [candidate_start, candidate_start + candidate_len) + is the leading whitespace of the current longest leading + whitespace. */ + Py_ssize_t new_candidate_len = 0; + + for (const char *candidate_iter = candidate_start, + *line_iter = line_start; + candidate_iter < candidate_start + candidate_len && + line_iter < leading_whitespace_end; + ++candidate_iter, ++line_iter) { + if (*candidate_iter != *line_iter) { + break; + } + ++new_candidate_len; + } + + candidate_len = new_candidate_len; + if (candidate_len == 0) { + // No common things now, fast exit! + Py_INCREF(unicode); + return unicode; + } + } + } + + assert(candidate_len >= 0); + /* Final check for strings that contain nothing but whitespace. */ + if (candidate_len == 0) { + Py_INCREF(unicode); + return unicode; + } + + // now we should trigger a dedent + char *dest = PyMem_Malloc(src_len); + if (!dest) { + PyErr_NoMemory(); + return NULL; + } + char *dest_iter = dest; + + for (const char *iter = src; iter < end; ++iter) { + const char *line_start = iter; + bool in_leading_space = true; + + // iterate over a line to find the end of a line + while (iter < end && *iter != '\n') { + if (in_leading_space && *iter != ' ' && *iter != '\t') { + in_leading_space = false; + } + ++iter; + } + + // invariant: *iter == '\n' or iter == end + bool append_newline = iter < end; + + // if this line has all white space, write '\n' and continue + if (in_leading_space && append_newline) { + *dest_iter++ = '\n'; + continue; + } + + /* copy [new_line_start + candidate_len, iter) to buffer, then + conditionally append '\n' */ + + Py_ssize_t new_line_len = iter - line_start - candidate_len; + assert(new_line_len >= 0); + memcpy(dest_iter, line_start + candidate_len, new_line_len); + + dest_iter += new_line_len; + + if (append_newline) { + *dest_iter++ = '\n'; + } + } + + Py_ssize_t dest_len = dest_iter - dest; + + PyObject *res = PyUnicode_FromStringAndSize(dest, dest_len); + PyMem_Free(dest); + return res; +} + static PyMethodDef unicode_methods[] = { UNICODE_ENCODE_METHODDEF UNICODE_REPLACE_METHODDEF From 1735d0f900a357ce3825e43281b325da79ff594b Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Tue, 25 Jul 2023 00:13:06 +0800 Subject: [PATCH 31/42] Apply suggestions from code review clean up things --- Modules/main.c | 1 - Objects/bytesobject.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/Modules/main.c b/Modules/main.c index 27cdcb042584fa..86505ac38418a9 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -7,7 +7,6 @@ #include "pycore_pathconfig.h" // _PyPathConfig_ComputeSysPath0() #include "pycore_pylifecycle.h" // _Py_PreInitializeFromPyArgv() #include "pycore_pystate.h" // _PyInterpreterState_GET() -#include "pycore_bytesobject.h" // _PyBytes_Dedent() /* Includes for exit_sigint() */ #include // perror() diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index fdfef6cbcdeb06..42dac3a41fe03c 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3535,8 +3535,6 @@ _PyBytesWriter_WriteBytes(_PyBytesWriter *writer, void *ptr, } -/* Algorithms on bytes */ - void _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, const char* src, Py_ssize_t len_src) From d3681b71e2bb233870d0c2aeccecdbffdf9a8195 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:14:29 +0800 Subject: [PATCH 32/42] clean up things --- Objects/bytesobject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 42dac3a41fe03c..6b9231a9fa7693 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -3557,3 +3557,4 @@ _PyBytes_Repeat(char* dest, Py_ssize_t len_dest, } } } + From d1b4cd17d747a9c960e596fc2ecfb1a6be95b106 Mon Sep 17 00:00:00 2001 From: Jon Crall Date: Thu, 10 Apr 2025 11:44:42 -0400 Subject: [PATCH 33/42] Update Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- .../2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst index 2e033ba98e12b3..511ca8fa732fa6 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst @@ -2,5 +2,3 @@ String arguments passed to "-c" are now automatically dedented as if by :func:`textwrap.dedent`. This allows "python -c" invocations to be indented in shell scripts without causing indentation errors. (Patch by Jon Crall and Steven Sun) - -Add an internal API :c:func:`_PyUnicode_Dedent`. (Patch by Steven Sun) From e556bbffb23f2e1d436e09f6da591c7acf05e9d2 Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 10 Apr 2025 11:49:30 -0400 Subject: [PATCH 34/42] lint: space in folder name --- .../2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/{Core and Builtins => Core_and_Builtins}/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst (100%) diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst b/Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst similarity index 100% rename from Misc/NEWS.d/next/Core and Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst rename to Misc/NEWS.d/next/Core_and_Builtins/2023-04-29-23-15-38.gh-issue-103997.BS3uVt.rst From 136c8b0892e08a6662fc4c0d0e922c419a774892 Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 10 Apr 2025 11:54:46 -0400 Subject: [PATCH 35/42] Explicit include of pycore_unicodeobject.h --- Modules/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/Modules/main.c b/Modules/main.c index 3a5c9caac3ce3a..ea1239ecc57f00 100644 --- a/Modules/main.c +++ b/Modules/main.c @@ -11,6 +11,7 @@ #include "pycore_pylifecycle.h" // _Py_PreInitializeFromPyArgv() #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_pythonrun.h" // _PyRun_AnyFileObject() +#include "pycore_unicodeobject.h" // _PyUnicode_Dedent() /* Includes for exit_sigint() */ #include // perror() From cd14a00bea12ba4dc326d008ec03ccbadfb2d627 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Thu, 17 Apr 2025 23:21:18 +0800 Subject: [PATCH 36/42] Apply suggestions from code review Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Lib/test/test_cmd_line.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index e1c6fb9c64380d..66768c28567c54 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -1096,13 +1096,13 @@ def test_cmd_dedent(self): args1 = sys.executable, '-c', case['code'] proc1 = subprocess.run(args1, stdout=subprocess.PIPE) self.assertEqual(proc1.returncode, 0, proc1) - output1 = proc1.stdout.strip() + output1 = proc1.stdout.strip().decode(encoding='utf-8') # Manually dedent beforehand, check the result is the same. args2 = sys.executable, '-c', dedent(case['code']) proc2 = subprocess.run(args2, stdout=subprocess.PIPE) self.assertEqual(proc2.returncode, 0, proc2) - output2 = proc2.stdout.strip() + output2 = proc2.stdout.strip().decode(encoding='utf-8') self.assertEqual(output1, output2) self.assertEqual(output1.replace(b'\r\n', b'\n'), case['expected']) From 07d2273ee1ad68689594a64e32c5e85b48facd97 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Fri, 18 Apr 2025 04:32:14 +0800 Subject: [PATCH 37/42] Resolve Comments Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- Lib/test/test_cmd_line.py | 54 ++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 66768c28567c54..c4a38f80e0db4e 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -1055,28 +1055,32 @@ def test_cmd_dedent(self): # test that -c auto-dedents its arguments from textwrap import dedent test_cases = [ - { - 'code': ''' + ( + """ print('space-auto-dedent') - ''', - 'expected': b'space-auto-dedent', - }, - { - 'code': dedent(''' + """, + "space-auto-dedent", + ), + ( + dedent( + """ ^^^print('tab-auto-dedent') - ''').replace('^', '\t'), - 'expected': b'tab-auto-dedent', - }, - { - 'code': dedent(''' + """ + ).replace("^", "\t"), + "tab-auto-dedent", + ), + ( + dedent( + """ ^^if 1: ^^^^print('mixed-auto-dedent-1') ^^print('mixed-auto-dedent-2') - ''').replace('^', '\t \t'), - 'expected': b'mixed-auto-dedent-1\nmixed-auto-dedent-2', - }, - { - 'code': ''' + """ + ).replace("^", "\t \t"), + "mixed-auto-dedent-1\nmixed-auto-dedent-2", + ), + ( + ''' data = """$ this data has an empty newline above and a newline with spaces below $ @@ -1084,28 +1088,30 @@ def test_cmd_dedent(self): """$ if 1: $ print(repr(data))$ - '''.replace('$', ''), + '''.replace( + "$", "" + ), # Note: entirely blank lines are normalized to \n, even if they # are part of a data string. This is consistent with # textwrap.dedent behavior, but might not be intuitive. - 'expected': b"'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'", - }, + "'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'", + ), ] - for case in test_cases: + for code, expected in test_cases: # Run the auto-dedent case - args1 = sys.executable, '-c', case['code'] + args1 = sys.executable, '-c', code proc1 = subprocess.run(args1, stdout=subprocess.PIPE) self.assertEqual(proc1.returncode, 0, proc1) output1 = proc1.stdout.strip().decode(encoding='utf-8') # Manually dedent beforehand, check the result is the same. - args2 = sys.executable, '-c', dedent(case['code']) + args2 = sys.executable, '-c', dedent(code) proc2 = subprocess.run(args2, stdout=subprocess.PIPE) self.assertEqual(proc2.returncode, 0, proc2) output2 = proc2.stdout.strip().decode(encoding='utf-8') self.assertEqual(output1, output2) - self.assertEqual(output1.replace(b'\r\n', b'\n'), case['expected']) + self.assertEqual(output1.replace('\r\n', '\n'), expected) def test_cmd_dedent_failcase(self): # Mixing tabs and spaces is not allowed From ed6e17bdd4792386ce625b49b21bbd410692f925 Mon Sep 17 00:00:00 2001 From: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:30:22 +0800 Subject: [PATCH 38/42] Refactor implementation --- Include/internal/pycore_unicodeobject.h | 2 +- Lib/test/test_cmd_line.py | 11 +- Objects/unicodeobject.c | 130 ++++++++++++++---------- 3 files changed, 85 insertions(+), 58 deletions(-) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index a5b2b28a1ab8b1..c85d53b89accdb 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -251,7 +251,7 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping( Behaviour is expected to be an exact match of `textwrap.dedent`. Return a new reference on success, NULL with exception set on error. */ -PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode); +extern PyObject* _PyUnicode_Dedent(PyObject *unicode); /* --- Misc functions ----------------------------------------------------- */ diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index c4a38f80e0db4e..e1d1d03d4ff698 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -17,6 +17,8 @@ spawn_python, kill_python, assert_python_ok, assert_python_failure, interpreter_requires_environment ) +from textwrap import dedent + if not support.has_subprocess_support: raise unittest.SkipTest("test module requires subprocess") @@ -1053,7 +1055,6 @@ def test_int_max_str_digits(self): def test_cmd_dedent(self): # test that -c auto-dedents its arguments - from textwrap import dedent test_cases = [ ( """ @@ -1096,6 +1097,14 @@ def test_cmd_dedent(self): # textwrap.dedent behavior, but might not be intuitive. "'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'", ), + ( + '', + '', + ), + ( + ' \t\n\t\n \t\t\t \t\t \t\n\t\t \n\n\n\t\t\t ', + '', + ), ] for code, expected in test_cases: # Run the auto-dedent case diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2c1a803d22db2c..cc0fb70b5a66c2 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14270,29 +14270,22 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored)) return Py_BuildValue("(N)", copy); } -/* Dedent a string. - Behaviour is expected to be an exact match of `textwrap.dedent`. - Return a new reference on success, NULL with exception set on error. - */ -PyAPI_FUNC(PyObject *) -_PyUnicode_Dedent(PyObject *unicode) -{ - Py_ssize_t src_len = 0; - const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len); - if (!src) { - return NULL; - } - if (src_len <= 0) { - Py_INCREF(unicode); - return unicode; - } - - const char *end = src + src_len; - - // [candidate_start, candidate_start + candidate_len) +/* +This function searchs the longest common leading whitespace +of all lines in the [src, end). +It returns the length of the common leading whitespace and sets `output` to +point to the beginning of the common leading whitespace if length > 0. +*/ +static Py_ssize_t +search_longest_common_leading_whitespace( + const char * const src, + const char * const end, + const char * * output +) { + // [_start, _start + _len) // describes the current longest common leading whitespace - const char *candidate_start = NULL; - Py_ssize_t candidate_len = 0; + const char *_start = NULL; + Py_ssize_t _len = 0; for (const char *iter = src; iter < end; ++iter) { const char *line_start = iter; @@ -14305,8 +14298,7 @@ _PyUnicode_Dedent(PyObject *unicode) in this line */ if (iter == line_start) { // some line has no indent, fast exit! - Py_INCREF(unicode); - return unicode; + return 0; } leading_whitespace_end = iter; } @@ -14318,47 +14310,73 @@ _PyUnicode_Dedent(PyObject *unicode) continue; } - if (!candidate_start) { + if (!_start) { // update the first leading whitespace - candidate_start = line_start; - candidate_len = leading_whitespace_end - line_start; - assert(candidate_len > 0); - } else { + _start = line_start; + _len = leading_whitespace_end - line_start; + assert(_len > 0); + } + else { /* We then compare with the current longest leading whitespace. - [line_start, leading_whitespace_end) is the leading whitespace of - this line, + [line_start, leading_whitespace_end) is the leading + whitespace of this line, - [candidate_start, candidate_start + candidate_len) - is the leading whitespace of the current longest leading - whitespace. */ - Py_ssize_t new_candidate_len = 0; + [_start, _start + _len) is the leading whitespace of the + current longest leading whitespace. */ + Py_ssize_t new_len = 0; + const char *_iter = _start, *line_iter = line_start; - for (const char *candidate_iter = candidate_start, - *line_iter = line_start; - candidate_iter < candidate_start + candidate_len && - line_iter < leading_whitespace_end; - ++candidate_iter, ++line_iter) { - if (*candidate_iter != *line_iter) { - break; - } - ++new_candidate_len; + while (_iter < _start + _len && line_iter < leading_whitespace_end + && *_iter == *line_iter) + { + ++_iter; + ++line_iter; + ++new_len; } - candidate_len = new_candidate_len; - if (candidate_len == 0) { + _len = new_len; + if (_len == 0) { // No common things now, fast exit! - Py_INCREF(unicode); - return unicode; + return 0; } } } - assert(candidate_len >= 0); - /* Final check for strings that contain nothing but whitespace. */ - if (candidate_len == 0) { - Py_INCREF(unicode); - return unicode; + assert(_len >= 0); + if (_len > 0) { + *output = _start; + } + return _len; +} + +/* Dedent a string. + Behaviour is expected to be an exact match of `textwrap.dedent`. + Return a new reference on success, NULL with exception set on error. + */ +PyObject * +_PyUnicode_Dedent(PyObject *unicode) +{ + Py_ssize_t src_len = 0; + const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len); + if (!src) { + return NULL; + } + assert(src_len >= 0); + if (src_len == 0) { + return Py_NewRef(unicode); + } + + const char *const end = src + src_len; + + // [whitespace_start, whitespace_start + whitespace_len) + // describes the current longest common leading whitespace + const char *whitespace_start = NULL; + Py_ssize_t whitespace_len = search_longest_common_leading_whitespace( + src, end, &whitespace_start); + + if (whitespace_len == 0) { + return Py_NewRef(unicode); } // now we should trigger a dedent @@ -14390,12 +14408,12 @@ _PyUnicode_Dedent(PyObject *unicode) continue; } - /* copy [new_line_start + candidate_len, iter) to buffer, then + /* copy [new_line_start + whitespace_len, iter) to buffer, then conditionally append '\n' */ - Py_ssize_t new_line_len = iter - line_start - candidate_len; + Py_ssize_t new_line_len = iter - line_start - whitespace_len; assert(new_line_len >= 0); - memcpy(dest_iter, line_start + candidate_len, new_line_len); + memcpy(dest_iter, line_start + whitespace_len, new_line_len); dest_iter += new_line_len; From 4c78c5772d0f47d5957cedb34574619e8101dfce Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 18 Apr 2025 15:55:01 +0900 Subject: [PATCH 39/42] Apply suggestions from code review --- Objects/unicodeobject.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index cc0fb70b5a66c2..577e350dfb4f05 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14280,8 +14280,8 @@ static Py_ssize_t search_longest_common_leading_whitespace( const char * const src, const char * const end, - const char * * output -) { + const char * * output) +{ // [_start, _start + _len) // describes the current longest common leading whitespace const char *_start = NULL; @@ -14422,9 +14422,7 @@ _PyUnicode_Dedent(PyObject *unicode) } } - Py_ssize_t dest_len = dest_iter - dest; - - PyObject *res = PyUnicode_FromStringAndSize(dest, dest_len); + PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest); PyMem_Free(dest); return res; } From 38d2a4ec693f2cea2192fd9ae3ab288e8297ef3a Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 18 Apr 2025 16:03:08 +0900 Subject: [PATCH 40/42] add what's new entry --- Doc/whatsnew/3.14.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index 7d469e83dc27ad..dda2f1a2a8f188 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -474,6 +474,11 @@ Other language changes explicitly overridden in the subclass. (Contributed by Tomasz Pytel in :gh:`132329`.) +* The command line option :option:`-c` now automatically dedents its code + argument before execution. The auto-dedentation behavior mirrors + :func:`textwrap.dedent`. + (Contributed by Jon Crall and Steven Sun in :gh:`103998`.) + .. _whatsnew314-pep765: PEP 765: Disallow return/break/continue that exit a finally block From 42b633095a2d9290eb7415243ce4d0aa1772f398 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Fri, 18 Apr 2025 16:10:13 +0900 Subject: [PATCH 41/42] Document dedentation of command in version 3.14 --- Doc/using/cmdline.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 9b5c6eb863e56d..2e9d8b7bf9590e 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -73,6 +73,9 @@ source. .. audit-event:: cpython.run_command command cmdoption-c + .. versionchanged:: 3.14 + *command* is automatically dedented before execution. + .. option:: -m Search :data:`sys.path` for the named module and execute its contents as From 98c17e5dc1764d9aa66f9706d72eed269e2b9993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Fri, 18 Apr 2025 10:06:36 +0200 Subject: [PATCH 42/42] Apply suggestions from code review --- Doc/using/cmdline.rst | 2 +- Doc/whatsnew/3.14.rst | 1 + Objects/unicodeobject.c | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 2e9d8b7bf9590e..fa7c9cddf9c6d6 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -73,7 +73,7 @@ source. .. audit-event:: cpython.run_command command cmdoption-c - .. versionchanged:: 3.14 + .. versionchanged:: next *command* is automatically dedented before execution. .. option:: -m diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index dda2f1a2a8f188..aaa4702d53df93 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -479,6 +479,7 @@ Other language changes :func:`textwrap.dedent`. (Contributed by Jon Crall and Steven Sun in :gh:`103998`.) + .. _whatsnew314-pep765: PEP 765: Disallow return/break/continue that exit a finally block diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 577e350dfb4f05..e01a10fc19e904 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -14278,9 +14278,9 @@ point to the beginning of the common leading whitespace if length > 0. */ static Py_ssize_t search_longest_common_leading_whitespace( - const char * const src, - const char * const end, - const char * * output) + const char *const src, + const char *const end, + const char **output) { // [_start, _start + _len) // describes the current longest common leading whitespace @@ -14328,7 +14328,7 @@ search_longest_common_leading_whitespace( const char *_iter = _start, *line_iter = line_start; while (_iter < _start + _len && line_iter < leading_whitespace_end - && *_iter == *line_iter) + && *_iter == *line_iter) { ++_iter; ++line_iter;