Skip to content

bpo-44317: Improve tokenizer errors with more informative locations #26555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/test/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,9 @@ def testSyntaxErrorOffset(self):
# Errors thrown by tokenizer.c
check('(0x+1)', 1, 3)
check('x = 0xI', 1, 6)
check('0010 + 2', 1, 4)
check('0010 + 2', 1, 1)
check('x = 32e-+4', 1, 8)
check('x = 0o9', 1, 6)
check('x = 0o9', 1, 7)
check('\u03b1 = 0xI', 1, 6)
check(b'\xce\xb1 = 0xI', 1, 6)
check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve tokenizer error with improved locations. Patch by Pablo Galindo.
72 changes: 54 additions & 18 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1067,19 +1067,13 @@ tok_backup(struct tok_state *tok, int c)
}
}


static int
syntaxerror(struct tok_state *tok, const char *format, ...)
_syntaxerror_range(struct tok_state *tok, const char *format,
int col_offset, int end_col_offset,
va_list vargs)
{
PyObject *errmsg, *errtext, *args;
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
errmsg = PyUnicode_FromFormatV(format, vargs);
va_end(vargs);
if (!errmsg) {
goto error;
}
Expand All @@ -1089,7 +1083,14 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
if (!errtext) {
goto error;
}
int offset = (int)PyUnicode_GET_LENGTH(errtext);

if (col_offset == -1) {
col_offset = (int)PyUnicode_GET_LENGTH(errtext);
}
if (end_col_offset == -1) {
end_col_offset = col_offset;
}

Py_ssize_t line_len = strcspn(tok->line_start, "\n");
if (line_len != tok->cur - tok->line_start) {
Py_DECREF(errtext);
Expand All @@ -1100,8 +1101,8 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
goto error;
}

args = Py_BuildValue("(O(OiiN))", errmsg,
tok->filename, tok->lineno, offset, errtext);
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
col_offset, errtext, tok->lineno, end_col_offset);
if (args) {
PyErr_SetObject(PyExc_SyntaxError, args);
Py_DECREF(args);
Expand All @@ -1113,6 +1114,38 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
return ERRORTOKEN;
}

static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
va_end(vargs);
return ret;
}

static int
syntaxerror_known_range(struct tok_state *tok,
int col_offset, int end_col_offset,
const char *format, ...)
{
va_list vargs;
#ifdef HAVE_STDARG_PROTOTYPES
va_start(vargs, format);
#else
va_start(vargs);
#endif
int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
va_end(vargs);
return ret;
}



static int
indenterror(struct tok_state *tok)
{
Expand Down Expand Up @@ -1578,12 +1611,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
c = tok_nextc(tok);
}
if (c < '0' || c >= '8') {
tok_backup(tok, c);
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in octal literal", c);
}
else {
tok_backup(tok, c);
return syntaxerror(tok, "invalid octal literal");
}
}
Expand All @@ -1604,12 +1637,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
c = tok_nextc(tok);
}
if (c != '0' && c != '1') {
tok_backup(tok, c);
if (isdigit(c)) {
return syntaxerror(tok,
"invalid digit '%c' in binary literal", c);
}
else {
tok_backup(tok, c);
return syntaxerror(tok, "invalid binary literal");
}
}
Expand Down Expand Up @@ -1639,6 +1672,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
}
c = tok_nextc(tok);
}
char* zeros_end = tok->cur;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cat tok->start be used instead of a new variable?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not here because we want to highlight only the zeros and tok->cur points at the end of the number

if (isdigit(c)) {
nonzero = 1;
c = tok_decimal_tail(tok);
Expand All @@ -1659,10 +1693,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
else if (nonzero) {
/* Old-style octal: now disallowed. */
tok_backup(tok, c);
return syntaxerror(tok,
"leading zeros in decimal integer "
"literals are not permitted; "
"use an 0o prefix for octal integers");
return syntaxerror_known_range(
tok, (int)(tok->start + 1 - tok->line_start),
(int)(zeros_end - tok->line_start),
"leading zeros in decimal integer "
"literals are not permitted; "
"use an 0o prefix for octal integers");
}
}
}
Expand Down