Skip to content

Commit e9b20c5

Browse files
committed
bpo-44317: Improve tokenizer errors with more informative locations
1 parent fa106a6 commit e9b20c5

File tree

3 files changed

+57
-18
lines changed

3 files changed

+57
-18
lines changed

Lib/test/test_exceptions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,9 +224,9 @@ def testSyntaxErrorOffset(self):
224224
# Errors thrown by tokenizer.c
225225
check('(0x+1)', 1, 3)
226226
check('x = 0xI', 1, 6)
227-
check('0010 + 2', 1, 4)
227+
check('0010 + 2', 1, 1)
228228
check('x = 32e-+4', 1, 8)
229-
check('x = 0o9', 1, 6)
229+
check('x = 0o9', 1, 7)
230230
check('\u03b1 = 0xI', 1, 6)
231231
check(b'\xce\xb1 = 0xI', 1, 6)
232232
check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve tokenizer error with improved locations. Patch by Pablo Galindo.

Parser/tokenizer.c

Lines changed: 54 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1067,19 +1067,13 @@ tok_backup(struct tok_state *tok, int c)
10671067
}
10681068
}
10691069

1070-
10711070
static int
1072-
syntaxerror(struct tok_state *tok, const char *format, ...)
1071+
_syntaxerror_range(struct tok_state *tok, const char *format,
1072+
Py_ssize_t col_offset, Py_ssize_t end_col_offset,
1073+
va_list vargs)
10731074
{
10741075
PyObject *errmsg, *errtext, *args;
1075-
va_list vargs;
1076-
#ifdef HAVE_STDARG_PROTOTYPES
1077-
va_start(vargs, format);
1078-
#else
1079-
va_start(vargs);
1080-
#endif
10811076
errmsg = PyUnicode_FromFormatV(format, vargs);
1082-
va_end(vargs);
10831077
if (!errmsg) {
10841078
goto error;
10851079
}
@@ -1089,7 +1083,14 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
10891083
if (!errtext) {
10901084
goto error;
10911085
}
1092-
int offset = (int)PyUnicode_GET_LENGTH(errtext);
1086+
1087+
if (col_offset == 0) {
1088+
col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1089+
}
1090+
if (end_col_offset == 0) {
1091+
col_offset = col_offset;
1092+
}
1093+
10931094
Py_ssize_t line_len = strcspn(tok->line_start, "\n");
10941095
if (line_len != tok->cur - tok->line_start) {
10951096
Py_DECREF(errtext);
@@ -1100,8 +1101,7 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
11001101
goto error;
11011102
}
11021103

1103-
args = Py_BuildValue("(O(OiiN))", errmsg,
1104-
tok->filename, tok->lineno, offset, errtext);
1104+
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, col_offset, errtext, tok->lineno, end_col_offset);
11051105
if (args) {
11061106
PyErr_SetObject(PyExc_SyntaxError, args);
11071107
Py_DECREF(args);
@@ -1113,6 +1113,36 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
11131113
return ERRORTOKEN;
11141114
}
11151115

1116+
static int
1117+
syntaxerror(struct tok_state *tok, const char *format, ...) {
1118+
va_list vargs;
1119+
#ifdef HAVE_STDARG_PROTOTYPES
1120+
va_start(vargs, format);
1121+
#else
1122+
va_start(vargs);
1123+
#endif
1124+
int ret = _syntaxerror_range(tok, format, 0, 0, vargs);
1125+
va_end(vargs);
1126+
return ret;
1127+
}
1128+
1129+
static int
1130+
syntaxerror_known_range(struct tok_state *tok,
1131+
Py_ssize_t col_offset, Py_ssize_t end_col_offset,
1132+
const char *format, ...) {
1133+
va_list vargs;
1134+
#ifdef HAVE_STDARG_PROTOTYPES
1135+
va_start(vargs, format);
1136+
#else
1137+
va_start(vargs);
1138+
#endif
1139+
int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1140+
va_end(vargs);
1141+
return ret;
1142+
}
1143+
1144+
1145+
11161146
static int
11171147
indenterror(struct tok_state *tok)
11181148
{
@@ -1552,6 +1582,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
15521582
/* Number */
15531583
if (isdigit(c)) {
15541584
if (c == '0') {
1585+
char* number_start = tok->cur;
15551586
/* Hex, octal or binary -- maybe. */
15561587
c = tok_nextc(tok);
15571588
if (c == 'x' || c == 'X') {
@@ -1580,6 +1611,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
15801611
if (c < '0' || c >= '8') {
15811612
tok_backup(tok, c);
15821613
if (isdigit(c)) {
1614+
// Move to the actual current token that is incorrect
1615+
tok_nextc(tok);
15831616
return syntaxerror(tok,
15841617
"invalid digit '%c' in octal literal", c);
15851618
}
@@ -1606,6 +1639,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
16061639
if (c != '0' && c != '1') {
16071640
tok_backup(tok, c);
16081641
if (isdigit(c)) {
1642+
// Move to the actual current token that is incorrect
1643+
tok_nextc(tok);
16091644
return syntaxerror(tok,
16101645
"invalid digit '%c' in binary literal", c);
16111646
}
@@ -1639,6 +1674,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
16391674
}
16401675
c = tok_nextc(tok);
16411676
}
1677+
char* zeros_end = tok->cur;
16421678
if (isdigit(c)) {
16431679
nonzero = 1;
16441680
c = tok_decimal_tail(tok);
@@ -1659,10 +1695,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
16591695
else if (nonzero) {
16601696
/* Old-style octal: now disallowed. */
16611697
tok_backup(tok, c);
1662-
return syntaxerror(tok,
1663-
"leading zeros in decimal integer "
1664-
"literals are not permitted; "
1665-
"use an 0o prefix for octal integers");
1698+
return syntaxerror_known_range(
1699+
tok, number_start - tok->line_start,
1700+
zeros_end - tok->line_start,
1701+
"leading zeros in decimal integer "
1702+
"literals are not permitted; "
1703+
"use an 0o prefix for octal integers");
16661704
}
16671705
}
16681706
}

0 commit comments

Comments
 (0)