From c87cb49e35060db6aef591a56b07a21303dab010 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Jun 2025 12:16:37 +0300 Subject: [PATCH 1/3] gh-135462: Fix quadratic complexity in processing special input in HTMLParser --- Lib/html/parser.py | 41 +++++--- Lib/test/test_htmlparser.py | 97 +++++++++++++++---- ...-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 2 + 3 files changed, 109 insertions(+), 31 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 1e30956fe24f83..de0f9a5c86b57b 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -27,6 +27,7 @@ attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?') starttagopen = re.compile('<[a-zA-Z]') +endtagopen = re.compile('') commentclose = re.compile(r'--\s*>') # Note: @@ -195,7 +196,7 @@ def goahead(self, end): k = self.parse_pi(i) elif startswith("', i + 1) - if k < 0: - k = rawdata.find('<', i + 1) - if k < 0: - k = i + 1 - else: - k += 1 - if self.convert_charrefs and not self.cdata_elem: - self.handle_data(unescape(rawdata[i:k])) + if starttagopen.match(rawdata, i): # < + letter + pass + elif startswith("'), - ('comment', '/img'), - ('endtag', 'html<')]) + ('data', '\n')]) def test_starttag_junk_chars(self): + self._run_check("<", [('data', '<')]) + self._run_check("<>", [('data', '<>')]) + self._run_check("< >", [('data', '< >')]) + self._run_check("< ", [('data', '< ')]) self._run_check("", []) + self._run_check("<$>", [('data', '<$>')]) self._run_check("", [('comment', '$')]) self._run_check("", [('endtag', 'a')]) + self._run_check("", [('starttag', 'a", [('endtag', 'a'", [('data', "'", []) + self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) self._run_check("", [('starttag', 'a$b', [])]) self._run_check("", [('startendtag', 'a$b', [])]) + self._run_check("", [('endtag', 'a$b')]) def test_slashes_in_starttag(self): self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) @@ -576,21 +583,50 @@ def test_EOF_in_charref(self): for html, expected in data: self._run_check(html, expected) - def test_EOF_in_comments_or_decls(self): + def test_eof_in_comments(self): data = [ - ('', [('comment', '-!>')]), + ('' '' '' @@ -604,6 +640,7 @@ def test_bogus_comments(self): '' # required '[' after CDATA ) expected = [ + ('comment', 'ELEMENT br EMPTY'), ('comment', ' not really a comment '), ('comment', ' not a comment either --'), ('comment', ' -- close enough --'), @@ -684,6 +721,26 @@ def test_convert_charrefs_dropped_text(self): ('endtag', 'a'), ('data', ' bar & baz')] ) + @support.requires_resource('cpu') + def test_eof_no_quadratic_complexity(self): + # Each of these examples used to take about an hour. + # Now they take a fraction of a second. + def check(source): + parser = html.parser.HTMLParser() + parser.feed(source) + parser.close() + n = 120_000 + check(" Date: Fri, 13 Jun 2025 17:17:38 +0300 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Ezio Melotti --- Lib/html/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index de0f9a5c86b57b..ba416e7fa6e3fe 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -204,12 +204,12 @@ def goahead(self, end): if k < 0: if not end: break - if starttagopen.match(rawdata, i): # < + letter + if starttagopen.match(rawdata, i): # < + letter pass elif startswith(" Date: Fri, 13 Jun 2025 17:27:54 +0300 Subject: [PATCH 3/3] Update a NEWS entry. --- .../Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst index 2d90b1dd980667..cf9aa8dbdf2efe 100644 --- a/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst +++ b/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst @@ -1,2 +1,4 @@ Fix quadratic complexity in processing specially crafted input in -:class:`html.parser.HTMLParser`. +:class:`html.parser.HTMLParser`. End-of-file errors are now handled according +to the HTML5 specs -- comments and declarations are automatically closed, +tags are ignored.