Skip to content

Commit 5c0a5f3

Browse files
committed
bpo-32876: fix handling of invalid markup declarations.
1 parent 3d18b50 commit 5c0a5f3

File tree

2 files changed

+26
-19
lines changed

2 files changed

+26
-19
lines changed

Lib/html/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def parse_html_declaration(self, i):
260260
if rawdata[i:i+4] == '<!--':
261261
# this case is actually already handled in goahead()
262262
return self.parse_comment(i)
263-
elif rawdata[i:i+3] == '<![':
263+
elif rawdata[i:i+3] == '<![CDATA[':
264264
return self.parse_marked_section(i)
265265
elif rawdata[i:i+9].lower() == '<!doctype':
266266
# find the closing >
@@ -277,7 +277,7 @@ def parse_html_declaration(self, i):
277277
def parse_bogus_comment(self, i, report=1):
278278
rawdata = self.rawdata
279279
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
280-
'parse_comment()')
280+
'parse_bogus_comment()')
281281
pos = rawdata.find('>', i+2)
282282
if pos == -1:
283283
return -1

Lib/test/test_htmlparser.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -580,52 +580,58 @@ def test_unescape_method(self):
580580
s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
581581
self.assertEqual(p.unescape(s), unescape(s))
582582

583-
def test_broken_comments(self):
583+
def test_bogus_comments(self):
584584
html = ('<! not really a comment >'
585585
'<! not a comment either -->'
586586
'<! -- close enough -->'
587587
'<!><!<-- this was an empty comment>'
588-
'<!!! another bogus comment !!!>')
588+
'<!!! another bogus comment !!!>'
589+
# see #32876
590+
'<![with square brackets]!>'
591+
'<![\nmultiline\nbogusness\n]!>'
592+
'<![more brackets]-[and a hyphen]!>'
593+
'<![cdata[should be uppercase]]>')
589594
expected = [
590595
('comment', ' not really a comment '),
591596
('comment', ' not a comment either --'),
592597
('comment', ' -- close enough --'),
593598
('comment', ''),
594599
('comment', '<-- this was an empty comment'),
595600
('comment', '!! another bogus comment !!!'),
601+
('comment', '[with square brackets]!'),
602+
('comment', '[\nmultiline\nbogusness\n]!'),
603+
('comment', '[more brackets]-[and a hyphen]!'),
604+
('comment', '[cdata[should be uppercase]]'),
596605
]
597606
self._run_check(html, expected)
598607

599608
def test_broken_condcoms(self):
600609
# these condcoms are missing the '--' after '<!' and before the '>'
610+
# and they are considered bogus comments according to
611+
# "8.2.4.42. Markup declaration open state"
601612
html = ('<![if !(IE)]>broken condcom<![endif]>'
602613
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
603614
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
604615
'<![if !ie 6]><b>foo</b><![endif]>'
605616
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
606-
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
607-
# and "8.2.4.45 Markup declaration open state", comment tokens should
608-
# be emitted instead of 'unknown decl', but calling unknown_decl
609-
# provides more flexibility.
610-
# See also Lib/_markupbase.py:parse_declaration
611617
expected = [
612-
('unknown decl', 'if !(IE)'),
618+
('comment', '[if !(IE)]'),
613619
('data', 'broken condcom'),
614-
('unknown decl', 'endif'),
615-
('unknown decl', 'if ! IE'),
620+
('comment', '[endif]'),
621+
('comment', '[if ! IE]'),
616622
('startendtag', 'link', [('href', 'favicon.tiff')]),
617-
('unknown decl', 'endif'),
618-
('unknown decl', 'if !IE 6'),
623+
('comment', '[endif]'),
624+
('comment', '[if !IE 6]'),
619625
('startendtag', 'img', [('src', 'firefox.png')]),
620-
('unknown decl', 'endif'),
621-
('unknown decl', 'if !ie 6'),
626+
('comment', '[endif]'),
627+
('comment', '[if !ie 6]'),
622628
('starttag', 'b', []),
623629
('data', 'foo'),
624630
('endtag', 'b'),
625-
('unknown decl', 'endif'),
626-
('unknown decl', 'if (!IE)|(lt IE 9)'),
631+
('comment', '[endif]'),
632+
('comment', '[if (!IE)|(lt IE 9)]'),
627633
('startendtag', 'img', [('src', 'mammoth.bmp')]),
628-
('unknown decl', 'endif')
634+
('comment', '[endif]')
629635
]
630636
self._run_check(html, expected)
631637

@@ -642,6 +648,7 @@ def test_convert_charrefs_dropped_text(self):
642648
)
643649

644650

651+
645652
class AttributesTestCase(TestCaseBase):
646653

647654
def test_attr_syntax(self):

0 commit comments

Comments
 (0)