Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def parse_html_declaration(self, i):
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
elif rawdata[i:i+3] == '<![':
elif rawdata[i:i+9] == '<![CDATA[':
return self.parse_marked_section(i)
elif rawdata[i:i+9].lower() == '<!doctype':
# find the closing >
Expand All @@ -277,7 +277,7 @@ def parse_html_declaration(self, i):
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_comment()')
'parse_bogus_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
Expand Down
72 changes: 55 additions & 17 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,55 +539,92 @@ def test_EOF_in_charref(self):
for html, expected in data:
self._run_check(html, expected)

def test_broken_comments(self):
def test_unescape_method(self):
from html import unescape
p = self.get_collector()
with self.assertWarns(DeprecationWarning):
s = '&quot;&#34;&#x22;&quot&#34&#x22&#bad;'
self.assertEqual(p.unescape(s), unescape(s))

def test_bogus_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!><!<-- this was an empty comment>'
'<!!! another bogus comment !!!>')
'<!!! another bogus comment !!!>'
# see #32876
'<![with square brackets]!>'
'<![\nmultiline\nbogusness\n]!>'
'<![more brackets]-[and a hyphen]!>'
'<![cdata[should be uppercase]]>')
expected = [
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
('comment', ''),
('comment', '<-- this was an empty comment'),
('comment', '!! another bogus comment !!!'),
('comment', '[with square brackets]!'),
('comment', '[\nmultiline\nbogusness\n]!'),
('comment', '[more brackets]-[and a hyphen]!'),
('comment', '[cdata[should be uppercase]]'),
]
self._run_check(html, expected)

def test_broken_condcoms(self):
# these condcoms are missing the '--' after '<!' and before the '>'
# and they are considered bogus comments according to
# "8.2.4.42. Markup declaration open state"
html = ('<![if !(IE)]>broken condcom<![endif]>'
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
'<![if !ie 6]><b>foo</b><![endif]>'
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
# and "8.2.4.45 Markup declaration open state", comment tokens should
# be emitted instead of 'unknown decl', but calling unknown_decl
# provides more flexibility.
# See also Lib/_markupbase.py:parse_declaration
expected = [
('unknown decl', 'if !(IE)'),
('comment', '[if !(IE)]'),
('data', 'broken condcom'),
('unknown decl', 'endif'),
('unknown decl', 'if ! IE'),
('comment', '[endif]'),
('comment', '[if ! IE]'),
('startendtag', 'link', [('href', 'favicon.tiff')]),
('unknown decl', 'endif'),
('unknown decl', 'if !IE 6'),
('comment', '[endif]'),
('comment', '[if !IE 6]'),
('startendtag', 'img', [('src', 'firefox.png')]),
('unknown decl', 'endif'),
('unknown decl', 'if !ie 6'),
('comment', '[endif]'),
('comment', '[if !ie 6]'),
('starttag', 'b', []),
('data', 'foo'),
('endtag', 'b'),
('unknown decl', 'endif'),
('unknown decl', 'if (!IE)|(lt IE 9)'),
('comment', '[endif]'),
('comment', '[if (!IE)|(lt IE 9)]'),
('startendtag', 'img', [('src', 'mammoth.bmp')]),
('unknown decl', 'endif')
('comment', '[endif]')
]
self._run_check(html, expected)

def test_cdata_declarations(self):
# More tests should be added. See also "8.2.4.42. Markup
# declaration open state", "8.2.4.69. CDATA section state",
# and issue 32876
html = ('<![CDATA[just some plain text]]>')
expected = [('unknown decl', 'CDATA[just some plain text')]
self._run_check(html, expected)

def test_cdata_declarations_multiline(self):
html = ('<code><![CDATA['
' if (a < b && a > b) {'
' printf("[<marquee>How?</marquee>]");'
' }'
']]></code>')
expected = [
('starttag', 'code', []),
('unknown decl',
'CDATA[ if (a < b && a > b) { '
'printf("[<marquee>How?</marquee>]"); }'),
('endtag', 'code')
]
self._run_check(html, expected)


def test_convert_charrefs_dropped_text(self):
# #23144: make sure that all the events are triggered when
# convert_charrefs is True, even if we don't call .close()
Expand All @@ -601,6 +638,7 @@ def test_convert_charrefs_dropped_text(self):
)



class AttributesTestCase(TestCaseBase):

def test_attr_syntax(self):
Expand Down
Loading