Skip to content

Commit e6d9fcb

Browse files
authored
bpo-31170: Write unit test for Expat 2.2.4 UTF-8 bug (#3570)
Non-regression tests for the Expat 2.2.3 UTF-8 decoder bug.
1 parent 49392c6 commit e6d9fcb

File tree

2 files changed

+34
-0
lines changed

2 files changed

+34
-0
lines changed

Lib/test/test_xml_etree.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
except UnicodeEncodeError:
3535
raise unittest.SkipTest("filename is not encodable to utf8")
3636
SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata")
37+
UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata")
3738

3839
SAMPLE_XML = """\
3940
<body>
@@ -1739,6 +1740,37 @@ def __eq__(self, other):
17391740
self.assertIsInstance(e[0].tag, str)
17401741
self.assertEqual(e[0].tag, 'changed')
17411742

1743+
def check_expat224_utf8_bug(self, text):
1744+
xml = b'<a b="%s"/>' % text
1745+
root = ET.XML(xml)
1746+
self.assertEqual(root.get('b'), text.decode('utf-8'))
1747+
1748+
def test_expat224_utf8_bug(self):
1749+
# bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder.
1750+
# Check that Expat 2.2.4 fixed the bug.
1751+
#
1752+
# Test buffer bounds at odd and even positions.
1753+
1754+
text = b'\xc3\xa0' * 1024
1755+
self.check_expat224_utf8_bug(text)
1756+
1757+
text = b'x' + b'\xc3\xa0' * 1024
1758+
self.check_expat224_utf8_bug(text)
1759+
1760+
def test_expat224_utf8_bug_file(self):
1761+
with open(UTF8_BUG_XMLFILE, 'rb') as fp:
1762+
raw = fp.read()
1763+
root = ET.fromstring(raw)
1764+
xmlattr = root.get('b')
1765+
1766+
# "Parse" manually the XML file to extract the value of the 'b'
1767+
# attribute of the <a b='xxx' /> XML element
1768+
text = raw.decode('utf-8').strip()
1769+
text = text.replace('\r\n', ' ')
1770+
text = text[6:-4]
1771+
self.assertEqual(root.get('b'), text)
1772+
1773+
17421774

17431775
# --------------------------------------------------------------------
17441776

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<a b='01234567890123456古人咏雪抽幽思骋妍辞竞险韵偶得一编奇绝辄擅美当时流声后代是以北门之风南山之雅梁园之简黄台之赋至今为作家称述尚矣及至洛阳之卧剡溪之兴灞桥之思亦皆传为故事钱塘沈履德先生隐居西湖两峰间孤高贞洁与雪同调方大雪满天皴肤粟背之际先生乃鹿中豹舄端居闭门或扶童曳杖踏遍六桥三竺时取古人诗讽咏之合唐宋元诸名家集句成诗得二百四十章联络通穿如出一人如呵一气气立于言表格备于篇中略无掇拾补凑之形非胸次包罗壮阔笔底驱走鲍谢欧苏诸公不能为此世称王荆公为集句擅长观其在钟山对雪仅题数篇未见有此噫嘻奇矣哉亦富矣哉予慕先生有袁安之节愧不能为慧可之立乃取新集命工传写使海内同好者知先生为博古传述之士而一新世人之耳目他日必有慕潜德阐幽光而剞劂以传者余实为之执殳矣
2+
弘治戊午仲冬望日慈溪杨子器衵于海虞官舍序毕诗部' />

0 commit comments

Comments
 (0)