Skip to content

Commit 5f5da72

Browse files
authored
bpo-31170: Write unit test for Expat 2.2.4 UTF-8 bug (#3570) (#3745)
Non-regression tests for the Expat 2.2.3 UTF-8 decoder bug. (cherry picked from commit e6d9fcb)
1 parent cf7197a commit 5f5da72

File tree

2 files changed

+33
-0
lines changed

2 files changed

+33
-0
lines changed

Lib/test/test_xml_etree.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
SIMPLE_XMLFILE = findfile("simple.xml", subdir="xmltestdata")
3232
SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata")
33+
UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata")
3334

3435
SAMPLE_XML = """\
3536
<body>
@@ -1494,6 +1495,36 @@ def test_issue10777(self):
14941495
ET.register_namespace('test10777', 'http://myuri/')
14951496
ET.register_namespace('test10777', 'http://myuri/')
14961497

1498+
def check_expat224_utf8_bug(self, text):
1499+
xml = b'<a b="%s"/>' % text
1500+
root = ET.XML(xml)
1501+
self.assertEqual(root.get('b'), text.decode('utf-8'))
1502+
1503+
def test_expat224_utf8_bug(self):
1504+
# bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder.
1505+
# Check that Expat 2.2.4 fixed the bug.
1506+
#
1507+
# Test buffer bounds at odd and even positions.
1508+
1509+
text = b'\xc3\xa0' * 1024
1510+
self.check_expat224_utf8_bug(text)
1511+
1512+
text = b'x' + b'\xc3\xa0' * 1024
1513+
self.check_expat224_utf8_bug(text)
1514+
1515+
def test_expat224_utf8_bug_file(self):
1516+
with open(UTF8_BUG_XMLFILE, 'rb') as fp:
1517+
raw = fp.read()
1518+
root = ET.fromstring(raw)
1519+
xmlattr = root.get('b')
1520+
1521+
# "Parse" manually the XML file to extract the value of the 'b'
1522+
# attribute of the <a b='xxx' /> XML element
1523+
text = raw.decode('utf-8').strip()
1524+
text = text.replace('\r\n', ' ')
1525+
text = text[6:-4]
1526+
self.assertEqual(root.get('b'), text)
1527+
14971528

14981529
# --------------------------------------------------------------------
14991530

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
<a b='01234567890123456古人咏雪抽幽思骋妍辞竞险韵偶得一编奇绝辄擅美当时流声后代是以北门之风南山之雅梁园之简黄台之赋至今为作家称述尚矣及至洛阳之卧剡溪之兴灞桥之思亦皆传为故事钱塘沈履德先生隐居西湖两峰间孤高贞洁与雪同调方大雪满天皴肤粟背之际先生乃鹿中豹舄端居闭门或扶童曳杖踏遍六桥三竺时取古人诗讽咏之合唐宋元诸名家集句成诗得二百四十章联络通穿如出一人如呵一气气立于言表格备于篇中略无掇拾补凑之形非胸次包罗壮阔笔底驱走鲍谢欧苏诸公不能为此世称王荆公为集句擅长观其在钟山对雪仅题数篇未见有此噫嘻奇矣哉亦富矣哉予慕先生有袁安之节愧不能为慧可之立乃取新集命工传写使海内同好者知先生为博古传述之士而一新世人之耳目他日必有慕潜德阐幽光而剞劂以传者余实为之执殳矣
2+
弘治戊午仲冬望日慈溪杨子器衵于海虞官舍序毕诗部' />

0 commit comments

Comments
 (0)