Skip to content

gh-132637: Fix positional predicates in xpath when a default namespace is provided #132822

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions Lib/test/test_xml_etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3168,6 +3168,47 @@ def test_find_xpath(self):
self.assertRaisesRegex(SyntaxError, 'XPath', e.find, './tag[last()-0]')
self.assertRaisesRegex(SyntaxError, 'XPath', e.find, './tag[last()+1]')

def test_find_xpath_namespaces(self):
LINEAR_XML = '''
<body xmlns="X">
<tag class='a'/>
<tag class='b'/>
<tag class='c'/>
<tag class='d'/>
</body>'''
e = ET.XML(LINEAR_XML)
nsmap = {"": "X"}

# Test for numeric indexing and last()
self.assertEqual(
e.find('./tag[1]', namespaces=nsmap).attrib['class'], 'a',
)
self.assertEqual(
e.find('./tag[2]', namespaces=nsmap).attrib['class'], 'b',
)
self.assertEqual(
e.find('./tag[last()]', namespaces=nsmap).attrib['class'], 'd',
)
self.assertEqual(
e.find('./tag[last()-1]', namespaces=nsmap).attrib['class'], 'c',
)
self.assertEqual(
e.find('./tag[last()-2]', namespaces=nsmap).attrib['class'], 'b',
)

self.assertRaisesRegex(
SyntaxError, 'XPath', e.find, './tag[0]', namespaces=nsmap,
)
self.assertRaisesRegex(
SyntaxError, 'XPath', e.find, './tag[-1]', namespaces=nsmap,
)
self.assertRaisesRegex(
SyntaxError, 'XPath', e.find, './tag[last()-0]', namespaces=nsmap,
)
self.assertRaisesRegex(
SyntaxError, 'XPath', e.find, './tag[last()+1]', namespaces=nsmap,
)

def test_findall(self):
e = ET.XML(SAMPLE_XML)
e[2] = ET.XML(SAMPLE_SECTION)
Expand Down Expand Up @@ -3307,6 +3348,21 @@ def test_findall_different_nsmaps(self):
self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)

def test_findall_default_nsmap_position_predicate(self):
root = ET.XML('''
<a xmlns="default" xmlns:x="X" xmlns:y="Y">
<x:b><c/></x:b>
<b/>
<b/>
<c><x:b/><b/></c><y:b/>
</a>''')
nsmap = {'': 'default'}
first_b = root[1]
last_b = root[2]
self.assertEqual(len(root.findall(".//b[1]", namespaces=nsmap)), 2)
self.assertEqual(root.findall(".//b[1]", namespaces=nsmap)[0], first_b)
self.assertEqual(root.findall(".//b[last()]", namespaces=nsmap)[0], last_b)

def test_findall_wildcard(self):
root = ET.XML('''
<a xmlns:x="X" xmlns:y="Y">
Expand Down
21 changes: 18 additions & 3 deletions Lib/xml/etree/ElementPath.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
xpath_tokenizer_re = re.compile(
r"("
r"'[^']*'|\"[^\"]*\"|"
r"last\(\)|" # Pick out the only xpath function currently supported
r"::|"
r"//?|"
r"\.\.|"
Expand All @@ -71,6 +72,9 @@
r"\s+"
)

# Find integers, possibly preceded by - or +
int_re = re.compile(r"[\+\-]?\d+$")

def xpath_tokenizer(pattern, namespaces=None):
default_namespace = namespaces.get('') if namespaces else None
parsing_attribute = False
Expand All @@ -85,11 +89,22 @@ def xpath_tokenizer(pattern, namespaces=None):
yield ttype, "{%s}%s" % (namespaces[prefix], uri)
except KeyError:
raise SyntaxError("prefix %r not found in prefix map" % prefix) from None
elif default_namespace and not parsing_attribute:
# We don't preprend the default_namespace when:
# - the tag is an attribute as the xml spec says default namespaces
# don't apply to attributes
# - when the tag is a number, possibly preceded by - or +, as these
# are not valid characters to start a tag with and are probably
# used as positional predicates.
elif default_namespace and not (parsing_attribute or int_re.match(tag)):
yield ttype, "{%s}%s" % (default_namespace, tag)
else:
yield token
parsing_attribute = False
elif ttype == 'last()':
# Break the found 'last()' part into the separate 'tag' and 'ttype'
# separate returned values expected from this generator
yield ('', 'last')
yield ('()', '')
else:
yield token
parsing_attribute = ttype == '@'
Expand Down Expand Up @@ -266,7 +281,7 @@ def select_negated(context, result):
if (attr_value := elem.get(key)) is not None and attr_value != value:
yield elem
return select_negated if '!=' in signature else select
if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
if signature == "-" and not int_re.match(predicate[0]):
# [tag]
tag = predicate[0]
def select(context, result):
Expand All @@ -276,7 +291,7 @@ def select(context, result):
return select
if signature == ".='" or signature == ".!='" or (
(signature == "-='" or signature == "-!='")
and not re.match(r"\-?\d+$", predicate[0])):
and not int_re.match(predicate[0])):
# [.='value'] or [tag='value'] or [.!='value'] or [tag!='value']
tag = predicate[0]
value = predicate[-1]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix ``xml.etree.ElementPath.xpath_tokenizer`` to correctly handle
positional predicates when a default namespace is provided in the
``namespaces`` argument.
Loading