From 0176f765d46f7a86e7fa566c7f9fa5e4869ab88c Mon Sep 17 00:00:00 2001 From: "Nam T. Nguyen" Date: Sun, 28 May 2017 16:15:52 -0700 Subject: [PATCH] urllib: Simplify splithost by calling into urlparse. The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. --- Lib/test/test_urllib.py | 2 ++ Lib/urllib.py | 23 ++++++++--------------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 14de91e13dad19..0e14946a8b1e5f 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -878,6 +878,8 @@ def test_splithost(self): ('www.example.org:80', '')) self.assertEqual(splithost('/foo/bar/baz.html'), (None, '/foo/bar/baz.html')) + self.assertEqual(splithost('//127.0.0.1#@host.com'), + ('127.0.0.1', '/#@host.com')) def test_splituser(self): splituser = urllib.splituser diff --git a/Lib/urllib.py b/Lib/urllib.py index c3c8ef4b600484..931fd51c807dcd 100644 --- a/Lib/urllib.py +++ b/Lib/urllib.py @@ -30,7 +30,7 @@ import base64 import re -from urlparse import urljoin as basejoin +from urlparse import urljoin as basejoin, urlparse, urlunparse __all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve", "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus", @@ -1088,22 +1088,15 @@ def splittype(url): return scheme.lower(), url[len(scheme) + 1:] return None, url -_hostprog = None def splithost(url): """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" - global _hostprog - if _hostprog is None: - import re - _hostprog = re.compile('^//([^/?]*)(.*)$') - - match = _hostprog.match(url) - if match: - host_port = match.group(1) - path = match.group(2) - if path and not path.startswith('/'): - path = '/' + path - return host_port, path - return None, url + fields = urlparse(url) + path = fields.path + rest = [fields.params, fields.query, fields.fragment] + if not path and any(rest): + path = '/' + return (None if fields.netloc == '' else fields.netloc, + urlunparse(['', '', path] + rest)) _userprog = None def splituser(host):