Skip to content

bpo-30285: Optimize case-insensitive matching and searching #1482

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Doc/whatsnew/3.7.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,10 @@ Optimizations
using the :func:`os.scandir` function.
(Contributed by Serhiy Storchaka in :issue:`25996`.)

* Optimized case-insensitive matching and searching of :mod:`regular
expressions <re>`. Searching some patterns can now be up to 20 times faster.
(Contributed by Serhiy Storchaka in :issue:`30285`.)


Build and C API Changes
=======================
Expand Down
171 changes: 102 additions & 69 deletions Lib/sre_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
Expand All @@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOC_IGNORE[op])
emit(av)
elif not iscased(av):
emit(op)
emit(av)
else:
lo = tolower(av)
if fixes and lo in fixes:
Expand All @@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
emit(OP_IGNORE[op])
emit(lo)
elif op is IN:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
elif flags & SRE_FLAG_LOCALE:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
else:
elif hascased:
emit(IN_IGNORE)
else:
emit(IN)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, tolower, fixes)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
Expand Down Expand Up @@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
else:
raise error("internal: unsupported operand type %r" % (op,))

def _compile_charset(charset, flags, code, fixup=None, fixes=None):
def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
for op, av in _optimize_charset(charset, fixup, fixes):
for op, av in charset:
emit(op)
if op is NEGATE:
pass
Expand All @@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)

def _optimize_charset(charset, fixup, fixes):
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
hascased = False
for op, av in charset:
while True:
try:
Expand All @@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
r = map(fixup, r)
if fixup and fixes:
for i in r:
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
if fixes:
for i in map(fixup, r):
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
Expand All @@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_IGNORE works.
if fixup and op is RANGE:
op = RANGE_IGNORE
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_IGNORE works.
if op is RANGE:
op = RANGE_IGNORE
tail.append((op, av))
break

Expand Down Expand Up @@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if fixup or len(out) < len(charset):
return out
if hascased or len(out) < len(charset):
return out, hascased
# else original character set is good enough
return charset
return charset, hascased

# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
return out
return out, hascased

# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
Expand Down Expand Up @@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out
return out, hascased

_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
Expand Down Expand Up @@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
table[i] = idx + 1
return table

def _get_literal_prefix(pattern):
def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased

def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if add_flags & SRE_FLAG_IGNORECASE:
flags1 = (flags | add_flags) & ~del_flags
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
Expand All @@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
return prefix, prefix_skip, True
return prefix, prefix_skip, False

def _get_charset_prefix(pattern):
charset = [] # not used
charsetappend = charset.append
if pattern.data:
def _get_charset_prefix(pattern, flags):
while True:
if not pattern.data:
return None
op, av = pattern.data[0]
if op is SUBPATTERN:
group, add_flags, del_flags, p = av
if p and not (add_flags & SRE_FLAG_IGNORECASE):
op, av = p[0]
if op is LITERAL:
charsetappend((op, av))
elif op is BRANCH:
c = []
cappend = c.append
for p in av[1]:
if not p:
break
op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
else:
charset = c
elif op is BRANCH:
c = []
cappend = c.append
for p in av[1]:
if not p:
break
op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
if op is not SUBPATTERN:
break
group, add_flags, del_flags, pattern = av
flags = (flags | add_flags) & ~del_flags
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
return None

iscased = _get_iscased(flags)
if op is LITERAL:
if iscased and iscased(av):
return None
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
for p in av[1]:
if not p:
return None
op, av = p[0]
if op is LITERAL and not (iscased and iscased(av)):
charsetappend((op, av))
else:
charset = c
elif op is IN:
charset = av
return charset
return None
return charset
elif op is IN:
charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None

def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
Expand All @@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE):
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern)
charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
Expand Down Expand Up @@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip

Expand Down
9 changes: 9 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,15 +891,24 @@ def test_case_helpers(self):
lo = ord(c.lower())
self.assertEqual(_sre.ascii_tolower(i), lo)
self.assertEqual(_sre.unicode_tolower(i), lo)
iscased = c in string.ascii_letters
self.assertEqual(_sre.ascii_iscased(i), iscased)
self.assertEqual(_sre.unicode_iscased(i), iscased)

for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
c = chr(i)
self.assertEqual(_sre.ascii_tolower(i), i)
if i != 0x0130:
self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
iscased = c != c.lower() or c != c.upper()
self.assertFalse(_sre.ascii_iscased(i))
self.assertEqual(_sre.unicode_iscased(i),
c != c.lower() or c != c.upper())

self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
self.assertFalse(_sre.ascii_iscased(0x0130))
self.assertTrue(_sre.unicode_iscased(0x0130))

def test_not_literal(self):
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
Expand Down
3 changes: 3 additions & 0 deletions Misc/NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,9 @@ Extension Modules
Library
-------

- bpo-30285: Optimized case-insensitive matching and searching of regular
expressions.

- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.

- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
Expand Down
34 changes: 34 additions & 0 deletions Modules/_sre.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
return sizeof(SRE_CODE);
}

/*[clinic input]
_sre.ascii_iscased -> bool

character: int
/

[clinic start generated code]*/

static int
_sre_ascii_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower(ch) || ch != sre_upper(ch);
}

/*[clinic input]
_sre.unicode_iscased -> bool

character: int
/

[clinic start generated code]*/

static int
_sre_unicode_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
}

/*[clinic input]
_sre.ascii_tolower -> int

Expand Down Expand Up @@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
static PyMethodDef _functions[] = {
_SRE_COMPILE_METHODDEF
_SRE_GETCODESIZE_METHODDEF
_SRE_ASCII_ISCASED_METHODDEF
_SRE_UNICODE_ISCASED_METHODDEF
_SRE_ASCII_TOLOWER_METHODDEF
_SRE_UNICODE_TOLOWER_METHODDEF
{NULL, NULL}
Expand Down
Loading