From 4c1534660747b5b0f575e9b7e256c0df6af0954a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 May 2017 20:19:00 +0300 Subject: [PATCH 1/4] bpo-30285: Optimize case-insensitive matching and searching of regular expressions. --- Doc/whatsnew/3.7.rst | 4 + Lib/sre_compile.py | 159 +++++++++++++++++++++++++--------------- Lib/test/test_re.py | 9 +++ Misc/NEWS | 3 + Modules/_sre.c | 34 +++++++++ Modules/clinic/_sre.c.h | 64 +++++++++++++++- 6 files changed, 212 insertions(+), 61 deletions(-) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 7edf4fc3cf4269..93be21f2f5bea5 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -196,6 +196,10 @@ Optimizations using the :func:`os.scandir` function. (Contributed by Serhiy Storchaka in :issue:`25996`.) +* Optimized case-insensitive matching and searching of :mod:`regular + expressions `. Searching some patterns can now be up to 20 times faster. + (Contributed by Serhiy Storchaka in :issue:`30285`.) + Build and C API Changes ======================= diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index db8b8a2778f582..3940b765d34e1f 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -69,13 +69,16 @@ def _compile(code, pattern, flags): REPEATING_CODES = _REPEATING_CODES SUCCESS_CODES = _SUCCESS_CODES ASSERT_CODES = _ASSERT_CODES + iscased = None tolower = None fixes = None if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + iscased = _sre.unicode_iscased tolower = _sre.unicode_tolower fixes = _ignorecase_fixes else: + iscased = _sre.ascii_iscased tolower = _sre.ascii_tolower for op, av in pattern: if op in LITERAL_CODES: @@ -85,6 +88,9 @@ def _compile(code, pattern, flags): elif flags & SRE_FLAG_LOCALE: emit(OP_LOC_IGNORE[op]) emit(av) + elif not iscased(av): + emit(op) + emit(av) else: lo = tolower(av) if fixes and lo in fixes: @@ -101,14 +107,10 @@ def _compile(code, pattern, flags): emit(OP_IGNORE[op]) emit(lo) elif op is IN: - if not flags & SRE_FLAG_IGNORECASE: - emit(op) - elif flags & SRE_FLAG_LOCALE: - emit(IN_LOC_IGNORE) - else: - emit(IN_IGNORE) + emit(op) skip = _len(code); emit(0) - _compile_charset(av, flags, code, tolower, fixes) + op = _compile_charset(av, flags, code, tolower, fixes) + code[skip-1] = op code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: @@ -226,7 +228,8 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None, fixes=None): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup, fixes): + opcs, charset = _optimize_charset(charset, flags, fixup, fixes) + for op, av in charset: emit(op) if op is NEGATE: pass @@ -249,17 +252,26 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): else: raise error("internal: unsupported set operator %r" % (op,)) emit(FAILURE) + return opcs -def _optimize_charset(charset, fixup, fixes): +def _optimize_charset(charset, flags, fixup, fixes): # internal: optimize character set out = [] tail = [] charmap = bytearray(256) + hascased = False + if fixup: + if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + iscased = _sre.unicode_iscased + else: + iscased = _sre.ascii_iscased for op, av in charset: while True: try: if op is LITERAL: if fixup: + if not hascased and iscased(av): + hascased = True lo = fixup(av) charmap[lo] = 1 if fixes and lo in fixes: @@ -268,7 +280,7 @@ def _optimize_charset(charset, fixup, fixes): else: charmap[av] = 1 elif op is RANGE: - r = range(av[0], av[1]+1) + r = r0 = range(av[0], av[1]+1) if fixup: r = map(fixup, r) if fixup and fixes: @@ -280,6 +292,8 @@ def _optimize_charset(charset, fixup, fixes): else: for i in r: charmap[i] = 1 + if fixup and not hascased: + hascased = any(map(iscased, r0)) elif op is NEGATE: out.append((op, av)) else: @@ -295,9 +309,18 @@ def _optimize_charset(charset, fixup, fixes): # and for both ranges RANGE_IGNORE works. if fixup and op is RANGE: op = RANGE_IGNORE + if fixup: + hascased = True tail.append((op, av)) break + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + op = IN_LOC_IGNORE + elif hascased: + op = IN_IGNORE + else: + op = IN + # compress character map runs = [] q = 0 @@ -322,17 +345,17 @@ def _optimize_charset(charset, fixup, fixes): out.append((RANGE, (p, q - 1))) out += tail # if the case was changed or new representation is more compact - if fixup or len(out) < len(charset): - return out + if hascased or len(out) < len(charset): + return op, out # else original character set is good enough - return charset + return op, charset # use bitmap if len(charmap) == 256: data = _mk_bitmap(charmap) out.append((CHARSET, data)) out += tail - return out + return op, out # To represent a big charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 @@ -371,7 +394,7 @@ def _optimize_charset(charset, fixup, fixes): data[0:0] = [block] + _bytes_to_codes(mapping) out.append((BIGCHARSET, data)) out += tail - return out + return op, out _CODEBITS = _sre.CODESIZE * 8 MAXCODE = (1 << _CODEBITS) - 1 @@ -414,19 +437,31 @@ def _generate_overlap_table(prefix): table[i] = idx + 1 return table -def _get_literal_prefix(pattern): +def _get_iscased(flags): + if not flags & SRE_FLAG_IGNORECASE: + return None + elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + return _sre.unicode_iscased + else: + return _sre.ascii_iscased + +def _get_literal_prefix(pattern, flags): # look for literal prefix prefix = [] prefixappend = prefix.append prefix_skip = None + iscased = _get_iscased(flags) for op, av in pattern.data: if op is LITERAL: + if iscased and iscased(av): + break prefixappend(av) elif op is SUBPATTERN: group, add_flags, del_flags, p = av - if add_flags & SRE_FLAG_IGNORECASE: + flags1 = (flags | add_flags) & ~del_flags + if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: break - prefix1, prefix_skip1, got_all = _get_literal_prefix(p) + prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) if prefix_skip is None: if group is not None: prefix_skip = len(prefix) @@ -441,46 +476,49 @@ def _get_literal_prefix(pattern): return prefix, prefix_skip, True return prefix, prefix_skip, False -def _get_charset_prefix(pattern): - charset = [] # not used - charsetappend = charset.append - if pattern.data: +def _get_charset_prefix(pattern, flags): + while True: + if not pattern.data: + return None op, av = pattern.data[0] - if op is SUBPATTERN: - group, add_flags, del_flags, p = av - if p and not (add_flags & SRE_FLAG_IGNORECASE): - op, av = p[0] - if op is LITERAL: - charsetappend((op, av)) - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break - else: - charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break + if op is not SUBPATTERN: + break + group, add_flags, del_flags, pattern = av + flags = (flags | add_flags) & ~del_flags + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + return None + + iscased = _get_iscased(flags) + if op is LITERAL: + if iscased and iscased(av): + return None + return [(op, av)] + elif op is BRANCH: + charset = [] + charsetappend = charset.append + for p in av[1]: + if not p: + return None + op, av = p[0] + if op is LITERAL and not (iscased and iscased(av)): + charsetappend((op, av)) else: - charset = c - elif op is IN: - charset = av - return charset + return None + return charset + elif op is IN: + charset = av + if iscased: + for op, av in charset: + if op is LITERAL: + if iscased(av): + return None + elif op is RANGE: + if av[1] > 0xffff: + return None + if any(map(iscased, range(av[0], av[1]+1))): + return None + return charset + return None def _compile_info(code, pattern, flags): # internal: compile an info block. in the current version, @@ -496,12 +534,12 @@ def _compile_info(code, pattern, flags): prefix = [] prefix_skip = 0 charset = [] # not used - if not (flags & SRE_FLAG_IGNORECASE): + if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE): # look for literal prefix - prefix, prefix_skip, got_all = _get_literal_prefix(pattern) + prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags) # if no prefix, look for charset prefix if not prefix: - charset = _get_charset_prefix(pattern) + charset = _get_charset_prefix(pattern, flags) ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -536,7 +574,8 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: - _compile_charset(charset, flags, code) + op = _compile_charset(charset, flags, code) + assert op is IN code[skip] = len(code) - skip def isstring(obj): diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index b5b7cff9a2a812..3129f7e9888bc5 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -891,15 +891,24 @@ def test_case_helpers(self): lo = ord(c.lower()) self.assertEqual(_sre.ascii_tolower(i), lo) self.assertEqual(_sre.unicode_tolower(i), lo) + iscased = c in string.ascii_letters + self.assertEqual(_sre.ascii_iscased(i), iscased) + self.assertEqual(_sre.unicode_iscased(i), iscased) for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: c = chr(i) self.assertEqual(_sre.ascii_tolower(i), i) if i != 0x0130: self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) + iscased = c != c.lower() or c != c.upper() + self.assertFalse(_sre.ascii_iscased(i)) + self.assertEqual(_sre.unicode_iscased(i), + c != c.lower() or c != c.upper()) self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) + self.assertFalse(_sre.ascii_iscased(0x0130)) + self.assertTrue(_sre.unicode_iscased(0x0130)) def test_not_literal(self): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") diff --git a/Misc/NEWS b/Misc/NEWS index 25619efddf14de..c291f279a94230 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -317,6 +317,9 @@ Extension Modules Library ------- +- bpo-30285: Optimized case-insensitive matching and searching of regular + expressions. + - bpo-30243: Removed the __init__ methods of _json's scanner and encoder. Misusing them could cause memory leaks or crashes. Now scanner and encoder objects are completely initialized in the __new__ methods. diff --git a/Modules/_sre.c b/Modules/_sre.c index a86c5f252b5e58..6873f1db438d41 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module) return sizeof(SRE_CODE); } +/*[clinic input] +_sre.ascii_iscased -> bool + + character: int + / + +[clinic start generated code]*/ + +static int +_sre_ascii_iscased_impl(PyObject *module, int character) +/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/ +{ + unsigned int ch = (unsigned int)character; + return ch != sre_lower(ch) || ch != sre_upper(ch); +} + +/*[clinic input] +_sre.unicode_iscased -> bool + + character: int + / + +[clinic start generated code]*/ + +static int +_sre_unicode_iscased_impl(PyObject *module, int character) +/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/ +{ + unsigned int ch = (unsigned int)character; + return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch); +} + /*[clinic input] _sre.ascii_tolower -> int @@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = { static PyMethodDef _functions[] = { _SRE_COMPILE_METHODDEF _SRE_GETCODESIZE_METHODDEF + _SRE_ASCII_ISCASED_METHODDEF + _SRE_UNICODE_ISCASED_METHODDEF _SRE_ASCII_TOLOWER_METHODDEF _SRE_UNICODE_TOLOWER_METHODDEF {NULL, NULL} diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h index 8056eda3b73d2f..1e606860386a71 100644 --- a/Modules/clinic/_sre.c.h +++ b/Modules/clinic/_sre.c.h @@ -29,6 +29,68 @@ _sre_getcodesize(PyObject *module, PyObject *Py_UNUSED(ignored)) return return_value; } +PyDoc_STRVAR(_sre_ascii_iscased__doc__, +"ascii_iscased($module, character, /)\n" +"--\n" +"\n"); + +#define _SRE_ASCII_ISCASED_METHODDEF \ + {"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__}, + +static int +_sre_ascii_iscased_impl(PyObject *module, int character); + +static PyObject * +_sre_ascii_iscased(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int character; + int _return_value; + + if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) { + goto exit; + } + _return_value = _sre_ascii_iscased_impl(module, character); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(_sre_unicode_iscased__doc__, +"unicode_iscased($module, character, /)\n" +"--\n" +"\n"); + +#define _SRE_UNICODE_ISCASED_METHODDEF \ + {"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__}, + +static int +_sre_unicode_iscased_impl(PyObject *module, int character); + +static PyObject * +_sre_unicode_iscased(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int character; + int _return_value; + + if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) { + goto exit; + } + _return_value = _sre_unicode_iscased_impl(module, character); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyBool_FromLong((long)_return_value); + +exit: + return return_value; +} + PyDoc_STRVAR(_sre_ascii_tolower__doc__, "ascii_tolower($module, character, /)\n" "--\n" @@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored)) { return _sre_SRE_Scanner_search_impl(self); } -/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/ From 49c40692a7b511542193363c4f34952a0f3acee7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 7 May 2017 08:31:32 +0300 Subject: [PATCH 2/4] Simplification by Lisa Roach. --- Lib/sre_compile.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 3940b765d34e1f..5b27dd7e15bc77 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -261,10 +261,7 @@ def _optimize_charset(charset, flags, fixup, fixes): charmap = bytearray(256) hascased = False if fixup: - if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: - iscased = _sre.unicode_iscased - else: - iscased = _sre.ascii_iscased + iscased = _get_iscased(flags) for op, av in charset: while True: try: From 5e6e0e8dc5a57c4af94f9636dc81bc1741a0bc16 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sun, 7 May 2017 11:02:44 +0300 Subject: [PATCH 3/4] Additional refactoring. --- Lib/sre_compile.py | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 5b27dd7e15bc77..9a4fadda59d653 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -107,10 +107,15 @@ def _compile(code, pattern, flags): emit(OP_IGNORE[op]) emit(lo) elif op is IN: - emit(op) + charset, hascased = _optimize_charset(av, iscased, tolower, fixes) + if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: + emit(IN_LOC_IGNORE) + elif hascased: + emit(IN_IGNORE) + else: + emit(IN) skip = _len(code); emit(0) - op = _compile_charset(av, flags, code, tolower, fixes) - code[skip-1] = op + _compile_charset(charset, flags, code) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: @@ -225,10 +230,9 @@ def _compile(code, pattern, flags): else: raise error("internal: unsupported operand type %r" % (op,)) -def _compile_charset(charset, flags, code, fixup=None, fixes=None): +def _compile_charset(charset, flags, code): # compile charset subprogram emit = code.append - opcs, charset = _optimize_charset(charset, flags, fixup, fixes) for op, av in charset: emit(op) if op is NEGATE: @@ -252,16 +256,13 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): else: raise error("internal: unsupported set operator %r" % (op,)) emit(FAILURE) - return opcs -def _optimize_charset(charset, flags, fixup, fixes): +def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # internal: optimize character set out = [] tail = [] charmap = bytearray(256) hascased = False - if fixup: - iscased = _get_iscased(flags) for op, av in charset: while True: try: @@ -311,13 +312,6 @@ def _optimize_charset(charset, flags, fixup, fixes): tail.append((op, av)) break - if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: - op = IN_LOC_IGNORE - elif hascased: - op = IN_IGNORE - else: - op = IN - # compress character map runs = [] q = 0 @@ -343,16 +337,16 @@ def _optimize_charset(charset, flags, fixup, fixes): out += tail # if the case was changed or new representation is more compact if hascased or len(out) < len(charset): - return op, out + return out, hascased # else original character set is good enough - return op, charset + return charset, hascased # use bitmap if len(charmap) == 256: data = _mk_bitmap(charmap) out.append((CHARSET, data)) out += tail - return op, out + return out, hascased # To represent a big charset, first a bitmap of all characters in the # set is constructed. Then, this bitmap is sliced into chunks of 256 @@ -391,7 +385,7 @@ def _optimize_charset(charset, flags, fixup, fixes): data[0:0] = [block] + _bytes_to_codes(mapping) out.append((BIGCHARSET, data)) out += tail - return op, out + return out, hascased _CODEBITS = _sre.CODESIZE * 8 MAXCODE = (1 << _CODEBITS) - 1 @@ -571,8 +565,9 @@ def _compile_info(code, pattern, flags): # generate overlap table code.extend(_generate_overlap_table(prefix)) elif charset: - op = _compile_charset(charset, flags, code) - assert op is IN + charset, hascased = _optimize_charset(charset) + assert not hascased + _compile_charset(charset, flags, code) code[skip] = len(code) - skip def isstring(obj): From af992cbe8bb8240b615570755066ee823ab204ae Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 8 May 2017 19:31:52 +0300 Subject: [PATCH 4/4] More refactoring. --- Lib/sre_compile.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 9a4fadda59d653..cebecb93c0ab80 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -268,30 +268,32 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): try: if op is LITERAL: if fixup: - if not hascased and iscased(av): - hascased = True lo = fixup(av) charmap[lo] = 1 if fixes and lo in fixes: for k in fixes[lo]: charmap[k] = 1 + if not hascased and iscased(av): + hascased = True else: charmap[av] = 1 elif op is RANGE: - r = r0 = range(av[0], av[1]+1) + r = range(av[0], av[1]+1) if fixup: - r = map(fixup, r) - if fixup and fixes: - for i in r: - charmap[i] = 1 - if i in fixes: - for k in fixes[i]: - charmap[k] = 1 + if fixes: + for i in map(fixup, r): + charmap[i] = 1 + if i in fixes: + for k in fixes[i]: + charmap[k] = 1 + else: + for i in map(fixup, r): + charmap[i] = 1 + if not hascased: + hascased = any(map(iscased, r)) else: for i in r: charmap[i] = 1 - if fixup and not hascased: - hascased = any(map(iscased, r0)) elif op is NEGATE: out.append((op, av)) else: @@ -302,13 +304,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap += b'\0' * 0xff00 continue # Character set contains non-BMP character codes. - # There are only two ranges of cased non-BMP characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_IGNORE works. - if fixup and op is RANGE: - op = RANGE_IGNORE if fixup: hascased = True + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if op is RANGE: + op = RANGE_IGNORE tail.append((op, av)) break