Skip to content

Commit 6d336a0

Browse files
bpo-30285: Optimize case-insensitive matching and searching (#1482)
of regular expressions.
1 parent f93234b commit 6d336a0

File tree

6 files changed

+215
-70
lines changed

6 files changed

+215
-70
lines changed

Doc/whatsnew/3.7.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,10 @@ Optimizations
208208
using the :func:`os.scandir` function.
209209
(Contributed by Serhiy Storchaka in :issue:`25996`.)
210210

211+
* Optimized case-insensitive matching and searching of :mod:`regular
212+
expressions <re>`. Searching some patterns can now be up to 20 times faster.
213+
(Contributed by Serhiy Storchaka in :issue:`30285`.)
214+
211215

212216
Build and C API Changes
213217
=======================

Lib/sre_compile.py

Lines changed: 102 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
6969
REPEATING_CODES = _REPEATING_CODES
7070
SUCCESS_CODES = _SUCCESS_CODES
7171
ASSERT_CODES = _ASSERT_CODES
72+
iscased = None
7273
tolower = None
7374
fixes = None
7475
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
7576
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
77+
iscased = _sre.unicode_iscased
7678
tolower = _sre.unicode_tolower
7779
fixes = _ignorecase_fixes
7880
else:
81+
iscased = _sre.ascii_iscased
7982
tolower = _sre.ascii_tolower
8083
for op, av in pattern:
8184
if op in LITERAL_CODES:
@@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
8588
elif flags & SRE_FLAG_LOCALE:
8689
emit(OP_LOC_IGNORE[op])
8790
emit(av)
91+
elif not iscased(av):
92+
emit(op)
93+
emit(av)
8894
else:
8995
lo = tolower(av)
9096
if fixes and lo in fixes:
@@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
101107
emit(OP_IGNORE[op])
102108
emit(lo)
103109
elif op is IN:
104-
if not flags & SRE_FLAG_IGNORECASE:
105-
emit(op)
106-
elif flags & SRE_FLAG_LOCALE:
110+
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
111+
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
107112
emit(IN_LOC_IGNORE)
108-
else:
113+
elif hascased:
109114
emit(IN_IGNORE)
115+
else:
116+
emit(IN)
110117
skip = _len(code); emit(0)
111-
_compile_charset(av, flags, code, tolower, fixes)
118+
_compile_charset(charset, flags, code)
112119
code[skip] = _len(code) - skip
113120
elif op is ANY:
114121
if flags & SRE_FLAG_DOTALL:
@@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
223230
else:
224231
raise error("internal: unsupported operand type %r" % (op,))
225232

226-
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
233+
def _compile_charset(charset, flags, code):
227234
# compile charset subprogram
228235
emit = code.append
229-
for op, av in _optimize_charset(charset, fixup, fixes):
236+
for op, av in charset:
230237
emit(op)
231238
if op is NEGATE:
232239
pass
@@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
250257
raise error("internal: unsupported set operator %r" % (op,))
251258
emit(FAILURE)
252259

253-
def _optimize_charset(charset, fixup, fixes):
260+
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
254261
# internal: optimize character set
255262
out = []
256263
tail = []
257264
charmap = bytearray(256)
265+
hascased = False
258266
for op, av in charset:
259267
while True:
260268
try:
@@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
265273
if fixes and lo in fixes:
266274
for k in fixes[lo]:
267275
charmap[k] = 1
276+
if not hascased and iscased(av):
277+
hascased = True
268278
else:
269279
charmap[av] = 1
270280
elif op is RANGE:
271281
r = range(av[0], av[1]+1)
272282
if fixup:
273-
r = map(fixup, r)
274-
if fixup and fixes:
275-
for i in r:
276-
charmap[i] = 1
277-
if i in fixes:
278-
for k in fixes[i]:
279-
charmap[k] = 1
283+
if fixes:
284+
for i in map(fixup, r):
285+
charmap[i] = 1
286+
if i in fixes:
287+
for k in fixes[i]:
288+
charmap[k] = 1
289+
else:
290+
for i in map(fixup, r):
291+
charmap[i] = 1
292+
if not hascased:
293+
hascased = any(map(iscased, r))
280294
else:
281295
for i in r:
282296
charmap[i] = 1
@@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
290304
charmap += b'\0' * 0xff00
291305
continue
292306
# Character set contains non-BMP character codes.
293-
# There are only two ranges of cased non-BMP characters:
294-
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
295-
# and for both ranges RANGE_IGNORE works.
296-
if fixup and op is RANGE:
297-
op = RANGE_IGNORE
307+
if fixup:
308+
hascased = True
309+
# There are only two ranges of cased non-BMP characters:
310+
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
311+
# and for both ranges RANGE_IGNORE works.
312+
if op is RANGE:
313+
op = RANGE_IGNORE
298314
tail.append((op, av))
299315
break
300316

@@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
322338
out.append((RANGE, (p, q - 1)))
323339
out += tail
324340
# if the case was changed or new representation is more compact
325-
if fixup or len(out) < len(charset):
326-
return out
341+
if hascased or len(out) < len(charset):
342+
return out, hascased
327343
# else original character set is good enough
328-
return charset
344+
return charset, hascased
329345

330346
# use bitmap
331347
if len(charmap) == 256:
332348
data = _mk_bitmap(charmap)
333349
out.append((CHARSET, data))
334350
out += tail
335-
return out
351+
return out, hascased
336352

337353
# To represent a big charset, first a bitmap of all characters in the
338354
# set is constructed. Then, this bitmap is sliced into chunks of 256
@@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
371387
data[0:0] = [block] + _bytes_to_codes(mapping)
372388
out.append((BIGCHARSET, data))
373389
out += tail
374-
return out
390+
return out, hascased
375391

376392
_CODEBITS = _sre.CODESIZE * 8
377393
MAXCODE = (1 << _CODEBITS) - 1
@@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
414430
table[i] = idx + 1
415431
return table
416432

417-
def _get_literal_prefix(pattern):
433+
def _get_iscased(flags):
434+
if not flags & SRE_FLAG_IGNORECASE:
435+
return None
436+
elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
437+
return _sre.unicode_iscased
438+
else:
439+
return _sre.ascii_iscased
440+
441+
def _get_literal_prefix(pattern, flags):
418442
# look for literal prefix
419443
prefix = []
420444
prefixappend = prefix.append
421445
prefix_skip = None
446+
iscased = _get_iscased(flags)
422447
for op, av in pattern.data:
423448
if op is LITERAL:
449+
if iscased and iscased(av):
450+
break
424451
prefixappend(av)
425452
elif op is SUBPATTERN:
426453
group, add_flags, del_flags, p = av
427-
if add_flags & SRE_FLAG_IGNORECASE:
454+
flags1 = (flags | add_flags) & ~del_flags
455+
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
428456
break
429-
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
457+
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
430458
if prefix_skip is None:
431459
if group is not None:
432460
prefix_skip = len(prefix)
@@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
441469
return prefix, prefix_skip, True
442470
return prefix, prefix_skip, False
443471

444-
def _get_charset_prefix(pattern):
445-
charset = [] # not used
446-
charsetappend = charset.append
447-
if pattern.data:
472+
def _get_charset_prefix(pattern, flags):
473+
while True:
474+
if not pattern.data:
475+
return None
448476
op, av = pattern.data[0]
449-
if op is SUBPATTERN:
450-
group, add_flags, del_flags, p = av
451-
if p and not (add_flags & SRE_FLAG_IGNORECASE):
452-
op, av = p[0]
453-
if op is LITERAL:
454-
charsetappend((op, av))
455-
elif op is BRANCH:
456-
c = []
457-
cappend = c.append
458-
for p in av[1]:
459-
if not p:
460-
break
461-
op, av = p[0]
462-
if op is LITERAL:
463-
cappend((op, av))
464-
else:
465-
break
466-
else:
467-
charset = c
468-
elif op is BRANCH:
469-
c = []
470-
cappend = c.append
471-
for p in av[1]:
472-
if not p:
473-
break
474-
op, av = p[0]
475-
if op is LITERAL:
476-
cappend((op, av))
477-
else:
478-
break
477+
if op is not SUBPATTERN:
478+
break
479+
group, add_flags, del_flags, pattern = av
480+
flags = (flags | add_flags) & ~del_flags
481+
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
482+
return None
483+
484+
iscased = _get_iscased(flags)
485+
if op is LITERAL:
486+
if iscased and iscased(av):
487+
return None
488+
return [(op, av)]
489+
elif op is BRANCH:
490+
charset = []
491+
charsetappend = charset.append
492+
for p in av[1]:
493+
if not p:
494+
return None
495+
op, av = p[0]
496+
if op is LITERAL and not (iscased and iscased(av)):
497+
charsetappend((op, av))
479498
else:
480-
charset = c
481-
elif op is IN:
482-
charset = av
483-
return charset
499+
return None
500+
return charset
501+
elif op is IN:
502+
charset = av
503+
if iscased:
504+
for op, av in charset:
505+
if op is LITERAL:
506+
if iscased(av):
507+
return None
508+
elif op is RANGE:
509+
if av[1] > 0xffff:
510+
return None
511+
if any(map(iscased, range(av[0], av[1]+1))):
512+
return None
513+
return charset
514+
return None
484515

485516
def _compile_info(code, pattern, flags):
486517
# internal: compile an info block. in the current version,
@@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
496527
prefix = []
497528
prefix_skip = 0
498529
charset = [] # not used
499-
if not (flags & SRE_FLAG_IGNORECASE):
530+
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
500531
# look for literal prefix
501-
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
532+
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
502533
# if no prefix, look for charset prefix
503534
if not prefix:
504-
charset = _get_charset_prefix(pattern)
535+
charset = _get_charset_prefix(pattern, flags)
505536
## if prefix:
506537
## print("*** PREFIX", prefix, prefix_skip)
507538
## if charset:
@@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
536567
# generate overlap table
537568
code.extend(_generate_overlap_table(prefix))
538569
elif charset:
570+
charset, hascased = _optimize_charset(charset)
571+
assert not hascased
539572
_compile_charset(charset, flags, code)
540573
code[skip] = len(code) - skip
541574

Lib/test/test_re.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,15 +891,24 @@ def test_case_helpers(self):
891891
lo = ord(c.lower())
892892
self.assertEqual(_sre.ascii_tolower(i), lo)
893893
self.assertEqual(_sre.unicode_tolower(i), lo)
894+
iscased = c in string.ascii_letters
895+
self.assertEqual(_sre.ascii_iscased(i), iscased)
896+
self.assertEqual(_sre.unicode_iscased(i), iscased)
894897

895898
for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
896899
c = chr(i)
897900
self.assertEqual(_sre.ascii_tolower(i), i)
898901
if i != 0x0130:
899902
self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
903+
iscased = c != c.lower() or c != c.upper()
904+
self.assertFalse(_sre.ascii_iscased(i))
905+
self.assertEqual(_sre.unicode_iscased(i),
906+
c != c.lower() or c != c.upper())
900907

901908
self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
902909
self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
910+
self.assertFalse(_sre.ascii_iscased(0x0130))
911+
self.assertTrue(_sre.unicode_iscased(0x0130))
903912

904913
def test_not_literal(self):
905914
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,9 @@ Extension Modules
320320
Library
321321
-------
322322

323+
- bpo-30285: Optimized case-insensitive matching and searching of regular
324+
expressions.
325+
323326
- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.
324327

325328
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and

Modules/_sre.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
273273
return sizeof(SRE_CODE);
274274
}
275275

276+
/*[clinic input]
277+
_sre.ascii_iscased -> bool
278+
279+
character: int
280+
/
281+
282+
[clinic start generated code]*/
283+
284+
static int
285+
_sre_ascii_iscased_impl(PyObject *module, int character)
286+
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
287+
{
288+
unsigned int ch = (unsigned int)character;
289+
return ch != sre_lower(ch) || ch != sre_upper(ch);
290+
}
291+
292+
/*[clinic input]
293+
_sre.unicode_iscased -> bool
294+
295+
character: int
296+
/
297+
298+
[clinic start generated code]*/
299+
300+
static int
301+
_sre_unicode_iscased_impl(PyObject *module, int character)
302+
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
303+
{
304+
unsigned int ch = (unsigned int)character;
305+
return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
306+
}
307+
276308
/*[clinic input]
277309
_sre.ascii_tolower -> int
278310
@@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
27502782
static PyMethodDef _functions[] = {
27512783
_SRE_COMPILE_METHODDEF
27522784
_SRE_GETCODESIZE_METHODDEF
2785+
_SRE_ASCII_ISCASED_METHODDEF
2786+
_SRE_UNICODE_ISCASED_METHODDEF
27532787
_SRE_ASCII_TOLOWER_METHODDEF
27542788
_SRE_UNICODE_TOLOWER_METHODDEF
27552789
{NULL, NULL}

0 commit comments

Comments
 (0)