@@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
69
69
REPEATING_CODES = _REPEATING_CODES
70
70
SUCCESS_CODES = _SUCCESS_CODES
71
71
ASSERT_CODES = _ASSERT_CODES
72
+ iscased = None
72
73
tolower = None
73
74
fixes = None
74
75
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE :
75
76
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII :
77
+ iscased = _sre .unicode_iscased
76
78
tolower = _sre .unicode_tolower
77
79
fixes = _ignorecase_fixes
78
80
else :
81
+ iscased = _sre .ascii_iscased
79
82
tolower = _sre .ascii_tolower
80
83
for op , av in pattern :
81
84
if op in LITERAL_CODES :
@@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
85
88
elif flags & SRE_FLAG_LOCALE :
86
89
emit (OP_LOC_IGNORE [op ])
87
90
emit (av )
91
+ elif not iscased (av ):
92
+ emit (op )
93
+ emit (av )
88
94
else :
89
95
lo = tolower (av )
90
96
if fixes and lo in fixes :
@@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
101
107
emit (OP_IGNORE [op ])
102
108
emit (lo )
103
109
elif op is IN :
104
- if not flags & SRE_FLAG_IGNORECASE :
105
- emit (op )
106
- elif flags & SRE_FLAG_LOCALE :
110
+ charset , hascased = _optimize_charset (av , iscased , tolower , fixes )
111
+ if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE :
107
112
emit (IN_LOC_IGNORE )
108
- else :
113
+ elif hascased :
109
114
emit (IN_IGNORE )
115
+ else :
116
+ emit (IN )
110
117
skip = _len (code ); emit (0 )
111
- _compile_charset (av , flags , code , tolower , fixes )
118
+ _compile_charset (charset , flags , code )
112
119
code [skip ] = _len (code ) - skip
113
120
elif op is ANY :
114
121
if flags & SRE_FLAG_DOTALL :
@@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
223
230
else :
224
231
raise error ("internal: unsupported operand type %r" % (op ,))
225
232
226
- def _compile_charset (charset , flags , code , fixup = None , fixes = None ):
233
+ def _compile_charset (charset , flags , code ):
227
234
# compile charset subprogram
228
235
emit = code .append
229
- for op , av in _optimize_charset ( charset , fixup , fixes ) :
236
+ for op , av in charset :
230
237
emit (op )
231
238
if op is NEGATE :
232
239
pass
@@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
250
257
raise error ("internal: unsupported set operator %r" % (op ,))
251
258
emit (FAILURE )
252
259
253
- def _optimize_charset (charset , fixup , fixes ):
260
+ def _optimize_charset (charset , iscased = None , fixup = None , fixes = None ):
254
261
# internal: optimize character set
255
262
out = []
256
263
tail = []
257
264
charmap = bytearray (256 )
265
+ hascased = False
258
266
for op , av in charset :
259
267
while True :
260
268
try :
@@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
265
273
if fixes and lo in fixes :
266
274
for k in fixes [lo ]:
267
275
charmap [k ] = 1
276
+ if not hascased and iscased (av ):
277
+ hascased = True
268
278
else :
269
279
charmap [av ] = 1
270
280
elif op is RANGE :
271
281
r = range (av [0 ], av [1 ]+ 1 )
272
282
if fixup :
273
- r = map (fixup , r )
274
- if fixup and fixes :
275
- for i in r :
276
- charmap [i ] = 1
277
- if i in fixes :
278
- for k in fixes [i ]:
279
- charmap [k ] = 1
283
+ if fixes :
284
+ for i in map (fixup , r ):
285
+ charmap [i ] = 1
286
+ if i in fixes :
287
+ for k in fixes [i ]:
288
+ charmap [k ] = 1
289
+ else :
290
+ for i in map (fixup , r ):
291
+ charmap [i ] = 1
292
+ if not hascased :
293
+ hascased = any (map (iscased , r ))
280
294
else :
281
295
for i in r :
282
296
charmap [i ] = 1
@@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
290
304
charmap += b'\0 ' * 0xff00
291
305
continue
292
306
# Character set contains non-BMP character codes.
293
- # There are only two ranges of cased non-BMP characters:
294
- # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
295
- # and for both ranges RANGE_IGNORE works.
296
- if fixup and op is RANGE :
297
- op = RANGE_IGNORE
307
+ if fixup :
308
+ hascased = True
309
+ # There are only two ranges of cased non-BMP characters:
310
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
311
+ # and for both ranges RANGE_IGNORE works.
312
+ if op is RANGE :
313
+ op = RANGE_IGNORE
298
314
tail .append ((op , av ))
299
315
break
300
316
@@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
322
338
out .append ((RANGE , (p , q - 1 )))
323
339
out += tail
324
340
# if the case was changed or new representation is more compact
325
- if fixup or len (out ) < len (charset ):
326
- return out
341
+ if hascased or len (out ) < len (charset ):
342
+ return out , hascased
327
343
# else original character set is good enough
328
- return charset
344
+ return charset , hascased
329
345
330
346
# use bitmap
331
347
if len (charmap ) == 256 :
332
348
data = _mk_bitmap (charmap )
333
349
out .append ((CHARSET , data ))
334
350
out += tail
335
- return out
351
+ return out , hascased
336
352
337
353
# To represent a big charset, first a bitmap of all characters in the
338
354
# set is constructed. Then, this bitmap is sliced into chunks of 256
@@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
371
387
data [0 :0 ] = [block ] + _bytes_to_codes (mapping )
372
388
out .append ((BIGCHARSET , data ))
373
389
out += tail
374
- return out
390
+ return out , hascased
375
391
376
392
_CODEBITS = _sre .CODESIZE * 8
377
393
MAXCODE = (1 << _CODEBITS ) - 1
@@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
414
430
table [i ] = idx + 1
415
431
return table
416
432
417
- def _get_literal_prefix (pattern ):
433
+ def _get_iscased (flags ):
434
+ if not flags & SRE_FLAG_IGNORECASE :
435
+ return None
436
+ elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII :
437
+ return _sre .unicode_iscased
438
+ else :
439
+ return _sre .ascii_iscased
440
+
441
+ def _get_literal_prefix (pattern , flags ):
418
442
# look for literal prefix
419
443
prefix = []
420
444
prefixappend = prefix .append
421
445
prefix_skip = None
446
+ iscased = _get_iscased (flags )
422
447
for op , av in pattern .data :
423
448
if op is LITERAL :
449
+ if iscased and iscased (av ):
450
+ break
424
451
prefixappend (av )
425
452
elif op is SUBPATTERN :
426
453
group , add_flags , del_flags , p = av
427
- if add_flags & SRE_FLAG_IGNORECASE :
454
+ flags1 = (flags | add_flags ) & ~ del_flags
455
+ if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE :
428
456
break
429
- prefix1 , prefix_skip1 , got_all = _get_literal_prefix (p )
457
+ prefix1 , prefix_skip1 , got_all = _get_literal_prefix (p , flags1 )
430
458
if prefix_skip is None :
431
459
if group is not None :
432
460
prefix_skip = len (prefix )
@@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
441
469
return prefix , prefix_skip , True
442
470
return prefix , prefix_skip , False
443
471
444
- def _get_charset_prefix (pattern ):
445
- charset = [] # not used
446
- charsetappend = charset . append
447
- if pattern . data :
472
+ def _get_charset_prefix (pattern , flags ):
473
+ while True :
474
+ if not pattern . data :
475
+ return None
448
476
op , av = pattern .data [0 ]
449
- if op is SUBPATTERN :
450
- group , add_flags , del_flags , p = av
451
- if p and not (add_flags & SRE_FLAG_IGNORECASE ):
452
- op , av = p [0 ]
453
- if op is LITERAL :
454
- charsetappend ((op , av ))
455
- elif op is BRANCH :
456
- c = []
457
- cappend = c .append
458
- for p in av [1 ]:
459
- if not p :
460
- break
461
- op , av = p [0 ]
462
- if op is LITERAL :
463
- cappend ((op , av ))
464
- else :
465
- break
466
- else :
467
- charset = c
468
- elif op is BRANCH :
469
- c = []
470
- cappend = c .append
471
- for p in av [1 ]:
472
- if not p :
473
- break
474
- op , av = p [0 ]
475
- if op is LITERAL :
476
- cappend ((op , av ))
477
- else :
478
- break
477
+ if op is not SUBPATTERN :
478
+ break
479
+ group , add_flags , del_flags , pattern = av
480
+ flags = (flags | add_flags ) & ~ del_flags
481
+ if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE :
482
+ return None
483
+
484
+ iscased = _get_iscased (flags )
485
+ if op is LITERAL :
486
+ if iscased and iscased (av ):
487
+ return None
488
+ return [(op , av )]
489
+ elif op is BRANCH :
490
+ charset = []
491
+ charsetappend = charset .append
492
+ for p in av [1 ]:
493
+ if not p :
494
+ return None
495
+ op , av = p [0 ]
496
+ if op is LITERAL and not (iscased and iscased (av )):
497
+ charsetappend ((op , av ))
479
498
else :
480
- charset = c
481
- elif op is IN :
482
- charset = av
483
- return charset
499
+ return None
500
+ return charset
501
+ elif op is IN :
502
+ charset = av
503
+ if iscased :
504
+ for op , av in charset :
505
+ if op is LITERAL :
506
+ if iscased (av ):
507
+ return None
508
+ elif op is RANGE :
509
+ if av [1 ] > 0xffff :
510
+ return None
511
+ if any (map (iscased , range (av [0 ], av [1 ]+ 1 ))):
512
+ return None
513
+ return charset
514
+ return None
484
515
485
516
def _compile_info (code , pattern , flags ):
486
517
# internal: compile an info block. in the current version,
@@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
496
527
prefix = []
497
528
prefix_skip = 0
498
529
charset = [] # not used
499
- if not (flags & SRE_FLAG_IGNORECASE ):
530
+ if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE ):
500
531
# look for literal prefix
501
- prefix , prefix_skip , got_all = _get_literal_prefix (pattern )
532
+ prefix , prefix_skip , got_all = _get_literal_prefix (pattern , flags )
502
533
# if no prefix, look for charset prefix
503
534
if not prefix :
504
- charset = _get_charset_prefix (pattern )
535
+ charset = _get_charset_prefix (pattern , flags )
505
536
## if prefix:
506
537
## print("*** PREFIX", prefix, prefix_skip)
507
538
## if charset:
@@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
536
567
# generate overlap table
537
568
code .extend (_generate_overlap_table (prefix ))
538
569
elif charset :
570
+ charset , hascased = _optimize_charset (charset )
571
+ assert not hascased
539
572
_compile_charset (charset , flags , code )
540
573
code [skip ] = len (code ) - skip
541
574
0 commit comments