From 4c1534660747b5b0f575e9b7e256c0df6af0954a Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 5 May 2017 20:19:00 +0300
Subject: [PATCH 1/4] bpo-30285: Optimize case-insensitive matching and
 searching of regular expressions.

---
 Doc/whatsnew/3.7.rst    |   4 +
 Lib/sre_compile.py      | 159 +++++++++++++++++++++++++---------------
 Lib/test/test_re.py     |   9 +++
 Misc/NEWS               |   3 +
 Modules/_sre.c          |  34 +++++++++
 Modules/clinic/_sre.c.h |  64 +++++++++++++++-
 6 files changed, 212 insertions(+), 61 deletions(-)
diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst
index 7edf4fc3cf4269..93be21f2f5bea5 100644
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@@ -196,6 +196,10 @@ Optimizations
   using the :func:`os.scandir` function.
   (Contributed by Serhiy Storchaka in :issue:`25996`.)
 
+* Optimized case-insensitive matching and searching of :mod:`regular
+  expressions <re>`.  Searching some patterns can now be up to 20 times faster.
+  (Contributed by Serhiy Storchaka in :issue:`30285`.)
+
 
 Build and C API Changes
 =======================
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index db8b8a2778f582..3940b765d34e1f 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
     REPEATING_CODES = _REPEATING_CODES
     SUCCESS_CODES = _SUCCESS_CODES
     ASSERT_CODES = _ASSERT_CODES
+    iscased = None
     tolower = None
     fixes = None
     if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
         if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+            iscased = _sre.unicode_iscased
             tolower = _sre.unicode_tolower
             fixes = _ignorecase_fixes
         else:
+            iscased = _sre.ascii_iscased
             tolower = _sre.ascii_tolower
     for op, av in pattern:
         if op in LITERAL_CODES:
@@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
             elif flags & SRE_FLAG_LOCALE:
                 emit(OP_LOC_IGNORE[op])
                 emit(av)
+            elif not iscased(av):
+                emit(op)
+                emit(av)
             else:
                 lo = tolower(av)
                 if fixes and lo in fixes:
@@ -101,14 +107,10 @@ def _compile(code, pattern, flags):
                     emit(OP_IGNORE[op])
                     emit(lo)
         elif op is IN:
-            if not flags & SRE_FLAG_IGNORECASE:
-                emit(op)
-            elif flags & SRE_FLAG_LOCALE:
-                emit(IN_LOC_IGNORE)
-            else:
-                emit(IN_IGNORE)
+            emit(op)
             skip = _len(code); emit(0)
-            _compile_charset(av, flags, code, tolower, fixes)
+            op = _compile_charset(av, flags, code, tolower, fixes)
+            code[skip-1] = op
             code[skip] = _len(code) - skip
         elif op is ANY:
             if flags & SRE_FLAG_DOTALL:
@@ -226,7 +228,8 @@ def _compile(code, pattern, flags):
 def _compile_charset(charset, flags, code, fixup=None, fixes=None):
     # compile charset subprogram
     emit = code.append
-    for op, av in _optimize_charset(charset, fixup, fixes):
+    opcs, charset = _optimize_charset(charset, flags, fixup, fixes)
+    for op, av in charset:
         emit(op)
         if op is NEGATE:
             pass
@@ -249,17 +252,26 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
         else:
             raise error("internal: unsupported set operator %r" % (op,))
     emit(FAILURE)
+    return opcs
 
-def _optimize_charset(charset, fixup, fixes):
+def _optimize_charset(charset, flags, fixup, fixes):
     # internal: optimize character set
     out = []
     tail = []
     charmap = bytearray(256)
+    hascased = False
+    if fixup:
+        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+            iscased = _sre.unicode_iscased
+        else:
+            iscased = _sre.ascii_iscased
     for op, av in charset:
         while True:
             try:
                 if op is LITERAL:
                     if fixup:
+                        if not hascased and iscased(av):
+                            hascased = True
                         lo = fixup(av)
                         charmap[lo] = 1
                         if fixes and lo in fixes:
@@ -268,7 +280,7 @@ def _optimize_charset(charset, fixup, fixes):
                     else:
                         charmap[av] = 1
                 elif op is RANGE:
-                    r = range(av[0], av[1]+1)
+                    r = r0 = range(av[0], av[1]+1)
                     if fixup:
                         r = map(fixup, r)
                     if fixup and fixes:
@@ -280,6 +292,8 @@ def _optimize_charset(charset, fixup, fixes):
                     else:
                         for i in r:
                             charmap[i] = 1
+                    if fixup and not hascased:
+                        hascased = any(map(iscased, r0))
                 elif op is NEGATE:
                     out.append((op, av))
                 else:
@@ -295,9 +309,18 @@ def _optimize_charset(charset, fixup, fixes):
                 # and for both ranges RANGE_IGNORE works.
                 if fixup and op is RANGE:
                     op = RANGE_IGNORE
+                if fixup:
+                    hascased = True
                 tail.append((op, av))
             break
 
+    if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+        op = IN_LOC_IGNORE
+    elif hascased:
+        op = IN_IGNORE
+    else:
+        op = IN
+
     # compress character map
     runs = []
     q = 0
@@ -322,17 +345,17 @@ def _optimize_charset(charset, fixup, fixes):
                 out.append((RANGE, (p, q - 1)))
         out += tail
         # if the case was changed or new representation is more compact
-        if fixup or len(out) < len(charset):
-            return out
+        if hascased or len(out) < len(charset):
+            return op, out
         # else original character set is good enough
-        return charset
+        return op, charset
 
     # use bitmap
     if len(charmap) == 256:
         data = _mk_bitmap(charmap)
         out.append((CHARSET, data))
         out += tail
-        return out
+        return op, out
 
     # To represent a big charset, first a bitmap of all characters in the
     # set is constructed. Then, this bitmap is sliced into chunks of 256
@@ -371,7 +394,7 @@ def _optimize_charset(charset, fixup, fixes):
     data[0:0] = [block] + _bytes_to_codes(mapping)
     out.append((BIGCHARSET, data))
     out += tail
-    return out
+    return op, out
 
 _CODEBITS = _sre.CODESIZE * 8
 MAXCODE = (1 << _CODEBITS) - 1
@@ -414,19 +437,31 @@ def _generate_overlap_table(prefix):
             table[i] = idx + 1
     return table
 
-def _get_literal_prefix(pattern):
+def _get_iscased(flags):
+    if not flags & SRE_FLAG_IGNORECASE:
+        return None
+    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+        return _sre.unicode_iscased
+    else:
+        return _sre.ascii_iscased
+
+def _get_literal_prefix(pattern, flags):
     # look for literal prefix
     prefix = []
     prefixappend = prefix.append
     prefix_skip = None
+    iscased = _get_iscased(flags)
     for op, av in pattern.data:
         if op is LITERAL:
+            if iscased and iscased(av):
+                break
             prefixappend(av)
         elif op is SUBPATTERN:
             group, add_flags, del_flags, p = av
-            if add_flags & SRE_FLAG_IGNORECASE:
+            flags1 = (flags | add_flags) & ~del_flags
+            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
                 break
-            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
             if prefix_skip is None:
                 if group is not None:
                     prefix_skip = len(prefix)
@@ -441,46 +476,49 @@ def _get_literal_prefix(pattern):
         return prefix, prefix_skip, True
     return prefix, prefix_skip, False
 
-def _get_charset_prefix(pattern):
-    charset = [] # not used
-    charsetappend = charset.append
-    if pattern.data:
+def _get_charset_prefix(pattern, flags):
+    while True:
+        if not pattern.data:
+            return None
         op, av = pattern.data[0]
-        if op is SUBPATTERN:
-            group, add_flags, del_flags, p = av
-            if p and not (add_flags & SRE_FLAG_IGNORECASE):
-                op, av = p[0]
-                if op is LITERAL:
-                    charsetappend((op, av))
-                elif op is BRANCH:
-                    c = []
-                    cappend = c.append
-                    for p in av[1]:
-                        if not p:
-                            break
-                        op, av = p[0]
-                        if op is LITERAL:
-                            cappend((op, av))
-                        else:
-                            break
-                    else:
-                        charset = c
-        elif op is BRANCH:
-            c = []
-            cappend = c.append
-            for p in av[1]:
-                if not p:
-                    break
-                op, av = p[0]
-                if op is LITERAL:
-                    cappend((op, av))
-                else:
-                    break
+        if op is not SUBPATTERN:
+            break
+        group, add_flags, del_flags, pattern = av
+        flags = (flags | add_flags) & ~del_flags
+        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+            return None
+
+    iscased = _get_iscased(flags)
+    if op is LITERAL:
+        if iscased and iscased(av):
+            return None
+        return [(op, av)]
+    elif op is BRANCH:
+        charset = []
+        charsetappend = charset.append
+        for p in av[1]:
+            if not p:
+                return None
+            op, av = p[0]
+            if op is LITERAL and not (iscased and iscased(av)):
+                charsetappend((op, av))
             else:
-                charset = c
-        elif op is IN:
-            charset = av
-    return charset
+                return None
+        return charset
+    elif op is IN:
+        charset = av
+        if iscased:
+            for op, av in charset:
+                if op is LITERAL:
+                    if iscased(av):
+                        return None
+                elif op is RANGE:
+                    if av[1] > 0xffff:
+                        return None
+                    if any(map(iscased, range(av[0], av[1]+1))):
+                        return None
+        return charset
+    return None
 
 def _compile_info(code, pattern, flags):
     # internal: compile an info block.  in the current version,
@@ -496,12 +534,12 @@ def _compile_info(code, pattern, flags):
     prefix = []
     prefix_skip = 0
     charset = [] # not used
-    if not (flags & SRE_FLAG_IGNORECASE):
+    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
         # look for literal prefix
-        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
         # if no prefix, look for charset prefix
         if not prefix:
-            charset = _get_charset_prefix(pattern)
+            charset = _get_charset_prefix(pattern, flags)
 ##     if prefix:
 ##         print("*** PREFIX", prefix, prefix_skip)
 ##     if charset:
@@ -536,7 +574,8 @@ def _compile_info(code, pattern, flags):
         # generate overlap table
         code.extend(_generate_overlap_table(prefix))
     elif charset:
-        _compile_charset(charset, flags, code)
+        op = _compile_charset(charset, flags, code)
+        assert op is IN
     code[skip] = len(code) - skip
 
 def isstring(obj):
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index b5b7cff9a2a812..3129f7e9888bc5 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -891,15 +891,24 @@ def test_case_helpers(self):
             lo = ord(c.lower())
             self.assertEqual(_sre.ascii_tolower(i), lo)
             self.assertEqual(_sre.unicode_tolower(i), lo)
+            iscased = c in string.ascii_letters
+            self.assertEqual(_sre.ascii_iscased(i), iscased)
+            self.assertEqual(_sre.unicode_iscased(i), iscased)
 
         for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
             c = chr(i)
             self.assertEqual(_sre.ascii_tolower(i), i)
             if i != 0x0130:
                 self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
+            iscased = c != c.lower() or c != c.upper()
+            self.assertFalse(_sre.ascii_iscased(i))
+            self.assertEqual(_sre.unicode_iscased(i),
+                             c != c.lower() or c != c.upper())
 
         self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
         self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
+        self.assertFalse(_sre.ascii_iscased(0x0130))
+        self.assertTrue(_sre.unicode_iscased(0x0130))
 
     def test_not_literal(self):
         self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
diff --git a/Misc/NEWS b/Misc/NEWS
index 25619efddf14de..c291f279a94230 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -317,6 +317,9 @@ Extension Modules
 Library
 -------
 
+- bpo-30285: Optimized case-insensitive matching and searching of regular
+  expressions.
+
 - bpo-30243: Removed the __init__ methods of _json's scanner and encoder.
   Misusing them could cause memory leaks or crashes.  Now scanner and encoder
   objects are completely initialized in the __new__ methods.
diff --git a/Modules/_sre.c b/Modules/_sre.c
index a86c5f252b5e58..6873f1db438d41 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
     return sizeof(SRE_CODE);
 }
 
+/*[clinic input]
+_sre.ascii_iscased -> bool
+
+    character: int
+    /
+
+[clinic start generated code]*/
+
+static int
+_sre_ascii_iscased_impl(PyObject *module, int character)
+/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
+{
+    unsigned int ch = (unsigned int)character;
+    return ch != sre_lower(ch) || ch != sre_upper(ch);
+}
+
+/*[clinic input]
+_sre.unicode_iscased -> bool
+
+    character: int
+    /
+
+[clinic start generated code]*/
+
+static int
+_sre_unicode_iscased_impl(PyObject *module, int character)
+/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
+{
+    unsigned int ch = (unsigned int)character;
+    return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
+}
+
 /*[clinic input]
 _sre.ascii_tolower -> int
 
@@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
 static PyMethodDef _functions[] = {
     _SRE_COMPILE_METHODDEF
     _SRE_GETCODESIZE_METHODDEF
+    _SRE_ASCII_ISCASED_METHODDEF
+    _SRE_UNICODE_ISCASED_METHODDEF
     _SRE_ASCII_TOLOWER_METHODDEF
     _SRE_UNICODE_TOLOWER_METHODDEF
     {NULL, NULL}
diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h
index 8056eda3b73d2f..1e606860386a71 100644
--- a/Modules/clinic/_sre.c.h
+++ b/Modules/clinic/_sre.c.h
@@ -29,6 +29,68 @@ _sre_getcodesize(PyObject *module, PyObject *Py_UNUSED(ignored))
     return return_value;
 }
 
+PyDoc_STRVAR(_sre_ascii_iscased__doc__,
+"ascii_iscased($module, character, /)\n"
+"--\n"
+"\n");
+
+#define _SRE_ASCII_ISCASED_METHODDEF    \
+    {"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__},
+
+static int
+_sre_ascii_iscased_impl(PyObject *module, int character);
+
+static PyObject *
+_sre_ascii_iscased(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int character;
+    int _return_value;
+
+    if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) {
+        goto exit;
+    }
+    _return_value = _sre_ascii_iscased_impl(module, character);
+    if ((_return_value == -1) && PyErr_Occurred()) {
+        goto exit;
+    }
+    return_value = PyBool_FromLong((long)_return_value);
+
+exit:
+    return return_value;
+}
+
+PyDoc_STRVAR(_sre_unicode_iscased__doc__,
+"unicode_iscased($module, character, /)\n"
+"--\n"
+"\n");
+
+#define _SRE_UNICODE_ISCASED_METHODDEF    \
+    {"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__},
+
+static int
+_sre_unicode_iscased_impl(PyObject *module, int character);
+
+static PyObject *
+_sre_unicode_iscased(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int character;
+    int _return_value;
+
+    if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) {
+        goto exit;
+    }
+    _return_value = _sre_unicode_iscased_impl(module, character);
+    if ((_return_value == -1) && PyErr_Occurred()) {
+        goto exit;
+    }
+    return_value = PyBool_FromLong((long)_return_value);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(_sre_ascii_tolower__doc__,
 "ascii_tolower($module, character, /)\n"
 "--\n"
@@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored))
 {
     return _sre_SRE_Scanner_search_impl(self);
 }
-/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/
+/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/

From 49c40692a7b511542193363c4f34952a0f3acee7 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 7 May 2017 08:31:32 +0300
Subject: [PATCH 2/4] Simplification by Lisa Roach.

---
 Lib/sre_compile.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 3940b765d34e1f..5b27dd7e15bc77 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -261,10 +261,7 @@ def _optimize_charset(charset, flags, fixup, fixes):
     charmap = bytearray(256)
     hascased = False
     if fixup:
-        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
-            iscased = _sre.unicode_iscased
-        else:
-            iscased = _sre.ascii_iscased
+        iscased = _get_iscased(flags)
     for op, av in charset:
         while True:
             try:

From 5e6e0e8dc5a57c4af94f9636dc81bc1741a0bc16 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 7 May 2017 11:02:44 +0300
Subject: [PATCH 3/4] Additional refactoring.

---
 Lib/sre_compile.py | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 5b27dd7e15bc77..9a4fadda59d653 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -107,10 +107,15 @@ def _compile(code, pattern, flags):
                     emit(OP_IGNORE[op])
                     emit(lo)
         elif op is IN:
-            emit(op)
+            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
+            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+                emit(IN_LOC_IGNORE)
+            elif hascased:
+                emit(IN_IGNORE)
+            else:
+                emit(IN)
             skip = _len(code); emit(0)
-            op = _compile_charset(av, flags, code, tolower, fixes)
-            code[skip-1] = op
+            _compile_charset(charset, flags, code)
             code[skip] = _len(code) - skip
         elif op is ANY:
             if flags & SRE_FLAG_DOTALL:
@@ -225,10 +230,9 @@ def _compile(code, pattern, flags):
         else:
             raise error("internal: unsupported operand type %r" % (op,))
 
-def _compile_charset(charset, flags, code, fixup=None, fixes=None):
+def _compile_charset(charset, flags, code):
     # compile charset subprogram
     emit = code.append
-    opcs, charset = _optimize_charset(charset, flags, fixup, fixes)
     for op, av in charset:
         emit(op)
         if op is NEGATE:
@@ -252,16 +256,13 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
         else:
             raise error("internal: unsupported set operator %r" % (op,))
     emit(FAILURE)
-    return opcs
 
-def _optimize_charset(charset, flags, fixup, fixes):
+def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
     # internal: optimize character set
     out = []
     tail = []
     charmap = bytearray(256)
     hascased = False
-    if fixup:
-        iscased = _get_iscased(flags)
     for op, av in charset:
         while True:
             try:
@@ -311,13 +312,6 @@ def _optimize_charset(charset, flags, fixup, fixes):
                 tail.append((op, av))
             break
 
-    if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
-        op = IN_LOC_IGNORE
-    elif hascased:
-        op = IN_IGNORE
-    else:
-        op = IN
-
     # compress character map
     runs = []
     q = 0
@@ -343,16 +337,16 @@ def _optimize_charset(charset, flags, fixup, fixes):
         out += tail
         # if the case was changed or new representation is more compact
         if hascased or len(out) < len(charset):
-            return op, out
+            return out, hascased
         # else original character set is good enough
-        return op, charset
+        return charset, hascased
 
     # use bitmap
     if len(charmap) == 256:
         data = _mk_bitmap(charmap)
         out.append((CHARSET, data))
         out += tail
-        return op, out
+        return out, hascased
 
     # To represent a big charset, first a bitmap of all characters in the
     # set is constructed. Then, this bitmap is sliced into chunks of 256
@@ -391,7 +385,7 @@ def _optimize_charset(charset, flags, fixup, fixes):
     data[0:0] = [block] + _bytes_to_codes(mapping)
     out.append((BIGCHARSET, data))
     out += tail
-    return op, out
+    return out, hascased
 
 _CODEBITS = _sre.CODESIZE * 8
 MAXCODE = (1 << _CODEBITS) - 1
@@ -571,8 +565,9 @@ def _compile_info(code, pattern, flags):
         # generate overlap table
         code.extend(_generate_overlap_table(prefix))
     elif charset:
-        op = _compile_charset(charset, flags, code)
-        assert op is IN
+        charset, hascased = _optimize_charset(charset)
+        assert not hascased
+        _compile_charset(charset, flags, code)
     code[skip] = len(code) - skip
 
 def isstring(obj):

From af992cbe8bb8240b615570755066ee823ab204ae Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 8 May 2017 19:31:52 +0300
Subject: [PATCH 4/4] More refactoring.

---
 Lib/sre_compile.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 9a4fadda59d653..cebecb93c0ab80 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -268,30 +268,32 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
             try:
                 if op is LITERAL:
                     if fixup:
-                        if not hascased and iscased(av):
-                            hascased = True
                         lo = fixup(av)
                         charmap[lo] = 1
                         if fixes and lo in fixes:
                             for k in fixes[lo]:
                                 charmap[k] = 1
+                        if not hascased and iscased(av):
+                            hascased = True
                     else:
                         charmap[av] = 1
                 elif op is RANGE:
-                    r = r0 = range(av[0], av[1]+1)
+                    r = range(av[0], av[1]+1)
                     if fixup:
-                        r = map(fixup, r)
-                    if fixup and fixes:
-                        for i in r:
-                            charmap[i] = 1
-                            if i in fixes:
-                                for k in fixes[i]:
-                                    charmap[k] = 1
+                        if fixes:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                                if i in fixes:
+                                    for k in fixes[i]:
+                                        charmap[k] = 1
+                        else:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                        if not hascased:
+                            hascased = any(map(iscased, r))
                     else:
                         for i in r:
                             charmap[i] = 1
-                    if fixup and not hascased:
-                        hascased = any(map(iscased, r0))
                 elif op is NEGATE:
                     out.append((op, av))
                 else:
@@ -302,13 +304,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
                     charmap += b'\0' * 0xff00
                     continue
                 # Character set contains non-BMP character codes.
-                # There are only two ranges of cased non-BMP characters:
-                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
-                # and for both ranges RANGE_IGNORE works.
-                if fixup and op is RANGE:
-                    op = RANGE_IGNORE
                 if fixup:
                     hascased = True
+                    # There are only two ranges of cased non-BMP characters:
+                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+                    # and for both ranges RANGE_IGNORE works.
+                    if op is RANGE:
+                        op = RANGE_IGNORE
                 tail.append((op, av))
             break