Remove obsolete workaround for slow encoding of unicode characters. (#30)

clokep · web-flow · commit 7e163506e1d8 · 2020-08-10T08:39:01.000-04:00
diff --git a/canonicaljson.py b/canonicaljson.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 import platform
 
 from frozendict import frozendict
@@ -32,14 +31,6 @@ def _default(obj):
                     obj.__class__.__name__)
 
 
-# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
-# much quicker (assuming c speedups are enabled) that it's actually much
-# quicker to let it do that and then substitute back (it's about 2.5x faster).
-#
-# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
-# as per https://github.com/simplejson/simplejson/issues/206).
-#
-
 # Declare these in the module scope, but they get configured in
 # set_json_library.
 _canonical_encoder = None
@@ -56,97 +47,21 @@ def set_json_library(json_lib):
     """
     global _canonical_encoder
     _canonical_encoder = json_lib.JSONEncoder(
-        ensure_ascii=True,
+        ensure_ascii=False,
         separators=(',', ':'),
         sort_keys=True,
         default=_default,
     )
 
     global _pretty_encoder
     _pretty_encoder = json_lib.JSONEncoder(
-        ensure_ascii=True,
+        ensure_ascii=False,
         indent=4,
         sort_keys=True,
         default=_default,
     )
 
 
-# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
-# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
-# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
-_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
-
-
-def _unascii(s):
-    """Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
-
-    This method takes the output of the JSONEncoder and expands any \\uNNNN
-    escapes it finds (except for \\u0000 to \\u001F, which are converted to
-    \\xNN escapes).
-
-    For performance, it assumes that the input is valid JSON, and performs few
-    sanity checks.
-    """
-
-    # make the fast path fast: if there are no matches in the string, the
-    # whole thing is ascii. We have to turn it into a bytes, which is
-    # quickest with encode('utf-8')
-    m = _U_ESCAPE.search(s)
-    if not m:
-        return s.encode('utf-8')
-
-    # appending to a string (or a bytes) is slooow, so we accumulate sections
-    # of string result in 'chunks', and join them all together later.
-    # (It doesn't seem to make much difference whether we accumulate
-    # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
-    #
-    chunks = []
-
-    # 'pos' tracks the index in 's' that we have processed into 'chunks' so
-    # far.
-    pos = 0
-
-    while m:
-        start = m.start()
-        end = m.end()
-
-        g = m.group(1)
-
-        if g is None:
-            # escaped backslash: pass it through along with anything before the
-            # match
-            chunks.append(s[pos:end])
-        else:
-            # \uNNNN, but we have to watch out for surrogate pairs.
-            #
-            # str.encode("utf-8") complains about surrogates, so we have to
-            # unpack them.
-            c = int(g, 16)
-
-            if c < 0x20:
-                # leave as a \uNNNN escape
-                chunks.append(s[pos:end])
-            else:
-                if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
-                    esc2 = s[end + 2:end + 6]
-                    c2 = int(esc2, 16)
-                    if c2 & 0xfc00 == 0xdc00:
-                        c = 0x10000 + (((c - 0xd800) << 10) |
-                                       (c2 - 0xdc00))
-                        end += 6
-
-                chunks.append(s[pos:start])
-                chunks.append(chr(c))
-
-        pos = end
-        m = _U_ESCAPE.search(s, pos)
-
-    # pass through anything after the last match
-    chunks.append(s[pos:])
-
-    return (''.join(chunks)).encode("utf-8")
-
-
 def encode_canonical_json(json_object):
     """Encodes the shortest UTF-8 JSON encoding with dictionary keys
     lexicographically sorted by unicode code point.
@@ -157,7 +72,7 @@ def encode_canonical_json(json_object):
     Returns:
         bytes encoding the JSON object"""
     s = _canonical_encoder.encode(json_object)
-    return _unascii(s)
+    return s.encode("utf-8")
 
 
 def encode_pretty_printed_json(json_object):
diff --git a/setup.py b/setup.py
@@ -46,7 +46,9 @@ def exec_file(path_segments, name):
     py_modules=["canonicaljson"],
     description="Canonical JSON",
     install_requires=[
-        "simplejson>=3.6.5",
+        # simplerjson versions before 3.14.0 had a bug with some characters
+        # (e.g. \u2028) if ensure_ascii was set to false.
+        "simplejson>=3.14.0",
         "frozendict>=1.0",
     ],
     zip_safe=True,
diff --git a/test_canonicaljson.py b/test_canonicaljson.py
@@ -62,6 +62,43 @@ def test_encode_canonical(self):
             b'"\\\\u1234"',
         )
 
+    def test_ascii(self):
+        """
+        Ensure the proper ASCII characters are escaped.
+
+        See https://matrix.org/docs/spec/appendices#grammar.
+        """
+        # Some characters go to their common shorthands.
+        escaped = {
+            0x08: b'"\\b"',
+            0x09: b'"\\t"',
+            0x0A: b'"\\n"',
+            0x0C: b'"\\f"',
+            0x0D: b'"\\r"',
+            0x22: b'"\\""',
+            0x5C: b'"\\\\"',
+        }
+        for c, expected in escaped.items():
+            self.assertEqual(encode_canonical_json(chr(c)), expected)
+
+        # Others go to the \uXXXX.
+        hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
+        for c in hex_escaped:
+            self.assertEqual(
+                encode_canonical_json(chr(c)),
+                b'"\\u00%02x"' % (c,)
+            )
+
+        # And other characters are passed unescaped.
+        unescaped = (
+            [0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)))
+        for c in unescaped:
+            c = chr(c)
+            self.assertEqual(
+                encode_canonical_json(c),
+                b'"' + c.encode("ascii") + b'"'
+            )
+
     def test_encode_pretty_printed(self):
         self.assertEqual(encode_pretty_printed_json({}), b'{}')