Skip to content

Commit 7e16350

Browse files
authored
Remove obsolete workaround for slow encoding of unicode characters. (#30)
1 parent 081a8e2 commit 7e16350

File tree

3 files changed

+43
-89
lines changed

3 files changed

+43
-89
lines changed

canonicaljson.py

Lines changed: 3 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
import re
1918
import platform
2019

2120
from frozendict import frozendict
@@ -32,14 +31,6 @@ def _default(obj):
3231
obj.__class__.__name__)
3332

3433

35-
# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
36-
# much quicker (assuming c speedups are enabled) that it's actually much
37-
# quicker to let it do that and then substitute back (it's about 2.5x faster).
38-
#
39-
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
40-
# as per https://github.com/simplejson/simplejson/issues/206).
41-
#
42-
4334
# Declare these in the module scope, but they get configured in
4435
# set_json_library.
4536
_canonical_encoder = None
@@ -56,97 +47,21 @@ def set_json_library(json_lib):
5647
"""
5748
global _canonical_encoder
5849
_canonical_encoder = json_lib.JSONEncoder(
59-
ensure_ascii=True,
50+
ensure_ascii=False,
6051
separators=(',', ':'),
6152
sort_keys=True,
6253
default=_default,
6354
)
6455

6556
global _pretty_encoder
6657
_pretty_encoder = json_lib.JSONEncoder(
67-
ensure_ascii=True,
58+
ensure_ascii=False,
6859
indent=4,
6960
sort_keys=True,
7061
default=_default,
7162
)
7263

7364

74-
# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
75-
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
76-
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
77-
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
78-
79-
80-
def _unascii(s):
81-
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
82-
83-
This method takes the output of the JSONEncoder and expands any \\uNNNN
84-
escapes it finds (except for \\u0000 to \\u001F, which are converted to
85-
\\xNN escapes).
86-
87-
For performance, it assumes that the input is valid JSON, and performs few
88-
sanity checks.
89-
"""
90-
91-
# make the fast path fast: if there are no matches in the string, the
92-
# whole thing is ascii. We have to turn it into a bytes, which is
93-
# quickest with encode('utf-8')
94-
m = _U_ESCAPE.search(s)
95-
if not m:
96-
return s.encode('utf-8')
97-
98-
# appending to a string (or a bytes) is slooow, so we accumulate sections
99-
# of string result in 'chunks', and join them all together later.
100-
# (It doesn't seem to make much difference whether we accumulate
101-
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
102-
#
103-
chunks = []
104-
105-
# 'pos' tracks the index in 's' that we have processed into 'chunks' so
106-
# far.
107-
pos = 0
108-
109-
while m:
110-
start = m.start()
111-
end = m.end()
112-
113-
g = m.group(1)
114-
115-
if g is None:
116-
# escaped backslash: pass it through along with anything before the
117-
# match
118-
chunks.append(s[pos:end])
119-
else:
120-
# \uNNNN, but we have to watch out for surrogate pairs.
121-
#
122-
# str.encode("utf-8") complains about surrogates, so we have to
123-
# unpack them.
124-
c = int(g, 16)
125-
126-
if c < 0x20:
127-
# leave as a \uNNNN escape
128-
chunks.append(s[pos:end])
129-
else:
130-
if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
131-
esc2 = s[end + 2:end + 6]
132-
c2 = int(esc2, 16)
133-
if c2 & 0xfc00 == 0xdc00:
134-
c = 0x10000 + (((c - 0xd800) << 10) |
135-
(c2 - 0xdc00))
136-
end += 6
137-
138-
chunks.append(s[pos:start])
139-
chunks.append(chr(c))
140-
141-
pos = end
142-
m = _U_ESCAPE.search(s, pos)
143-
144-
# pass through anything after the last match
145-
chunks.append(s[pos:])
146-
147-
return (''.join(chunks)).encode("utf-8")
148-
149-
15065
def encode_canonical_json(json_object):
15166
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
15267
lexicographically sorted by unicode code point.
@@ -157,7 +72,7 @@ def encode_canonical_json(json_object):
15772
Returns:
15873
bytes encoding the JSON object"""
15974
s = _canonical_encoder.encode(json_object)
160-
return _unascii(s)
75+
return s.encode("utf-8")
16176

16277

16378
def encode_pretty_printed_json(json_object):

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def exec_file(path_segments, name):
4646
py_modules=["canonicaljson"],
4747
description="Canonical JSON",
4848
install_requires=[
49-
"simplejson>=3.6.5",
49+
# simplerjson versions before 3.14.0 had a bug with some characters
50+
# (e.g. \u2028) if ensure_ascii was set to false.
51+
"simplejson>=3.14.0",
5052
"frozendict>=1.0",
5153
],
5254
zip_safe=True,

test_canonicaljson.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,43 @@ def test_encode_canonical(self):
6262
b'"\\\\u1234"',
6363
)
6464

65+
def test_ascii(self):
66+
"""
67+
Ensure the proper ASCII characters are escaped.
68+
69+
See https://matrix.org/docs/spec/appendices#grammar.
70+
"""
71+
# Some characters go to their common shorthands.
72+
escaped = {
73+
0x08: b'"\\b"',
74+
0x09: b'"\\t"',
75+
0x0A: b'"\\n"',
76+
0x0C: b'"\\f"',
77+
0x0D: b'"\\r"',
78+
0x22: b'"\\""',
79+
0x5C: b'"\\\\"',
80+
}
81+
for c, expected in escaped.items():
82+
self.assertEqual(encode_canonical_json(chr(c)), expected)
83+
84+
# Others go to the \uXXXX.
85+
hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
86+
for c in hex_escaped:
87+
self.assertEqual(
88+
encode_canonical_json(chr(c)),
89+
b'"\\u00%02x"' % (c,)
90+
)
91+
92+
# And other characters are passed unescaped.
93+
unescaped = (
94+
[0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)))
95+
for c in unescaped:
96+
c = chr(c)
97+
self.assertEqual(
98+
encode_canonical_json(c),
99+
b'"' + c.encode("ascii") + b'"'
100+
)
101+
65102
def test_encode_pretty_printed(self):
66103
self.assertEqual(encode_pretty_printed_json({}), b'{}')
67104

0 commit comments

Comments
 (0)