Skip to content

Commit 2b9c51d

Browse files
committed
Encode with ensure_ascii=True and then fix
This turns out to be way quicker. It also allows us to fix #2.
1 parent 5b224ac commit 2b9c51d

File tree

3 files changed

+94
-5
lines changed

3 files changed

+94
-5
lines changed

canonicaljson.py

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
# using simplejson rather than regular json gives approximately a 25%
18+
import re
19+
from six import unichr, PY2
20+
21+
# using simplejson rather than regular json gives approximately a 100%
1922
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
2023
import simplejson as json
2124

@@ -31,21 +34,94 @@ def _default(obj):
3134
obj.__class__.__name__)
3235

3336

37+
# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
38+
# much quicker (assuming c speedups are enabled) that it's actually much
39+
# quicker to let it do that and then substitute back (it's about 2.5x faster).
40+
#
41+
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
42+
# as per https://github.com/simplejson/simplejson/issues/206).
43+
#
3444
_canonical_encoder = json.JSONEncoder(
35-
ensure_ascii=False,
45+
ensure_ascii=True,
3646
separators=(',', ':'),
3747
sort_keys=True,
3848
default=_default,
3949
)
4050

41-
4251
_pretty_encoder = json.JSONEncoder(
4352
ensure_ascii=True,
4453
indent=4,
4554
sort_keys=True,
4655
default=_default,
4756
)
4857

58+
# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
59+
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
60+
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
61+
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
62+
63+
64+
def _unascii(s):
65+
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
66+
67+
This method takes the output of the JSONEncoder and expands any \\uNNNN
68+
escapes it finds.
69+
70+
For performance, it assumes that the input is valid JSON, and performs few
71+
sanity checks.
72+
"""
73+
74+
# make the fast path fast: if there are no matches in the string, the
75+
# whole thing is ascii. On python 2, that means we're done. On python 3,
76+
# we have to turn it into a bytes, which is quickest with encode('utf-8')
77+
m = _U_ESCAPE.search(s)
78+
if not m:
79+
if PY2:
80+
return s
81+
return s.encode('utf-8')
82+
83+
# appending to a string (or a bytes) is slooow, so we accumulate sections
84+
# of string result in 'chunks', and join them all together later.
85+
# (It doesn't seem to make much difference whether we accumulate
86+
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
87+
#
88+
chunks = []
89+
90+
# 'pos' tracks the index in 's' that we have processed into 'chunks' so
91+
# far.
92+
pos = 0
93+
94+
while m:
95+
start = m.start()
96+
end = m.end()
97+
98+
g = m.group(1)
99+
100+
if g is None:
101+
# escaped backslash: pass it through along with anything before the
102+
# match
103+
chunks.append(s[pos:end])
104+
else:
105+
# \uNNNN, but we have to watch out for surrogate pairs
106+
c = int(g, 16)
107+
108+
if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
109+
esc2 = s[end + 2:end + 6]
110+
c2 = int(esc2, 16)
111+
if c2 & 0xfc00 == 0xdc00:
112+
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00))
113+
end += 6
114+
chunks.append(s[pos:start])
115+
chunks.append(unichr(c))
116+
117+
pos = end
118+
m = _U_ESCAPE.search(s, pos)
119+
120+
# pass through anything after the last match
121+
chunks.append(s[pos:])
122+
123+
return (''.join(chunks)).encode("utf-8")
124+
49125

50126
def encode_canonical_json(json_object):
51127
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
@@ -56,9 +132,8 @@ def encode_canonical_json(json_object):
56132
57133
Returns:
58134
bytes encoding the JSON object"""
59-
60135
s = _canonical_encoder.encode(json_object)
61-
return s.encode("UTF-8")
136+
return _unascii(s)
62137

63138

64139
def encode_pretty_printed_json(json_object):

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def exec_file(path_segments, name):
4747
install_requires=[
4848
"simplejson>=3.6.5",
4949
"frozendict>=0.4",
50+
"six",
5051
],
5152
long_description=read_file(("README.rst",)),
5253
keywords="json",

test_canonicaljson.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,19 @@ def test_encode_canonical(self):
3535
u"la merde amusée": u"💩",
3636
}), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}')
3737

38+
# so should U+2028 and U+2029
39+
self.assertEquals(
40+
encode_canonical_json({u"spaces": u"\u2028 \u2029"}),
41+
b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}',
42+
)
43+
44+
# but we need to watch out for 'u1234' after backslash, which should
45+
# get encoded to an escaped backslash, followed by u1234
46+
self.assertEquals(
47+
encode_canonical_json(u"\\u1234"),
48+
b'"\\\\u1234"',
49+
)
50+
3851
def test_encode_pretty_printed(self):
3952
self.assertEquals(encode_pretty_printed_json({}), b'{}')
4053

0 commit comments

Comments
 (0)