Skip to content

Commit 8ed84b6

Browse files
committed
Encode with ensure_ascii=True and then fix
This turns out to be way quicker. It also allows us to fix #2.
1 parent 5b224ac commit 8ed84b6

File tree

3 files changed

+92
-5
lines changed

3 files changed

+92
-5
lines changed

canonicaljson.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
# using simplejson rather than regular json gives approximately a 25%
18+
import re
19+
from six import unichr, PY2
20+
21+
# using simplejson rather than regular json gives approximately a 100%
1922
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
2023
import simplejson as json
2124

@@ -31,21 +34,92 @@ def _default(obj):
3134
obj.__class__.__name__)
3235

3336

37+
# ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
38+
# much quicker (assuming c speedups are enabled) that it's actually much
39+
# quicker to let it do that and then substitute back (it's about 2.5x faster).
40+
#
41+
# (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
42+
# as per https://github.com/simplejson/simplejson/issues/206).
43+
#
3444
_canonical_encoder = json.JSONEncoder(
35-
ensure_ascii=False,
45+
ensure_ascii=True,
3646
separators=(',', ':'),
3747
sort_keys=True,
3848
default=_default,
3949
)
4050

41-
4251
_pretty_encoder = json.JSONEncoder(
4352
ensure_ascii=True,
4453
indent=4,
4554
sort_keys=True,
4655
default=_default,
4756
)
4857

58+
# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
59+
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
60+
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
61+
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
62+
63+
64+
def _unascii(s):
65+
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
66+
67+
This method takes the output of the JSONEncoder and expands any \\uNNNN
68+
escapes it finds.
69+
70+
For performance, it assumes that the input is valid JSON, and performs few
71+
sanity checks.
72+
"""
73+
74+
# make the fast path fast: if there are no matches in the string, the
75+
# whole thing is ascii. On python 2, that means we're done. On python 3,
76+
# we have to turn it into a bytes, which is quickest with encode('utf-8')
77+
m = _U_ESCAPE.search(s)
78+
if not m:
79+
return s if PY2 else s.encode('utf-8')
80+
81+
# appending to a string (or a bytes) is slooow, so we accumulate sections
82+
# of string result in 'chunks', and join them all together later.
83+
# (It doesn't seem to make much difference whether we accumulate
84+
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
85+
#
86+
chunks = []
87+
88+
# 'pos' tracks the index in 's' that we have processed into 'chunks' so
89+
# far.
90+
pos = 0
91+
92+
while m:
93+
start = m.start()
94+
end = m.end()
95+
96+
g = m.group(1)
97+
98+
if g is None:
99+
# escaped backslash: pass it through along with anything before the
100+
# match
101+
chunks.append(s[pos:end])
102+
else:
103+
# \uNNNN, but we have to watch out for surrogate pairs
104+
c = int(g, 16)
105+
106+
if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
107+
esc2 = s[end + 2:end + 6]
108+
c2 = int(esc2, 16)
109+
if c2 & 0xfc00 == 0xdc00:
110+
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00))
111+
end += 6
112+
chunks.append(s[pos:start])
113+
chunks.append(unichr(c))
114+
115+
pos = end
116+
m = _U_ESCAPE.search(s, pos)
117+
118+
# pass through anything after the last match
119+
chunks.append(s[pos:])
120+
121+
return (''.join(chunks)).encode("utf-8")
122+
49123

50124
def encode_canonical_json(json_object):
51125
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
@@ -56,9 +130,8 @@ def encode_canonical_json(json_object):
56130
57131
Returns:
58132
bytes encoding the JSON object"""
59-
60133
s = _canonical_encoder.encode(json_object)
61-
return s.encode("UTF-8")
134+
return _unascii(s)
62135

63136

64137
def encode_pretty_printed_json(json_object):

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def exec_file(path_segments, name):
4747
install_requires=[
4848
"simplejson>=3.6.5",
4949
"frozendict>=0.4",
50+
"six",
5051
],
5152
long_description=read_file(("README.rst",)),
5253
keywords="json",

test_canonicaljson.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,19 @@ def test_encode_canonical(self):
3535
u"la merde amusée": u"💩",
3636
}), b'{"la merde amus\xc3\xa9e":"\xF0\x9F\x92\xA9"}')
3737

38+
# so should U+2028 and U+2029
39+
self.assertEquals(
40+
encode_canonical_json({u"spaces": u"\u2028 \u2029"}),
41+
b'{"spaces":"\xe2\x80\xa8 \xe2\x80\xa9"}',
42+
)
43+
44+
# but we need to watch out for 'u1234' after backslash, which should
45+
# get encoded to an escaped backslash, followed by u1234
46+
self.assertEquals(
47+
encode_canonical_json(u"\\u1234"),
48+
b'"\\\\u1234"',
49+
)
50+
3851
def test_encode_pretty_printed(self):
3952
self.assertEquals(encode_pretty_printed_json({}), b'{}')
4053

0 commit comments

Comments
 (0)