Skip to content

Commit 070160c

Browse files
committed
Merge remote-tracking branch 'origin/master' into clokep/iter-methods
2 parents bca38f7 + 7e16350 commit 070160c

File tree

3 files changed

+43
-81
lines changed

3 files changed

+43
-81
lines changed

canonicaljson.py

Lines changed: 3 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# See the License for the specific language governing permissions and
1616
# limitations under the License.
1717

18-
import re
1918
import platform
2019

2120
from frozendict import frozendict
@@ -58,97 +57,21 @@ def set_json_library(json_lib):
5857

5958
global _canonical_encoder
6059
_canonical_encoder = json_lib.JSONEncoder(
61-
ensure_ascii=True,
60+
ensure_ascii=False,
6261
separators=(',', ':'),
6362
sort_keys=True,
6463
default=_default,
6564
)
6665

6766
global _pretty_encoder
6867
_pretty_encoder = json_lib.JSONEncoder(
69-
ensure_ascii=True,
68+
ensure_ascii=False,
7069
indent=4,
7170
sort_keys=True,
7271
default=_default,
7372
)
7473

7574

76-
# This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
77-
# unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
78-
# in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
79-
_U_ESCAPE = re.compile(r"\\u([0-9a-f]{4})|\\\\")
80-
81-
82-
def _unascii(s):
83-
"""Unpack `\\uNNNN` escapes in 's' and encode the result as UTF-8
84-
85-
This method takes the output of the JSONEncoder and expands any \\uNNNN
86-
escapes it finds (except for \\u0000 to \\u001F, which are converted to
87-
\\xNN escapes).
88-
89-
For performance, it assumes that the input is valid JSON, and performs few
90-
sanity checks.
91-
"""
92-
93-
# make the fast path fast: if there are no matches in the string, the
94-
# whole thing is ascii. We have to turn it into a bytes, which is
95-
# quickest with encode('utf-8')
96-
m = _U_ESCAPE.search(s)
97-
if not m:
98-
return s.encode('utf-8')
99-
100-
# appending to a string (or a bytes) is slooow, so we accumulate sections
101-
# of string result in 'chunks', and join them all together later.
102-
# (It doesn't seem to make much difference whether we accumulate
103-
# utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
104-
#
105-
chunks = []
106-
107-
# 'pos' tracks the index in 's' that we have processed into 'chunks' so
108-
# far.
109-
pos = 0
110-
111-
while m:
112-
start = m.start()
113-
end = m.end()
114-
115-
g = m.group(1)
116-
117-
if g is None:
118-
# escaped backslash: pass it through along with anything before the
119-
# match
120-
chunks.append(s[pos:end])
121-
else:
122-
# \uNNNN, but we have to watch out for surrogate pairs.
123-
#
124-
# str.encode("utf-8") complains about surrogates, so we have to
125-
# unpack them.
126-
c = int(g, 16)
127-
128-
if c < 0x20:
129-
# leave as a \uNNNN escape
130-
chunks.append(s[pos:end])
131-
else:
132-
if c & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u':
133-
esc2 = s[end + 2:end + 6]
134-
c2 = int(esc2, 16)
135-
if c2 & 0xfc00 == 0xdc00:
136-
c = 0x10000 + (((c - 0xd800) << 10) |
137-
(c2 - 0xdc00))
138-
end += 6
139-
140-
chunks.append(s[pos:start])
141-
chunks.append(chr(c))
142-
143-
pos = end
144-
m = _U_ESCAPE.search(s, pos)
145-
146-
# pass through anything after the last match
147-
chunks.append(s[pos:])
148-
149-
return (''.join(chunks)).encode("utf-8")
150-
151-
15275
def encode_canonical_json(json_object):
15376
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
15477
lexicographically sorted by unicode code point.
@@ -159,7 +82,7 @@ def encode_canonical_json(json_object):
15982
Returns:
16083
bytes encoding the JSON object"""
16184
s = _canonical_encoder.encode(json_object)
162-
return _unascii(s)
85+
return s.encode("utf-8")
16386

16487

16588
def iterencode_canonical_json(json_object):

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def exec_file(path_segments, name):
4646
py_modules=["canonicaljson"],
4747
description="Canonical JSON",
4848
install_requires=[
49-
"simplejson>=3.6.5",
49+
# simplerjson versions before 3.14.0 had a bug with some characters
50+
# (e.g. \u2028) if ensure_ascii was set to false.
51+
"simplejson>=3.14.0",
5052
"frozendict>=1.0",
5153
],
5254
zip_safe=True,

test_canonicaljson.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,43 @@ def test_encode_canonical(self):
6767
# Iteratively encoding should work.
6868
self.assertEqual(list(iterencode_canonical_json({})), [b'{}'])
6969

70+
def test_ascii(self):
71+
"""
72+
Ensure the proper ASCII characters are escaped.
73+
74+
See https://matrix.org/docs/spec/appendices#grammar.
75+
"""
76+
# Some characters go to their common shorthands.
77+
escaped = {
78+
0x08: b'"\\b"',
79+
0x09: b'"\\t"',
80+
0x0A: b'"\\n"',
81+
0x0C: b'"\\f"',
82+
0x0D: b'"\\r"',
83+
0x22: b'"\\""',
84+
0x5C: b'"\\\\"',
85+
}
86+
for c, expected in escaped.items():
87+
self.assertEqual(encode_canonical_json(chr(c)), expected)
88+
89+
# Others go to the \uXXXX.
90+
hex_escaped = list(range(0x08)) + [0x0B] + list(range(0x0E, 0x20))
91+
for c in hex_escaped:
92+
self.assertEqual(
93+
encode_canonical_json(chr(c)),
94+
b'"\\u00%02x"' % (c,)
95+
)
96+
97+
# And other characters are passed unescaped.
98+
unescaped = (
99+
[0x20, 0x21] + list(range(0x23, 0x5C)) + list(range(0x5D, 0x7E)))
100+
for c in unescaped:
101+
c = chr(c)
102+
self.assertEqual(
103+
encode_canonical_json(c),
104+
b'"' + c.encode("ascii") + b'"'
105+
)
106+
70107
def test_encode_pretty_printed(self):
71108
self.assertEqual(encode_pretty_printed_json({}), b'{}')
72109
self.assertEqual(list(iterencode_pretty_printed_json({})), [b'{}'])

0 commit comments

Comments
 (0)