15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- import re
19
18
import platform
20
19
21
20
from frozendict import frozendict
@@ -32,14 +31,6 @@ def _default(obj):
32
31
obj .__class__ .__name__ )
33
32
34
33
35
- # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
36
- # much quicker (assuming c speedups are enabled) that it's actually much
37
- # quicker to let it do that and then substitute back (it's about 2.5x faster).
38
- #
39
- # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
40
- # as per https://github.com/simplejson/simplejson/issues/206).
41
- #
42
-
43
34
# Declare these in the module scope, but they get configured in
44
35
# set_json_library.
45
36
_canonical_encoder = None
@@ -56,97 +47,21 @@ def set_json_library(json_lib):
56
47
"""
57
48
global _canonical_encoder
58
49
_canonical_encoder = json_lib .JSONEncoder (
59
- ensure_ascii = True ,
50
+ ensure_ascii = False ,
60
51
separators = (',' , ':' ),
61
52
sort_keys = True ,
62
53
default = _default ,
63
54
)
64
55
65
56
global _pretty_encoder
66
57
_pretty_encoder = json_lib .JSONEncoder (
67
- ensure_ascii = True ,
58
+ ensure_ascii = False ,
68
59
indent = 4 ,
69
60
sort_keys = True ,
70
61
default = _default ,
71
62
)
72
63
73
64
74
- # This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
75
- # unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
76
- # in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
77
- _U_ESCAPE = re .compile (r"\\u([0-9a-f]{4})|\\\\" )
78
-
79
-
80
- def _unascii (s ):
81
- """Unpack `\\ uNNNN` escapes in 's' and encode the result as UTF-8
82
-
83
- This method takes the output of the JSONEncoder and expands any \\ uNNNN
84
- escapes it finds (except for \\ u0000 to \\ u001F, which are converted to
85
- \\ xNN escapes).
86
-
87
- For performance, it assumes that the input is valid JSON, and performs few
88
- sanity checks.
89
- """
90
-
91
- # make the fast path fast: if there are no matches in the string, the
92
- # whole thing is ascii. We have to turn it into a bytes, which is
93
- # quickest with encode('utf-8')
94
- m = _U_ESCAPE .search (s )
95
- if not m :
96
- return s .encode ('utf-8' )
97
-
98
- # appending to a string (or a bytes) is slooow, so we accumulate sections
99
- # of string result in 'chunks', and join them all together later.
100
- # (It doesn't seem to make much difference whether we accumulate
101
- # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
102
- #
103
- chunks = []
104
-
105
- # 'pos' tracks the index in 's' that we have processed into 'chunks' so
106
- # far.
107
- pos = 0
108
-
109
- while m :
110
- start = m .start ()
111
- end = m .end ()
112
-
113
- g = m .group (1 )
114
-
115
- if g is None :
116
- # escaped backslash: pass it through along with anything before the
117
- # match
118
- chunks .append (s [pos :end ])
119
- else :
120
- # \uNNNN, but we have to watch out for surrogate pairs.
121
- #
122
- # str.encode("utf-8") complains about surrogates, so we have to
123
- # unpack them.
124
- c = int (g , 16 )
125
-
126
- if c < 0x20 :
127
- # leave as a \uNNNN escape
128
- chunks .append (s [pos :end ])
129
- else :
130
- if c & 0xfc00 == 0xd800 and s [end :end + 2 ] == '\\ u' :
131
- esc2 = s [end + 2 :end + 6 ]
132
- c2 = int (esc2 , 16 )
133
- if c2 & 0xfc00 == 0xdc00 :
134
- c = 0x10000 + (((c - 0xd800 ) << 10 ) |
135
- (c2 - 0xdc00 ))
136
- end += 6
137
-
138
- chunks .append (s [pos :start ])
139
- chunks .append (chr (c ))
140
-
141
- pos = end
142
- m = _U_ESCAPE .search (s , pos )
143
-
144
- # pass through anything after the last match
145
- chunks .append (s [pos :])
146
-
147
- return ('' .join (chunks )).encode ("utf-8" )
148
-
149
-
150
65
def encode_canonical_json (json_object ):
151
66
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
152
67
lexicographically sorted by unicode code point.
@@ -157,7 +72,7 @@ def encode_canonical_json(json_object):
157
72
Returns:
158
73
bytes encoding the JSON object"""
159
74
s = _canonical_encoder .encode (json_object )
160
- return _unascii ( s )
75
+ return s . encode ( "utf-8" )
161
76
162
77
163
78
def encode_pretty_printed_json (json_object ):
0 commit comments