15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- import re
19
18
import platform
20
19
21
20
from frozendict import frozendict
@@ -58,97 +57,21 @@ def set_json_library(json_lib):
58
57
59
58
global _canonical_encoder
60
59
_canonical_encoder = json_lib .JSONEncoder (
61
- ensure_ascii = True ,
60
+ ensure_ascii = False ,
62
61
separators = (',' , ':' ),
63
62
sort_keys = True ,
64
63
default = _default ,
65
64
)
66
65
67
66
global _pretty_encoder
68
67
_pretty_encoder = json_lib .JSONEncoder (
69
- ensure_ascii = True ,
68
+ ensure_ascii = False ,
70
69
indent = 4 ,
71
70
sort_keys = True ,
72
71
default = _default ,
73
72
)
74
73
75
74
76
- # This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
77
- # unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
78
- # in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
79
- _U_ESCAPE = re .compile (r"\\u([0-9a-f]{4})|\\\\" )
80
-
81
-
82
- def _unascii (s ):
83
- """Unpack `\\ uNNNN` escapes in 's' and encode the result as UTF-8
84
-
85
- This method takes the output of the JSONEncoder and expands any \\ uNNNN
86
- escapes it finds (except for \\ u0000 to \\ u001F, which are converted to
87
- \\ xNN escapes).
88
-
89
- For performance, it assumes that the input is valid JSON, and performs few
90
- sanity checks.
91
- """
92
-
93
- # make the fast path fast: if there are no matches in the string, the
94
- # whole thing is ascii. We have to turn it into a bytes, which is
95
- # quickest with encode('utf-8')
96
- m = _U_ESCAPE .search (s )
97
- if not m :
98
- return s .encode ('utf-8' )
99
-
100
- # appending to a string (or a bytes) is slooow, so we accumulate sections
101
- # of string result in 'chunks', and join them all together later.
102
- # (It doesn't seem to make much difference whether we accumulate
103
- # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
104
- #
105
- chunks = []
106
-
107
- # 'pos' tracks the index in 's' that we have processed into 'chunks' so
108
- # far.
109
- pos = 0
110
-
111
- while m :
112
- start = m .start ()
113
- end = m .end ()
114
-
115
- g = m .group (1 )
116
-
117
- if g is None :
118
- # escaped backslash: pass it through along with anything before the
119
- # match
120
- chunks .append (s [pos :end ])
121
- else :
122
- # \uNNNN, but we have to watch out for surrogate pairs.
123
- #
124
- # str.encode("utf-8") complains about surrogates, so we have to
125
- # unpack them.
126
- c = int (g , 16 )
127
-
128
- if c < 0x20 :
129
- # leave as a \uNNNN escape
130
- chunks .append (s [pos :end ])
131
- else :
132
- if c & 0xfc00 == 0xd800 and s [end :end + 2 ] == '\\ u' :
133
- esc2 = s [end + 2 :end + 6 ]
134
- c2 = int (esc2 , 16 )
135
- if c2 & 0xfc00 == 0xdc00 :
136
- c = 0x10000 + (((c - 0xd800 ) << 10 ) |
137
- (c2 - 0xdc00 ))
138
- end += 6
139
-
140
- chunks .append (s [pos :start ])
141
- chunks .append (chr (c ))
142
-
143
- pos = end
144
- m = _U_ESCAPE .search (s , pos )
145
-
146
- # pass through anything after the last match
147
- chunks .append (s [pos :])
148
-
149
- return ('' .join (chunks )).encode ("utf-8" )
150
-
151
-
152
75
def encode_canonical_json (json_object ):
153
76
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
154
77
lexicographically sorted by unicode code point.
@@ -159,7 +82,7 @@ def encode_canonical_json(json_object):
159
82
Returns:
160
83
bytes encoding the JSON object"""
161
84
s = _canonical_encoder .encode (json_object )
162
- return _unascii ( s )
85
+ return s . encode ( "utf-8" )
163
86
164
87
165
88
def iterencode_canonical_json (json_object ):
0 commit comments