15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- # using simplejson rather than regular json gives approximately a 25%
18
+ import re
19
+ from six import unichr , PY2
20
+
21
+ # using simplejson rather than regular json gives approximately a 100%
19
22
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
20
23
import simplejson as json
21
24
@@ -31,21 +34,92 @@ def _default(obj):
31
34
obj .__class__ .__name__ )
32
35
33
36
37
+ # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
38
+ # much quicker (assuming c speedups are enabled) that it's actually much
39
+ # quicker to let it do that and then substitute back (it's about 2.5x faster).
40
+ #
41
+ # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
42
+ # as per https://github.com/simplejson/simplejson/issues/206).
43
+ #
34
44
_canonical_encoder = json .JSONEncoder (
35
- ensure_ascii = False ,
45
+ ensure_ascii = True ,
36
46
separators = (',' , ':' ),
37
47
sort_keys = True ,
38
48
default = _default ,
39
49
)
40
50
41
-
42
51
_pretty_encoder = json .JSONEncoder (
43
52
ensure_ascii = True ,
44
53
indent = 4 ,
45
54
sort_keys = True ,
46
55
default = _default ,
47
56
)
48
57
58
+ # This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
59
+ # unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
60
+ # in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
61
+ _U_ESCAPE = re .compile (r"\\u([0-9a-f]{4})|\\\\" )
62
+
63
+
64
+ def _unascii (s ):
65
+ """Unpack `\\ uNNNN` escapes in 's' and encode the result as UTF-8
66
+
67
+ This method takes the output of the JSONEncoder and expands any \\ uNNNN
68
+ escapes it finds.
69
+
70
+ For performance, it assumes that the input is valid JSON, and performs few
71
+ sanity checks.
72
+ """
73
+
74
+ # make the fast path fast: if there are no matches in the string, the
75
+ # whole thing is ascii. On python 2, that means we're done. On python 3,
76
+ # we have to turn it into a bytes, which is quickest with encode('utf-8')
77
+ m = _U_ESCAPE .search (s )
78
+ if not m :
79
+ return s if PY2 else s .encode ('utf-8' )
80
+
81
+ # appending to a string (or a bytes) is slooow, so we accumulate sections
82
+ # of string result in 'chunks', and join them all together later.
83
+ # (It doesn't seem to make much difference whether we accumulate
84
+ # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
85
+ #
86
+ chunks = []
87
+
88
+ # 'pos' tracks the index in 's' that we have processed into 'chunks' so
89
+ # far.
90
+ pos = 0
91
+
92
+ while m :
93
+ start = m .start ()
94
+ end = m .end ()
95
+
96
+ g = m .group (1 )
97
+
98
+ if g is None :
99
+ # escaped backslash: pass it through along with anything before the
100
+ # match
101
+ chunks .append (s [pos :end ])
102
+ else :
103
+ # \uNNNN, but we have to watch out for surrogate pairs
104
+ c = int (g , 16 )
105
+
106
+ if c & 0xfc00 == 0xd800 and s [end :end + 2 ] == '\\ u' :
107
+ esc2 = s [end + 2 :end + 6 ]
108
+ c2 = int (esc2 , 16 )
109
+ if c2 & 0xfc00 == 0xdc00 :
110
+ c = 0x10000 + (((c - 0xd800 ) << 10 ) | (c2 - 0xdc00 ))
111
+ end += 6
112
+ chunks .append (s [pos :start ])
113
+ chunks .append (unichr (c ))
114
+
115
+ pos = end
116
+ m = _U_ESCAPE .search (s , pos )
117
+
118
+ # pass through anything after the last match
119
+ chunks .append (s [pos :])
120
+
121
+ return ('' .join (chunks )).encode ("utf-8" )
122
+
49
123
50
124
def encode_canonical_json (json_object ):
51
125
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
@@ -56,9 +130,8 @@ def encode_canonical_json(json_object):
56
130
57
131
Returns:
58
132
bytes encoding the JSON object"""
59
-
60
133
s = _canonical_encoder .encode (json_object )
61
- return s . encode ( "UTF-8" )
134
+ return _unascii ( s )
62
135
63
136
64
137
def encode_pretty_printed_json (json_object ):
0 commit comments