15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- # using simplejson rather than regular json gives approximately a 25%
18
+ import re
19
+ from six import unichr , PY2
20
+
21
+ # using simplejson rather than regular json gives approximately a 100%
19
22
# performance improvement (as measured on python 2.7.12/simplejson 3.13.2)
20
23
import simplejson as json
21
24
@@ -33,21 +36,92 @@ def _default(obj):
33
36
obj .__class__ .__name__ )
34
37
35
38
39
+ # ideally we'd set ensure_ascii=False, but the ensure_ascii codepath is so
40
+ # much quicker (assuming c speedups are enabled) that it's actually much
41
+ # quicker to let it do that and then substitute back (it's about 2.5x faster).
42
+ #
43
+ # (in any case, simplejson's ensure_ascii doesn't get U+2028 and U+2029 right,
44
+ # as per https://github.com/simplejson/simplejson/issues/206).
45
+ #
36
46
_canonical_encoder = json .JSONEncoder (
37
- ensure_ascii = False ,
47
+ ensure_ascii = True ,
38
48
separators = (',' , ':' ),
39
49
sort_keys = True ,
40
50
default = _default ,
41
51
)
42
52
43
-
44
53
_pretty_encoder = json .JSONEncoder (
45
54
ensure_ascii = True ,
46
55
indent = 4 ,
47
56
sort_keys = True ,
48
57
default = _default ,
49
58
)
50
59
60
+ # This regexp matches either `\uNNNN` or `\\`. We match '\\' (and leave it
61
+ # unchanged) to make sure that the regex doesn't accidentally capture the uNNNN
62
+ # in `\\uNNNN`, which is an escaped backslash followed by 'uNNNN'.
63
+ _U_ESCAPE = re .compile (r"\\u([0-9a-f]{4})|\\\\" )
64
+
65
+
66
+ def _unascii (s ):
67
+ """Unpack `\\ uNNNN` escapes in 's' and encode the result as UTF-8
68
+
69
+ This method takes the output of the JSONEncoder and expands any \\ uNNNN
70
+ escapes it finds.
71
+
72
+ For performance, it assumes that the input is valid JSON, and performs few
73
+ sanity checks.
74
+ """
75
+
76
+ # make the fast path fast: if there are no matches in the string, the
77
+ # whole thing is ascii. On python 2, that means we're done. On python 3,
78
+ # we have to turn it into a bytes, which is quickest with encode('utf-8')
79
+ m = _U_ESCAPE .search (s )
80
+ if not m :
81
+ return s if PY2 else s .encode ('utf-8' )
82
+
83
+ # appending to a string (or a bytes) is slooow, so we accumulate sections
84
+ # of string result in 'chunks', and join them all together later.
85
+ # (It doesn't seem to make much difference whether we accumulate
86
+ # utf8-encoded bytes, or strings which we utf-8 encode after rejoining)
87
+ #
88
+ chunks = []
89
+
90
+ # 'pos' tracks the index in 's' that we have processed into 'chunks' so
91
+ # far.
92
+ pos = 0
93
+
94
+ while m :
95
+ start = m .start ()
96
+ end = m .end ()
97
+
98
+ g = m .group (1 )
99
+
100
+ if g is None :
101
+ # escaped backslash: pass it through along with anything before the
102
+ # match
103
+ chunks .append (s [pos :end ])
104
+ else :
105
+ # \uNNNN, but we have to watch out for surrogate pairs
106
+ c = int (g , 16 )
107
+
108
+ if c & 0xfc00 == 0xd800 and s [end :end + 2 ] == '\\ u' :
109
+ esc2 = s [end + 2 :end + 6 ]
110
+ c2 = int (esc2 , 16 )
111
+ if c2 & 0xfc00 == 0xdc00 :
112
+ c = 0x10000 + (((c - 0xd800 ) << 10 ) | (c2 - 0xdc00 ))
113
+ end += 6
114
+ chunks .append (s [pos :start ])
115
+ chunks .append (unichr (c ))
116
+
117
+ pos = end
118
+ m = _U_ESCAPE .search (s , pos )
119
+
120
+ # pass through anything after the last match
121
+ chunks .append (s [pos :])
122
+
123
+ return ('' .join (chunks )).encode ("utf-8" )
124
+
51
125
52
126
def encode_canonical_json (json_object ):
53
127
"""Encodes the shortest UTF-8 JSON encoding with dictionary keys
@@ -58,9 +132,8 @@ def encode_canonical_json(json_object):
58
132
59
133
Returns:
60
134
bytes encoding the JSON object"""
61
-
62
135
s = _canonical_encoder .encode (json_object )
63
- return s . encode ( "UTF-8" )
136
+ return _unascii ( s )
64
137
65
138
66
139
def encode_pretty_printed_json (json_object ):
0 commit comments