@@ -42,7 +42,11 @@ def build_char_map(
42
42
pass
43
43
# I conside the space_code is available on one byte
44
44
if isinstance (space_code , str ):
45
- sp = space_code .encode ("charmap" )[0 ]
45
+ try : # one byte
46
+ sp = space_code .encode ("charmap" )[0 ]
47
+ except Exception :
48
+ sp = space_code .encode ("utf-16-be" )
49
+ sp = sp [0 ] + 256 * sp [1 ]
46
50
else :
47
51
sp = space_code
48
52
sp_width = compute_space_width (ft , sp , space_width )
@@ -52,12 +56,12 @@ def build_char_map(
52
56
float (sp_width / 2 ),
53
57
encoding ,
54
58
# https://github.com/python/mypy/issues/4374
55
- map_dict , # type: ignore
56
- ) # type: ignore
59
+ map_dict ,
60
+ )
57
61
58
62
59
63
# used when missing data, e.g. font def missing
60
- unknown_char_map : Tuple [str , float , Union [str , Dict [int , str ]], Dict ] = (
64
+ unknown_char_map : Tuple [str , float , Union [str , Dict [int , str ]], Dict [ Any , Any ] ] = (
61
65
"Unknown" ,
62
66
9999 ,
63
67
dict (zip (range (256 ), ["�" ] * 256 )),
@@ -104,15 +108,15 @@ def parse_encoding(
104
108
encoding : Union [str , List [str ], Dict [int , str ]] = []
105
109
if "/Encoding" not in ft :
106
110
try :
107
- if "/BaseFont" in ft and ft ["/BaseFont" ] in charset_encoding :
111
+ if "/BaseFont" in ft and cast ( str , ft ["/BaseFont" ]) in charset_encoding :
108
112
encoding = dict (
109
113
zip (range (256 ), charset_encoding [cast (str , ft ["/BaseFont" ])])
110
114
)
111
115
else :
112
116
encoding = "charmap"
113
117
return encoding , _default_fonts_space_width [cast (str , ft ["/BaseFont" ])]
114
118
except Exception :
115
- if ft ["/Subtype" ] == "/Type1" :
119
+ if cast ( str , ft ["/Subtype" ]) == "/Type1" :
116
120
return "charmap" , space_code
117
121
else :
118
122
return "" , space_code
@@ -163,19 +167,31 @@ def parse_encoding(
163
167
164
168
def parse_to_unicode (
165
169
ft : DictionaryObject , space_code : int
166
- ) -> Tuple [Dict , int , List [int ]]:
167
- map_dict : Dict [
168
- Any , Any
169
- ] = (
170
- {}
171
- ) # will store all translation code and map_dict[-1] we will have the number of bytes to convert
172
- int_entry : List [
173
- int
174
- ] = [] # will provide the list of cmap keys as int to correct encoding
170
+ ) -> Tuple [Dict [Any , Any ], int , List [int ]]:
171
+ # will store all translation code
172
+ # and map_dict[-1] we will have the number of bytes to convert
173
+ map_dict : Dict [Any , Any ] = {}
174
+
175
+ # will provide the list of cmap keys as int to correct encoding
176
+ int_entry : List [int ] = []
177
+
175
178
if "/ToUnicode" not in ft :
176
179
return {}, space_code , []
177
180
process_rg : bool = False
178
181
process_char : bool = False
182
+ cm = prepare_cm (ft )
183
+ for l in cm .split (b"\n " ):
184
+ process_rg , process_char = process_cm_line (
185
+ l .strip (b" " ), process_rg , process_char , map_dict , int_entry
186
+ )
187
+
188
+ for a , value in map_dict .items ():
189
+ if value == " " :
190
+ space_code = a
191
+ return map_dict , space_code , int_entry
192
+
193
+
194
+ def prepare_cm (ft : DictionaryObject ) -> bytes :
179
195
cm : bytes = cast (DecodedStreamObject , ft ["/ToUnicode" ]).get_data ()
180
196
# we need to prepare cm before due to missing return line in pdf printed to pdf from word
181
197
cm = (
@@ -204,71 +220,84 @@ def parse_to_unicode(
204
220
.replace (b"]" , b" ]\n " )
205
221
.replace (b"\r " , b"\n " )
206
222
)
223
+ return cm
207
224
208
- for l in cm .split (b"\n " ):
209
- if l in (b"" , b" " ) or l [0 ] == 37 : # 37 = %
210
- continue
211
- if b"beginbfrange" in l :
212
- process_rg = True
213
- elif b"endbfrange" in l :
214
- process_rg = False
215
- elif b"beginbfchar" in l :
216
- process_char = True
217
- elif b"endbfchar" in l :
218
- process_char = False
219
- elif process_rg :
220
- lst = [x for x in l .split (b" " ) if x ]
221
- a = int (lst [0 ], 16 )
222
- b = int (lst [1 ], 16 )
223
- nbi = len (lst [0 ])
224
- map_dict [- 1 ] = nbi // 2
225
- fmt = b"%%0%dX" % nbi
226
- if lst [2 ] == b"[" :
227
- for sq in lst [3 :]:
228
- if sq == b"]" :
229
- break
230
- map_dict [
231
- unhexlify (fmt % a ).decode (
232
- "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" ,
233
- "surrogatepass" ,
234
- )
235
- ] = unhexlify (sq ).decode ("utf-16-be" , "surrogatepass" )
236
- int_entry .append (a )
237
- a += 1
238
- else :
239
- c = int (lst [2 ], 16 )
240
- fmt2 = b"%%0%dX" % max (4 , len (lst [2 ]))
241
- while a <= b :
242
- map_dict [
243
- unhexlify (fmt % a ).decode (
244
- "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" ,
245
- "surrogatepass" ,
246
- )
247
- ] = unhexlify (fmt2 % c ).decode ("utf-16-be" , "surrogatepass" )
248
- int_entry .append (a )
249
- a += 1
250
- c += 1
251
- elif process_char :
252
- lst = [x for x in l .split (b" " ) if x ]
253
- map_dict [- 1 ] = len (lst [0 ]) // 2
254
- while len (lst ) > 1 :
255
- map_to = ""
256
- # placeholder (see above) means empty string
257
- if lst [1 ] != b"." :
258
- map_to = unhexlify (lst [1 ]).decode (
259
- "utf-16-be" , "surrogatepass"
260
- ) # join is here as some cases where the code was split
261
- map_dict [
262
- unhexlify (lst [0 ]).decode (
263
- "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" , "surrogatepass"
264
- )
265
- ] = map_to
266
- int_entry .append (int (lst [0 ], 16 ))
267
- lst = lst [2 :]
268
- for a , value in map_dict .items ():
269
- if value == " " :
270
- space_code = a
271
- return map_dict , space_code , int_entry
225
+
226
+ def process_cm_line (
227
+ l : bytes ,
228
+ process_rg : bool ,
229
+ process_char : bool ,
230
+ map_dict : Dict [Any , Any ],
231
+ int_entry : List [int ],
232
+ ) -> Tuple [bool , bool ]:
233
+ if l in (b"" , b" " ) or l [0 ] == 37 : # 37 = %
234
+ return process_rg , process_char
235
+ if b"beginbfrange" in l :
236
+ process_rg = True
237
+ elif b"endbfrange" in l :
238
+ process_rg = False
239
+ elif b"beginbfchar" in l :
240
+ process_char = True
241
+ elif b"endbfchar" in l :
242
+ process_char = False
243
+ elif process_rg :
244
+ parse_bfrange (l , map_dict , int_entry )
245
+ elif process_char :
246
+ parse_bfchar (l , map_dict , int_entry )
247
+ return process_rg , process_char
248
+
249
+
250
+ def parse_bfrange (l : bytes , map_dict : Dict [Any , Any ], int_entry : List [int ]) -> None :
251
+ lst = [x for x in l .split (b" " ) if x ]
252
+ a = int (lst [0 ], 16 )
253
+ b = int (lst [1 ], 16 )
254
+ nbi = len (lst [0 ])
255
+ map_dict [- 1 ] = nbi // 2
256
+ fmt = b"%%0%dX" % nbi
257
+ if lst [2 ] == b"[" :
258
+ for sq in lst [3 :]:
259
+ if sq == b"]" :
260
+ break
261
+ map_dict [
262
+ unhexlify (fmt % a ).decode (
263
+ "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" ,
264
+ "surrogatepass" ,
265
+ )
266
+ ] = unhexlify (sq ).decode ("utf-16-be" , "surrogatepass" )
267
+ int_entry .append (a )
268
+ a += 1
269
+ else :
270
+ c = int (lst [2 ], 16 )
271
+ fmt2 = b"%%0%dX" % max (4 , len (lst [2 ]))
272
+ while a <= b :
273
+ map_dict [
274
+ unhexlify (fmt % a ).decode (
275
+ "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" ,
276
+ "surrogatepass" ,
277
+ )
278
+ ] = unhexlify (fmt2 % c ).decode ("utf-16-be" , "surrogatepass" )
279
+ int_entry .append (a )
280
+ a += 1
281
+ c += 1
282
+
283
+
284
+ def parse_bfchar (l : bytes , map_dict : Dict [Any , Any ], int_entry : List [int ]) -> None :
285
+ lst = [x for x in l .split (b" " ) if x ]
286
+ map_dict [- 1 ] = len (lst [0 ]) // 2
287
+ while len (lst ) > 1 :
288
+ map_to = ""
289
+ # placeholder (see above) means empty string
290
+ if lst [1 ] != b"." :
291
+ map_to = unhexlify (lst [1 ]).decode (
292
+ "utf-16-be" , "surrogatepass"
293
+ ) # join is here as some cases where the code was split
294
+ map_dict [
295
+ unhexlify (lst [0 ]).decode (
296
+ "charmap" if map_dict [- 1 ] == 1 else "utf-16-be" , "surrogatepass"
297
+ )
298
+ ] = map_to
299
+ int_entry .append (int (lst [0 ], 16 ))
300
+ lst = lst [2 :]
272
301
273
302
274
303
def compute_space_width (
@@ -285,7 +314,7 @@ def compute_space_width(
285
314
except Exception :
286
315
w1 [- 1 ] = 1000.0
287
316
if "/W" in ft1 :
288
- w = list (ft1 ["/W" ]) # type: ignore
317
+ w = list (ft1 ["/W" ])
289
318
else :
290
319
w = []
291
320
while len (w ) > 0 :
0 commit comments