Skip to content

Commit 80ff23b

Browse files
committed
Merge branch 'main' into outline-nomenclature
2 parents 9537c4b + 844f238 commit 80ff23b

20 files changed

+350
-144
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ repos:
3030
hooks:
3131
- id: isort
3232
- repo: https://github.com/psf/black
33-
rev: 22.3.0
33+
rev: 22.6.0
3434
hooks:
3535
- id: black
3636
args: [--target-version, py36]
@@ -40,7 +40,7 @@ repos:
4040
- id: blacken-docs
4141
additional_dependencies: [black==22.1.0]
4242
- repo: https://github.com/asottile/pyupgrade
43-
rev: v2.34.0
43+
rev: v2.37.2
4444
hooks:
4545
- id: pyupgrade
4646
args: [--py36-plus]

CHANGELOG.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,34 @@
11
# CHANGELOG
22

3+
## Version 2.8.0, 2022-07-24
4+
5+
### New Features (ENH)
6+
- Add writer.add_annotation, page.annotations, and generic.AnnotationBuilder (#1120)
7+
8+
### Bug Fixes (BUG)
9+
- Set /AS for /Btn form fields in writer (#1161)
10+
- Ignore if /Perms verify failed (#1157)
11+
12+
### Robustness (ROB)
13+
- Cope with utf16 character for space calculation (#1155)
14+
- Cope with null params for FitH / FitV destination (#1152)
15+
- Handle outlines without valid destination (#1076)
16+
17+
### Developer Experience (DEV)
18+
- Introduce _utils.logger_warning (#1148)
19+
20+
### Maintenance (MAINT)
21+
- Break up parse_to_unicode (#1162)
22+
- Add diagnostic output to exception in read_from_stream (#1159)
23+
- Reduce PdfReader.read complexity (#1151)
24+
25+
### Testing (TST)
26+
- Add workflow tests found by arc testing (#1154)
27+
- Decrypt file which is not encrypted (#1149)
28+
- Test CryptRC4 encryption class; test image extraction filters (#1147)
29+
30+
Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.7.0...2.8.0
31+
332
## Version 2.7.0, 2022-07-21
433

534
### New Features (ENH)

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,6 @@ mutation-results:
3131

3232
benchmark:
3333
pytest tests/bench.py
34+
35+
mypy:
36+
mypy PyPDF2 --ignore-missing-imports --check-untyped --strict

PyPDF2/_cmap.py

Lines changed: 109 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@ def build_char_map(
4242
pass
4343
# I conside the space_code is available on one byte
4444
if isinstance(space_code, str):
45-
sp = space_code.encode("charmap")[0]
45+
try: # one byte
46+
sp = space_code.encode("charmap")[0]
47+
except Exception:
48+
sp = space_code.encode("utf-16-be")
49+
sp = sp[0] + 256 * sp[1]
4650
else:
4751
sp = space_code
4852
sp_width = compute_space_width(ft, sp, space_width)
@@ -52,12 +56,12 @@ def build_char_map(
5256
float(sp_width / 2),
5357
encoding,
5458
# https://github.com/python/mypy/issues/4374
55-
map_dict, # type: ignore
56-
) # type: ignore
59+
map_dict,
60+
)
5761

5862

5963
# used when missing data, e.g. font def missing
60-
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict] = (
64+
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
6165
"Unknown",
6266
9999,
6367
dict(zip(range(256), ["�"] * 256)),
@@ -104,15 +108,15 @@ def parse_encoding(
104108
encoding: Union[str, List[str], Dict[int, str]] = []
105109
if "/Encoding" not in ft:
106110
try:
107-
if "/BaseFont" in ft and ft["/BaseFont"] in charset_encoding:
111+
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
108112
encoding = dict(
109113
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
110114
)
111115
else:
112116
encoding = "charmap"
113117
return encoding, _default_fonts_space_width[cast(str, ft["/BaseFont"])]
114118
except Exception:
115-
if ft["/Subtype"] == "/Type1":
119+
if cast(str, ft["/Subtype"]) == "/Type1":
116120
return "charmap", space_code
117121
else:
118122
return "", space_code
@@ -163,19 +167,31 @@ def parse_encoding(
163167

164168
def parse_to_unicode(
165169
ft: DictionaryObject, space_code: int
166-
) -> Tuple[Dict, int, List[int]]:
167-
map_dict: Dict[
168-
Any, Any
169-
] = (
170-
{}
171-
) # will store all translation code and map_dict[-1] we will have the number of bytes to convert
172-
int_entry: List[
173-
int
174-
] = [] # will provide the list of cmap keys as int to correct encoding
170+
) -> Tuple[Dict[Any, Any], int, List[int]]:
171+
# will store all translation code
172+
# and map_dict[-1] we will have the number of bytes to convert
173+
map_dict: Dict[Any, Any] = {}
174+
175+
# will provide the list of cmap keys as int to correct encoding
176+
int_entry: List[int] = []
177+
175178
if "/ToUnicode" not in ft:
176179
return {}, space_code, []
177180
process_rg: bool = False
178181
process_char: bool = False
182+
cm = prepare_cm(ft)
183+
for l in cm.split(b"\n"):
184+
process_rg, process_char = process_cm_line(
185+
l.strip(b" "), process_rg, process_char, map_dict, int_entry
186+
)
187+
188+
for a, value in map_dict.items():
189+
if value == " ":
190+
space_code = a
191+
return map_dict, space_code, int_entry
192+
193+
194+
def prepare_cm(ft: DictionaryObject) -> bytes:
179195
cm: bytes = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
180196
# we need to prepare cm before due to missing return line in pdf printed to pdf from word
181197
cm = (
@@ -204,71 +220,84 @@ def parse_to_unicode(
204220
.replace(b"]", b" ]\n ")
205221
.replace(b"\r", b"\n")
206222
)
223+
return cm
207224

208-
for l in cm.split(b"\n"):
209-
if l in (b"", b" ") or l[0] == 37: # 37 = %
210-
continue
211-
if b"beginbfrange" in l:
212-
process_rg = True
213-
elif b"endbfrange" in l:
214-
process_rg = False
215-
elif b"beginbfchar" in l:
216-
process_char = True
217-
elif b"endbfchar" in l:
218-
process_char = False
219-
elif process_rg:
220-
lst = [x for x in l.split(b" ") if x]
221-
a = int(lst[0], 16)
222-
b = int(lst[1], 16)
223-
nbi = len(lst[0])
224-
map_dict[-1] = nbi // 2
225-
fmt = b"%%0%dX" % nbi
226-
if lst[2] == b"[":
227-
for sq in lst[3:]:
228-
if sq == b"]":
229-
break
230-
map_dict[
231-
unhexlify(fmt % a).decode(
232-
"charmap" if map_dict[-1] == 1 else "utf-16-be",
233-
"surrogatepass",
234-
)
235-
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
236-
int_entry.append(a)
237-
a += 1
238-
else:
239-
c = int(lst[2], 16)
240-
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
241-
while a <= b:
242-
map_dict[
243-
unhexlify(fmt % a).decode(
244-
"charmap" if map_dict[-1] == 1 else "utf-16-be",
245-
"surrogatepass",
246-
)
247-
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
248-
int_entry.append(a)
249-
a += 1
250-
c += 1
251-
elif process_char:
252-
lst = [x for x in l.split(b" ") if x]
253-
map_dict[-1] = len(lst[0]) // 2
254-
while len(lst) > 1:
255-
map_to = ""
256-
# placeholder (see above) means empty string
257-
if lst[1] != b".":
258-
map_to = unhexlify(lst[1]).decode(
259-
"utf-16-be", "surrogatepass"
260-
) # join is here as some cases where the code was split
261-
map_dict[
262-
unhexlify(lst[0]).decode(
263-
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
264-
)
265-
] = map_to
266-
int_entry.append(int(lst[0], 16))
267-
lst = lst[2:]
268-
for a, value in map_dict.items():
269-
if value == " ":
270-
space_code = a
271-
return map_dict, space_code, int_entry
225+
226+
def process_cm_line(
227+
l: bytes,
228+
process_rg: bool,
229+
process_char: bool,
230+
map_dict: Dict[Any, Any],
231+
int_entry: List[int],
232+
) -> Tuple[bool, bool]:
233+
if l in (b"", b" ") or l[0] == 37: # 37 = %
234+
return process_rg, process_char
235+
if b"beginbfrange" in l:
236+
process_rg = True
237+
elif b"endbfrange" in l:
238+
process_rg = False
239+
elif b"beginbfchar" in l:
240+
process_char = True
241+
elif b"endbfchar" in l:
242+
process_char = False
243+
elif process_rg:
244+
parse_bfrange(l, map_dict, int_entry)
245+
elif process_char:
246+
parse_bfchar(l, map_dict, int_entry)
247+
return process_rg, process_char
248+
249+
250+
def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
251+
lst = [x for x in l.split(b" ") if x]
252+
a = int(lst[0], 16)
253+
b = int(lst[1], 16)
254+
nbi = len(lst[0])
255+
map_dict[-1] = nbi // 2
256+
fmt = b"%%0%dX" % nbi
257+
if lst[2] == b"[":
258+
for sq in lst[3:]:
259+
if sq == b"]":
260+
break
261+
map_dict[
262+
unhexlify(fmt % a).decode(
263+
"charmap" if map_dict[-1] == 1 else "utf-16-be",
264+
"surrogatepass",
265+
)
266+
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
267+
int_entry.append(a)
268+
a += 1
269+
else:
270+
c = int(lst[2], 16)
271+
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
272+
while a <= b:
273+
map_dict[
274+
unhexlify(fmt % a).decode(
275+
"charmap" if map_dict[-1] == 1 else "utf-16-be",
276+
"surrogatepass",
277+
)
278+
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
279+
int_entry.append(a)
280+
a += 1
281+
c += 1
282+
283+
284+
def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
285+
lst = [x for x in l.split(b" ") if x]
286+
map_dict[-1] = len(lst[0]) // 2
287+
while len(lst) > 1:
288+
map_to = ""
289+
# placeholder (see above) means empty string
290+
if lst[1] != b".":
291+
map_to = unhexlify(lst[1]).decode(
292+
"utf-16-be", "surrogatepass"
293+
) # join is here as some cases where the code was split
294+
map_dict[
295+
unhexlify(lst[0]).decode(
296+
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
297+
)
298+
] = map_to
299+
int_entry.append(int(lst[0], 16))
300+
lst = lst[2:]
272301

273302

274303
def compute_space_width(
@@ -285,7 +314,7 @@ def compute_space_width(
285314
except Exception:
286315
w1[-1] = 1000.0
287316
if "/W" in ft1:
288-
w = list(ft1["/W"]) # type: ignore
317+
w = list(ft1["/W"])
289318
else:
290319
w = []
291320
while len(w) > 0:

PyPDF2/_encryption.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@
2929
import random
3030
import struct
3131
from enum import IntEnum
32-
from typing import Optional, Tuple, Union, cast
32+
from typing import Any, Dict, Optional, Tuple, Union, cast
3333

34+
from PyPDF2._utils import logger_warning
3435
from PyPDF2.errors import DependencyError
3536
from PyPDF2.generic import (
3637
ArrayObject,
@@ -565,7 +566,7 @@ def verify_perms(
565566
@staticmethod
566567
def generate_values(
567568
user_pwd: bytes, owner_pwd: bytes, key: bytes, p: int, metadata_encrypted: bool
568-
) -> dict:
569+
) -> Dict[Any, Any]:
569570
u_value, ue_value = AlgV5.compute_U_value(user_pwd, key)
570571
o_value, oe_value = AlgV5.compute_O_value(owner_pwd, key, u_value)
571572
perms = AlgV5.compute_Perms_value(key, p, metadata_encrypted)
@@ -826,7 +827,7 @@ def verify_v5(self, password: bytes) -> Tuple[bytes, PasswordType]:
826827
P = (P + 0x100000000) % 0x100000000 # maybe < 0
827828
metadata_encrypted = self.entry.get("/EncryptMetadata", True)
828829
if not AlgV5.verify_perms(key, perms, P, metadata_encrypted):
829-
return b"", PasswordType.NOT_DECRYPTED
830+
logger_warning("ignore '/Perms' verify failed", __name__)
830831
return key, rc
831832

832833
@staticmethod

PyPDF2/_merger.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -567,12 +567,12 @@ def find_outline_item(
567567

568568
for i, oi_enum in enumerate(root):
569569
if isinstance(oi_enum, list):
570-
# b is still an inner node
570+
# oi_enum is still an inner node
571571
# (OutlineType, if recursive types were supported by mypy)
572572
res = self.find_outline_item(outline_item, oi_enum) # type: ignore
573573
if res:
574574
return [i] + res
575-
elif oi_enum == outline_item or oi_enum["/Title"] == outline_item:
575+
elif oi_enum == outline_item or cast(Dict[Any, Any], oi_enum["/Title"]) == outline_item:
576576
# we found a leaf node
577577
return [i]
578578

@@ -689,7 +689,7 @@ def add_named_destination(self, title: str, pagenum: int) -> None:
689689

690690
class PdfFileMerger(PdfMerger): # pragma: no cover
691691
def __init__(self, *args: Any, **kwargs: Any) -> None:
692-
deprecate_with_replacement("PdfFileMerger", "PdfMerge")
692+
deprecate_with_replacement("PdfFileMerger", "PdfMerger")
693693

694694
if "strict" not in kwargs and len(args) < 1:
695695
kwargs["strict"] = True # maintain the default

PyPDF2/_page.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -506,9 +506,9 @@ def _merge_page(
506506
# Combine /ProcSet sets.
507507
new_resources[NameObject(RES.PROC_SET)] = ArrayObject(
508508
frozenset(
509-
original_resources.get(RES.PROC_SET, ArrayObject()).get_object() # type: ignore
509+
original_resources.get(RES.PROC_SET, ArrayObject()).get_object()
510510
).union(
511-
frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object()) # type: ignore
511+
frozenset(page2resources.get(RES.PROC_SET, ArrayObject()).get_object())
512512
)
513513
)
514514

@@ -1248,7 +1248,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12481248
cmaps[operands[0]][2],
12491249
cmaps[operands[0]][3],
12501250
operands[0],
1251-
) # type:ignore
1251+
)
12521252
except KeyError: # font not found
12531253
_space_width = unknown_char_map[1]
12541254
cmap = (
@@ -1395,7 +1395,7 @@ def process_operation(operator: bytes, operands: List) -> None:
13951395
except IndexError:
13961396
pass
13971397
try:
1398-
xobj = resources_dict["/XObject"] # type: ignore
1398+
xobj = resources_dict["/XObject"]
13991399
if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore
14001400
# output += text
14011401
text = self.extract_xform_text(xobj[operands[0]], space_width) # type: ignore

0 commit comments

Comments
 (0)