Skip to content

Commit 2b9f5e2

Browse files
authored
TLDR-922: replace pylzma by py7zr (#547)
1 parent 6715b4a commit 2b9f5e2

File tree

5 files changed

+16
-10
lines changed

5 files changed

+16
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dedoc/version.py
1111

1212
# Distribution / packaging
1313
.Python
14+
etc/
1415
env/
1516
build/
1617
develop-eggs/

dedoc/readers/archive_reader/archive_reader.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo
8585
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
8686

8787
def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
88-
import py7zlib
89-
90-
with open(path, "rb") as content:
91-
arch_file = py7zlib.Archive7z(content)
92-
names = arch_file.getnames()
93-
for name in names:
94-
file = arch_file.getmember(name)
95-
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
88+
import os
89+
import py7zr
90+
import tempfile
91+
92+
with tempfile.TemporaryDirectory() as tmpdir:
93+
with py7zr.SevenZipFile(path, "r") as arch_file:
94+
arch_file.extractall(tmpdir)
95+
96+
for dir_path, _, file_names in os.walk(tmpdir):
97+
for file_name in file_names:
98+
file_path = os.path.join(dir_path, file_name)
99+
with open(file_path, "rb") as file:
100+
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis)
96101

97102
def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
98103
import os

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ pdf2image==1.10.0 #1.14.0 - there are converting artifacts '№' != '№\n\x0c'
1818
pdfminer.six>=20211012,<=20231228
1919
piexif==1.1.3
2020
puremagic>=1.0,<2.0 # needs libmagic to be installed in the system
21-
pylzma==0.5.0
21+
py7zr~=1.0
2222
pypdf>=4.0,<6.0
2323
pytesseract==0.3.10
2424
python-docx==0.8.11
-242 Bytes
Binary file not shown.

tests/unit_tests/test_module_attachment_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_archive_with_slash(self) -> None:
9898
Tests attachment extraction from archives with files containing slash symbol in the name
9999
"""
100100
file_name_template = "attachments.{}"
101-
for extension in "7z", "tar", "tar.gz", "zip":
101+
for extension in "tar", "tar.gz", "zip":
102102
file_name = file_name_template.format(extension)
103103
files = self.__get_list_of_files_in_archive(file_name)
104104
self.assertEqual(2, len(files))

0 commit comments

Comments
 (0)