Skip to content

Commit 6dc74db

Browse files
uranusjrKOLANICH
authored andcommitted
Refactor ELF parsing logic to standalone class (pypa#553)
1 parent 871ae00 commit 6dc74db

File tree

6 files changed

+266
-319
lines changed

6 files changed

+266
-319
lines changed

packaging/_elffile.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
"""
2+
ELF file parser.
3+
4+
This provides a class ``ELFFile`` that parses an ELF executable in a similar
5+
interface to ``ZipFile``. Only the read interface is implemented.
6+
7+
Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
8+
ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
9+
"""
10+
11+
import enum
12+
import os
13+
import struct
14+
from typing import IO, Optional, Tuple
15+
16+
17+
class ELFInvalid(ValueError):
18+
pass
19+
20+
21+
class EIClass(enum.IntEnum):
22+
C32 = 1
23+
C64 = 2
24+
25+
26+
class EIData(enum.IntEnum):
27+
Lsb = 1
28+
Msb = 2
29+
30+
31+
class EMachine(enum.IntEnum):
32+
I386 = 3
33+
S390 = 22
34+
Arm = 40
35+
X8664 = 62
36+
AArc64 = 183
37+
38+
39+
class ELFFile:
40+
"""
41+
Representation of an ELF executable.
42+
"""
43+
44+
def __init__(self, f: IO[bytes]) -> None:
45+
self._f = f
46+
47+
try:
48+
ident = self._read("16B")
49+
except struct.error:
50+
raise ELFInvalid("unable to parse identification")
51+
magic = bytes(ident[:4])
52+
if magic != b"\x7fELF":
53+
raise ELFInvalid(f"invalid magic: {magic!r}")
54+
55+
self.capacity = ident[4] # Format for program header (bitness).
56+
self.encoding = ident[5] # Data structure encoding (endianess).
57+
58+
try:
59+
# e_fmt: Format for program header.
60+
# p_fmt: Format for section header.
61+
# p_idx: Indexes to find p_type, p_offset, and p_filesz.
62+
e_fmt, self._p_fmt, self._p_idx = {
63+
(1, 1): ("<HHIIIIIHHH", "<IIIIIIII", (0, 1, 4)), # 32-bit LSB.
64+
(1, 2): (">HHIIIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB.
65+
(2, 1): ("<HHIQQQIHHH", "<IIQQQQQQ", (0, 2, 5)), # 64-bit LSB.
66+
(2, 2): (">HHIQQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB.
67+
}[(self.capacity, self.encoding)]
68+
except KeyError:
69+
raise ELFInvalid(
70+
f"unrecognized capacity ({self.capacity}) or "
71+
f"encoding ({self.encoding})"
72+
)
73+
74+
try:
75+
(
76+
_,
77+
self.machine, # Architecture type.
78+
_,
79+
_,
80+
self._e_phoff, # Offset of program header.
81+
_,
82+
self.flags, # Processor-specific flags.
83+
_,
84+
self._e_phentsize, # Size of section.
85+
self._e_phnum, # Number of sections.
86+
) = self._read(e_fmt)
87+
except struct.error as e:
88+
raise ELFInvalid("unable to parse machine and section information") from e
89+
90+
def _read(self, fmt: str) -> Tuple[int, ...]:
91+
return struct.unpack(fmt, self._f.read(struct.calcsize(fmt)))
92+
93+
@property
94+
def interpreter(self) -> Optional[str]:
95+
"""
96+
The path recorded in the ``PT_INTERP`` section header.
97+
"""
98+
for index in range(self._e_phnum):
99+
self._f.seek(self._e_phoff + self._e_phentsize * index)
100+
try:
101+
data = self._read(self._p_fmt)
102+
except struct.error:
103+
continue
104+
if data[self._p_idx[0]] != 3: # Not PT_INTERP.
105+
continue
106+
self._f.seek(data[self._p_idx[1]])
107+
return os.fsdecode(self._f.read(data[self._p_idx[2]])).strip("\0")
108+
return None

packaging/_manylinux.py

Lines changed: 45 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,58 @@
11
import collections
2+
import contextlib
23
import functools
34
import os
45
import re
5-
import struct
66
import sys
77
import warnings
8-
from typing import IO, Dict, Iterator, NamedTuple, Optional, Tuple
9-
10-
11-
# Python does not provide platform information at sufficient granularity to
12-
# identify the architecture of the running executable in some cases, so we
13-
# determine it dynamically by reading the information from the running
14-
# process. This only applies on Linux, which uses the ELF format.
15-
class _ELFFileHeader:
16-
# https://en.wikipedia.org/wiki/Executable_and_Linkable_Format#File_header
17-
class _InvalidELFFileHeader(ValueError):
18-
"""
19-
An invalid ELF file header was found.
20-
"""
21-
22-
ELF_MAGIC_NUMBER = 0x7F454C46
23-
ELFCLASS32 = 1
24-
ELFCLASS64 = 2
25-
ELFDATA2LSB = 1
26-
ELFDATA2MSB = 2
27-
EM_386 = 3
28-
EM_S390 = 22
29-
EM_ARM = 40
30-
EM_X86_64 = 62
31-
EF_ARM_ABIMASK = 0xFF000000
32-
EF_ARM_ABI_VER5 = 0x05000000
33-
EF_ARM_ABI_FLOAT_HARD = 0x00000400
34-
35-
def __init__(self, file: IO[bytes]) -> None:
36-
def unpack(fmt: str) -> int:
37-
try:
38-
data = file.read(struct.calcsize(fmt))
39-
result: Tuple[int, ...] = struct.unpack(fmt, data)
40-
except struct.error:
41-
raise _ELFFileHeader._InvalidELFFileHeader()
42-
return result[0]
43-
44-
self.e_ident_magic = unpack(">I")
45-
if self.e_ident_magic != self.ELF_MAGIC_NUMBER:
46-
raise _ELFFileHeader._InvalidELFFileHeader()
47-
self.e_ident_class = unpack("B")
48-
if self.e_ident_class not in {self.ELFCLASS32, self.ELFCLASS64}:
49-
raise _ELFFileHeader._InvalidELFFileHeader()
50-
self.e_ident_data = unpack("B")
51-
if self.e_ident_data not in {self.ELFDATA2LSB, self.ELFDATA2MSB}:
52-
raise _ELFFileHeader._InvalidELFFileHeader()
53-
self.e_ident_version = unpack("B")
54-
self.e_ident_osabi = unpack("B")
55-
self.e_ident_abiversion = unpack("B")
56-
self.e_ident_pad = file.read(7)
57-
format_h = "<H" if self.e_ident_data == self.ELFDATA2LSB else ">H"
58-
format_i = "<I" if self.e_ident_data == self.ELFDATA2LSB else ">I"
59-
format_q = "<Q" if self.e_ident_data == self.ELFDATA2LSB else ">Q"
60-
format_p = format_i if self.e_ident_class == self.ELFCLASS32 else format_q
61-
self.e_type = unpack(format_h)
62-
self.e_machine = unpack(format_h)
63-
self.e_version = unpack(format_i)
64-
self.e_entry = unpack(format_p)
65-
self.e_phoff = unpack(format_p)
66-
self.e_shoff = unpack(format_p)
67-
self.e_flags = unpack(format_i)
68-
self.e_ehsize = unpack(format_h)
69-
self.e_phentsize = unpack(format_h)
70-
self.e_phnum = unpack(format_h)
71-
self.e_shentsize = unpack(format_h)
72-
self.e_shnum = unpack(format_h)
73-
self.e_shstrndx = unpack(format_h)
74-
75-
76-
def _get_elf_header() -> Optional[_ELFFileHeader]:
8+
from typing import Dict, Generator, Iterator, NamedTuple, Optional, Tuple
9+
10+
from ._elffile import EIClass, EIData, ELFFile, EMachine
11+
12+
EF_ARM_ABIMASK = 0xFF000000
13+
EF_ARM_ABI_VER5 = 0x05000000
14+
EF_ARM_ABI_FLOAT_HARD = 0x00000400
15+
16+
17+
@contextlib.contextmanager
18+
def _parse_elf(path: str) -> Generator[Optional[ELFFile], None, None]:
7719
try:
78-
with open(sys.executable, "rb") as f:
79-
elf_header = _ELFFileHeader(f)
80-
except (OSError, TypeError, _ELFFileHeader._InvalidELFFileHeader):
81-
return None
82-
return elf_header
20+
with open(path, "rb") as f:
21+
yield ELFFile(f)
22+
except (OSError, TypeError, ValueError):
23+
yield None
8324

8425

85-
def _is_linux_armhf() -> bool:
26+
def _is_linux_armhf(executable: str) -> bool:
8627
# hard-float ABI can be detected from the ELF header of the running
8728
# process
8829
# https://static.docs.arm.com/ihi0044/g/aaelf32.pdf
89-
elf_header = _get_elf_header()
90-
if elf_header is None:
91-
return False
92-
result = elf_header.e_ident_class == elf_header.ELFCLASS32
93-
result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB
94-
result &= elf_header.e_machine == elf_header.EM_ARM
95-
result &= (
96-
elf_header.e_flags & elf_header.EF_ARM_ABIMASK
97-
) == elf_header.EF_ARM_ABI_VER5
98-
result &= (
99-
elf_header.e_flags & elf_header.EF_ARM_ABI_FLOAT_HARD
100-
) == elf_header.EF_ARM_ABI_FLOAT_HARD
101-
return result
102-
103-
104-
def _is_linux_i686() -> bool:
105-
elf_header = _get_elf_header()
106-
if elf_header is None:
107-
return False
108-
result = elf_header.e_ident_class == elf_header.ELFCLASS32
109-
result &= elf_header.e_ident_data == elf_header.ELFDATA2LSB
110-
result &= elf_header.e_machine == elf_header.EM_386
111-
return result
30+
with _parse_elf(executable) as f:
31+
return (
32+
f is not None
33+
and f.capacity == EIClass.C32
34+
and f.encoding == EIData.Lsb
35+
and f.machine == EMachine.Arm
36+
and f.flags & EF_ARM_ABIMASK == EF_ARM_ABI_VER5
37+
and f.flags & EF_ARM_ABI_FLOAT_HARD == EF_ARM_ABI_FLOAT_HARD
38+
)
39+
40+
41+
def _is_linux_i686(executable: str) -> bool:
42+
with _parse_elf(executable) as f:
43+
return (
44+
f is not None
45+
and f.capacity == EIClass.C32
46+
and f.encoding == EIData.Lsb
47+
and f.machine == EMachine.I386
48+
)
11249

11350

114-
def _have_compatible_abi(arch: str) -> bool:
51+
def _have_compatible_abi(executable: str, arch: str) -> bool:
11552
if arch == "armv7l":
116-
return _is_linux_armhf()
53+
return _is_linux_armhf(executable)
11754
if arch == "i686":
118-
return _is_linux_i686()
55+
return _is_linux_i686(executable)
11956
return arch in {"x86_64", "aarch64", "ppc64", "ppc64le", "s390x"}
12057

12158

@@ -141,10 +78,10 @@ def _glibc_version_string_confstr() -> Optional[str]:
14178
# platform module.
14279
# https://github.com/python/cpython/blob/fcf1d003bf4f0100c/Lib/platform.py#L175-L183
14380
try:
144-
# os.confstr("CS_GNU_LIBC_VERSION") returns a string like "glibc 2.17".
145-
version_string = os.confstr("CS_GNU_LIBC_VERSION")
81+
# Should be a string like "glibc 2.17".
82+
version_string: str = getattr(os, "confstr")("CS_GNU_LIBC_VERSION")
14683
assert version_string is not None
147-
_, version = version_string.split()
84+
_, version = version_string.rsplit()
14885
except (AssertionError, AttributeError, OSError, ValueError):
14986
# os.confstr() or CS_GNU_LIBC_VERSION not available (or a bad value)...
15087
return None
@@ -211,8 +148,8 @@ def _parse_glibc_version(version_str: str) -> Tuple[int, int]:
211148
m = re.match(r"(?P<major>[0-9]+)\.(?P<minor>[0-9]+)", version_str)
212149
if not m:
213150
warnings.warn(
214-
"Expected glibc version with 2 components major.minor,"
215-
" got: %s" % version_str,
151+
f"Expected glibc version with 2 components major.minor,"
152+
f" got: {version_str}",
216153
RuntimeWarning,
217154
)
218155
return -1, -1
@@ -265,7 +202,7 @@ def _is_compatible(name: str, arch: str, version: _GLibCVersion) -> bool:
265202

266203

267204
def platform_tags(linux: str, arch: str) -> Iterator[str]:
268-
if not _have_compatible_abi(arch):
205+
if not _have_compatible_abi(sys.executable, arch):
269206
return
270207
# Oldest glibc to be supported regardless of architecture is (2, 17).
271208
too_old_glibc2 = _GLibCVersion(2, 16)

packaging/_musllinux.py

Lines changed: 8 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -4,70 +4,13 @@
44
linked against musl, and what musl version is used.
55
"""
66

7-
import contextlib
87
import functools
9-
import operator
10-
import os
118
import re
12-
import struct
139
import subprocess
1410
import sys
15-
from typing import IO, Iterator, NamedTuple, Optional, Tuple
11+
from typing import Iterator, NamedTuple, Optional
1612

17-
18-
def _read_unpacked(f: IO[bytes], fmt: str) -> Tuple[int, ...]:
19-
return struct.unpack(fmt, f.read(struct.calcsize(fmt)))
20-
21-
22-
def _parse_ld_musl_from_elf(f: IO[bytes]) -> Optional[str]:
23-
"""Detect musl libc location by parsing the Python executable.
24-
25-
Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
26-
ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
27-
"""
28-
f.seek(0)
29-
try:
30-
ident = _read_unpacked(f, "16B")
31-
except struct.error:
32-
return None
33-
if ident[:4] != tuple(b"\x7fELF"): # Invalid magic, not ELF.
34-
return None
35-
f.seek(struct.calcsize("HHI"), 1) # Skip file type, machine, and version.
36-
37-
try:
38-
# e_fmt: Format for program header.
39-
# p_fmt: Format for section header.
40-
# p_idx: Indexes to find p_type, p_offset, and p_filesz.
41-
e_fmt, p_fmt, p_idx = {
42-
(1, 1): ("<IIIIHHH", "<IIIIIIII", (0, 1, 4)), # 32-bit LSB.
43-
(1, 2): (">IIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB.
44-
(2, 1): ("<QQQIHHH", "<IIQQQQQQ", (0, 2, 5)), # 64-bit LSB.
45-
(2, 2): (">QQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB.
46-
}[(ident[4], ident[5])]
47-
except KeyError:
48-
return None
49-
else:
50-
p_get = operator.itemgetter(*p_idx)
51-
52-
# Find the interpreter section and return its content.
53-
try:
54-
_, e_phoff, _, _, _, e_phentsize, e_phnum = _read_unpacked(f, e_fmt)
55-
except struct.error:
56-
return None
57-
for i in range(e_phnum + 1):
58-
f.seek(e_phoff + e_phentsize * i)
59-
try:
60-
p_type, p_offset, p_filesz = p_get(_read_unpacked(f, p_fmt))
61-
except struct.error:
62-
return None
63-
if p_type != 3: # Not PT_INTERP.
64-
continue
65-
f.seek(p_offset)
66-
interpreter = os.fsdecode(f.read(p_filesz)).strip("\0")
67-
if "musl" not in interpreter:
68-
return None
69-
return interpreter
70-
return None
13+
from ._elffile import ELFFile
7114

7215

7316
class _MuslVersion(NamedTuple):
@@ -97,13 +40,12 @@ def _get_musl_version(executable: str) -> Optional[_MuslVersion]:
9740
Version 1.2.2
9841
Dynamic Program Loader
9942
"""
100-
with contextlib.ExitStack() as stack:
101-
try:
102-
f = stack.enter_context(open(executable, "rb"))
103-
except OSError:
104-
return None
105-
ld = _parse_ld_musl_from_elf(f)
106-
if not ld:
43+
try:
44+
with open(executable, "rb") as f:
45+
ld = ELFFile(f).interpreter
46+
except (OSError, TypeError, ValueError):
47+
return None
48+
if ld is None or "musl" not in ld:
10749
return None
10850
proc = subprocess.run([ld], stderr=subprocess.PIPE, universal_newlines=True)
10951
return _parse_musl_version(proc.stderr)

0 commit comments

Comments
 (0)