Skip to content

Commit 96b6c12

Browse files
committed
gh-103477: Read and write gzip header and trailer with zlib
RHEL, SLES and Ubuntu for IBM zSystems (aka s390x) ship with a zlib optimization [1] that significantly improves deflate and inflate performance on this platform by using a specialized CPU instruction. This instruction not only compresses the data, but also computes a checksum. At the moment Pyhton's gzip support performs compression and checksum calculation separately, which creates unnecessary overhead on s390x. The reason is that Python needs to write specific values into gzip header; and when this support was introduced in year 1997, there was indeed no better way to do this. Since v1.2.2.1 (2011) zlib provides inflateGetHeader() and deflateSetHeader() functions for that, so Python does not have to deal with the exact header and trailer format anymore. Add new interfaces to zlibmodule.c that make use of these functions: * Add mtime argument to zlib.compress(). * Add mtime and fname arguments to zlib.compressobj(). * Add gz_header_mtime and gz_header_done propeties to ZlibDecompressor. In Python modules, replace raw streams with gzip streams, make use of these new interfaces, and remove all mentions of crc32. In addition to the new interfaces above, there is an additional change in behavior that the users can see: for malformed gzip headers and trailers, decompression now raises zlib.error instead of BadGzipFile. However, this is allowed by today's spec. 📜🤖 NEWS entry added by blurb_it. [1] madler/zlib#410
1 parent 2b6e877 commit 96b6c12

File tree

11 files changed

+205
-219
lines changed

11 files changed

+205
-219
lines changed

Include/internal/pycore_global_objects_fini_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_global_strings.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ struct _Py_global_strings {
409409
STRUCT_FOR_ID(fix_imports)
410410
STRUCT_FOR_ID(flags)
411411
STRUCT_FOR_ID(flush)
412+
STRUCT_FOR_ID(fname)
412413
STRUCT_FOR_ID(follow_symlinks)
413414
STRUCT_FOR_ID(format)
414415
STRUCT_FOR_ID(frequency)
@@ -522,6 +523,7 @@ struct _Py_global_strings {
522523
STRUCT_FOR_ID(modules)
523524
STRUCT_FOR_ID(mro)
524525
STRUCT_FOR_ID(msg)
526+
STRUCT_FOR_ID(mtime)
525527
STRUCT_FOR_ID(mycmp)
526528
STRUCT_FOR_ID(n)
527529
STRUCT_FOR_ID(n_arg)

Include/internal/pycore_runtime_init_generated.h

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_unicodeobject_generated.h

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/gzip.py

Lines changed: 33 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -200,20 +200,20 @@ def __init__(self, filename=None, mode=None,
200200
FutureWarning, 2)
201201
self.mode = WRITE
202202
self._init_write(filename)
203+
if mtime is None:
204+
mtime = int(time.time())
203205
self.compress = zlib.compressobj(compresslevel,
204206
zlib.DEFLATED,
205-
-zlib.MAX_WBITS,
207+
16 + zlib.MAX_WBITS,
206208
zlib.DEF_MEM_LEVEL,
207-
0)
208-
self._write_mtime = mtime
209+
0,
210+
mtime=mtime,
211+
fname=self._encode_fname())
209212
else:
210213
raise ValueError("Invalid mode: {!r}".format(mode))
211214

212215
self.fileobj = fileobj
213216

214-
if self.mode == WRITE:
215-
self._write_gzip_header(compresslevel)
216-
217217
@property
218218
def mtime(self):
219219
"""Last modification time read from stream, or None"""
@@ -225,15 +225,12 @@ def __repr__(self):
225225

226226
def _init_write(self, filename):
227227
self.name = filename
228-
self.crc = zlib.crc32(b"")
229228
self.size = 0
230229
self.writebuf = []
231230
self.bufsize = 0
232231
self.offset = 0 # Current file offset for seek(), tell(), etc
233232

234-
def _write_gzip_header(self, compresslevel):
235-
self.fileobj.write(b'\037\213') # magic header
236-
self.fileobj.write(b'\010') # compression method
233+
def _encode_fname(self):
237234
try:
238235
# RFC 1952 requires the FNAME field to be Latin-1. Do not
239236
# include filenames that cannot be represented that way.
@@ -244,24 +241,7 @@ def _write_gzip_header(self, compresslevel):
244241
fname = fname[:-3]
245242
except UnicodeEncodeError:
246243
fname = b''
247-
flags = 0
248-
if fname:
249-
flags = FNAME
250-
self.fileobj.write(chr(flags).encode('latin-1'))
251-
mtime = self._write_mtime
252-
if mtime is None:
253-
mtime = time.time()
254-
write32u(self.fileobj, int(mtime))
255-
if compresslevel == _COMPRESS_LEVEL_BEST:
256-
xfl = b'\002'
257-
elif compresslevel == _COMPRESS_LEVEL_FAST:
258-
xfl = b'\004'
259-
else:
260-
xfl = b'\000'
261-
self.fileobj.write(xfl)
262-
self.fileobj.write(b'\377')
263-
if fname:
264-
self.fileobj.write(fname + b'\000')
244+
return fname
265245

266246
def write(self,data):
267247
self._check_not_closed()
@@ -282,7 +262,6 @@ def write(self,data):
282262
if length > 0:
283263
self.fileobj.write(self.compress.compress(data))
284264
self.size += length
285-
self.crc = zlib.crc32(data, self.crc)
286265
self.offset += length
287266

288267
return length
@@ -326,9 +305,6 @@ def close(self):
326305
try:
327306
if self.mode == WRITE:
328307
fileobj.write(self.compress.flush())
329-
write32u(fileobj, self.crc)
330-
# self.size may exceed 2 GiB, or even 4 GiB
331-
write32u(fileobj, self.size & 0xffffffff)
332308
elif self.mode == READ:
333309
self._buffer.close()
334310
finally:
@@ -409,62 +385,17 @@ def _read_exact(fp, n):
409385
return data
410386

411387

412-
def _read_gzip_header(fp):
413-
'''Read a gzip header from `fp` and progress to the end of the header.
414-
415-
Returns last mtime if header was present or None otherwise.
416-
'''
417-
magic = fp.read(2)
418-
if magic == b'':
419-
return None
420-
421-
if magic != b'\037\213':
422-
raise BadGzipFile('Not a gzipped file (%r)' % magic)
423-
424-
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
425-
if method != 8:
426-
raise BadGzipFile('Unknown compression method')
427-
428-
if flag & FEXTRA:
429-
# Read & discard the extra field, if present
430-
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
431-
_read_exact(fp, extra_len)
432-
if flag & FNAME:
433-
# Read and discard a null-terminated string containing the filename
434-
while True:
435-
s = fp.read(1)
436-
if not s or s==b'\000':
437-
break
438-
if flag & FCOMMENT:
439-
# Read and discard a null-terminated string containing a comment
440-
while True:
441-
s = fp.read(1)
442-
if not s or s==b'\000':
443-
break
444-
if flag & FHCRC:
445-
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
446-
return last_mtime
447-
448-
449388
class _GzipReader(_compression.DecompressReader):
450389
def __init__(self, fp):
451390
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
452-
wbits=-zlib.MAX_WBITS)
391+
wbits=16 + zlib.MAX_WBITS)
453392
# Set flag indicating start of a new member
454393
self._new_member = True
455394
self._last_mtime = None
456395

457396
def _init_read(self):
458-
self._crc = zlib.crc32(b"")
459397
self._stream_size = 0 # Decompressed size of unconcatenated stream
460398

461-
def _read_gzip_header(self):
462-
last_mtime = _read_gzip_header(self._fp)
463-
if last_mtime is None:
464-
return False
465-
self._last_mtime = last_mtime
466-
return True
467-
468399
def read(self, size=-1):
469400
if size < 0:
470401
return self.readall()
@@ -478,33 +409,35 @@ def read(self, size=-1):
478409
while True:
479410
if self._decompressor.eof:
480411
# Ending case: we've come to the end of a member in the file,
481-
# so finish up this member, and read a new gzip header.
482-
# Check the CRC and file size, and set the flag so we read
483-
# a new member
412+
# so finish up this member and set the flag, so that we read a
413+
# new member
484414
self._read_eof()
485415
self._new_member = True
486416
self._decompressor = self._decomp_factory(
487417
**self._decomp_args)
488418

489-
if self._new_member:
490-
# If the _new_member flag is set, we have to
491-
# jump to the next member, if there is one.
492-
self._init_read()
493-
if not self._read_gzip_header():
494-
self._size = self._pos
495-
return b""
496-
self._new_member = False
497-
498419
# Read a chunk of data from the file
499420
if self._decompressor.needs_input:
500421
buf = self._fp.read(READ_BUFFER_SIZE)
422+
if self._new_member:
423+
# If the _new_member flag is set, we have to
424+
# jump to the next member, if there is one.
425+
self._init_read()
426+
if not buf:
427+
self._size = self._pos
428+
return b""
429+
self._new_member = False
501430
uncompress = self._decompressor.decompress(buf, size)
502431
else:
432+
assert not self._new_member
503433
uncompress = self._decompressor.decompress(b"", size)
504434

435+
if self._decompressor.gz_header_done:
436+
self._last_mtime = self._decompressor.gz_header_mtime
437+
505438
if self._decompressor.unused_data != b"":
506439
# Prepend the already read bytes to the fileobj so they can
507-
# be seen by _read_eof() and _read_gzip_header()
440+
# be seen by _read_eof()
508441
self._fp.prepend(self._decompressor.unused_data)
509442

510443
if uncompress != b"":
@@ -513,23 +446,12 @@ def read(self, size=-1):
513446
raise EOFError("Compressed file ended before the "
514447
"end-of-stream marker was reached")
515448

516-
self._crc = zlib.crc32(uncompress, self._crc)
517449
self._stream_size += len(uncompress)
518450
self._pos += len(uncompress)
519451
return uncompress
520452

521453
def _read_eof(self):
522454
# We've read to the end of the file
523-
# We check that the computed CRC and size of the
524-
# uncompressed data matches the stored values. Note that the size
525-
# stored is the true file size mod 2**32.
526-
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
527-
if crc32 != self._crc:
528-
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
529-
hex(self._crc)))
530-
elif isize != (self._stream_size & 0xffffffff):
531-
raise BadGzipFile("Incorrect length of data produced")
532-
533455
# Gzip files can be padded with zeroes and still have archives.
534456
# Consume all zero bytes and set the file position to the first
535457
# non-zero byte. See http://www.gzip.org/#faq8
@@ -544,68 +466,32 @@ def _rewind(self):
544466
self._new_member = True
545467

546468

547-
def _create_simple_gzip_header(compresslevel: int,
548-
mtime = None) -> bytes:
549-
"""
550-
Write a simple gzip header with no extra fields.
551-
:param compresslevel: Compresslevel used to determine the xfl bytes.
552-
:param mtime: The mtime (must support conversion to a 32-bit integer).
553-
:return: A bytes object representing the gzip header.
554-
"""
555-
if mtime is None:
556-
mtime = time.time()
557-
if compresslevel == _COMPRESS_LEVEL_BEST:
558-
xfl = 2
559-
elif compresslevel == _COMPRESS_LEVEL_FAST:
560-
xfl = 4
561-
else:
562-
xfl = 0
563-
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
564-
# fields added to header), mtime, xfl and os (255 for unknown OS).
565-
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
566-
567-
568469
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
569470
"""Compress data in one shot and return the compressed string.
570471
571472
compresslevel sets the compression level in range of 0-9.
572473
mtime can be used to set the modification time. The modification time is
573474
set to the current time by default.
574475
"""
575-
if mtime == 0:
576-
# Use zlib as it creates the header with 0 mtime by default.
577-
# This is faster and with less overhead.
578-
return zlib.compress(data, level=compresslevel, wbits=31)
579-
header = _create_simple_gzip_header(compresslevel, mtime)
580-
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
581-
# Wbits=-15 creates a raw deflate block.
582-
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
583-
trailer)
476+
if mtime is None:
477+
mtime = int(time.time())
478+
return zlib.compress(data, level=compresslevel, wbits=31, mtime=mtime)
584479

585480

586481
def decompress(data):
587482
"""Decompress a gzip compressed string in one shot.
588483
Return the decompressed string.
589484
"""
590485
decompressed_members = []
591-
while True:
592-
fp = io.BytesIO(data)
593-
if _read_gzip_header(fp) is None:
594-
return b"".join(decompressed_members)
595-
# Use a zlib raw deflate compressor
596-
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
597-
# Read all the data except the header
598-
decompressed = do.decompress(data[fp.tell():])
599-
if not do.eof or len(do.unused_data) < 8:
486+
while data:
487+
do = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
488+
decompressed = do.decompress(data)
489+
if not do.eof:
600490
raise EOFError("Compressed file ended before the end-of-stream "
601491
"marker was reached")
602-
crc, length = struct.unpack("<II", do.unused_data[:8])
603-
if crc != zlib.crc32(decompressed):
604-
raise BadGzipFile("CRC check failed")
605-
if length != (len(decompressed) & 0xffffffff):
606-
raise BadGzipFile("Incorrect length of data produced")
607492
decompressed_members.append(decompressed)
608-
data = do.unused_data[8:].lstrip(b"\x00")
493+
data = do.unused_data.lstrip(b"\x00")
494+
return b"".join(decompressed_members)
609495

610496

611497
def main():

0 commit comments

Comments
 (0)