diff --git a/Lib/tarfile.py b/Lib/tarfile.py index b5b28cff419a71..d96d35524c6f54 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("bz2 module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = bz2.BZ2Decompressor() self.exception = OSError else: @@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("lzma module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = lzma.LZMADecompressor() self.exception = lzma.LZMAError else: @@ -403,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("compression.zstd module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = zstd.ZstdDecompressor() self.exception = zstd.ZstdError else: @@ -485,7 +482,6 @@ def _init_read_gz(self): """Initialize for reading a gzip compressed fileobj. """ self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) - self.dbuf = b"" # taken from gzip.GzipFile with some alterations if self.__read(2) != b"\037\213": @@ -543,26 +539,48 @@ def _read(self, size): if self.comptype == "tar": return self.__read(size) - c = len(self.dbuf) - t = [self.dbuf] + c = 0 + t = [] while c < size: - # Skip underlying buffer to avoid unaligned double buffering. - if self.buf: - buf = self.buf - self.buf = b"" + if self.comptype == "gz": + # zlib interface is different than others. + # It returns data in unconsumed_tail. + if self.buf: + cbuf = self.buf + self.buf = b"" + else: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + + try: + dbuf = self.cmp.decompress(cbuf, size - c) + self.buf = self.cmp.unconsumed_tail + except self.exception as e: + raise ReadError("invalid compressed data") from e else: - buf = self.fileobj.read(self.bufsize) - if not buf: - break - try: - buf = self.cmp.decompress(buf) - except self.exception as e: - raise ReadError("invalid compressed data") from e - t.append(buf) - c += len(buf) + # Other decompressors have needs_input. + # decompress() can buffer data internally. + if self.cmp.needs_input: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + else: + cbuf = b"" + + try: + dbuf = self.cmp.decompress(cbuf, size - c) + except self.exception as e: + raise ReadError("invalid compressed data") from e + + t.append(dbuf) + c += len(dbuf) + t = b"".join(t) - self.dbuf = t[size:] - return t[:size] + if len(t) > size: + # This would only happen if decompress() has a bug. + raise ReadError("decompress() returned too much data") + return t def __read(self, size): """Return size bytes from stream. If internal buffer is empty, diff --git a/Misc/ACKS b/Misc/ACKS index 234d0d2d0a2a16..14f0db7549534b 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -144,6 +144,7 @@ Bas van Beek Ian Beer Stefan Behnel Reimer Behrends +Tomi Belan Maxime Bélanger Ben Bell Thomas Bellman diff --git a/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst new file mode 100644 index 00000000000000..eca6014e4a0aed --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst @@ -0,0 +1,2 @@ +Fix :mod:`tarfile` performance issue when reading archives in streaming mode +(e.g. ``r|*``).