From 24110fbb9830695074cade73058d91f18f242b31 Mon Sep 17 00:00:00 2001 From: Tomi Belan Date: Tue, 2 Jul 2024 22:41:22 +0200 Subject: [PATCH 1/3] Fix performance of tarfile reading with "r|*" --- Lib/tarfile.py | 18 ++++++++++-------- Misc/ACKS | 1 + ...4-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst | 2 ++ 3 files changed, 13 insertions(+), 8 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst diff --git a/Lib/tarfile.py b/Lib/tarfile.py index b5b28cff419a71..043d0cc4239ee3 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -380,7 +380,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("bz2 module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = bz2.BZ2Decompressor() self.exception = OSError else: @@ -392,7 +391,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("lzma module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = lzma.LZMADecompressor() self.exception = lzma.LZMAError else: @@ -485,7 +483,6 @@ def _init_read_gz(self): """Initialize for reading a gzip compressed fileobj. """ self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) - self.dbuf = b"" # taken from gzip.GzipFile with some alterations if self.__read(2) != b"\037\213": @@ -543,26 +540,31 @@ def _read(self, size): if self.comptype == "tar": return self.__read(size) - c = len(self.dbuf) - t = [self.dbuf] + c = 0 + t = [] while c < size: # Skip underlying buffer to avoid unaligned double buffering. if self.buf: buf = self.buf self.buf = b"" + elif self.comptype != "gz" and not self.cmp.needs_input: + buf = b"" else: buf = self.fileobj.read(self.bufsize) if not buf: break try: - buf = self.cmp.decompress(buf) + buf = self.cmp.decompress(buf, size - c) + if self.comptype == "gz": + self.buf = self.cmp.unconsumed_tail except self.exception as e: raise ReadError("invalid compressed data") from e t.append(buf) c += len(buf) t = b"".join(t) - self.dbuf = t[size:] - return t[:size] + if len(t) > size: + raise ReadError("decompress() returned too much data") + return t def __read(self, size): """Return size bytes from stream. If internal buffer is empty, diff --git a/Misc/ACKS b/Misc/ACKS index 234d0d2d0a2a16..14f0db7549534b 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -144,6 +144,7 @@ Bas van Beek Ian Beer Stefan Behnel Reimer Behrends +Tomi Belan Maxime Bélanger Ben Bell Thomas Bellman diff --git a/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst new file mode 100644 index 00000000000000..eca6014e4a0aed --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-02-20-57-43.gh-issue-121109.Tp6R2s.rst @@ -0,0 +1,2 @@ +Fix :mod:`tarfile` performance issue when reading archives in streaming mode +(e.g. ``r|*``). From 42a7a3d41b119302bd8c178b88493d03c76e5da0 Mon Sep 17 00:00:00 2001 From: Tomi Belan Date: Sat, 23 May 2026 21:19:29 +0200 Subject: [PATCH 2/3] Merge zstd additions --- Lib/tarfile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 043d0cc4239ee3..3173fcbdcfaf08 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -401,7 +401,6 @@ def __init__(self, name, mode, comptype, fileobj, bufsize, except ImportError: raise CompressionError("compression.zstd module is not available") from None if mode == "r": - self.dbuf = b"" self.cmp = zstd.ZstdDecompressor() self.exception = zstd.ZstdError else: From e61bccf04fc2367bc93e4dd427b8c99ef138393f Mon Sep 17 00:00:00 2001 From: Tomi Belan Date: Sat, 23 May 2026 23:34:18 +0200 Subject: [PATCH 3/3] Refactor by splitting gzip and non-gzip branch --- Lib/tarfile.py | 51 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index 3173fcbdcfaf08..d96d35524c6f54 100644 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -542,26 +542,43 @@ def _read(self, size): c = 0 t = [] while c < size: - # Skip underlying buffer to avoid unaligned double buffering. - if self.buf: - buf = self.buf - self.buf = b"" - elif self.comptype != "gz" and not self.cmp.needs_input: - buf = b"" - else: - buf = self.fileobj.read(self.bufsize) - if not buf: - break - try: - buf = self.cmp.decompress(buf, size - c) - if self.comptype == "gz": + if self.comptype == "gz": + # zlib interface is different than others. + # It returns data in unconsumed_tail. + if self.buf: + cbuf = self.buf + self.buf = b"" + else: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + + try: + dbuf = self.cmp.decompress(cbuf, size - c) self.buf = self.cmp.unconsumed_tail - except self.exception as e: - raise ReadError("invalid compressed data") from e - t.append(buf) - c += len(buf) + except self.exception as e: + raise ReadError("invalid compressed data") from e + else: + # Other decompressors have needs_input. + # decompress() can buffer data internally. + if self.cmp.needs_input: + cbuf = self.fileobj.read(self.bufsize) + if not cbuf: + break + else: + cbuf = b"" + + try: + dbuf = self.cmp.decompress(cbuf, size - c) + except self.exception as e: + raise ReadError("invalid compressed data") from e + + t.append(dbuf) + c += len(dbuf) + t = b"".join(t) if len(t) > size: + # This would only happen if decompress() has a bug. raise ReadError("decompress() returned too much data") return t