From 44738a0f8d8e8eb71a0ab3020b68759076430c77 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Fri, 22 May 2026 11:35:16 -0400 Subject: [PATCH 1/7] AP-687 - given a UTC modified datetime string for the requested file, we now check if the file exists locally and has a newer mtime --- CHANGELOG.md | 11 +++++++++++ tind_client/client.py | 23 ++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2436a3c..fd7b2b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased (Targeting 0.2.5)] + +### Added +- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API +- `fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target + location and has a modified time that is newer than the requested file + +### Changed +- slight change to raise a file not downloaded error if `tind_download()` fails to return a written file path + + ## [0.2.4] ### Added diff --git a/tind_client/client.py b/tind_client/client.py index 77b9883..b3da18f 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -3,12 +3,14 @@ """ import json +import logging import os import re from io import StringIO from pathlib import Path from typing import Any, Iterator import xml.etree.ElementTree as E +from datetime import datetime, timezone from pymarc import Record from pymarc.marcxml import parse_xml_to_array @@ -16,6 +18,7 @@ from .api import tind_get, tind_download from .errors import RecordNotFoundError, TINDError +logger = logging.getLogger(__name__) NS = "http://www.loc.gov/MARC21/slim" E.register_namespace("", NS) @@ -69,12 +72,15 @@ def fetch_metadata(self, record: str) -> Record: return records[0] - def fetch_file(self, file_url: str, output_dir: str = "") -> str: - """Download a file from TIND and save it locally. + def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> str: + """Download a file from TIND and save it locally. If the file already exists in the output + directory and has a local modified timestamp that is newer than supplied ``modified`` + timestamp, the file will not be re-downloaded. :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. Falls back to ``default_storage_dir`` when empty. + :param str modified: Optional modified timestamp from the file metadata returned by TIND :raises AuthorizationError: When the TIND API key is invalid or the file is restricted. :raises ValueError: When ``file_url`` is not a valid TIND file download URL. :raises RecordNotFoundError: When the file is invalid or not found. @@ -84,9 +90,20 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str: raise ValueError("URL is not a valid TIND file download URL.") output_target = output_dir or self.default_storage_dir + + expected_filename = file_url.rstrip("/").split("/")[-2] + expected_path = Path(output_target) / expected_filename + + if modified and expected_path.exists(): + meta_mtime = datetime.fromisoformat(modified).replace(tzinfo=timezone.utc) + local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc) + if local_mtime >= meta_mtime: + logger.debug("Cached file at (%s) is newer; skipping download.", expected_path) + return str(expected_path) + (status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key) - if status != 200: + if status != 200 or not saved_to: raise RecordNotFoundError("Referenced file could not be downloaded.") return saved_to From 4ce7ca22a2bad3488431734f7abbe8239935e122 Mon Sep 17 00:00:00 2001 From: Jason Raitz <145712254+jason-raitz@users.noreply.github.com> Date: Tue, 26 May 2026 11:54:24 -0400 Subject: [PATCH 2/7] Update tind_client/client.py Co-authored-by: Anna Wilcox --- tind_client/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tind_client/client.py b/tind_client/client.py index b3da18f..0e680a4 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -6,11 +6,11 @@ import logging import os import re +from datetime import datetime, timezone from io import StringIO from pathlib import Path from typing import Any, Iterator import xml.etree.ElementTree as E -from datetime import datetime, timezone from pymarc import Record from pymarc.marcxml import parse_xml_to_array From 5d5f9a997e5b5953268636c9461c4d944e231aa7 Mon Sep 17 00:00:00 2001 From: Jason Raitz <145712254+jason-raitz@users.noreply.github.com> Date: Tue, 26 May 2026 11:55:59 -0400 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Anna Wilcox --- tind_client/client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tind_client/client.py b/tind_client/client.py index 0e680a4..ca86eb7 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -73,14 +73,16 @@ def fetch_metadata(self, record: str) -> Record: return records[0] def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> str: - """Download a file from TIND and save it locally. If the file already exists in the output - directory and has a local modified timestamp that is newer than supplied ``modified`` + """Download a file from TIND and save it locally. + + If the file already exists in the output directory and was modified at or after a supplied ``modified`` timestamp, the file will not be re-downloaded. :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. Falls back to ``default_storage_dir`` when empty. - :param str modified: Optional modified timestamp from the file metadata returned by TIND + :param str modified: Optional modified timestamp from the file metadata returned by TIND. + If not specified, the file will always be downloaded. :raises AuthorizationError: When the TIND API key is invalid or the file is restricted. :raises ValueError: When ``file_url`` is not a valid TIND file download URL. :raises RecordNotFoundError: When the file is invalid or not found. From 651654e9bbb750531c760a4d12833ad9320b46fa Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Tue, 26 May 2026 12:00:03 -0400 Subject: [PATCH 4/7] line too long --- tind_client/client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tind_client/client.py b/tind_client/client.py index ca86eb7..e973f62 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -74,9 +74,9 @@ def fetch_metadata(self, record: str) -> Record: def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> str: """Download a file from TIND and save it locally. - - If the file already exists in the output directory and was modified at or after a supplied ``modified`` - timestamp, the file will not be re-downloaded. + + If the file already exists in the output directory and was modified at or after a supplied + ``modified`` timestamp, the file will not be re-downloaded. :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. From 823111eedfd857c73c01c007ca3b7fd6c084f3ac Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 27 May 2026 10:47:18 -0400 Subject: [PATCH 5/7] changes from code reviews --- CHANGELOG.md | 9 +++---- tests/test_fetch.py | 55 +++++++++++++++++++++++++++++++++++++++++++ tind_client/client.py | 13 +++++----- 3 files changed, 67 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd7b2b7..83c1a3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased (Targeting 0.2.5)] ### Added -- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API -- `fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target - location and has a modified time that is newer than the requested file +- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API. + `fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target + location and has a modified time that is newer than the requested file. +- Added tests for this new behavior. ### Changed -- slight change to raise a file not downloaded error if `tind_download()` fails to return a written file path +- slight change to raise a file not downloaded error if `tind_download()` fails to return a written file path. ## [0.2.4] diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 768725b..fcdb434 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -5,7 +5,9 @@ import json import xml.etree.ElementTree as E +from datetime import datetime, timezone from pathlib import Path +from unittest.mock import MagicMock, patch import pytest import requests_mock as req_mock # noqa: F401 — activates the requests_mock fixture @@ -87,6 +89,59 @@ def test_fetch_file_not_found( client.fetch_file(url, output_dir=str(tmp_path)) +def test_fetch_file_skips_download_when_local_file_is_newer( + tmp_path: Path, + client: TINDClient, +) -> None: + """fetch_file returns the cached path without downloading when local mtime >= meta_mtime.""" + + url = f"{BASE_URL}/api/v1/record/12345/files/some-image.jpg/download/?version=1" + cached = tmp_path / "some-image.jpg" + cached.write_bytes(b"cached content") + + meta_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + local_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) # newer + + mock_stat = MagicMock() + mock_stat.st_mtime = local_mtime.timestamp() + + with patch.object(Path, "stat", return_value=mock_stat): + result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime) + + assert result == str(cached) + + +def test_fetch_file_redownloads_when_local_file_is_older( + requests_mock: req_mock.Mocker, + tmp_path: Path, + client: TINDClient, +) -> None: + """fetch_file re-downloads when local mtime < meta_mtime.""" + + url = f"{BASE_URL}/api/v1/record/12345/files/some-other-image.jpg/download/?version=1" + cached = tmp_path / "some-other-image.jpg" + cached.write_bytes(b"stale content") + + meta_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) + local_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) # older + + mock_stat = MagicMock() + mock_stat.st_mtime = local_mtime.timestamp() + + requests_mock.get( + url, + content=b"fresh content", + status_code=200, + headers={"Content-Disposition": 'attachment; filename="some-other-image.jpg"'}, + ) + + with patch.object(Path, "stat", return_value=mock_stat): + result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime) + + assert result.endswith("some-other-image.jpg") + assert Path(result).read_bytes() == b"fresh content" + + # --------------------------------------------------------------------------- # fetch_file_metadata # --------------------------------------------------------------------------- diff --git a/tind_client/client.py b/tind_client/client.py index e973f62..a5a9a36 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -72,17 +72,19 @@ def fetch_metadata(self, record: str) -> Record: return records[0] - def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> str: + def fetch_file( + self, file_url: str, output_dir: str = "", meta_mtime: datetime | None = None + ) -> str: """Download a file from TIND and save it locally. If the file already exists in the output directory and was modified at or after a supplied - ``modified`` timestamp, the file will not be re-downloaded. + ``meta_mtime`` timestamp, the file will not be re-downloaded. :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. Falls back to ``default_storage_dir`` when empty. - :param str modified: Optional modified timestamp from the file metadata returned by TIND. - If not specified, the file will always be downloaded. + :param datetime meta_mtime: Optional modified timestamp from the file metadata returned by + TIND. If not specified, the file will always be downloaded. :raises AuthorizationError: When the TIND API key is invalid or the file is restricted. :raises ValueError: When ``file_url`` is not a valid TIND file download URL. :raises RecordNotFoundError: When the file is invalid or not found. @@ -96,8 +98,7 @@ def fetch_file(self, file_url: str, output_dir: str = "", modified: str = "") -> expected_filename = file_url.rstrip("/").split("/")[-2] expected_path = Path(output_target) / expected_filename - if modified and expected_path.exists(): - meta_mtime = datetime.fromisoformat(modified).replace(tzinfo=timezone.utc) + if meta_mtime and expected_path.exists(): local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc) if local_mtime >= meta_mtime: logger.debug("Cached file at (%s) is newer; skipping download.", expected_path) From 3ec2ff6fa0fbea7e76cda2f0118eb632916df694 Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 27 May 2026 11:15:42 -0400 Subject: [PATCH 6/7] fixed url filename parsing --- CHANGELOG.md | 6 +++++- tind_client/api.py | 2 +- tind_client/client.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83c1a3d..676bbe8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added tests for this new behavior. ### Changed -- slight change to raise a file not downloaded error if `tind_download()` fails to return a written file path. +- Raises a file not downloaded error if `tind_download()` fails to return a written file path. + +### Fixed +- Found a bug in the `fetch_file()` method where the "backup" filename parsing logic would always return + `download` as the filename. ## [0.2.4] diff --git a/tind_client/api.py b/tind_client/api.py index 2d7105f..1212101 100644 --- a/tind_client/api.py +++ b/tind_client/api.py @@ -87,7 +87,7 @@ def tind_download(url: str, output_dir: str, api_key: str) -> Tuple[int, str]: return status, "" # Fall-back to the file name in the URL if it isn't included in the response. - output_filename = url.rstrip("/").split("/")[-2] + output_filename = url.split("?")[0].rstrip("/").split("/")[-2] # See if we can extract the filename from the response headers. if "Content-Disposition" in resp.headers: diff --git a/tind_client/client.py b/tind_client/client.py index a5a9a36..7b2c0f0 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -95,7 +95,7 @@ def fetch_file( output_target = output_dir or self.default_storage_dir - expected_filename = file_url.rstrip("/").split("/")[-2] + expected_filename = file_url.split("?")[0].rstrip("/").split("/")[-2] expected_path = Path(output_target) / expected_filename if meta_mtime and expected_path.exists(): From 7e18eb9797b1d543a03cd0bf6e79d239675c47fa Mon Sep 17 00:00:00 2001 From: Jason Raitz Date: Wed, 27 May 2026 13:34:41 -0400 Subject: [PATCH 7/7] version bump --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 676bbe8..b6acc43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased (Targeting 0.2.5)] +## [0.2.5] ### Added - Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API.