diff --git a/CHANGELOG.md b/CHANGELOG.md index 2436a3c..b6acc43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.2.5] + +### Added +- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API. + `fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target + location and has a modified time that is newer than the requested file. +- Added tests for this new behavior. + +### Changed +- Raises a file not downloaded error if `tind_download()` fails to return a written file path. + +### Fixed +- Found a bug in the `fetch_file()` method where the "backup" filename parsing logic would always return + `download` as the filename. + + ## [0.2.4] ### Added diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 768725b..fcdb434 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -5,7 +5,9 @@ import json import xml.etree.ElementTree as E +from datetime import datetime, timezone from pathlib import Path +from unittest.mock import MagicMock, patch import pytest import requests_mock as req_mock # noqa: F401 — activates the requests_mock fixture @@ -87,6 +89,59 @@ def test_fetch_file_not_found( client.fetch_file(url, output_dir=str(tmp_path)) +def test_fetch_file_skips_download_when_local_file_is_newer( + tmp_path: Path, + client: TINDClient, +) -> None: + """fetch_file returns the cached path without downloading when local mtime >= meta_mtime.""" + + url = f"{BASE_URL}/api/v1/record/12345/files/some-image.jpg/download/?version=1" + cached = tmp_path / "some-image.jpg" + cached.write_bytes(b"cached content") + + meta_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + local_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) # newer + + mock_stat = MagicMock() + mock_stat.st_mtime = local_mtime.timestamp() + + with patch.object(Path, "stat", return_value=mock_stat): + result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime) + + assert result == str(cached) + + +def test_fetch_file_redownloads_when_local_file_is_older( + requests_mock: req_mock.Mocker, + tmp_path: Path, + client: TINDClient, +) -> None: + """fetch_file re-downloads when local mtime < meta_mtime.""" + + url = f"{BASE_URL}/api/v1/record/12345/files/some-other-image.jpg/download/?version=1" + cached = tmp_path / "some-other-image.jpg" + cached.write_bytes(b"stale content") + + meta_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) + local_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) # older + + mock_stat = MagicMock() + mock_stat.st_mtime = local_mtime.timestamp() + + requests_mock.get( + url, + content=b"fresh content", + status_code=200, + headers={"Content-Disposition": 'attachment; filename="some-other-image.jpg"'}, + ) + + with patch.object(Path, "stat", return_value=mock_stat): + result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime) + + assert result.endswith("some-other-image.jpg") + assert Path(result).read_bytes() == b"fresh content" + + # --------------------------------------------------------------------------- # fetch_file_metadata # --------------------------------------------------------------------------- diff --git a/tind_client/api.py b/tind_client/api.py index 2d7105f..1212101 100644 --- a/tind_client/api.py +++ b/tind_client/api.py @@ -87,7 +87,7 @@ def tind_download(url: str, output_dir: str, api_key: str) -> Tuple[int, str]: return status, "" # Fall-back to the file name in the URL if it isn't included in the response. - output_filename = url.rstrip("/").split("/")[-2] + output_filename = url.split("?")[0].rstrip("/").split("/")[-2] # See if we can extract the filename from the response headers. if "Content-Disposition" in resp.headers: diff --git a/tind_client/client.py b/tind_client/client.py index 77b9883..7b2c0f0 100644 --- a/tind_client/client.py +++ b/tind_client/client.py @@ -3,8 +3,10 @@ """ import json +import logging import os import re +from datetime import datetime, timezone from io import StringIO from pathlib import Path from typing import Any, Iterator @@ -16,6 +18,7 @@ from .api import tind_get, tind_download from .errors import RecordNotFoundError, TINDError +logger = logging.getLogger(__name__) NS = "http://www.loc.gov/MARC21/slim" E.register_namespace("", NS) @@ -69,12 +72,19 @@ def fetch_metadata(self, record: str) -> Record: return records[0] - def fetch_file(self, file_url: str, output_dir: str = "") -> str: + def fetch_file( + self, file_url: str, output_dir: str = "", meta_mtime: datetime | None = None + ) -> str: """Download a file from TIND and save it locally. + If the file already exists in the output directory and was modified at or after a supplied + ``meta_mtime`` timestamp, the file will not be re-downloaded. + :param str file_url: The TIND file download URL. :param str output_dir: Directory in which to save the file. Falls back to ``default_storage_dir`` when empty. + :param datetime meta_mtime: Optional modified timestamp from the file metadata returned by + TIND. If not specified, the file will always be downloaded. :raises AuthorizationError: When the TIND API key is invalid or the file is restricted. :raises ValueError: When ``file_url`` is not a valid TIND file download URL. :raises RecordNotFoundError: When the file is invalid or not found. @@ -84,9 +94,19 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str: raise ValueError("URL is not a valid TIND file download URL.") output_target = output_dir or self.default_storage_dir + + expected_filename = file_url.split("?")[0].rstrip("/").split("/")[-2] + expected_path = Path(output_target) / expected_filename + + if meta_mtime and expected_path.exists(): + local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc) + if local_mtime >= meta_mtime: + logger.debug("Cached file at (%s) is newer; skipping download.", expected_path) + return str(expected_path) + (status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key) - if status != 200: + if status != 200 or not saved_to: raise RecordNotFoundError("Referenced file could not be downloaded.") return saved_to