Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.2.5]

### Added
- Optional parameter to `fetch_file()` with a modified time of the remote file pulled from the TIND API.
`fetch_file()` uses this to avoid unnecessary downloads if a file already exists at the target
location and has a modified time that is newer than the requested file.
- Added tests for this new behavior.

### Changed
- Raises a file not downloaded error if `tind_download()` fails to return a written file path.

### Fixed
- Found a bug in the `fetch_file()` method where the "backup" filename parsing logic would always return
`download` as the filename.


## [0.2.4]

### Added
Expand Down
55 changes: 55 additions & 0 deletions tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import json
import xml.etree.ElementTree as E

from datetime import datetime, timezone
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
import requests_mock as req_mock # noqa: F401 — activates the requests_mock fixture
Expand Down Expand Up @@ -87,6 +89,59 @@ def test_fetch_file_not_found(
client.fetch_file(url, output_dir=str(tmp_path))


def test_fetch_file_skips_download_when_local_file_is_newer(
tmp_path: Path,
client: TINDClient,
) -> None:
"""fetch_file returns the cached path without downloading when local mtime >= meta_mtime."""

url = f"{BASE_URL}/api/v1/record/12345/files/some-image.jpg/download/?version=1"
cached = tmp_path / "some-image.jpg"
cached.write_bytes(b"cached content")

meta_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
local_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc) # newer

mock_stat = MagicMock()
mock_stat.st_mtime = local_mtime.timestamp()

with patch.object(Path, "stat", return_value=mock_stat):
result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime)

assert result == str(cached)


def test_fetch_file_redownloads_when_local_file_is_older(
requests_mock: req_mock.Mocker,
tmp_path: Path,
client: TINDClient,
) -> None:
"""fetch_file re-downloads when local mtime < meta_mtime."""

url = f"{BASE_URL}/api/v1/record/12345/files/some-other-image.jpg/download/?version=1"
cached = tmp_path / "some-other-image.jpg"
cached.write_bytes(b"stale content")

meta_mtime = datetime(2026, 1, 3, 0, 0, 0, tzinfo=timezone.utc)
local_mtime = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) # older

mock_stat = MagicMock()
mock_stat.st_mtime = local_mtime.timestamp()

requests_mock.get(
url,
content=b"fresh content",
status_code=200,
headers={"Content-Disposition": 'attachment; filename="some-other-image.jpg"'},
)

with patch.object(Path, "stat", return_value=mock_stat):
result = client.fetch_file(url, output_dir=str(tmp_path), meta_mtime=meta_mtime)

assert result.endswith("some-other-image.jpg")
assert Path(result).read_bytes() == b"fresh content"


# ---------------------------------------------------------------------------
# fetch_file_metadata
# ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion tind_client/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def tind_download(url: str, output_dir: str, api_key: str) -> Tuple[int, str]:
return status, ""

# Fall-back to the file name in the URL if it isn't included in the response.
output_filename = url.rstrip("/").split("/")[-2]
output_filename = url.split("?")[0].rstrip("/").split("/")[-2]

# See if we can extract the filename from the response headers.
if "Content-Disposition" in resp.headers:
Expand Down
24 changes: 22 additions & 2 deletions tind_client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
"""

import json
import logging
import os
import re
from datetime import datetime, timezone
from io import StringIO
from pathlib import Path
from typing import Any, Iterator
Expand All @@ -16,6 +18,7 @@
from .api import tind_get, tind_download
from .errors import RecordNotFoundError, TINDError

logger = logging.getLogger(__name__)

NS = "http://www.loc.gov/MARC21/slim"
E.register_namespace("", NS)
Expand Down Expand Up @@ -69,12 +72,19 @@ def fetch_metadata(self, record: str) -> Record:

return records[0]

def fetch_file(self, file_url: str, output_dir: str = "") -> str:
def fetch_file(
self, file_url: str, output_dir: str = "", meta_mtime: datetime | None = None
) -> str:
"""Download a file from TIND and save it locally.

If the file already exists in the output directory and was modified at or after a supplied
``meta_mtime`` timestamp, the file will not be re-downloaded.

:param str file_url: The TIND file download URL.
:param str output_dir: Directory in which to save the file.
Falls back to ``default_storage_dir`` when empty.
:param datetime meta_mtime: Optional modified timestamp from the file metadata returned by
TIND. If not specified, the file will always be downloaded.
:raises AuthorizationError: When the TIND API key is invalid or the file is restricted.
:raises ValueError: When ``file_url`` is not a valid TIND file download URL.
:raises RecordNotFoundError: When the file is invalid or not found.
Expand All @@ -84,9 +94,19 @@ def fetch_file(self, file_url: str, output_dir: str = "") -> str:
raise ValueError("URL is not a valid TIND file download URL.")

output_target = output_dir or self.default_storage_dir

expected_filename = file_url.split("?")[0].rstrip("/").split("/")[-2]
expected_path = Path(output_target) / expected_filename

if meta_mtime and expected_path.exists():
local_mtime = datetime.fromtimestamp(expected_path.stat().st_mtime, tz=timezone.utc)
if local_mtime >= meta_mtime:
logger.debug("Cached file at (%s) is newer; skipping download.", expected_path)
return str(expected_path)

(status, saved_to) = tind_download(file_url, output_dir=output_target, api_key=self.api_key)

if status != 200:
if status != 200 or not saved_to:
raise RecordNotFoundError("Referenced file could not be downloaded.")

return saved_to
Expand Down