diff --git a/docs/api/datasets.md b/docs/api/datasets.md index 7bf6d5a61..d0c43b56c 100644 --- a/docs/api/datasets.md +++ b/docs/api/datasets.md @@ -7,5 +7,6 @@ Convenience small datasets .. autofunction:: blobs .. autofunction:: blobs_annotating_element +.. autofunction:: cells .. autofunction:: raccoon ``` diff --git a/pyproject.toml b/pyproject.toml index 03181eadb..3fe577a50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ dependencies = [ "spatial_image>=1.2.3", "scikit-image", "scipy!=1.17.0", + "scverse-misc[datasets]>=0.1.0", "typing_extensions>=4.8.0", "universal_pathlib>=0.2.6", "xarray>=2024.10.0", diff --git a/src/spatialdata/datasets.py b/src/spatialdata/datasets.py index 37f529c72..8b0233f68 100644 --- a/src/spatialdata/datasets.py +++ b/src/spatialdata/datasets.py @@ -31,7 +31,7 @@ ) from spatialdata.transformations import Identity -__all__ = ["blobs", "raccoon"] +__all__ = ["blobs", "cells", "raccoon"] def blobs( @@ -79,6 +79,38 @@ def raccoon() -> SpatialData: return RaccoonDataset().raccoon() +def cells(path: str | None = None) -> SpatialData: + """ + Cells dataset. + + Download the ``cells`` example dataset and load it as a :class:`~spatialdata.SpatialData` + object. The download is hash-verified and cached, so repeated calls reuse the local copy + instead of downloading again. + + Parameters + ---------- + path + Directory in which to cache the downloaded data. If `None`, the default OS cache + location is used (:func:`pooch.os_cache` for ``"spatialdata"``). + + Returns + ------- + SpatialData object with the cells dataset. + """ + import importlib.resources + from pathlib import Path + + import pooch + from scverse_misc.datasets import fetch, parse_registry + + cache_dir = Path(path) if path is not None else Path(pooch.os_cache("spatialdata")) + registry = importlib.resources.files("spatialdata").joinpath("datasets.yaml") + with importlib.resources.as_file(registry) as registry_path: + base_url, datasets = parse_registry(registry_path) + sdata: SpatialData = fetch(datasets["cells"], cache_dir, base_url=base_url) + return sdata + + class RaccoonDataset: """Raccoon dataset.""" diff --git a/src/spatialdata/datasets.yaml b/src/spatialdata/datasets.yaml new file mode 100644 index 000000000..9f5d235a1 --- /dev/null +++ b/src/spatialdata/datasets.yaml @@ -0,0 +1,15 @@ +# Registry of downloadable example datasets for ``spatialdata.datasets``. +# +# Parsed by ``scverse_misc.datasets.parse_registry`` and fetched (downloaded, +# hash-verified, cached and loaded) via ``scverse_misc.datasets.fetch``. +# +# type: spatialdata -> a .zip that extracts to a single .zarr store +base_url: https://exampledata.scverse.org/spatialdata/ +datasets: + cells: + type: spatialdata + doc_header: Cells dataset as a SpatialData object. + files: + - name: cells.zip + s3_key: cells.zip + sha256: dc9613cb9e16fd2cd8d83f3a9586eeda4af5ba8ba366f1066efb51305820c5fb diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 2237e253c..10337915f 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -1,6 +1,11 @@ from __future__ import annotations -from spatialdata.datasets import blobs, raccoon +import importlib.resources + +import pytest + +from spatialdata import SpatialData +from spatialdata.datasets import blobs, cells, raccoon def test_datasets() -> None: @@ -26,3 +31,26 @@ def test_datasets() -> None: assert sdata_raccoon.images["raccoon"].shape == (3, 768, 1024) assert sdata_raccoon.labels["segmentation"].shape == (768, 1024) _ = str(sdata_raccoon) + + +def test_cells_registry() -> None: + # Network-free: the shipped registry parses and exposes the cells dataset. + from scverse_misc.datasets import parse_registry + + registry = importlib.resources.files("spatialdata").joinpath("datasets.yaml") + with importlib.resources.as_file(registry) as registry_path: + base_url, datasets = parse_registry(registry_path) + + assert base_url == "https://exampledata.scverse.org/spatialdata/" + entry = datasets["cells"] + assert entry.type == "spatialdata" + file = entry.file(name="cells.zip") + assert file.sha256 == "dc9613cb9e16fd2cd8d83f3a9586eeda4af5ba8ba366f1066efb51305820c5fb" + assert file.resolve_url(base_url) == "https://exampledata.scverse.org/spatialdata/cells.zip" + + +@pytest.mark.slow +def test_cells_download(tmp_path) -> None: + # Downloads ~3 MB from the scverse example data bucket; opt out with `-m "not slow"`. + sdata = cells(path=str(tmp_path)) + assert isinstance(sdata, SpatialData)