Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions src/spatialdata_io/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,12 @@ def visium_hd_wrapper(
default=True,
help="Whether to read cells annotations in the AnnData table. [default: True]",
)
@click.option(
"--cells-analysis",
type=bool,
default=True,
help="Whether to read the onboard secondary analysis (clustering/PCA/UMAP/diffexp) into the table. [default: True]",
)
@click.option(
"--gex-only",
type=bool,
Expand Down Expand Up @@ -762,6 +768,7 @@ def xenium_wrapper(
morphology_focus: bool = True,
aligned_images: bool = True,
cells_table: bool = True,
cells_analysis: bool = True,
gex_only: bool = True,
imread_kwargs: str = "{}",
image_models_kwargs: str = "{}",
Expand All @@ -782,6 +789,7 @@ def xenium_wrapper(
morphology_focus=morphology_focus,
aligned_images=aligned_images,
cells_table=cells_table,
cells_analysis=cells_analysis,
gex_only=gex_only,
imread_kwargs=_parse_json_param(imread_kwargs, "imread_kwargs"),
image_models_kwargs=_parse_json_param(image_models_kwargs, "image_models_kwargs"),
Expand Down
12 changes: 12 additions & 0 deletions src/spatialdata_io/_constants/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,18 @@ class XeniumKeys(ModeEnum):
EXPLORER_SELECTION_Y = "Y"
EXPLORER_SELECTION_KEY = "Selection"

# secondary analysis (the ``analysis/`` folder: clustering / pca / umap / diffexp)
ANALYSIS_DIR = "analysis"
ANALYSIS_CLUSTERING_DIR = "clustering"
ANALYSIS_PCA_DIR = "pca"
ANALYSIS_UMAP_DIR = "umap"
ANALYSIS_DIFFEXP_DIR = "diffexp"
ANALYSIS_CLUSTERS_FILE = "clusters.csv"
ANALYSIS_PROJECTION_FILE = "projection.csv"
ANALYSIS_DIFFEXP_FILE = "differential_expression.csv"
ANALYSIS_BARCODE = "Barcode"
ANALYSIS_CLUSTER = "Cluster"


@unique
class VisiumKeys(ModeEnum):
Expand Down
102 changes: 102 additions & 0 deletions src/spatialdata_io/readers/xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def xenium(
morphology_focus: bool = True,
aligned_images: bool = True,
cells_table: bool = True,
cells_analysis: bool = True,
n_jobs: int | None = None,
gex_only: bool = True,
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
Expand Down Expand Up @@ -325,6 +326,13 @@ def xenium(
`False` and use the `xenium_aligned_image` function directly.
cells_table
Whether to read the cell annotations in the `AnnData` table.
cells_analysis
Whether to read the Xenium onboard secondary analysis (the ``analysis/`` folder) into the table, when present.
Clustering results (``analysis/clustering``) are added as categorical columns in ``table.obs`` (one per
clustering, joined to the cells by ``cell_id``); PCA and UMAP projections are added to ``table.obsm`` as
``"X_pca"`` / ``"X_umap"``; and differential-expression tables (``analysis/diffexp``) are added to
``table.uns["diffexp"]``. Requires ``cells_table=True``; a missing ``analysis/`` folder (e.g. re-segmented
data) is a no-op.
n_jobs
.. deprecated::
``n_jobs`` is not used anymore and will be removed in a future release. The reading time of shapes is now
Expand Down Expand Up @@ -412,6 +420,9 @@ def xenium(
if not cells_as_circles:
table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"

if cells_analysis:
_add_cells_analysis(table, path)

# --- read elements ---
polygons = {}
labels = {}
Expand Down Expand Up @@ -482,6 +493,97 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
return cell_id_column


def _add_cells_analysis(table: AnnData, path: Path) -> None:
"""Enrich the cell table in place with the Xenium onboard secondary analysis.

Reads the ``analysis/`` folder of the Xenium output, joining everything to the
cells by the ``Barcode`` column, which is the Xenium ``cell_id`` (and hence
``table.obs_names``). Cells absent from a given result (e.g. filtered out by QC
before clustering) receive a missing value rather than being dropped.

- ``analysis/clustering/<name>/clusters.csv`` -> one categorical column per
clustering in ``table.obs`` (named ``<name>``, e.g. ``gene_expression_graphclust``).
- ``analysis/pca/<name>/projection.csv`` -> ``table.obsm["X_pca"]``.
- ``analysis/umap/<name>/projection.csv`` -> ``table.obsm["X_umap"]``.
- ``analysis/diffexp/<name>/differential_expression.csv`` ->
``table.uns["diffexp"][<name>]``.

A missing ``analysis/`` folder is a no-op (e.g. re-segmented data, or a
matrix-only export).
"""
analysis_dir = path / XeniumKeys.ANALYSIS_DIR
if not analysis_dir.is_dir():
return
barcode = str(XeniumKeys.ANALYSIS_BARCODE)
cluster = str(XeniumKeys.ANALYSIS_CLUSTER)
# The clustering/projection CSVs key on the Xenium ``cell_id`` barcode. Join on
# the ``cell_id`` obs column rather than ``obs_names``: depending on the Xenium
# Analyzer version the table index may be the barcode OR a positional integer,
# but the ``cell_id`` column is always the barcode. ``join_keys`` stays in table
# row order, so reindexing to it keeps everything row-aligned.
cell_id_col = str(XeniumKeys.CELL_ID)
if cell_id_col in table.obs.columns:
join_keys = pd.Index([str(x) for x in table.obs[cell_id_col]])
else:
join_keys = pd.Index([str(x) for x in table.obs_names])

# clustering -> categorical obs columns
clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR
if clustering_dir.is_dir():
for sub in sorted(p for p in clustering_dir.iterdir() if p.is_dir()):
csv = sub / XeniumKeys.ANALYSIS_CLUSTERS_FILE
if not csv.is_file():
continue
df = pd.read_csv(csv, dtype={barcode: str})
labels = df.set_index(barcode)[cluster].reindex(join_keys)
# Cluster ids are 1-based ints; store as string categories (idiomatic
# for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast.
str_labels = [None if pd.isna(v) else str(int(v)) for v in labels]
table.obs[sub.name] = pd.Categorical(str_labels)

# pca / umap projections -> obsm
pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, join_keys, barcode)
if pca is not None:
table.obsm["X_pca"] = pca
umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, join_keys, barcode)
if umap is not None:
table.obsm["X_umap"] = umap

# differential expression -> uns
diffexp_dir = analysis_dir / XeniumKeys.ANALYSIS_DIFFEXP_DIR
if diffexp_dir.is_dir():
diffexp: dict[str, pd.DataFrame] = {}
for sub in sorted(p for p in diffexp_dir.iterdir() if p.is_dir()):
csv = sub / XeniumKeys.ANALYSIS_DIFFEXP_FILE
if csv.is_file():
diffexp[sub.name] = pd.read_csv(csv)
if diffexp:
table.uns["diffexp"] = diffexp


def _read_projection(group_dir: Path, join_keys: pd.Index, barcode: str) -> ArrayLike | None:
"""Read a ``<name>/projection.csv`` under ``group_dir`` into an obsm-shaped array.

Returns an ``(n_obs, n_components)`` float array aligned to ``join_keys`` (the cell-id
barcodes in table row order; rows absent from the projection become NaN), or ``None``
when ``group_dir`` has no projection. If several projections exist (rare), the one with
the most components is used.
"""
if not group_dir.is_dir():
return None
best: ArrayLike | None = None
best_cols = -1
for sub in sorted(p for p in group_dir.iterdir() if p.is_dir()):
csv = sub / XeniumKeys.ANALYSIS_PROJECTION_FILE
if not csv.is_file():
continue
df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode)
arr = df.reindex(join_keys).to_numpy(dtype=np.float32)
if arr.shape[1] > best_cols:
best, best_cols = arr, arr.shape[1]
return best


def _get_polygons(
path: Path,
file: str,
Expand Down
78 changes: 78 additions & 0 deletions tests/test_xenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
from tempfile import TemporaryDirectory

import numpy as np
import pandas as pd
import pytest
from anndata import AnnData
from click.testing import CliRunner
from pytest_mock import MockerFixture
from spatialdata import match_table_to_element, read_zarr
from spatialdata.models import TableModel, get_table_keys

from spatialdata_io.__main__ import xenium_wrapper
from spatialdata_io.readers.xenium import (
_add_cells_analysis,
_cell_id_str_from_prefix_suffix_uint32_reference,
cell_id_str_from_prefix_suffix_uint32,
prefix_suffix_uint32_from_cell_id_str,
Expand Down Expand Up @@ -324,3 +327,78 @@ def test_cli_xenium_valid_json_forwarded(
assert result.exit_code == 0, result.output
call_kwargs = mock_xenium.call_args.kwargs
assert call_kwargs[kwarg_param] == {"chunks": 512}


def _write_clusters(folder: Path, barcodes: list[str], clusters: list[int]) -> None:
folder.mkdir(parents=True)
pd.DataFrame({"Barcode": barcodes, "Cluster": clusters}).to_csv(folder / "clusters.csv", index=False)


def _write_projection(folder: Path, barcodes: list[str], cols: dict[str, list[float]]) -> None:
folder.mkdir(parents=True)
pd.DataFrame({"Barcode": barcodes, **cols}).to_csv(folder / "projection.csv", index=False)


def test_xenium_cells_analysis(tmp_path: Path) -> None:
"""``_add_cells_analysis`` joins onboard analysis to the table by ``cell_id``.

Covers: clustering -> categorical obs (joined by barcode, reordered, with a
cell absent from a clustering left missing), pca/umap -> obsm (aligned, missing
rows NaN), and diffexp -> uns.
"""
obs_names = ["a-1", "b-1", "c-1", "d-1"]
adata = AnnData(X=np.zeros((4, 2), dtype=np.float32))
adata.obs_names = obs_names

analysis = tmp_path / "analysis"
# graphclust covers 3 of 4 cells (d-1 unclustered -> missing); rows scrambled.
_write_clusters(analysis / "clustering" / "gene_expression_graphclust", ["c-1", "a-1", "b-1"], [3, 1, 2])
_write_clusters(
analysis / "clustering" / "gene_expression_kmeans_2_clusters", ["a-1", "b-1", "c-1", "d-1"], [1, 2, 1, 2]
)
# pca covers 3 of 4 cells; umap covers all.
_write_projection(
analysis / "pca" / "gene_expression_2_components",
["a-1", "b-1", "c-1"],
{"PC-1": [0.1, 0.2, 0.3], "PC-2": [1.0, 2.0, 3.0]},
)
_write_projection(
analysis / "umap" / "gene_expression_2_components",
["a-1", "b-1", "c-1", "d-1"],
{"UMAP-1": [1.0, 2.0, 3.0, 4.0], "UMAP-2": [4.0, 3.0, 2.0, 1.0]},
)
de_dir = analysis / "diffexp" / "gene_expression_graphclust"
de_dir.mkdir(parents=True)
pd.DataFrame({"Feature ID": ["g1"], "Feature Name": ["G1"], "Cluster 1 Mean Counts": [0.5]}).to_csv(
de_dir / "differential_expression.csv", index=False
)

_add_cells_analysis(adata, tmp_path)

# clustering -> categorical obs, joined by barcode (NOT row position), missing -> NaN.
gc = adata.obs["gene_expression_graphclust"]
assert str(gc.dtype) == "category"
assert list(gc[:3]) == ["1", "2", "3"]
assert pd.isna(gc.iloc[3])
assert list(gc.cat.categories) == ["1", "2", "3"] # string categories, no "1.0"
assert list(adata.obs["gene_expression_kmeans_2_clusters"]) == ["1", "2", "1", "2"]

# pca/umap -> obsm, aligned to obs order; cells absent from a projection are NaN.
assert adata.obsm["X_pca"].shape == (4, 2)
assert adata.obsm["X_pca"][0, 0] == np.float32(0.1) # a-1 joined correctly
assert np.isnan(adata.obsm["X_pca"][3]).all() # d-1 absent from pca
assert adata.obsm["X_umap"].shape == (4, 2)
assert not np.isnan(adata.obsm["X_umap"]).any()

# diffexp -> uns
assert "gene_expression_graphclust" in adata.uns["diffexp"]


def test_xenium_cells_analysis_missing_folder_is_noop(tmp_path: Path) -> None:
adata = AnnData(X=np.zeros((2, 2), dtype=np.float32))
adata.obs_names = ["a-1", "b-1"]
_add_cells_analysis(adata, tmp_path) # no analysis/ folder present
assert "X_pca" not in adata.obsm
assert "X_umap" not in adata.obsm
assert not any(c.startswith("gene_expression_") for c in adata.obs.columns)
assert "diffexp" not in adata.uns
Loading