From f95d31a5cd6e2111b649a26a04a35f46ffe2f6bf Mon Sep 17 00:00:00 2001 From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:42:46 +0200 Subject: [PATCH 1/3] feat(xenium): import onboard secondary analysis into the table Add a `cells_analysis` option (default True) to `xenium()` that reads the Xenium output's `analysis/` folder into the cell table when present: - `analysis/clustering//clusters.csv` -> one categorical column per clustering in `table.obs` (e.g. `gene_expression_graphclust`, `gene_expression_kmeans_10_clusters`), joined to the cells by `cell_id` (the CSV `Barcode`). Cells absent from a clustering (filtered by QC) get a missing value rather than being dropped. - `analysis/pca//projection.csv` -> `table.obsm["X_pca"]`. - `analysis/umap//projection.csv` -> `table.obsm["X_umap"]`. - `analysis/diffexp//differential_expression.csv` -> `table.uns["diffexp"][]`. Until now `xenium()` imported only the raw outputs (boundaries, transcripts, images, cell-feature matrix); the onboard secondary analysis was dropped, so the 10x-computed clusters/embeddings had to be recomputed downstream. Joining by `cell_id` keeps everything aligned to the shapes/table index. A missing `analysis/` folder is a no-op (e.g. re-segmented data, matrix-only exports). Adds self-contained unit tests for the parser (join-by-barcode, missing-cell handling, obsm alignment, no-op when the folder is absent). --- src/spatialdata_io/_constants/_constants.py | 12 +++ src/spatialdata_io/readers/xenium.py | 92 +++++++++++++++++++++ tests/test_xenium.py | 78 +++++++++++++++++ 3 files changed, 182 insertions(+) diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py index 46f983b1..17a6076c 100644 --- a/src/spatialdata_io/_constants/_constants.py +++ b/src/spatialdata_io/_constants/_constants.py @@ -162,6 +162,18 @@ class XeniumKeys(ModeEnum): EXPLORER_SELECTION_Y = "Y" EXPLORER_SELECTION_KEY = "Selection" + # secondary analysis (the ``analysis/`` folder: clustering / pca / umap / diffexp) + ANALYSIS_DIR = "analysis" + ANALYSIS_CLUSTERING_DIR = "clustering" + ANALYSIS_PCA_DIR = "pca" + ANALYSIS_UMAP_DIR = "umap" + ANALYSIS_DIFFEXP_DIR = "diffexp" + ANALYSIS_CLUSTERS_FILE = "clusters.csv" + ANALYSIS_PROJECTION_FILE = "projection.csv" + ANALYSIS_DIFFEXP_FILE = "differential_expression.csv" + ANALYSIS_BARCODE = "Barcode" + ANALYSIS_CLUSTER = "Cluster" + @unique class VisiumKeys(ModeEnum): diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index fcf2b6e1..022cec5c 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -272,6 +272,7 @@ def xenium( morphology_focus: bool = True, aligned_images: bool = True, cells_table: bool = True, + cells_analysis: bool = True, n_jobs: int | None = None, gex_only: bool = True, imread_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -325,6 +326,13 @@ def xenium( `False` and use the `xenium_aligned_image` function directly. cells_table Whether to read the cell annotations in the `AnnData` table. + cells_analysis + Whether to read the Xenium onboard secondary analysis (the ``analysis/`` folder) into the table, when present. + Clustering results (``analysis/clustering``) are added as categorical columns in ``table.obs`` (one per + clustering, joined to the cells by ``cell_id``); PCA and UMAP projections are added to ``table.obsm`` as + ``"X_pca"`` / ``"X_umap"``; and differential-expression tables (``analysis/diffexp``) are added to + ``table.uns["diffexp"]``. Requires ``cells_table=True``; a missing ``analysis/`` folder (e.g. re-segmented + data) is a no-op. n_jobs .. deprecated:: ``n_jobs`` is not used anymore and will be removed in a future release. The reading time of shapes is now @@ -412,6 +420,9 @@ def xenium( if not cells_as_circles: table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels" + if cells_analysis: + _add_cells_analysis(table, path) + # --- read elements --- polygons = {} labels = {} @@ -482,6 +493,87 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series: return cell_id_column +def _add_cells_analysis(table: AnnData, path: Path) -> None: + """Enrich the cell table in place with the Xenium onboard secondary analysis. + + Reads the ``analysis/`` folder of the Xenium output, joining everything to the + cells by the ``Barcode`` column, which is the Xenium ``cell_id`` (and hence + ``table.obs_names``). Cells absent from a given result (e.g. filtered out by QC + before clustering) receive a missing value rather than being dropped. + + - ``analysis/clustering//clusters.csv`` -> one categorical column per + clustering in ``table.obs`` (named ````, e.g. ``gene_expression_graphclust``). + - ``analysis/pca//projection.csv`` -> ``table.obsm["X_pca"]``. + - ``analysis/umap//projection.csv`` -> ``table.obsm["X_umap"]``. + - ``analysis/diffexp//differential_expression.csv`` -> + ``table.uns["diffexp"][]``. + + A missing ``analysis/`` folder is a no-op (e.g. re-segmented data, or a + matrix-only export). + """ + analysis_dir = path / XeniumKeys.ANALYSIS_DIR + if not analysis_dir.is_dir(): + return + obs_names = table.obs_names + barcode = str(XeniumKeys.ANALYSIS_BARCODE) + cluster = str(XeniumKeys.ANALYSIS_CLUSTER) + + # clustering -> categorical obs columns + clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR + if clustering_dir.is_dir(): + for sub in sorted(p for p in clustering_dir.iterdir() if p.is_dir()): + csv = sub / XeniumKeys.ANALYSIS_CLUSTERS_FILE + if not csv.is_file(): + continue + df = pd.read_csv(csv, dtype={barcode: str}) + labels = df.set_index(barcode)[cluster].reindex(obs_names) + # Cluster ids are 1-based ints; store as string categories (idiomatic + # for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast. + str_labels = [None if pd.isna(v) else str(int(v)) for v in labels] + table.obs[sub.name] = pd.Categorical(str_labels) + + # pca / umap projections -> obsm + pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, obs_names, barcode) + if pca is not None: + table.obsm["X_pca"] = pca + umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, obs_names, barcode) + if umap is not None: + table.obsm["X_umap"] = umap + + # differential expression -> uns + diffexp_dir = analysis_dir / XeniumKeys.ANALYSIS_DIFFEXP_DIR + if diffexp_dir.is_dir(): + diffexp: dict[str, pd.DataFrame] = {} + for sub in sorted(p for p in diffexp_dir.iterdir() if p.is_dir()): + csv = sub / XeniumKeys.ANALYSIS_DIFFEXP_FILE + if csv.is_file(): + diffexp[sub.name] = pd.read_csv(csv) + if diffexp: + table.uns["diffexp"] = diffexp + + +def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> ArrayLike | None: + """Read a ``/projection.csv`` under ``group_dir`` into an obsm-shaped array. + + Returns an ``(n_obs, n_components)`` float array aligned to ``obs_names`` (rows absent from + the projection become NaN), or ``None`` when ``group_dir`` has no projection. If several + projections exist (rare), the one with the most components is used. + """ + if not group_dir.is_dir(): + return None + best: ArrayLike | None = None + best_cols = -1 + for sub in sorted(p for p in group_dir.iterdir() if p.is_dir()): + csv = sub / XeniumKeys.ANALYSIS_PROJECTION_FILE + if not csv.is_file(): + continue + df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode) + arr = df.reindex(obs_names).to_numpy(dtype=np.float32) + if arr.shape[1] > best_cols: + best, best_cols = arr, arr.shape[1] + return best + + def _get_polygons( path: Path, file: str, diff --git a/tests/test_xenium.py b/tests/test_xenium.py index 17903888..b6d33c84 100644 --- a/tests/test_xenium.py +++ b/tests/test_xenium.py @@ -3,7 +3,9 @@ from tempfile import TemporaryDirectory import numpy as np +import pandas as pd import pytest +from anndata import AnnData from click.testing import CliRunner from pytest_mock import MockerFixture from spatialdata import match_table_to_element, read_zarr @@ -11,6 +13,7 @@ from spatialdata_io.__main__ import xenium_wrapper from spatialdata_io.readers.xenium import ( + _add_cells_analysis, _cell_id_str_from_prefix_suffix_uint32_reference, cell_id_str_from_prefix_suffix_uint32, prefix_suffix_uint32_from_cell_id_str, @@ -324,3 +327,78 @@ def test_cli_xenium_valid_json_forwarded( assert result.exit_code == 0, result.output call_kwargs = mock_xenium.call_args.kwargs assert call_kwargs[kwarg_param] == {"chunks": 512} + + +def _write_clusters(folder: Path, barcodes: list[str], clusters: list[int]) -> None: + folder.mkdir(parents=True) + pd.DataFrame({"Barcode": barcodes, "Cluster": clusters}).to_csv(folder / "clusters.csv", index=False) + + +def _write_projection(folder: Path, barcodes: list[str], cols: dict[str, list[float]]) -> None: + folder.mkdir(parents=True) + pd.DataFrame({"Barcode": barcodes, **cols}).to_csv(folder / "projection.csv", index=False) + + +def test_xenium_cells_analysis(tmp_path: Path) -> None: + """``_add_cells_analysis`` joins onboard analysis to the table by ``cell_id``. + + Covers: clustering -> categorical obs (joined by barcode, reordered, with a + cell absent from a clustering left missing), pca/umap -> obsm (aligned, missing + rows NaN), and diffexp -> uns. + """ + obs_names = ["a-1", "b-1", "c-1", "d-1"] + adata = AnnData(X=np.zeros((4, 2), dtype=np.float32)) + adata.obs_names = obs_names + + analysis = tmp_path / "analysis" + # graphclust covers 3 of 4 cells (d-1 unclustered -> missing); rows scrambled. + _write_clusters(analysis / "clustering" / "gene_expression_graphclust", ["c-1", "a-1", "b-1"], [3, 1, 2]) + _write_clusters( + analysis / "clustering" / "gene_expression_kmeans_2_clusters", ["a-1", "b-1", "c-1", "d-1"], [1, 2, 1, 2] + ) + # pca covers 3 of 4 cells; umap covers all. + _write_projection( + analysis / "pca" / "gene_expression_2_components", + ["a-1", "b-1", "c-1"], + {"PC-1": [0.1, 0.2, 0.3], "PC-2": [1.0, 2.0, 3.0]}, + ) + _write_projection( + analysis / "umap" / "gene_expression_2_components", + ["a-1", "b-1", "c-1", "d-1"], + {"UMAP-1": [1.0, 2.0, 3.0, 4.0], "UMAP-2": [4.0, 3.0, 2.0, 1.0]}, + ) + de_dir = analysis / "diffexp" / "gene_expression_graphclust" + de_dir.mkdir(parents=True) + pd.DataFrame({"Feature ID": ["g1"], "Feature Name": ["G1"], "Cluster 1 Mean Counts": [0.5]}).to_csv( + de_dir / "differential_expression.csv", index=False + ) + + _add_cells_analysis(adata, tmp_path) + + # clustering -> categorical obs, joined by barcode (NOT row position), missing -> NaN. + gc = adata.obs["gene_expression_graphclust"] + assert str(gc.dtype) == "category" + assert list(gc[:3]) == ["1", "2", "3"] + assert pd.isna(gc.iloc[3]) + assert list(gc.cat.categories) == ["1", "2", "3"] # string categories, no "1.0" + assert list(adata.obs["gene_expression_kmeans_2_clusters"]) == ["1", "2", "1", "2"] + + # pca/umap -> obsm, aligned to obs order; cells absent from a projection are NaN. + assert adata.obsm["X_pca"].shape == (4, 2) + assert adata.obsm["X_pca"][0, 0] == np.float32(0.1) # a-1 joined correctly + assert np.isnan(adata.obsm["X_pca"][3]).all() # d-1 absent from pca + assert adata.obsm["X_umap"].shape == (4, 2) + assert not np.isnan(adata.obsm["X_umap"]).any() + + # diffexp -> uns + assert "gene_expression_graphclust" in adata.uns["diffexp"] + + +def test_xenium_cells_analysis_missing_folder_is_noop(tmp_path: Path) -> None: + adata = AnnData(X=np.zeros((2, 2), dtype=np.float32)) + adata.obs_names = ["a-1", "b-1"] + _add_cells_analysis(adata, tmp_path) # no analysis/ folder present + assert "X_pca" not in adata.obsm + assert "X_umap" not in adata.obsm + assert not any(c.startswith("gene_expression_") for c in adata.obs.columns) + assert "diffexp" not in adata.uns From 0739d5ca7675c402ad31039c9c02403374cbfcb4 Mon Sep 17 00:00:00 2001 From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:21:33 +0200 Subject: [PATCH 2/3] fix(xenium): join onboard analysis on cell_id column, not the table index The analysis CSVs key on the cell_id barcode; join clustering + projections on the cell_id obs column instead of obs_names so the import stays correct even when the table index is positional rather than the barcode. --- src/spatialdata_io/readers/xenium.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py index 022cec5c..b606bb99 100644 --- a/src/spatialdata_io/readers/xenium.py +++ b/src/spatialdata_io/readers/xenium.py @@ -514,9 +514,18 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None: analysis_dir = path / XeniumKeys.ANALYSIS_DIR if not analysis_dir.is_dir(): return - obs_names = table.obs_names barcode = str(XeniumKeys.ANALYSIS_BARCODE) cluster = str(XeniumKeys.ANALYSIS_CLUSTER) + # The clustering/projection CSVs key on the Xenium ``cell_id`` barcode. Join on + # the ``cell_id`` obs column rather than ``obs_names``: depending on the Xenium + # Analyzer version the table index may be the barcode OR a positional integer, + # but the ``cell_id`` column is always the barcode. ``join_keys`` stays in table + # row order, so reindexing to it keeps everything row-aligned. + cell_id_col = str(XeniumKeys.CELL_ID) + if cell_id_col in table.obs.columns: + join_keys = pd.Index([str(x) for x in table.obs[cell_id_col]]) + else: + join_keys = pd.Index([str(x) for x in table.obs_names]) # clustering -> categorical obs columns clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR @@ -526,17 +535,17 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None: if not csv.is_file(): continue df = pd.read_csv(csv, dtype={barcode: str}) - labels = df.set_index(barcode)[cluster].reindex(obs_names) + labels = df.set_index(barcode)[cluster].reindex(join_keys) # Cluster ids are 1-based ints; store as string categories (idiomatic # for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast. str_labels = [None if pd.isna(v) else str(int(v)) for v in labels] table.obs[sub.name] = pd.Categorical(str_labels) # pca / umap projections -> obsm - pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, obs_names, barcode) + pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, join_keys, barcode) if pca is not None: table.obsm["X_pca"] = pca - umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, obs_names, barcode) + umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, join_keys, barcode) if umap is not None: table.obsm["X_umap"] = umap @@ -552,12 +561,13 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None: table.uns["diffexp"] = diffexp -def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> ArrayLike | None: +def _read_projection(group_dir: Path, join_keys: pd.Index, barcode: str) -> ArrayLike | None: """Read a ``/projection.csv`` under ``group_dir`` into an obsm-shaped array. - Returns an ``(n_obs, n_components)`` float array aligned to ``obs_names`` (rows absent from - the projection become NaN), or ``None`` when ``group_dir`` has no projection. If several - projections exist (rare), the one with the most components is used. + Returns an ``(n_obs, n_components)`` float array aligned to ``join_keys`` (the cell-id + barcodes in table row order; rows absent from the projection become NaN), or ``None`` + when ``group_dir`` has no projection. If several projections exist (rare), the one with + the most components is used. """ if not group_dir.is_dir(): return None @@ -568,7 +578,7 @@ def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> Arra if not csv.is_file(): continue df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode) - arr = df.reindex(obs_names).to_numpy(dtype=np.float32) + arr = df.reindex(join_keys).to_numpy(dtype=np.float32) if arr.shape[1] > best_cols: best, best_cols = arr, arr.shape[1] return best From 687a283a81012880850fbd738a097628becb9462 Mon Sep 17 00:00:00 2001 From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com> Date: Thu, 18 Jun 2026 14:19:54 +0200 Subject: [PATCH 3/3] fix(xenium): expose cells_analysis as a CLI option The CLI-completeness test (test_cli_exposes_all_reader_params) requires every xenium() parameter to have a matching click option in xenium_wrapper. Add the --cells-analysis option + param + pass-through for the new cells_analysis kwarg. --- src/spatialdata_io/__main__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/spatialdata_io/__main__.py b/src/spatialdata_io/__main__.py index f0a5f9f7..84947489 100644 --- a/src/spatialdata_io/__main__.py +++ b/src/spatialdata_io/__main__.py @@ -724,6 +724,12 @@ def visium_hd_wrapper( default=True, help="Whether to read cells annotations in the AnnData table. [default: True]", ) +@click.option( + "--cells-analysis", + type=bool, + default=True, + help="Whether to read the onboard secondary analysis (clustering/PCA/UMAP/diffexp) into the table. [default: True]", +) @click.option( "--gex-only", type=bool, @@ -762,6 +768,7 @@ def xenium_wrapper( morphology_focus: bool = True, aligned_images: bool = True, cells_table: bool = True, + cells_analysis: bool = True, gex_only: bool = True, imread_kwargs: str = "{}", image_models_kwargs: str = "{}", @@ -782,6 +789,7 @@ def xenium_wrapper( morphology_focus=morphology_focus, aligned_images=aligned_images, cells_table=cells_table, + cells_analysis=cells_analysis, gex_only=gex_only, imread_kwargs=_parse_json_param(imread_kwargs, "imread_kwargs"), image_models_kwargs=_parse_json_param(image_models_kwargs, "image_models_kwargs"),