From f95d31a5cd6e2111b649a26a04a35f46ffe2f6bf Mon Sep 17 00:00:00 2001
From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:42:46 +0200
Subject: [PATCH 1/3] feat(xenium): import onboard secondary analysis into the
 table

Add a `cells_analysis` option (default True) to `xenium()` that reads the
Xenium output's `analysis/` folder into the cell table when present:

- `analysis/clustering/<name>/clusters.csv` -> one categorical column per
  clustering in `table.obs` (e.g. `gene_expression_graphclust`,
  `gene_expression_kmeans_10_clusters`), joined to the cells by `cell_id`
  (the CSV `Barcode`). Cells absent from a clustering (filtered by QC) get a
  missing value rather than being dropped.
- `analysis/pca/<name>/projection.csv`  -> `table.obsm["X_pca"]`.
- `analysis/umap/<name>/projection.csv` -> `table.obsm["X_umap"]`.
- `analysis/diffexp/<name>/differential_expression.csv` ->
  `table.uns["diffexp"][<name>]`.

Until now `xenium()` imported only the raw outputs (boundaries, transcripts,
images, cell-feature matrix); the onboard secondary analysis was dropped, so
the 10x-computed clusters/embeddings had to be recomputed downstream. Joining
by `cell_id` keeps everything aligned to the shapes/table index. A missing
`analysis/` folder is a no-op (e.g. re-segmented data, matrix-only exports).

Adds self-contained unit tests for the parser (join-by-barcode, missing-cell
handling, obsm alignment, no-op when the folder is absent).
---
 src/spatialdata_io/_constants/_constants.py | 12 +++
 src/spatialdata_io/readers/xenium.py        | 92 +++++++++++++++++++++
 tests/test_xenium.py                        | 78 +++++++++++++++++
 3 files changed, 182 insertions(+)
diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
index 46f983b1..17a6076c 100644
--- a/src/spatialdata_io/_constants/_constants.py
+++ b/src/spatialdata_io/_constants/_constants.py
@@ -162,6 +162,18 @@ class XeniumKeys(ModeEnum):
     EXPLORER_SELECTION_Y = "Y"
     EXPLORER_SELECTION_KEY = "Selection"
 
+    # secondary analysis (the ``analysis/`` folder: clustering / pca / umap / diffexp)
+    ANALYSIS_DIR = "analysis"
+    ANALYSIS_CLUSTERING_DIR = "clustering"
+    ANALYSIS_PCA_DIR = "pca"
+    ANALYSIS_UMAP_DIR = "umap"
+    ANALYSIS_DIFFEXP_DIR = "diffexp"
+    ANALYSIS_CLUSTERS_FILE = "clusters.csv"
+    ANALYSIS_PROJECTION_FILE = "projection.csv"
+    ANALYSIS_DIFFEXP_FILE = "differential_expression.csv"
+    ANALYSIS_BARCODE = "Barcode"
+    ANALYSIS_CLUSTER = "Cluster"
+
 
 @unique
 class VisiumKeys(ModeEnum):
diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index fcf2b6e1..022cec5c 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -272,6 +272,7 @@ def xenium(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
+    cells_analysis: bool = True,
     n_jobs: int | None = None,
     gex_only: bool = True,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -325,6 +326,13 @@ def xenium(
         `False` and use the `xenium_aligned_image` function directly.
     cells_table
         Whether to read the cell annotations in the `AnnData` table.
+    cells_analysis
+        Whether to read the Xenium onboard secondary analysis (the ``analysis/`` folder) into the table, when present.
+        Clustering results (``analysis/clustering``) are added as categorical columns in ``table.obs`` (one per
+        clustering, joined to the cells by ``cell_id``); PCA and UMAP projections are added to ``table.obsm`` as
+        ``"X_pca"`` / ``"X_umap"``; and differential-expression tables (``analysis/diffexp``) are added to
+        ``table.uns["diffexp"]``. Requires ``cells_table=True``; a missing ``analysis/`` folder (e.g. re-segmented
+        data) is a no-op.
     n_jobs
         .. deprecated::
             ``n_jobs`` is not used anymore and will be removed in a future release. The reading time of shapes is now
@@ -412,6 +420,9 @@ def xenium(
                 if not cells_as_circles:
                     table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
+        if cells_analysis:
+            _add_cells_analysis(table, path)
+
     # --- read elements ---
     polygons = {}
     labels = {}
@@ -482,6 +493,87 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     return cell_id_column
 
 
+def _add_cells_analysis(table: AnnData, path: Path) -> None:
+    """Enrich the cell table in place with the Xenium onboard secondary analysis.
+
+    Reads the ``analysis/`` folder of the Xenium output, joining everything to the
+    cells by the ``Barcode`` column, which is the Xenium ``cell_id`` (and hence
+    ``table.obs_names``). Cells absent from a given result (e.g. filtered out by QC
+    before clustering) receive a missing value rather than being dropped.
+
+    - ``analysis/clustering/<name>/clusters.csv`` -> one categorical column per
+      clustering in ``table.obs`` (named ``<name>``, e.g. ``gene_expression_graphclust``).
+    - ``analysis/pca/<name>/projection.csv``  -> ``table.obsm["X_pca"]``.
+    - ``analysis/umap/<name>/projection.csv`` -> ``table.obsm["X_umap"]``.
+    - ``analysis/diffexp/<name>/differential_expression.csv`` ->
+      ``table.uns["diffexp"][<name>]``.
+
+    A missing ``analysis/`` folder is a no-op (e.g. re-segmented data, or a
+    matrix-only export).
+    """
+    analysis_dir = path / XeniumKeys.ANALYSIS_DIR
+    if not analysis_dir.is_dir():
+        return
+    obs_names = table.obs_names
+    barcode = str(XeniumKeys.ANALYSIS_BARCODE)
+    cluster = str(XeniumKeys.ANALYSIS_CLUSTER)
+
+    # clustering -> categorical obs columns
+    clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR
+    if clustering_dir.is_dir():
+        for sub in sorted(p for p in clustering_dir.iterdir() if p.is_dir()):
+            csv = sub / XeniumKeys.ANALYSIS_CLUSTERS_FILE
+            if not csv.is_file():
+                continue
+            df = pd.read_csv(csv, dtype={barcode: str})
+            labels = df.set_index(barcode)[cluster].reindex(obs_names)
+            # Cluster ids are 1-based ints; store as string categories (idiomatic
+            # for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast.
+            str_labels = [None if pd.isna(v) else str(int(v)) for v in labels]
+            table.obs[sub.name] = pd.Categorical(str_labels)
+
+    # pca / umap projections -> obsm
+    pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, obs_names, barcode)
+    if pca is not None:
+        table.obsm["X_pca"] = pca
+    umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, obs_names, barcode)
+    if umap is not None:
+        table.obsm["X_umap"] = umap
+
+    # differential expression -> uns
+    diffexp_dir = analysis_dir / XeniumKeys.ANALYSIS_DIFFEXP_DIR
+    if diffexp_dir.is_dir():
+        diffexp: dict[str, pd.DataFrame] = {}
+        for sub in sorted(p for p in diffexp_dir.iterdir() if p.is_dir()):
+            csv = sub / XeniumKeys.ANALYSIS_DIFFEXP_FILE
+            if csv.is_file():
+                diffexp[sub.name] = pd.read_csv(csv)
+        if diffexp:
+            table.uns["diffexp"] = diffexp
+
+
+def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> ArrayLike | None:
+    """Read a ``<name>/projection.csv`` under ``group_dir`` into an obsm-shaped array.
+
+    Returns an ``(n_obs, n_components)`` float array aligned to ``obs_names`` (rows absent from
+    the projection become NaN), or ``None`` when ``group_dir`` has no projection. If several
+    projections exist (rare), the one with the most components is used.
+    """
+    if not group_dir.is_dir():
+        return None
+    best: ArrayLike | None = None
+    best_cols = -1
+    for sub in sorted(p for p in group_dir.iterdir() if p.is_dir()):
+        csv = sub / XeniumKeys.ANALYSIS_PROJECTION_FILE
+        if not csv.is_file():
+            continue
+        df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode)
+        arr = df.reindex(obs_names).to_numpy(dtype=np.float32)
+        if arr.shape[1] > best_cols:
+            best, best_cols = arr, arr.shape[1]
+    return best
+
+
 def _get_polygons(
     path: Path,
     file: str,
diff --git a/tests/test_xenium.py b/tests/test_xenium.py
index 17903888..b6d33c84 100644
--- a/tests/test_xenium.py
+++ b/tests/test_xenium.py
@@ -3,7 +3,9 @@
 from tempfile import TemporaryDirectory
 
 import numpy as np
+import pandas as pd
 import pytest
+from anndata import AnnData
 from click.testing import CliRunner
 from pytest_mock import MockerFixture
 from spatialdata import match_table_to_element, read_zarr
@@ -11,6 +13,7 @@
 
 from spatialdata_io.__main__ import xenium_wrapper
 from spatialdata_io.readers.xenium import (
+    _add_cells_analysis,
     _cell_id_str_from_prefix_suffix_uint32_reference,
     cell_id_str_from_prefix_suffix_uint32,
     prefix_suffix_uint32_from_cell_id_str,
@@ -324,3 +327,78 @@ def test_cli_xenium_valid_json_forwarded(
     assert result.exit_code == 0, result.output
     call_kwargs = mock_xenium.call_args.kwargs
     assert call_kwargs[kwarg_param] == {"chunks": 512}
+
+
+def _write_clusters(folder: Path, barcodes: list[str], clusters: list[int]) -> None:
+    folder.mkdir(parents=True)
+    pd.DataFrame({"Barcode": barcodes, "Cluster": clusters}).to_csv(folder / "clusters.csv", index=False)
+
+
+def _write_projection(folder: Path, barcodes: list[str], cols: dict[str, list[float]]) -> None:
+    folder.mkdir(parents=True)
+    pd.DataFrame({"Barcode": barcodes, **cols}).to_csv(folder / "projection.csv", index=False)
+
+
+def test_xenium_cells_analysis(tmp_path: Path) -> None:
+    """``_add_cells_analysis`` joins onboard analysis to the table by ``cell_id``.
+
+    Covers: clustering -> categorical obs (joined by barcode, reordered, with a
+    cell absent from a clustering left missing), pca/umap -> obsm (aligned, missing
+    rows NaN), and diffexp -> uns.
+    """
+    obs_names = ["a-1", "b-1", "c-1", "d-1"]
+    adata = AnnData(X=np.zeros((4, 2), dtype=np.float32))
+    adata.obs_names = obs_names
+
+    analysis = tmp_path / "analysis"
+    # graphclust covers 3 of 4 cells (d-1 unclustered -> missing); rows scrambled.
+    _write_clusters(analysis / "clustering" / "gene_expression_graphclust", ["c-1", "a-1", "b-1"], [3, 1, 2])
+    _write_clusters(
+        analysis / "clustering" / "gene_expression_kmeans_2_clusters", ["a-1", "b-1", "c-1", "d-1"], [1, 2, 1, 2]
+    )
+    # pca covers 3 of 4 cells; umap covers all.
+    _write_projection(
+        analysis / "pca" / "gene_expression_2_components",
+        ["a-1", "b-1", "c-1"],
+        {"PC-1": [0.1, 0.2, 0.3], "PC-2": [1.0, 2.0, 3.0]},
+    )
+    _write_projection(
+        analysis / "umap" / "gene_expression_2_components",
+        ["a-1", "b-1", "c-1", "d-1"],
+        {"UMAP-1": [1.0, 2.0, 3.0, 4.0], "UMAP-2": [4.0, 3.0, 2.0, 1.0]},
+    )
+    de_dir = analysis / "diffexp" / "gene_expression_graphclust"
+    de_dir.mkdir(parents=True)
+    pd.DataFrame({"Feature ID": ["g1"], "Feature Name": ["G1"], "Cluster 1 Mean Counts": [0.5]}).to_csv(
+        de_dir / "differential_expression.csv", index=False
+    )
+
+    _add_cells_analysis(adata, tmp_path)
+
+    # clustering -> categorical obs, joined by barcode (NOT row position), missing -> NaN.
+    gc = adata.obs["gene_expression_graphclust"]
+    assert str(gc.dtype) == "category"
+    assert list(gc[:3]) == ["1", "2", "3"]
+    assert pd.isna(gc.iloc[3])
+    assert list(gc.cat.categories) == ["1", "2", "3"]  # string categories, no "1.0"
+    assert list(adata.obs["gene_expression_kmeans_2_clusters"]) == ["1", "2", "1", "2"]
+
+    # pca/umap -> obsm, aligned to obs order; cells absent from a projection are NaN.
+    assert adata.obsm["X_pca"].shape == (4, 2)
+    assert adata.obsm["X_pca"][0, 0] == np.float32(0.1)  # a-1 joined correctly
+    assert np.isnan(adata.obsm["X_pca"][3]).all()  # d-1 absent from pca
+    assert adata.obsm["X_umap"].shape == (4, 2)
+    assert not np.isnan(adata.obsm["X_umap"]).any()
+
+    # diffexp -> uns
+    assert "gene_expression_graphclust" in adata.uns["diffexp"]
+
+
+def test_xenium_cells_analysis_missing_folder_is_noop(tmp_path: Path) -> None:
+    adata = AnnData(X=np.zeros((2, 2), dtype=np.float32))
+    adata.obs_names = ["a-1", "b-1"]
+    _add_cells_analysis(adata, tmp_path)  # no analysis/ folder present
+    assert "X_pca" not in adata.obsm
+    assert "X_umap" not in adata.obsm
+    assert not any(c.startswith("gene_expression_") for c in adata.obs.columns)
+    assert "diffexp" not in adata.uns

From 0739d5ca7675c402ad31039c9c02403374cbfcb4 Mon Sep 17 00:00:00 2001
From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com>
Date: Thu, 18 Jun 2026 11:21:33 +0200
Subject: [PATCH 2/3] fix(xenium): join onboard analysis on cell_id column, not
 the table index

The analysis CSVs key on the cell_id barcode; join clustering + projections on
the cell_id obs column instead of obs_names so the import stays correct even
when the table index is positional rather than the barcode.
---
 src/spatialdata_io/readers/xenium.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
index 022cec5c..b606bb99 100644
--- a/src/spatialdata_io/readers/xenium.py
+++ b/src/spatialdata_io/readers/xenium.py
@@ -514,9 +514,18 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None:
     analysis_dir = path / XeniumKeys.ANALYSIS_DIR
     if not analysis_dir.is_dir():
         return
-    obs_names = table.obs_names
     barcode = str(XeniumKeys.ANALYSIS_BARCODE)
     cluster = str(XeniumKeys.ANALYSIS_CLUSTER)
+    # The clustering/projection CSVs key on the Xenium ``cell_id`` barcode. Join on
+    # the ``cell_id`` obs column rather than ``obs_names``: depending on the Xenium
+    # Analyzer version the table index may be the barcode OR a positional integer,
+    # but the ``cell_id`` column is always the barcode. ``join_keys`` stays in table
+    # row order, so reindexing to it keeps everything row-aligned.
+    cell_id_col = str(XeniumKeys.CELL_ID)
+    if cell_id_col in table.obs.columns:
+        join_keys = pd.Index([str(x) for x in table.obs[cell_id_col]])
+    else:
+        join_keys = pd.Index([str(x) for x in table.obs_names])
 
     # clustering -> categorical obs columns
     clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR
@@ -526,17 +535,17 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None:
             if not csv.is_file():
                 continue
             df = pd.read_csv(csv, dtype={barcode: str})
-            labels = df.set_index(barcode)[cluster].reindex(obs_names)
+            labels = df.set_index(barcode)[cluster].reindex(join_keys)
             # Cluster ids are 1-based ints; store as string categories (idiomatic
             # for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast.
             str_labels = [None if pd.isna(v) else str(int(v)) for v in labels]
             table.obs[sub.name] = pd.Categorical(str_labels)
 
     # pca / umap projections -> obsm
-    pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, obs_names, barcode)
+    pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, join_keys, barcode)
     if pca is not None:
         table.obsm["X_pca"] = pca
-    umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, obs_names, barcode)
+    umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, join_keys, barcode)
     if umap is not None:
         table.obsm["X_umap"] = umap
 
@@ -552,12 +561,13 @@ def _add_cells_analysis(table: AnnData, path: Path) -> None:
             table.uns["diffexp"] = diffexp
 
 
-def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> ArrayLike | None:
+def _read_projection(group_dir: Path, join_keys: pd.Index, barcode: str) -> ArrayLike | None:
     """Read a ``<name>/projection.csv`` under ``group_dir`` into an obsm-shaped array.
 
-    Returns an ``(n_obs, n_components)`` float array aligned to ``obs_names`` (rows absent from
-    the projection become NaN), or ``None`` when ``group_dir`` has no projection. If several
-    projections exist (rare), the one with the most components is used.
+    Returns an ``(n_obs, n_components)`` float array aligned to ``join_keys`` (the cell-id
+    barcodes in table row order; rows absent from the projection become NaN), or ``None``
+    when ``group_dir`` has no projection. If several projections exist (rare), the one with
+    the most components is used.
     """
     if not group_dir.is_dir():
         return None
@@ -568,7 +578,7 @@ def _read_projection(group_dir: Path, obs_names: pd.Index, barcode: str) -> Arra
         if not csv.is_file():
             continue
         df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode)
-        arr = df.reindex(obs_names).to_numpy(dtype=np.float32)
+        arr = df.reindex(join_keys).to_numpy(dtype=np.float32)
         if arr.shape[1] > best_cols:
             best, best_cols = arr, arr.shape[1]
     return best

From 687a283a81012880850fbd738a097628becb9462 Mon Sep 17 00:00:00 2001
From: Tomatokeftes <129113023+Tomatokeftes@users.noreply.github.com>
Date: Thu, 18 Jun 2026 14:19:54 +0200
Subject: [PATCH 3/3] fix(xenium): expose cells_analysis as a CLI option

The CLI-completeness test (test_cli_exposes_all_reader_params) requires every
xenium() parameter to have a matching click option in xenium_wrapper. Add the
--cells-analysis option + param + pass-through for the new cells_analysis kwarg.
---
 src/spatialdata_io/__main__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/spatialdata_io/__main__.py b/src/spatialdata_io/__main__.py
index f0a5f9f7..84947489 100644
--- a/src/spatialdata_io/__main__.py
+++ b/src/spatialdata_io/__main__.py
@@ -724,6 +724,12 @@ def visium_hd_wrapper(
     default=True,
     help="Whether to read cells annotations in the AnnData table. [default: True]",
 )
+@click.option(
+    "--cells-analysis",
+    type=bool,
+    default=True,
+    help="Whether to read the onboard secondary analysis (clustering/PCA/UMAP/diffexp) into the table. [default: True]",
+)
 @click.option(
     "--gex-only",
     type=bool,
@@ -762,6 +768,7 @@ def xenium_wrapper(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
+    cells_analysis: bool = True,
     gex_only: bool = True,
     imread_kwargs: str = "{}",
     image_models_kwargs: str = "{}",
@@ -782,6 +789,7 @@ def xenium_wrapper(
         morphology_focus=morphology_focus,
         aligned_images=aligned_images,
         cells_table=cells_table,
+        cells_analysis=cells_analysis,
         gex_only=gex_only,
         imread_kwargs=_parse_json_param(imread_kwargs, "imread_kwargs"),
         image_models_kwargs=_parse_json_param(image_models_kwargs, "image_models_kwargs"),