scverse · Tomatokeftes · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/src/spatialdata_io/__main__.py b/src/spatialdata_io/__main__.py
@@ -724,6 +724,12 @@ def visium_hd_wrapper(
     default=True,
     help="Whether to read cells annotations in the AnnData table. [default: True]",
 )
+@click.option(
+    "--cells-analysis",
+    type=bool,
+    default=True,
+    help="Whether to read the onboard secondary analysis (clustering/PCA/UMAP/diffexp) into the table. [default: True]",
+)
 @click.option(
     "--gex-only",
     type=bool,
@@ -762,6 +768,7 @@ def xenium_wrapper(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
+    cells_analysis: bool = True,
     gex_only: bool = True,
     imread_kwargs: str = "{}",
     image_models_kwargs: str = "{}",
@@ -782,6 +789,7 @@ def xenium_wrapper(
         morphology_focus=morphology_focus,
         aligned_images=aligned_images,
         cells_table=cells_table,
+        cells_analysis=cells_analysis,
         gex_only=gex_only,
         imread_kwargs=_parse_json_param(imread_kwargs, "imread_kwargs"),
         image_models_kwargs=_parse_json_param(image_models_kwargs, "image_models_kwargs"),

diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
@@ -162,6 +162,18 @@ class XeniumKeys(ModeEnum):
     EXPLORER_SELECTION_Y = "Y"
     EXPLORER_SELECTION_KEY = "Selection"
 
+    # secondary analysis (the ``analysis/`` folder: clustering / pca / umap / diffexp)
+    ANALYSIS_DIR = "analysis"
+    ANALYSIS_CLUSTERING_DIR = "clustering"
+    ANALYSIS_PCA_DIR = "pca"
+    ANALYSIS_UMAP_DIR = "umap"
+    ANALYSIS_DIFFEXP_DIR = "diffexp"
+    ANALYSIS_CLUSTERS_FILE = "clusters.csv"
+    ANALYSIS_PROJECTION_FILE = "projection.csv"
+    ANALYSIS_DIFFEXP_FILE = "differential_expression.csv"
+    ANALYSIS_BARCODE = "Barcode"
+    ANALYSIS_CLUSTER = "Cluster"
+
 
 @unique
 class VisiumKeys(ModeEnum):

diff --git a/src/spatialdata_io/readers/xenium.py b/src/spatialdata_io/readers/xenium.py
@@ -272,6 +272,7 @@ def xenium(
     morphology_focus: bool = True,
     aligned_images: bool = True,
     cells_table: bool = True,
+    cells_analysis: bool = True,
     n_jobs: int | None = None,
     gex_only: bool = True,
     imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -325,6 +326,13 @@ def xenium(
         `False` and use the `xenium_aligned_image` function directly.
     cells_table
         Whether to read the cell annotations in the `AnnData` table.
+    cells_analysis
+        Whether to read the Xenium onboard secondary analysis (the ``analysis/`` folder) into the table, when present.
+        Clustering results (``analysis/clustering``) are added as categorical columns in ``table.obs`` (one per
+        clustering, joined to the cells by ``cell_id``); PCA and UMAP projections are added to ``table.obsm`` as
+        ``"X_pca"`` / ``"X_umap"``; and differential-expression tables (``analysis/diffexp``) are added to
+        ``table.uns["diffexp"]``. Requires ``cells_table=True``; a missing ``analysis/`` folder (e.g. re-segmented
+        data) is a no-op.
     n_jobs
         .. deprecated::
             ``n_jobs`` is not used anymore and will be removed in a future release. The reading time of shapes is now
@@ -412,6 +420,9 @@ def xenium(
                 if not cells_as_circles:
                     table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
 
+        if cells_analysis:
+            _add_cells_analysis(table, path)
+
     # --- read elements ---
     polygons = {}
     labels = {}
@@ -482,6 +493,97 @@ def _decode_cell_id_column(cell_id_column: pd.Series) -> pd.Series:
     return cell_id_column
 
 
+def _add_cells_analysis(table: AnnData, path: Path) -> None:
+    """Enrich the cell table in place with the Xenium onboard secondary analysis.
+
+    Reads the ``analysis/`` folder of the Xenium output, joining everything to the
+    cells by the ``Barcode`` column, which is the Xenium ``cell_id`` (and hence
+    ``table.obs_names``). Cells absent from a given result (e.g. filtered out by QC
+    before clustering) receive a missing value rather than being dropped.
+
+    - ``analysis/clustering/<name>/clusters.csv`` -> one categorical column per
+      clustering in ``table.obs`` (named ``<name>``, e.g. ``gene_expression_graphclust``).
+    - ``analysis/pca/<name>/projection.csv``  -> ``table.obsm["X_pca"]``.
+    - ``analysis/umap/<name>/projection.csv`` -> ``table.obsm["X_umap"]``.
+    - ``analysis/diffexp/<name>/differential_expression.csv`` ->
+      ``table.uns["diffexp"][<name>]``.
+
+    A missing ``analysis/`` folder is a no-op (e.g. re-segmented data, or a
+    matrix-only export).
+    """
+    analysis_dir = path / XeniumKeys.ANALYSIS_DIR
+    if not analysis_dir.is_dir():
+        return
+    barcode = str(XeniumKeys.ANALYSIS_BARCODE)
+    cluster = str(XeniumKeys.ANALYSIS_CLUSTER)
+    # The clustering/projection CSVs key on the Xenium ``cell_id`` barcode. Join on
+    # the ``cell_id`` obs column rather than ``obs_names``: depending on the Xenium
+    # Analyzer version the table index may be the barcode OR a positional integer,
+    # but the ``cell_id`` column is always the barcode. ``join_keys`` stays in table
+    # row order, so reindexing to it keeps everything row-aligned.
+    cell_id_col = str(XeniumKeys.CELL_ID)
+    if cell_id_col in table.obs.columns:
+        join_keys = pd.Index([str(x) for x in table.obs[cell_id_col]])
+    else:
+        join_keys = pd.Index([str(x) for x in table.obs_names])
+
+    # clustering -> categorical obs columns
+    clustering_dir = analysis_dir / XeniumKeys.ANALYSIS_CLUSTERING_DIR
+    if clustering_dir.is_dir():
+        for sub in sorted(p for p in clustering_dir.iterdir() if p.is_dir()):
+            csv = sub / XeniumKeys.ANALYSIS_CLUSTERS_FILE
+            if not csv.is_file():
+                continue
+            df = pd.read_csv(csv, dtype={barcode: str})
+            labels = df.set_index(barcode)[cluster].reindex(join_keys)
+            # Cluster ids are 1-based ints; store as string categories (idiomatic
+            # for scanpy/squidpy) so "1" never becomes "1.0" via the NaN upcast.
+            str_labels = [None if pd.isna(v) else str(int(v)) for v in labels]
+            table.obs[sub.name] = pd.Categorical(str_labels)
+
+    # pca / umap projections -> obsm
+    pca = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_PCA_DIR, join_keys, barcode)
+    if pca is not None:
+        table.obsm["X_pca"] = pca
+    umap = _read_projection(analysis_dir / XeniumKeys.ANALYSIS_UMAP_DIR, join_keys, barcode)
+    if umap is not None:
+        table.obsm["X_umap"] = umap
+
+    # differential expression -> uns
+    diffexp_dir = analysis_dir / XeniumKeys.ANALYSIS_DIFFEXP_DIR
+    if diffexp_dir.is_dir():
+        diffexp: dict[str, pd.DataFrame] = {}
+        for sub in sorted(p for p in diffexp_dir.iterdir() if p.is_dir()):
+            csv = sub / XeniumKeys.ANALYSIS_DIFFEXP_FILE
+            if csv.is_file():
+                diffexp[sub.name] = pd.read_csv(csv)
+        if diffexp:
+            table.uns["diffexp"] = diffexp
+
+
+def _read_projection(group_dir: Path, join_keys: pd.Index, barcode: str) -> ArrayLike | None:
+    """Read a ``<name>/projection.csv`` under ``group_dir`` into an obsm-shaped array.
+
+    Returns an ``(n_obs, n_components)`` float array aligned to ``join_keys`` (the cell-id
+    barcodes in table row order; rows absent from the projection become NaN), or ``None``
+    when ``group_dir`` has no projection. If several projections exist (rare), the one with
+    the most components is used.
+    """
+    if not group_dir.is_dir():
+        return None
+    best: ArrayLike | None = None
+    best_cols = -1
+    for sub in sorted(p for p in group_dir.iterdir() if p.is_dir()):
+        csv = sub / XeniumKeys.ANALYSIS_PROJECTION_FILE
+        if not csv.is_file():
+            continue
+        df = pd.read_csv(csv, dtype={barcode: str}).set_index(barcode)
+        arr = df.reindex(join_keys).to_numpy(dtype=np.float32)
+        if arr.shape[1] > best_cols:
+            best, best_cols = arr, arr.shape[1]
+    return best
+
+
 def _get_polygons(
     path: Path,
     file: str,

diff --git a/tests/test_xenium.py b/tests/test_xenium.py
@@ -3,14 +3,17 @@
 from tempfile import TemporaryDirectory
 
 import numpy as np
+import pandas as pd
 import pytest
+from anndata import AnnData
 from click.testing import CliRunner
 from pytest_mock import MockerFixture
 from spatialdata import match_table_to_element, read_zarr
 from spatialdata.models import TableModel, get_table_keys
 
 from spatialdata_io.__main__ import xenium_wrapper
 from spatialdata_io.readers.xenium import (
+    _add_cells_analysis,
     _cell_id_str_from_prefix_suffix_uint32_reference,
     cell_id_str_from_prefix_suffix_uint32,
     prefix_suffix_uint32_from_cell_id_str,
@@ -324,3 +327,78 @@ def test_cli_xenium_valid_json_forwarded(
     assert result.exit_code == 0, result.output
     call_kwargs = mock_xenium.call_args.kwargs
     assert call_kwargs[kwarg_param] == {"chunks": 512}
+
+
+def _write_clusters(folder: Path, barcodes: list[str], clusters: list[int]) -> None:
+    folder.mkdir(parents=True)
+    pd.DataFrame({"Barcode": barcodes, "Cluster": clusters}).to_csv(folder / "clusters.csv", index=False)
+
+
+def _write_projection(folder: Path, barcodes: list[str], cols: dict[str, list[float]]) -> None:
+    folder.mkdir(parents=True)
+    pd.DataFrame({"Barcode": barcodes, **cols}).to_csv(folder / "projection.csv", index=False)
+
+
+def test_xenium_cells_analysis(tmp_path: Path) -> None:
+    """``_add_cells_analysis`` joins onboard analysis to the table by ``cell_id``.
+
+    Covers: clustering -> categorical obs (joined by barcode, reordered, with a
+    cell absent from a clustering left missing), pca/umap -> obsm (aligned, missing
+    rows NaN), and diffexp -> uns.
+    """
+    obs_names = ["a-1", "b-1", "c-1", "d-1"]
+    adata = AnnData(X=np.zeros((4, 2), dtype=np.float32))
+    adata.obs_names = obs_names
+
+    analysis = tmp_path / "analysis"
+    # graphclust covers 3 of 4 cells (d-1 unclustered -> missing); rows scrambled.
+    _write_clusters(analysis / "clustering" / "gene_expression_graphclust", ["c-1", "a-1", "b-1"], [3, 1, 2])
+    _write_clusters(
+        analysis / "clustering" / "gene_expression_kmeans_2_clusters", ["a-1", "b-1", "c-1", "d-1"], [1, 2, 1, 2]
+    )
+    # pca covers 3 of 4 cells; umap covers all.
+    _write_projection(
+        analysis / "pca" / "gene_expression_2_components",
+        ["a-1", "b-1", "c-1"],
+        {"PC-1": [0.1, 0.2, 0.3], "PC-2": [1.0, 2.0, 3.0]},
+    )
+    _write_projection(
+        analysis / "umap" / "gene_expression_2_components",
+        ["a-1", "b-1", "c-1", "d-1"],
+        {"UMAP-1": [1.0, 2.0, 3.0, 4.0], "UMAP-2": [4.0, 3.0, 2.0, 1.0]},
+    )
+    de_dir = analysis / "diffexp" / "gene_expression_graphclust"
+    de_dir.mkdir(parents=True)
+    pd.DataFrame({"Feature ID": ["g1"], "Feature Name": ["G1"], "Cluster 1 Mean Counts": [0.5]}).to_csv(
+        de_dir / "differential_expression.csv", index=False
+    )
+
+    _add_cells_analysis(adata, tmp_path)
+
+    # clustering -> categorical obs, joined by barcode (NOT row position), missing -> NaN.
+    gc = adata.obs["gene_expression_graphclust"]
+    assert str(gc.dtype) == "category"
+    assert list(gc[:3]) == ["1", "2", "3"]
+    assert pd.isna(gc.iloc[3])
+    assert list(gc.cat.categories) == ["1", "2", "3"]  # string categories, no "1.0"
+    assert list(adata.obs["gene_expression_kmeans_2_clusters"]) == ["1", "2", "1", "2"]
+
+    # pca/umap -> obsm, aligned to obs order; cells absent from a projection are NaN.
+    assert adata.obsm["X_pca"].shape == (4, 2)
+    assert adata.obsm["X_pca"][0, 0] == np.float32(0.1)  # a-1 joined correctly
+    assert np.isnan(adata.obsm["X_pca"][3]).all()  # d-1 absent from pca
+    assert adata.obsm["X_umap"].shape == (4, 2)
+    assert not np.isnan(adata.obsm["X_umap"]).any()
+
+    # diffexp -> uns
+    assert "gene_expression_graphclust" in adata.uns["diffexp"]
+
+
+def test_xenium_cells_analysis_missing_folder_is_noop(tmp_path: Path) -> None:
+    adata = AnnData(X=np.zeros((2, 2), dtype=np.float32))
+    adata.obs_names = ["a-1", "b-1"]
+    _add_cells_analysis(adata, tmp_path)  # no analysis/ folder present
+    assert "X_pca" not in adata.obsm
+    assert "X_umap" not in adata.obsm
+    assert not any(c.startswith("gene_expression_") for c in adata.obs.columns)
+    assert "diffexp" not in adata.uns