Add support for getting data from census_spatial_sequencing to get_anndata, get_obs, get_var (#1383)

ivirshup · ebezzi · web-flow · commit 19a2ca31ad56 · 2025-04-01T14:32:30.000-07:00
* Add support for getting data from census_spatial_sequencing to get_anndata

* Document modality argument to get_anndata

* Add support for modality argument to get_obs, get_var

* Start on presence matrix (possible bug)

* Fix bug and correct test

* Apply suggestions from code review

Co-authored-by: Emanuele Bezzi &lt;ebezzi@chanzuckerberg.com&gt;

---------

Co-authored-by: Emanuele Bezzi &lt;ebezzi@chanzuckerberg.com&gt;
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_experiment.py b/api/python/cellxgene_census/src/cellxgene_census/_experiment.py
@@ -18,7 +18,11 @@ def _get_experiment_name(organism: str) -> str:
     return re.sub(r"[ ]+", "_", organism).lower()
 
 
-def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:
+def _get_experiment(
+    census: soma.Collection,
+    organism: str,
+    modality: str = "census_data",
+) -> soma.Experiment:
     """Given a census :class:`tiledbsoma.Collection`, return the experiment for the named organism.
     Organism matching is somewhat flexible, attempting to map from human-friendly
     names to the underlying collection element name.
@@ -49,7 +53,7 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:
 
     if exp_name not in census["census_data"]:
         raise ValueError(f"Unknown organism {organism} - does not exist")
-    exp = census["census_data"][exp_name]
+    exp = census[modality][exp_name]
     if exp.soma_type != "SOMAExperiment":
         raise ValueError(f"Unknown organism {organism} - not a SOMA Experiment")
 
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py b/api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py
@@ -42,6 +42,7 @@ def get_anndata(
     var_embeddings: Sequence[str] | None = (),
     obs_column_names: Sequence[str] | None = None,
     var_column_names: Sequence[str] | None = None,
+    modality: str = "census_data",
 ) -> anndata.AnnData:
     """Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
     and return it as an :class:`anndata.AnnData` object.
@@ -89,6 +90,9 @@ def get_anndata(
             Columns to fetch for ``obs`` dataframe.
         var_column_names:
             Columns to fetch for ``var`` dataframe.
+        modality:
+            Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
+            Defaults to ``"census_data"``.
 
     Returns:
         An :class:`anndata.AnnData` object containing the census slice.
@@ -103,7 +107,7 @@ def get_anndata(
 
         >>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
     """
-    exp = _get_experiment(census, organism)
+    exp = _get_experiment(census, organism, modality)
     obs_coords = (slice(None),) if obs_coords is None else (obs_coords,)
     var_coords = (slice(None),) if var_coords is None else (var_coords,)
 
@@ -147,6 +151,9 @@ def get_anndata(
 
         # If obs_embeddings or var_embeddings are defined, inject them in the appropriate slot
         if obs_embeddings or var_embeddings:
+            if modality == "census_spatial_sequencing":
+                raise ValueError("Embeddings are not supported for the spatial sequencing collection at this time.")
+
             from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name
 
             census_version = _extract_census_version(census)
@@ -176,12 +183,13 @@ def _get_axis_metadata(
     census: soma.Collection,
     axis: Literal["obs", "var"],
     organism: str,
+    modality: str = "census_data",
     *,
     value_filter: str | None = None,
     coords: SparseDFCoord | None = slice(None),
     column_names: Sequence[str] | None = None,
 ) -> pd.DataFrame:
-    exp = _get_experiment(census, organism)
+    exp = _get_experiment(census, organism, modality=modality)
     coords = (slice(None),) if coords is None else (coords,)
     if axis == "obs":
         df = exp.obs
@@ -202,6 +210,7 @@ def get_obs(
     value_filter: str | None = None,
     coords: SparseDFCoord | None = slice(None),
     column_names: Sequence[str] | None = None,
+    modality: str = "census_data",
 ) -> pd.DataFrame:
     """Get the observation metadata for a query on the census.
 
@@ -218,12 +227,21 @@ def get_obs(
             May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
         column_names:
             Columns to fetch.
+        modality
+            Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
+            Defaults to ``"census_data"``.
 
     Returns:
         A :class:`pandas.DataFrame` object containing metadata for the queried slice.
     """
     return _get_axis_metadata(
-        census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
+        census,
+        "obs",
+        organism,
+        value_filter=value_filter,
+        coords=coords,
+        column_names=column_names,
+        modality=modality,
     )
 
 
@@ -234,6 +252,7 @@ def get_var(
     value_filter: str | None = None,
     coords: SparseDFCoord | None = slice(None),
     column_names: Sequence[str] | None = None,
+    modality: str = "census_data",
 ) -> pd.DataFrame:
     """Get the variable metadata for a query on the census.
 
@@ -250,10 +269,19 @@ def get_var(
             May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
         column_names:
             Columns to fetch.
+        modality:
+            Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
+            Defaults to ``"census_data"``.
 
     Returns:
         A :class:`pandas.DataFrame` object containing metadata for the queried slice.
     """
     return _get_axis_metadata(
-        census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
+        census,
+        "var",
+        organism,
+        value_filter=value_filter,
+        coords=coords,
+        column_names=column_names,
+        modality=modality,
     )
diff --git a/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py b/api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py
@@ -17,6 +17,7 @@ def get_presence_matrix(
     census: soma.Collection,
     organism: str,
     measurement_name: str = "RNA",
+    modality: str = "census_data",
 ) -> sparse.csr_matrix:
     """Read the feature dataset presence matrix and return as a :class:`scipy.sparse.csr_array`. The
     returned sparse matrix is indexed on the first dimension by the dataset ``soma_joinid`` values,
@@ -29,6 +30,9 @@ def get_presence_matrix(
             The organism to query, usually one of ``"Homo sapiens"`` or ``"Mus musculus"``.
         measurement_name:
             The measurement object to query. Deafults to ``"RNA"``.
+        modality:
+            Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
+            Defaults to ``"census_data"``.
 
     Returns:
         A :class:`scipy.sparse.csr_array` object containing the presence matrix.
@@ -44,6 +48,6 @@ def get_presence_matrix(
         <321x60554 sparse array of type '<class 'numpy.uint8'>'
         with 6441269 stored elements in Compressed Sparse Row format>
     """
-    exp = _get_experiment(census, organism)
+    exp = _get_experiment(census, organism, modality=modality)
     presence = exp.ms[measurement_name]["feature_dataset_presence_matrix"]
     return presence.read((slice(None),)).coos().concat().to_scipy().tocsr()
diff --git a/api/python/cellxgene_census/tests/test_get_anndata.py b/api/python/cellxgene_census/tests/test_get_anndata.py
@@ -33,12 +33,14 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None:
 
 
 @pytest.mark.live_corpus
-def test_get_anndata_coords(census: soma.Collection) -> None:
+@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
+def test_get_anndata_coords(census: soma.Collection, modality: str) -> None:
     ad = cellxgene_census.get_anndata(
         census,
         organism="Mus musculus",
         obs_coords=slice(1000),
         var_coords=slice(2000),
+        modality=modality,
     )
 
     assert ad is not None
@@ -119,10 +121,12 @@ def test_get_anndata_two_layers(census: soma.Collection, layers: list[str]) -> N
 
 
 @pytest.mark.live_corpus
-def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None:
+@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
+def test_get_anndata_wrong_layer_names(census: soma.Collection, modality: str) -> None:
     with pytest.raises(ValueError) as raise_info:
         cellxgene_census.get_anndata(
             census,
+            modality=modality,
             organism="Homo sapiens",
             X_name="this_layer_name_is_bad",
             obs_coords=slice(100),
@@ -134,6 +138,7 @@ def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None:
     with pytest.raises(ValueError) as raise_info:
         cellxgene_census.get_anndata(
             census,
+            modality=modality,
             organism="Homo sapiens",
             X_name="raw",
             X_layers=["this_layer_name_is_bad"],
@@ -334,11 +339,12 @@ def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"])
         pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"),
     ],
 )
-def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None:
+@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
+def test_get_obs(lts_census: soma.Collection, query: dict[str, Any], modality: str) -> None:
     adata_obs = cellxgene_census.get_anndata(
-        lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs")
+        lts_census, organism="Mus musculus", modality=modality, **_map_to_get_anndata_args(query, "obs")
     ).obs
-    only_obs = cellxgene_census.get_obs(lts_census, "Mus musculus", **query)
+    only_obs = cellxgene_census.get_obs(lts_census, "Mus musculus", modality=modality, **query)
     # account for a difference:
     only_obs.index = only_obs.index.astype(str)
 
@@ -360,11 +366,16 @@ def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None:
         pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"),
     ],
 )
-def test_get_var(lts_census: soma.Collection, query: dict[str, Any]) -> None:
+@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
+def test_get_var(lts_census: soma.Collection, query: dict[str, Any], modality: str) -> None:
     adata_var = cellxgene_census.get_anndata(
-        lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var")
+        lts_census,
+        organism="Mus musculus",
+        obs_coords=slice(0),
+        modality=modality,
+        **_map_to_get_anndata_args(query, "var"),
     ).var
-    only_var = cellxgene_census.get_var(lts_census, "Mus musculus", **query)
+    only_var = cellxgene_census.get_var(lts_census, "Mus musculus", modality=modality, **query)
     # AnnData instantiation converts the index to string, so we match that behaviour for comparisons sake
     only_var.index = only_var.index.astype(str)
 
diff --git a/api/python/cellxgene_census/tests/test_get_helpers.py b/api/python/cellxgene_census/tests/test_get_helpers.py
@@ -25,12 +25,13 @@ def test_get_experiment(census: soma.Collection) -> None:
 
 @pytest.mark.live_corpus
 @pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
-def test_get_presence_matrix(organism: str, census: soma.Collection) -> None:
+@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
+def test_get_presence_matrix(organism: str, census: soma.Collection, modality: str) -> None:
     census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()
 
-    pm = cellxgene_census.get_presence_matrix(census, organism)
+    pm = cellxgene_census.get_presence_matrix(census, organism, modality=modality)
     assert isinstance(pm, scipy.sparse.csr_matrix)
     assert pm.shape[0] == len(census_datasets)
     assert pm.shape[1] == len(
-        census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
+        census[modality][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
     )