Skip to content

Commit 19a2ca3

Browse files
ivirshupebezzi
andauthored
Add support for getting data from census_spatial_sequencing to get_anndata, get_obs, get_var (#1383)
* Add support for getting data from census_spatial_sequencing to get_anndata * Document modality argument to get_anndata * Add support for modality argument to get_obs, get_var * Start on presence matrix (possible bug) * Fix bug and correct test * Apply suggestions from code review Co-authored-by: Emanuele Bezzi <[email protected]> --------- Co-authored-by: Emanuele Bezzi <[email protected]>
1 parent aef4506 commit 19a2ca3

File tree

5 files changed

+66
-18
lines changed

5 files changed

+66
-18
lines changed

api/python/cellxgene_census/src/cellxgene_census/_experiment.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ def _get_experiment_name(organism: str) -> str:
1818
return re.sub(r"[ ]+", "_", organism).lower()
1919

2020

21-
def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:
21+
def _get_experiment(
22+
census: soma.Collection,
23+
organism: str,
24+
modality: str = "census_data",
25+
) -> soma.Experiment:
2226
"""Given a census :class:`tiledbsoma.Collection`, return the experiment for the named organism.
2327
Organism matching is somewhat flexible, attempting to map from human-friendly
2428
names to the underlying collection element name.
@@ -49,7 +53,7 @@ def _get_experiment(census: soma.Collection, organism: str) -> soma.Experiment:
4953

5054
if exp_name not in census["census_data"]:
5155
raise ValueError(f"Unknown organism {organism} - does not exist")
52-
exp = census["census_data"][exp_name]
56+
exp = census[modality][exp_name]
5357
if exp.soma_type != "SOMAExperiment":
5458
raise ValueError(f"Unknown organism {organism} - not a SOMA Experiment")
5559

api/python/cellxgene_census/src/cellxgene_census/_get_anndata.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def get_anndata(
4242
var_embeddings: Sequence[str] | None = (),
4343
obs_column_names: Sequence[str] | None = None,
4444
var_column_names: Sequence[str] | None = None,
45+
modality: str = "census_data",
4546
) -> anndata.AnnData:
4647
"""Convenience wrapper around :class:`tiledbsoma.Experiment` query, to build and execute a query,
4748
and return it as an :class:`anndata.AnnData` object.
@@ -89,6 +90,9 @@ def get_anndata(
8990
Columns to fetch for ``obs`` dataframe.
9091
var_column_names:
9192
Columns to fetch for ``var`` dataframe.
93+
modality:
94+
Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
95+
Defaults to ``"census_data"``.
9296
9397
Returns:
9498
An :class:`anndata.AnnData` object containing the census slice.
@@ -103,7 +107,7 @@ def get_anndata(
103107
104108
>>> get_anndata(census, "Homo sapiens", obs_coords=slice(0, 1000))
105109
"""
106-
exp = _get_experiment(census, organism)
110+
exp = _get_experiment(census, organism, modality)
107111
obs_coords = (slice(None),) if obs_coords is None else (obs_coords,)
108112
var_coords = (slice(None),) if var_coords is None else (var_coords,)
109113

@@ -147,6 +151,9 @@ def get_anndata(
147151

148152
# If obs_embeddings or var_embeddings are defined, inject them in the appropriate slot
149153
if obs_embeddings or var_embeddings:
154+
if modality == "census_spatial_sequencing":
155+
raise ValueError("Embeddings are not supported for the spatial sequencing collection at this time.")
156+
150157
from .experimental._embedding import _get_embedding, get_embedding_metadata_by_name
151158

152159
census_version = _extract_census_version(census)
@@ -176,12 +183,13 @@ def _get_axis_metadata(
176183
census: soma.Collection,
177184
axis: Literal["obs", "var"],
178185
organism: str,
186+
modality: str = "census_data",
179187
*,
180188
value_filter: str | None = None,
181189
coords: SparseDFCoord | None = slice(None),
182190
column_names: Sequence[str] | None = None,
183191
) -> pd.DataFrame:
184-
exp = _get_experiment(census, organism)
192+
exp = _get_experiment(census, organism, modality=modality)
185193
coords = (slice(None),) if coords is None else (coords,)
186194
if axis == "obs":
187195
df = exp.obs
@@ -202,6 +210,7 @@ def get_obs(
202210
value_filter: str | None = None,
203211
coords: SparseDFCoord | None = slice(None),
204212
column_names: Sequence[str] | None = None,
213+
modality: str = "census_data",
205214
) -> pd.DataFrame:
206215
"""Get the observation metadata for a query on the census.
207216
@@ -218,12 +227,21 @@ def get_obs(
218227
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
219228
column_names:
220229
Columns to fetch.
230+
modality
231+
Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
232+
Defaults to ``"census_data"``.
221233
222234
Returns:
223235
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
224236
"""
225237
return _get_axis_metadata(
226-
census, "obs", organism, value_filter=value_filter, coords=coords, column_names=column_names
238+
census,
239+
"obs",
240+
organism,
241+
value_filter=value_filter,
242+
coords=coords,
243+
column_names=column_names,
244+
modality=modality,
227245
)
228246

229247

@@ -234,6 +252,7 @@ def get_var(
234252
value_filter: str | None = None,
235253
coords: SparseDFCoord | None = slice(None),
236254
column_names: Sequence[str] | None = None,
255+
modality: str = "census_data",
237256
) -> pd.DataFrame:
238257
"""Get the variable metadata for a query on the census.
239258
@@ -250,10 +269,19 @@ def get_var(
250269
May be an ``int``, a list of ``int``, or a slice. The default, ``None``, selects all.
251270
column_names:
252271
Columns to fetch.
272+
modality:
273+
Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
274+
Defaults to ``"census_data"``.
253275
254276
Returns:
255277
A :class:`pandas.DataFrame` object containing metadata for the queried slice.
256278
"""
257279
return _get_axis_metadata(
258-
census, "var", organism, value_filter=value_filter, coords=coords, column_names=column_names
280+
census,
281+
"var",
282+
organism,
283+
value_filter=value_filter,
284+
coords=coords,
285+
column_names=column_names,
286+
modality=modality,
259287
)

api/python/cellxgene_census/src/cellxgene_census/_presence_matrix.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def get_presence_matrix(
1717
census: soma.Collection,
1818
organism: str,
1919
measurement_name: str = "RNA",
20+
modality: str = "census_data",
2021
) -> sparse.csr_matrix:
2122
"""Read the feature dataset presence matrix and return as a :class:`scipy.sparse.csr_array`. The
2223
returned sparse matrix is indexed on the first dimension by the dataset ``soma_joinid`` values,
@@ -29,6 +30,9 @@ def get_presence_matrix(
2930
The organism to query, usually one of ``"Homo sapiens"`` or ``"Mus musculus"``.
3031
measurement_name:
3132
The measurement object to query. Deafults to ``"RNA"``.
33+
modality:
34+
Which modality to query, can be one of ``"census_data"`` or ``"census_spatial_sequencing"``.
35+
Defaults to ``"census_data"``.
3236
3337
Returns:
3438
A :class:`scipy.sparse.csr_array` object containing the presence matrix.
@@ -44,6 +48,6 @@ def get_presence_matrix(
4448
<321x60554 sparse array of type '<class 'numpy.uint8'>'
4549
with 6441269 stored elements in Compressed Sparse Row format>
4650
"""
47-
exp = _get_experiment(census, organism)
51+
exp = _get_experiment(census, organism, modality=modality)
4852
presence = exp.ms[measurement_name]["feature_dataset_presence_matrix"]
4953
return presence.read((slice(None),)).coos().concat().to_scipy().tocsr()

api/python/cellxgene_census/tests/test_get_anndata.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,14 @@ def test_get_anndata_value_filter(census: soma.Collection) -> None:
3333

3434

3535
@pytest.mark.live_corpus
36-
def test_get_anndata_coords(census: soma.Collection) -> None:
36+
@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
37+
def test_get_anndata_coords(census: soma.Collection, modality: str) -> None:
3738
ad = cellxgene_census.get_anndata(
3839
census,
3940
organism="Mus musculus",
4041
obs_coords=slice(1000),
4142
var_coords=slice(2000),
43+
modality=modality,
4244
)
4345

4446
assert ad is not None
@@ -119,10 +121,12 @@ def test_get_anndata_two_layers(census: soma.Collection, layers: list[str]) -> N
119121

120122

121123
@pytest.mark.live_corpus
122-
def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None:
124+
@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
125+
def test_get_anndata_wrong_layer_names(census: soma.Collection, modality: str) -> None:
123126
with pytest.raises(ValueError) as raise_info:
124127
cellxgene_census.get_anndata(
125128
census,
129+
modality=modality,
126130
organism="Homo sapiens",
127131
X_name="this_layer_name_is_bad",
128132
obs_coords=slice(100),
@@ -134,6 +138,7 @@ def test_get_anndata_wrong_layer_names(census: soma.Collection) -> None:
134138
with pytest.raises(ValueError) as raise_info:
135139
cellxgene_census.get_anndata(
136140
census,
141+
modality=modality,
137142
organism="Homo sapiens",
138143
X_name="raw",
139144
X_layers=["this_layer_name_is_bad"],
@@ -334,11 +339,12 @@ def _map_to_get_anndata_args(query: dict[str, Any], axis: Literal["obs", "var"])
334339
pytest.param({"value_filter": "tissue_general == 'vasculature'"}, id="value_filter"),
335340
],
336341
)
337-
def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None:
342+
@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
343+
def test_get_obs(lts_census: soma.Collection, query: dict[str, Any], modality: str) -> None:
338344
adata_obs = cellxgene_census.get_anndata(
339-
lts_census, organism="Mus musculus", **_map_to_get_anndata_args(query, "obs")
345+
lts_census, organism="Mus musculus", modality=modality, **_map_to_get_anndata_args(query, "obs")
340346
).obs
341-
only_obs = cellxgene_census.get_obs(lts_census, "Mus musculus", **query)
347+
only_obs = cellxgene_census.get_obs(lts_census, "Mus musculus", modality=modality, **query)
342348
# account for a difference:
343349
only_obs.index = only_obs.index.astype(str)
344350

@@ -360,11 +366,16 @@ def test_get_obs(lts_census: soma.Collection, query: dict[str, Any]) -> None:
360366
pytest.param({"value_filter": "feature_name in ['Gm53058', '0610010K14Rik']"}, id="value_filter"),
361367
],
362368
)
363-
def test_get_var(lts_census: soma.Collection, query: dict[str, Any]) -> None:
369+
@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
370+
def test_get_var(lts_census: soma.Collection, query: dict[str, Any], modality: str) -> None:
364371
adata_var = cellxgene_census.get_anndata(
365-
lts_census, organism="Mus musculus", obs_coords=slice(0), **_map_to_get_anndata_args(query, "var")
372+
lts_census,
373+
organism="Mus musculus",
374+
obs_coords=slice(0),
375+
modality=modality,
376+
**_map_to_get_anndata_args(query, "var"),
366377
).var
367-
only_var = cellxgene_census.get_var(lts_census, "Mus musculus", **query)
378+
only_var = cellxgene_census.get_var(lts_census, "Mus musculus", modality=modality, **query)
368379
# AnnData instantiation converts the index to string, so we match that behaviour for comparisons sake
369380
only_var.index = only_var.index.astype(str)
370381

api/python/cellxgene_census/tests/test_get_helpers.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@ def test_get_experiment(census: soma.Collection) -> None:
2525

2626
@pytest.mark.live_corpus
2727
@pytest.mark.parametrize("organism", ["homo_sapiens", "mus_musculus"])
28-
def test_get_presence_matrix(organism: str, census: soma.Collection) -> None:
28+
@pytest.mark.parametrize("modality", ["census_data", "census_spatial_sequencing"])
29+
def test_get_presence_matrix(organism: str, census: soma.Collection, modality: str) -> None:
2930
census_datasets = census["census_info"]["datasets"].read().concat().to_pandas()
3031

31-
pm = cellxgene_census.get_presence_matrix(census, organism)
32+
pm = cellxgene_census.get_presence_matrix(census, organism, modality=modality)
3233
assert isinstance(pm, scipy.sparse.csr_matrix)
3334
assert pm.shape[0] == len(census_datasets)
3435
assert pm.shape[1] == len(
35-
census["census_data"][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
36+
census[modality][organism].ms["RNA"].var.read(column_names=["soma_joinid"]).concat().to_pandas()
3637
)

0 commit comments

Comments
 (0)