[builder] Upgrade to CELLxGENE schema 5.1 (#1192)

Bento007 · ebezzi · web-flow · commit bfbde13ba7b7 · 2024-06-28T08:41:00.000-07:00
* feat: upgrade census buidler to schema 5.1

* feat: upgrade COG version

* update tests

* Apply suggestions from code review

* bump census schema version

---------

Co-authored-by: Emanuele Bezzi &lt;ebezzi@chanzuckerberg.com&gt;
diff --git a/tools/cellxgene_census_builder/pyproject.toml b/tools/cellxgene_census_builder/pyproject.toml
@@ -36,7 +36,7 @@ dependencies= [
     #    https://github.com/TileDB-Inc/TileDB/blob/dev/format_spec/FORMAT_SPEC.md
     "tiledbsoma==1.9.3",
     "cellxgene-census==1.12.0",
-    "cellxgene-ontology-guide==0.6.1",
+    "cellxgene-ontology-guide==1.0.0",
     "scipy==1.12.0",
     "fsspec[http]==2024.3.1",
     "s3fs==2024.3.1",
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -11,9 +11,9 @@
 # DataFrame columns. True is enabled, False is disabled.
 USE_ARROW_DICTIONARY = True
 
-CENSUS_SCHEMA_VERSION = "2.0.1"
+CENSUS_SCHEMA_VERSION = "2.1.0"
 
-CXG_SCHEMA_VERSION = "5.0.0"  # the CELLxGENE schema version supported
+CXG_SCHEMA_VERSION = "5.1.0"  # the CELLxGENE schema version supported
 
 # Columns expected in the census_datasets dataframe
 CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
diff --git a/tools/cellxgene_census_builder/tests/anndata/test_anndata.py b/tools/cellxgene_census_builder/tests/anndata/test_anndata.py
@@ -265,7 +265,7 @@ def test_empty_estimated_density(tmp_path: pathlib.Path) -> None:
     adata = anndata.AnnData(
         obs=pd.DataFrame(), var=pd.DataFrame({"feature_id": [0, 1, 2]}), X=sparse.csr_matrix((0, 3), dtype=np.float32)
     )
-    adata.uns["schema_version"] = "5.0.0"
+    adata.uns["schema_version"] = "5.1.0"
     adata.write_h5ad(path)
 
     with open_anndata(path) as ad:
@@ -297,7 +297,7 @@ def test_open_anndata_raw_X(tmp_path: pathlib.Path) -> None:
         var=pd.DataFrame({"feature_id": [0, 1, 2]}),
         X=sparse.csr_matrix((2, 3), dtype=np.float32),
         raw={"X": sparse.csr_matrix((2, 4), dtype=np.float32)},
-        uns={"schema_version": "5.0.0"},
+        uns={"schema_version": "5.1.0"},
     )
     adata.write_h5ad(path)
 
@@ -410,7 +410,7 @@ def test_multi_species_filter(
             index=[f"feature_{i}" for i in range(n_vars)],
         ),
         X=sparse.random(n_obs, n_vars, format="csr", dtype=np.float32),
-        uns={"schema_version": "5.0.0"},
+        uns={"schema_version": "5.1.0"},
     )
     path = (tmp_path / "species.h5ad").as_posix()
     adata.write_h5ad(path)
diff --git a/tools/cellxgene_census_builder/tests/conftest.py b/tools/cellxgene_census_builder/tests/conftest.py
@@ -116,7 +116,7 @@ def get_anndata(
     uns["batch_condition"] = np.array(["a", "b"], dtype="object")
 
     # Need to carefully set the corpora schema versions in order for tests to pass.
-    uns["schema_version"] = "5.0.0"  # type: ignore
+    uns["schema_version"] = "5.1.0"  # type: ignore
 
     return anndata.AnnData(X=X, obs=obs, var=var, obsm=obsm, uns=uns)
 
diff --git a/tools/cellxgene_census_builder/tests/test_manifest.py b/tools/cellxgene_census_builder/tests/test_manifest.py
@@ -65,7 +65,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [
                     {
                         "filesize": 123,
@@ -90,7 +90,7 @@ def test_load_manifest_from_cxg(empty_blocklist: str) -> None:
                 "collection_doi_label": "Publication 2",
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 456, "filetype": "H5AD", "url": "https://fake.url/dataset_id_2.h5ad"}],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 11,
@@ -122,7 +122,7 @@ def test_load_manifest_from_cxg_errors_on_datasets_with_old_schema(
                 "collection_doi_label": "Publication 1",
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -166,7 +166,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #1",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [{"filesize": 123, "filetype": "H5AD", "url": "https://fake.url/dataset_id_1.h5ad"}],
                 "dataset_version_id": "dataset_id_1",
                 "cell_count": 10,
@@ -179,7 +179,7 @@ def test_load_manifest_from_cxg_excludes_datasets_with_no_assets(
                 "collection_doi": None,
                 "citation": "citation",
                 "title": "dataset #2",
-                "schema_version": "5.0.0",
+                "schema_version": "5.1.0",
                 "assets": [],
                 "dataset_version_id": "dataset_id_2",
                 "cell_count": 10,