laminlabs · Zethson · Jun 25, 2025 · Jun 25, 2025 · Jul 1, 2025 · Jul 1, 2025
diff --git a/lamindb/curators/__init__.py b/lamindb/curators/__init__.py
@@ -8,6 +8,7 @@
    MuDataCurator
    SpatialDataCurator
    TiledbsomaExperimentCurator
+   CxGCurator
 
 Modules.
 
@@ -19,23 +20,23 @@
 """
 
 from ._legacy import (  # backward compat
-    CellxGeneAnnDataCatManager,
     PertAnnDataCatManager,
 )
 from .core import (
     AnnDataCurator,
+    CxGCurator,
     DataFrameCurator,
     MuDataCurator,
     SpatialDataCurator,
     TiledbsomaExperimentCurator,
 )
 
 __all__ = [
-    "CellxGeneAnnDataCatManager",
     "PertAnnDataCatManager",
     "AnnDataCurator",
     "DataFrameCurator",
     "MuDataCurator",
     "SpatialDataCurator",
     "TiledbsomaExperimentCurator",
+    "CxGCurator",
 ]
diff --git a/lamindb/curators/_cellxgene_schemas/__init__.py b/lamindb/curators/_cellxgene_schemas/__init__.py
@@ -1,11 +1,16 @@
+from typing import Literal
+
 import pandas as pd
 from lamin_utils import logger
 from lamindb_setup.core.upath import UPath
 
 from lamindb.base.types import FieldAttr
-from lamindb.models import SQLRecord, ULabel
+from lamindb.models import Feature, Schema, SQLRecord, ULabel
 from lamindb.models._from_values import _format_values
 
+CELLxGENESchemaVersions = Literal["4.0.0", "5.0.0", "5.1.0", "5.2.0", "5.3.0"]
+
+# These names are reserved by the CELLxGENE Schema and are not allowed to be used as obs columns
 RESERVED_NAMES = {
     "ethnicity",
     "ethnicity_ontology_term_id",
@@ -35,7 +40,6 @@ def _get_cxg_categoricals() -> dict[str, FieldAttr]:
         "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
         "disease": bt.Disease.name,
         "disease_ontology_term_id": bt.Disease.ontology_id,
-        # "donor_id": "str",  via pandera
         "self_reported_ethnicity": bt.Ethnicity.name,
         "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
         "sex": bt.Phenotype.name,
@@ -46,6 +50,7 @@ def _get_cxg_categoricals() -> dict[str, FieldAttr]:
         "tissue_type": ULabel.name,
         "organism": bt.Organism.name,
         "organism_ontology_term_id": bt.Organism.ontology_id,
+        "donor_id": str,
     }
 
 
@@ -110,10 +115,16 @@ def _fetch_bionty_source(entity: str, organism: str) -> SQLRecord | None:  # typ
                 name=row.source,
                 version=row.version,
             ).one_or_none()
+            # if the source was not found, we register it from bionty-assets
             if source is None:
-                logger.error(
-                    f"Could not find source: {entity}\n"
-                    "    → consider running `bionty.core.sync_public_sources()`"
+                source = getattr(bt, entity).add_source(
+                    bt.Source.using("laminlabs/bionty-assets")
+                    .get(
+                        entity=f"bionty.{entity}",
+                        version=row.version,
+                        organism=row.organism,
+                    )
+                    .save()
                 )
             return source
 
@@ -127,16 +138,23 @@ def _fetch_bionty_source(entity: str, organism: str) -> SQLRecord | None:  # typ
 
     key_to_source: dict[str, bt.Source] = {}
     for key, field in categoricals.items():
-        if field.field.model.__get_module_name__() == "bionty":
-            entity = field.field.model.__name__
-            key_to_source[key] = _fetch_bionty_source(entity, organism)
+        if hasattr(field, "field"):
+            if field.field.model.__get_module_name__() == "bionty":
+                entity = field.field.model.__name__
+                key_to_source[key] = _fetch_bionty_source(entity, organism)
+        else:
+            key_to_source[key] = field
     key_to_source["var_index"] = _fetch_bionty_source("Gene", organism)
 
     return key_to_source
 
 
 def _init_categoricals_additional_values() -> None:
-    """Add additional values from CellxGene schema to the DB."""
+    """Add additional values from CellxGene schema to the instance.
+
+    CELLxGENE schemas use specific (control) values that are not available
+    in the ontologies. Therefore, we save them to the instance.
+    """
     import bionty as bt
 
     # Note: if you add another control below, be mindful to change the if condition that
@@ -150,7 +168,7 @@ def _init_categoricals_additional_values() -> None:
         # "normal" in Disease
         normal = bt.Phenotype.from_source(
             ontology_id="PATO:0000461",
-            source=bt.Source.get(name="pato", version="2024-03-28"),
+            source=bt.Source.get(name="pato", currently_used=True),
         )
         bt.Disease(
             uid=normal.uid,
@@ -196,3 +214,51 @@ def _init_categoricals_additional_values() -> None:
             ULabel(
                 name=name, type=suspension_type, description="From CellxGene schema."
             ).save()
+
+
+def _get_cxg_schema(
+    schema_version: CELLxGENESchemaVersions, sources: dict[str, SQLRecord]
+) -> Schema:
+    """Generates a `~lamindb.Schema` for a specific CELLxGENE schema version."""
+    import bionty as bt
+
+    categoricals = _get_cxg_categoricals()
+
+    var_schema = Schema(
+        name=f"CELLxGENE var of version {schema_version}",
+        index=Feature(
+            name="var_index",
+            dtype=bt.Gene.ensembl_gene_id,
+            cat_filters={"source": sources["var_index"]},
+        ).save(),
+        itype=Feature,
+        dtype="DataFrame",
+        minimal_set=True,
+        coerce_dtype=True,
+    ).save()
+
+    obs_features = [
+        Feature(
+            name=field, dtype=categoricals[field], cat_filters={"source": source}
+        ).save()
+        for field, source in sources.items()
+        if field != "var_index"
+    ]
+
+    obs_schema = Schema(
+        name=f"CELLxGENE obs of version {schema_version}",
+        features=obs_features,
+        otype="DataFrame",
+        minimal_set=True,
+        coerce_dtype=True,
+    ).save()
+
+    full_cxg_schema = Schema(
+        name=f"CELLxGENE AnnData schema of version {schema_version}",
+        otype="AnnData",
+        minimal_set=True,
+        coerce_dtype=True,
+        slots={"var": var_schema, "obs": obs_schema},
+    ).save()
+
+    return full_cxg_schema
diff --git a/lamindb/curators/_cellxgene_schemas/schema_versions.csv b/lamindb/curators/_cellxgene_schemas/schema_versions.csv
@@ -41,3 +41,14 @@ schema_version,entity,organism,source,version
 5.2.0,Tissue,all,uberon,2024-08-07
 5.2.0,Gene,human,ensembl,release-110
 5.2.0,Gene,mouse,ensembl,release-110
+5.3.0,CellType,all,cl,2025-02-13
+5.3.0,ExperimentalFactor,all,efo,3.75.0
+5.3.0,Ethnicity,human,hancestro,3.0
+5.3.0,DevelopmentalStage,human,hsapdv,2025-01-23
+5.3.0,DevelopmentalStage,mouse,mmusdv,2025-01-23
+5.3.0,Disease,all,mondo,2025-02-04
+5.3.0,Organism,all,ncbitaxon,2024-11-25
+5.3.0,Phenotype,all,pato,2025-02-01
+5.3.0,Tissue,all,uberon,2025-01-15
+5.3.0,Gene,human,ensembl,release-113
+5.3.0,Gene,mouse,ensembl,release-113
diff --git a/lamindb/curators/_legacy.py b/lamindb/curators/_legacy.py
@@ -1318,7 +1318,7 @@ def save_artifact(
 class CellxGeneAnnDataCatManager(AnnDataCatManager):
     """Categorical manager for `AnnData` respecting the CELLxGENE schema.
 
-    This will be superceded by a schema-based curation flow.
+    This will be superseded by a schema-based curation flow.
     """
 
     cxg_categoricals_defaults = {
@@ -1369,6 +1369,9 @@ def __init__(
         # Filter categoricals based on what's present in adata
         if categoricals is None:
             categoricals = self._get_cxg_categoricals()
+
+            # backwards compatibility
+            categoricals.pop("donor_id", None)
         categoricals = _restrict_obs_fields(adata.obs, categoricals)
 
         # Configure sources
@@ -1703,6 +1706,9 @@ def _configure_categoricals(self, adata: ad.AnnData):
             "pert_target": "unknown",
         }
 
+        # backwards compatibility
+        categoricals.pop("donor_id", None)
+
         return categoricals, categoricals_defaults
 
     def _configure_sources(self, adata: ad.AnnData):

diff --git a/lamindb/curators/core.py b/lamindb/curators/core.py
@@ -16,7 +16,7 @@
 import copy
 import re
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, Literal
 
 import lamindb_setup as ln_setup
 import numpy as np
@@ -26,6 +26,11 @@
 from lamindb_setup.core._docs import doc_args
 
 from lamindb.base.types import FieldAttr  # noqa
+from lamindb.curators._cellxgene_schemas import (
+    CELLxGENESchemaVersions,
+    _get_cxg_categoricals,
+    _get_cxg_schema,
+)
 from lamindb.models import (
     Artifact,
     Feature,
@@ -463,9 +468,11 @@ def __init__(
         slot: str | None = None,
     ) -> None:
         super().__init__(dataset=dataset, schema=schema)
+
         categoricals = []
         features = []
         feature_ids: set[int] = set()
+
         if schema.flexible:
             features += Feature.filter(name__in=self._dataset.keys()).list()
             feature_ids = {feature.id for feature in features}
@@ -488,6 +495,7 @@ def __init__(
                 features.extend(schema_features)
         else:
             assert schema.itype is not None  # noqa: S101
+
         pandera_columns = {}
         if features or schema._index_feature_uid is not None:
             # populate features
@@ -540,18 +548,23 @@ def __init__(
                     "list[cat["
                 ):
                     # validate categoricals if the column is required or if the column is present
-                    if required or feature.name in self._dataset.keys():
+                    # but exclude the index feature from column categoricals
+                    if (required or feature.name in self._dataset.keys()) and (
+                        schema._index_feature_uid is None
+                        or feature.uid != schema._index_feature_uid
+                    ):
                         categoricals.append(feature)
-            if schema._index_feature_uid is not None:
-                # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
-                # so, we're typing it as `str` here
+            # in almost no case, an index should have a pandas.CategoricalDtype in a DataFrame
+            # so, we're typing it as `str` here
+            if schema.index is not None:
                 index = pandera.Index(
                     schema.index.dtype
                     if not schema.index.dtype.startswith("cat")
                     else str
                 )
             else:
                 index = None
+
             self._pandera_schema = pandera.DataFrameSchema(
                 pandera_columns,
                 coerce=schema.coerce_dtype,
@@ -986,6 +999,81 @@ def __init__(
         self._columns_field = self._var_fields
 
 
+class CxGCurator(SlotsCurator):
+    """Curator for `AnnData` objects that should adhere to a specific CELLxGENE Schema version.
+
+    Args:
+        dataset: The AnnData-like object to validate & annotate.
+        schema_version: A CELLxGENE Schema version that defines the validation constraints.
+        organism: The organism of the Schema.
+        defaults: Default values that are set if columns or column values are missing.
+        extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
+            These extra sources are joined with the CELLxGENE fixed sources.
+            Use this parameter when subclassing.
+
+    Example:
+
+        .. literalinclude:: scripts/curate_cxg.py
+            :language: python
+            :caption: curate_cxg.py
+    """
+
+    def __init__(
+        self,
+        dataset: AnnData | Artifact,
+        schema_version: CELLxGENESchemaVersions,
+        *,
+        organism: Literal["human", "mouse"] = "human",
+        defaults: dict[str, str] = None,
+        extra_sources: dict[str, SQLRecord] = None,
+    ) -> None:
+        from ._cellxgene_schemas import (
+            _add_defaults_to_obs,
+            _create_sources,
+            _init_categoricals_additional_values,
+            _restrict_obs_fields,
+        )
+
+        # Add defaults first to ensure that we fetch valid sources
+        if defaults:
+            _add_defaults_to_obs(dataset.obs, defaults)
+
+        # Filter categoricals based on what's present in the dataset
+        present_categoricals = _restrict_obs_fields(
+            dataset.obs, _get_cxg_categoricals()
+        )
+
+        sources = _create_sources(present_categoricals, schema_version, organism)
+        # These sources are not a part of the cellxgene schema but rather passed through.
+        # This is useful when other Curators extend the CELLxGENE curator
+        if extra_sources:
+            sources = sources | extra_sources
+        cxg_schema = _get_cxg_schema(schema_version, sources=sources).save()
+        super().__init__(dataset=dataset, schema=cxg_schema)
+
+        if not data_is_scversedatastructure(self._dataset, "AnnData"):
+            raise InvalidArgument("dataset must be AnnData-like.")
+
+        self.schema_version = schema_version
+        self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
+
+        self._slots = {
+            slot: DataFrameCurator(
+                (
+                    getattr(self._dataset, slot.strip(".T")).T
+                    if slot == "var.T"
+                    else getattr(self._dataset, slot)
+                ),
+                slot_schema,
+                slot=slot,
+            )
+            for slot, slot_schema in cxg_schema.slots.items()
+            if slot in {"obs", "var", "var.T", "uns"}
+        }
+
+        _init_categoricals_additional_values()
+
+
 class CatVector:
     """Vector with categorical values."""