Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/full-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ jobs:
- name: install python dependencies (including experimental)
run: |
python -m pip install -U pip setuptools setuptools_scm wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install './api/python/cellxgene_census/[experimental]'

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/py-dependency-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ jobs:
- name: Install dependencies (including experimental)
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
GIT_CLONE_PROTECTION_ACTIVE=false pip install -U -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -U cellxgene-census[experimental]

Expand Down
1 change: 0 additions & 1 deletion .github/workflows/py-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ jobs:
- name: Install dependencies (including experimental)
run: |
python -m pip install -U pip setuptools wheel
pip install --use-pep517 accumulation-tree # Geneformer dependency needs --use-pep517 for Cython
GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ./api/python/cellxgene_census/scripts/requirements-dev.txt
pip install -e './api/python/cellxgene_census/[experimental]'
- name: Report Dependency Versions
Expand Down
7 changes: 2 additions & 5 deletions api/python/cellxgene_census/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,8 @@ experimental = [
"torchdata~=0.7,<0.10",
"scikit-learn>=1.2",
"scikit-misc>=0.3",
"datasets~=2.0",
"tdigest~=0.5",
"tiledb-vector-search~=0.11", # newest version compatible with tiledbsoma's version of TileDB Embedded
# Not expressible in pyproject.toml:
#"git+https://huggingface.co/ctheodoris/Geneformer",
# instead, experimental/ml/geneformer_tokenizer.py catches ImportError to ask user to install that.
"psutil",
]
spatial = [
"spatialdata>=0.2.5",
Expand Down Expand Up @@ -162,4 +158,5 @@ markers = [
"expensive: too expensive to run regularly or in CI",
"experimental: tests for the `experimental` package",
"lts_compat_check: check for compatibility with an LTS build",
"geneformer: Geneformer tests (not run in CI due to dependency issues)",
]
2 changes: 0 additions & 2 deletions api/python/cellxgene_census/scripts/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@ requests-mock
twine
coverage
nbqa
git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096; python_version>='3.10'
transformers[torch]<4.50 # Pinned Geneformer@ebc1e096 import AdamW from transformers, which was removed in 4.50
proxy.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import uuid
import warnings
from abc import ABC, abstractmethod
from collections.abc import Generator
from typing import Any

import scipy.sparse
from datasets import Dataset
from tiledbsoma import Experiment, ExperimentAxisQuery


Expand All @@ -15,6 +15,9 @@ class CellDatasetBuilder(ExperimentAxisQuery, ABC): # type: ignore
into a Dataset item, and may also override `__init__()` and context `__enter__()`
to perform any necessary preprocessing.

**DEPRECATION NOTICE:** this is planned for removal from the cellxgene_census API and
migrated into git:cellxgene-census/tools/models/geneformer.

The base class inherits ExperimentAxisQuery, so typical usage would be:

```
Expand Down Expand Up @@ -55,12 +58,19 @@ def __init__(
super().__init__(experiment, measurement_name, **kwargs)
self.layer_name = layer_name
self.block_size = block_size
warnings.warn(
"cellxgene_census.experimental.ml.huggingface will be removed from API in an upcoming release and migrated to git:cellxgene-census/tools/models/geneformer",
FutureWarning,
stacklevel=2,
)

def build(self, from_generator_kwargs: dict[str, Any] | None = None) -> Dataset:
def build(self, from_generator_kwargs: dict[str, Any] | None = None) -> "Dataset": # type: ignore # noqa: F821
"""Build the dataset from query results.

- `from_generator_kwargs`: kwargs passed through to `Dataset.from_generator()`
"""
# late binding to simplify CI dependencies:
from datasets import Dataset

def gen() -> Generator[dict[str, Any], None, None]:
for Xblock, (block_cell_joinids, _) in (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ class GeneformerTokenizer(CellDatasetBuilder):
cell in CELLxGENE Census ExperimentAxisQuery results (human).

This class requires the Geneformer package to be installed separately with:
`pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096`
`pip install transformers[torch]<4.50 git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096`

**DEPRECATION NOTICE:** this is planned for removal from the cellxgene_census API and
migrated into git:cellxgene-census/tools/models/geneformer.

Example usage:

Expand Down Expand Up @@ -127,7 +130,7 @@ def _load_geneformer_data(
# pyproject.toml can't express Geneformer git+https dependency
raise ImportError(
"Please install Geneformer with: "
"pip install git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096"
"pip install transformers[torch]<4.50 git+https://huggingface.co/ctheodoris/Geneformer@ebc1e096"
) from None
if not token_dictionary_file:
token_dictionary_file = geneformer.tokenizer.TOKEN_DICTIONARY_FILE
Expand Down
2 changes: 1 addition & 1 deletion api/python/cellxgene_census/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import tiledbsoma as soma

TEST_MARKERS_SKIPPED_BY_DEFAULT = ["expensive", "experimental"]
TEST_MARKERS_SKIPPED_BY_DEFAULT = ["expensive", "experimental", "geneformer"]

# tiledb will complain if this isn't set and a process is spawned. May cause segfaults on the proxy test if this isn't set.
multiprocessing.set_start_method("spawn", force=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import sys

import datasets
import pytest
import tiledbsoma
from py.path import local as Path
Expand All @@ -10,15 +7,21 @@

CENSUS_VERSION_FOR_GENEFORMER_TESTS = "2023-12-15"

"""
NOTE: These tests have been disabled by default (by @pytest.mark.geneformer, which is listed in
api/python/cellxgene_census/tests/conftest.py:TEST_MARKERS_SKIPPED_BY_DEFAULT). This is because the
Geneformer package dependencies tend to cause more CI issues than usage justifies. To run them
locally as needed, use `pytest -m geneformer --geneformer` (not a typo).
"""


@pytest.mark.skip("Needs to be investigated.")
@pytest.mark.experimental
@pytest.mark.live_corpus
@pytest.mark.geneformer
def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
"""
Test that GeneformerTokenizer produces the same token sequences as the original
geneformer.TranscriptomeTokenizer (modulo a small tolerance on Spearman rank correlation)
"""
import datasets
from geneformer import TranscriptomeTokenizer

from cellxgene_census.experimental.ml.huggingface import GeneformerTokenizer
Expand Down Expand Up @@ -86,9 +89,7 @@ def test_GeneformerTokenizer_correctness(tmpdir: Path) -> None:
assert identical / len(cell_ids) >= EXACT_THRESHOLD


@pytest.mark.skipif(sys.version_info < (3, 10), reason="requires python3.10 or higher")
@pytest.mark.experimental
@pytest.mark.live_corpus
@pytest.mark.geneformer
def test_GeneformerTokenizer_docstring_example() -> None:
from cellxgene_census.experimental.ml.huggingface import GeneformerTokenizer

Expand Down
Loading