diff --git a/pyproject.toml b/pyproject.toml index 8960a86..043af8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,6 +85,7 @@ overrides.matrix.extras.features = [ ] overrides.matrix.extras.dependencies = [ { if = [ "full" ], value = "scipy-stubs" }, + { if = [ "full" ], value = "scikit-learn" }, ] [[tool.hatch.envs.hatch-test.matrix]] diff --git a/tests/test_stats.py b/tests/test_stats.py index 8f76dc2..3fb4565 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -223,6 +223,42 @@ def test_mean_var( np.testing.assert_array_almost_equal(var, var_expected) # type: ignore[arg-type] +@pytest.mark.skipif(not find_spec("sklearn"), reason="sklearn not installed") +@pytest.mark.array_type(Flags.Sparse, skip=Flags.Matrix | Flags.Dask | Flags.Disk | Flags.Gpu) +@pytest.mark.parametrize("axis", [0, 1]) +def test_mean_var_sparse_64(array_type: ArrayType[types.CSArray], axis: Literal[0, 1]) -> None: + """Test that we’re equivalent for 64 bit.""" + from sklearn.utils.sparsefuncs import mean_variance_axis + + mtx = array_type.random((10000, 1000), dtype=np.float64) + + mean_fau, var_fau = stats.mean_var(mtx, axis=axis) + mean_skl, var_skl = mean_variance_axis(mtx, axis) + + np.testing.assert_allclose(mean_fau, mean_skl, rtol=1.0e-5, atol=1.0e-8) + np.testing.assert_allclose(var_fau, var_skl, rtol=1.0e-5, atol=1.0e-8) + + +@pytest.mark.skipif(not find_spec("sklearn"), reason="sklearn not installed") +@pytest.mark.array_type(Flags.Sparse, skip=Flags.Matrix | Flags.Dask | Flags.Disk | Flags.Gpu) +def test_mean_var_sparse_32(array_type: ArrayType[types.CSArray]) -> None: + """Test whether we are more accurate for 32 bit.""" + from sklearn.utils.sparsefuncs import mean_variance_axis + + mtx64 = array_type.random((10000, 1000), dtype=np.float64) + mtx32 = mtx64.astype(np.float32) + + fau, skl = {}, {} + for n_bit, mtx in [(32, mtx32), (64, mtx64)]: + fau[n_bit] = stats.mean_var(mtx, axis=0) + skl[n_bit] = mean_variance_axis(mtx, 0) + + for stat, _ in enumerate(["mean", "var"]): + resid_fau = np.mean(np.abs(fau[64][stat] - fau[32][stat])) + resid_skl = np.mean(np.abs(skl[64][stat] - skl[32][stat])) + assert resid_fau < resid_skl + + @pytest.mark.array_type(skip={Flags.Disk, *ATS_CUPY_SPARSE}) @pytest.mark.parametrize( ("axis", "expected"), diff --git a/typings/sklearn/utils/sparsefuncs.pyi b/typings/sklearn/utils/sparsefuncs.pyi new file mode 100644 index 0000000..2a73b01 --- /dev/null +++ b/typings/sklearn/utils/sparsefuncs.pyi @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: MPL-2.0 +from typing import Literal + +import numpy as np +from numpy.typing import NDArray +from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix + +def mean_variance_axis( + X: csc_array | csc_matrix | csr_array | csr_matrix, # noqa: N803 + axis: Literal[0, 1], + weights: NDArray[np.floating] | None = None, + return_sum_weights: bool = False, +) -> tuple[NDArray[np.float64], NDArray[np.float64]]: ...