diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cad2767018c..a393a114579 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix binning when ``labels`` is specified. (:issue:`7766`). + By `Deepak Cherian `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index b786fa60af9..a6d63b7e95f 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -338,7 +338,8 @@ def _factorize_bins( if (codes == -1).all(): raise ValueError(f"None of the data falls within bins with edges {bins!r}") full_index = binned.categories - unique_values = np.sort(binned.unique().dropna()) + uniques = np.sort(pd.unique(codes)) + unique_values = full_index[uniques[uniques != -1]] group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g] if len(group_indices) == 0: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 13e26954950..4488bda3eca 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1373,7 +1373,22 @@ def test_groupby_multidim_map(self): @pytest.mark.parametrize("use_flox", [True, False]) @pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]]) - def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None: + @pytest.mark.parametrize( + "cut_kwargs", + ( + {"labels": None, "include_lowest": True}, + {"labels": None, "include_lowest": False}, + {"labels": ["a", "b"]}, + {"labels": [1.2, 3.5]}, + {"labels": ["b", "a"]}, + ), + ) + def test_groupby_bins( + self, + coords: np.typing.ArrayLike, + use_flox: bool, + cut_kwargs: dict, + ) -> None: array = DataArray( np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a" ) @@ -1384,11 +1399,10 @@ def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None bins = [0, 1.5, 5] df = array.to_dataframe() - df["dim_0_bins"] = pd.cut(array["dim_0"], bins) + df["dim_0_bins"] = pd.cut(array["dim_0"], bins, **cut_kwargs) expected_df = df.groupby("dim_0_bins").sum() # TODO: can't convert df with IntervalIndex to Xarray - expected = ( expected_df.reset_index(drop=True) .to_xarray() @@ -1397,25 +1411,55 @@ def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None ) with xr.set_options(use_flox=use_flox): - actual = array.groupby_bins("dim_0", bins=bins).sum() + actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).sum() assert_identical(expected, actual) - actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum() - assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual) - - actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum()) + actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).map( + lambda x: x.sum() + ) assert_identical(expected, actual) # make sure original array dims are unchanged assert len(array.dim_0) == 4 - da = xr.DataArray(np.ones((2, 3, 4))) - bins = [-1, 0, 1, 2] - with xr.set_options(use_flox=False): - actual = da.groupby_bins("dim_0", bins).mean(...) - with xr.set_options(use_flox=True): - expected = da.groupby_bins("dim_0", bins).mean(...) - assert_allclose(actual, expected) + def test_groupby_bins_ellipsis(self): + da = xr.DataArray(np.ones((2, 3, 4))) + bins = [-1, 0, 1, 2] + with xr.set_options(use_flox=False): + actual = da.groupby_bins("dim_0", bins).mean(...) + with xr.set_options(use_flox=True): + expected = da.groupby_bins("dim_0", bins).mean(...) + assert_allclose(actual, expected) + + @pytest.mark.parametrize("use_flox", [True, False]) + def test_groupby_bins_gives_correct_subset(self, use_flox: bool) -> None: + # GH7766 + rng = np.random.default_rng(42) + coords = rng.normal(5, 5, 1000) + bins = np.logspace(-4, 1, 10) + labels = [ + "one", + "two", + "three", + "four", + "five", + "six", + "seven", + "eight", + "nine", + ] + # xArray + # Make a mock dataarray + darr = xr.DataArray(coords, coords=[coords], dims=["coords"]) + expected = xr.DataArray( + [np.nan, np.nan, 1, 1, 1, 8, 31, 104, 542], + dims="coords_bins", + coords={"coords_bins": labels}, + ) + gb = darr.groupby_bins("coords", bins, labels=labels) + with xr.set_options(use_flox=use_flox): + actual = gb.count() + assert_identical(actual, expected) def test_groupby_bins_empty(self): array = DataArray(np.arange(4), [("x", range(4))])