From 8826c144ab4735d9dc8f6df02e016fa06d38a922 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 4 Dec 2022 22:29:02 -0500 Subject: [PATCH 1/6] Avoid instantiating entire dataset by getting the nbytes in an array Using `.data` accidentally tries to load the whole lazy arrays into memory. Sad. --- xarray/core/variable.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bb988392f50..8cf46c4b88a 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -403,8 +403,8 @@ def nbytes(self) -> int: If the underlying data array does not include ``nbytes``, estimates the bytes consumed based on the ``size`` and ``dtype``. """ - if hasattr(self.data, "nbytes"): - return self.data.nbytes + if hasattr(self._data, "nbytes"): + return self._data.nbytes else: return self.size * self.dtype.itemsize From f934b90746a09e41d3929bc4691209b98e430b54 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 4 Dec 2022 22:33:31 -0500 Subject: [PATCH 2/6] DOC: Add release note for bugfix. --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c28812b100e..5f01a4371ac 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,6 +35,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Accessing the property ``.nbytes`` of a DataArray, or Variable no longer + accidentally triggers loading the variable into memory. + Documentation ~~~~~~~~~~~~~ From a04ba205f624d54abd39db34b51b501e91fc2471 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Sun, 4 Dec 2022 22:53:50 -0500 Subject: [PATCH 3/6] Add test to ensure that number of bytes of sparse array is correctly reported --- xarray/tests/test_dataarray.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8184fe1955c..eb728bd6f1c 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3259,6 +3259,21 @@ def test_from_series_sparse(self) -> None: actual_sparse.data = actual_sparse.data.todense() assert_identical(actual_sparse, actual_dense) + @requires_sparse + def test_sparse_nbytes(self) -> None: + # https://github.com/pydata/xarray/issues/4842#issue-793245791 + df = pd.DataFrame() + df["x"] = np.repeat(np.arange(10), 10) + df["y"] = np.repeat(np.arange(10), 10) + df["time"] = np.tile(pd.date_range("2000-01-01", "2000-03-10", freq="W"), 10) + df["rate"] = 10.0 + df = df.set_index(["time", "y", "x"]) + + sparse_ds = xr.Dataset.from_dataframe(df, sparse=True) + rate = sparse_ds["rate"] + assert rate.nbytes < 8000 + assert rate.size * rate.dtype.itemsize == 8000 + @requires_sparse def test_from_multiindex_series_sparse(self) -> None: # regression test for GH4019 From 362eeccd52c5f4c546a45566cd2fc10715910580 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Tue, 6 Dec 2022 21:31:52 -0500 Subject: [PATCH 4/6] Add suggested test using InaccessibleArray --- xarray/tests/test_dataarray.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index eb728bd6f1c..768d8810f31 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -38,6 +38,7 @@ assert_identical, assert_no_warnings, has_dask, + InaccessibleArray, raise_if_dask_computes, requires_bottleneck, requires_cupy, @@ -3292,6 +3293,21 @@ def test_from_multiindex_series_sparse(self) -> None: np.testing.assert_equal(actual_coords, expected_coords) + def test_nbytes_does_not_load_data(self) -> None: + array = InaccessibleArray(np.zeros((3, 3), dtype='uint8')) + da = xr.DataArray(array, dims=["x", "y"]) + + # If xarray tries to instantiate the InaccessibleArray to compute + # nbytes, the following will raise an error. + # However, it should still be able to accurately give us information + # about the number of bytes from the metadata + assert da.nbytes == 9 + # Here we confirm that this does not depend on array having the + # nbytes property, since it isn't really required by the array + # interface. nbytes is more a property of arrays that have been + # cast to numpy arrays. + assert not hasattr(array, 'nbytes') + def test_to_and_from_empty_series(self) -> None: # GH697 expected = pd.Series([], dtype=np.float64) From 58d3834394294630eeadef3ec83e208ac35fe3f4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 7 Dec 2022 02:33:26 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_dataarray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 768d8810f31..7ae21f7965f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -30,6 +30,7 @@ from xarray.core.types import QueryEngineOptions, QueryParserOptions from xarray.core.utils import is_scalar from xarray.tests import ( + InaccessibleArray, ReturnItem, assert_allclose, assert_array_equal, @@ -38,7 +39,6 @@ assert_identical, assert_no_warnings, has_dask, - InaccessibleArray, raise_if_dask_computes, requires_bottleneck, requires_cupy, @@ -3294,7 +3294,7 @@ def test_from_multiindex_series_sparse(self) -> None: np.testing.assert_equal(actual_coords, expected_coords) def test_nbytes_does_not_load_data(self) -> None: - array = InaccessibleArray(np.zeros((3, 3), dtype='uint8')) + array = InaccessibleArray(np.zeros((3, 3), dtype="uint8")) da = xr.DataArray(array, dims=["x", "y"]) # If xarray tries to instantiate the InaccessibleArray to compute @@ -3306,7 +3306,7 @@ def test_nbytes_does_not_load_data(self) -> None: # nbytes property, since it isn't really required by the array # interface. nbytes is more a property of arrays that have been # cast to numpy arrays. - assert not hasattr(array, 'nbytes') + assert not hasattr(array, "nbytes") def test_to_and_from_empty_series(self) -> None: # GH697 From 171c9325c3eaed5311efaec955ddc7afee9060a9 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Wed, 7 Dec 2022 07:51:11 -0500 Subject: [PATCH 6/6] Remove duplicate test --- xarray/tests/test_dataarray.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 7ae21f7965f..46987ab6c60 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3260,21 +3260,6 @@ def test_from_series_sparse(self) -> None: actual_sparse.data = actual_sparse.data.todense() assert_identical(actual_sparse, actual_dense) - @requires_sparse - def test_sparse_nbytes(self) -> None: - # https://github.com/pydata/xarray/issues/4842#issue-793245791 - df = pd.DataFrame() - df["x"] = np.repeat(np.arange(10), 10) - df["y"] = np.repeat(np.arange(10), 10) - df["time"] = np.tile(pd.date_range("2000-01-01", "2000-03-10", freq="W"), 10) - df["rate"] = 10.0 - df = df.set_index(["time", "y", "x"]) - - sparse_ds = xr.Dataset.from_dataframe(df, sparse=True) - rate = sparse_ds["rate"] - assert rate.nbytes < 8000 - assert rate.size * rate.dtype.itemsize == 8000 - @requires_sparse def test_from_multiindex_series_sparse(self) -> None: # regression test for GH4019