From 9024a67d62aba27d58a173220dcea6cb8ff5456d Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 23 Sep 2020 13:09:26 -0700 Subject: [PATCH 1/2] Simplify and restore old behavior for deep-copies Intended to fix https://github.com/pydata/xarray/issues/4449 Needs tests! --- xarray/core/indexing.py | 14 ++++--- xarray/core/variable.py | 83 +++++++++++++---------------------------- 2 files changed, 34 insertions(+), 63 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 9627f431cb6..a3c5971cc46 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1,3 +1,4 @@ +import copy import enum import functools import operator @@ -1466,13 +1467,14 @@ def __repr__(self) -> str: type(self).__name__, self.array, self.dtype ) - def copy(self, deep: bool = True) -> "PandasIndexAdapter": - # Not the same as just writing `self.array.copy(deep=deep)`, as - # shallow copies of the underlying numpy.ndarrays become deep ones - # upon pickling + def __copy__(self): + # We don't use `copy.copy(self.array)`, as shallow copies of the + # underlying numpy.ndarrays become deep ones upon pickling # >>> len(pickle.dumps((self.array, self.array))) # 4000281 # >>> len(pickle.dumps((self.array, self.array.copy(deep=False)))) # 8000341 - array = self.array.copy(deep=True) if deep else self.array - return PandasIndexAdapter(array, self._dtype) + return PandasIndexAdapter(self.array, self._dtype) + + def __deepcopy__(self, memo): + return PandasIndexAdapter(copy.deepcopy(self.array, memo), self._dtype) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c55e61cb816..ca4c04cab5c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -927,50 +927,55 @@ def copy(self, deep=True, data=None): pandas.DataFrame.copy """ if data is None: - data = self._data - - if isinstance(data, indexing.MemoryCachedArray): - # don't share caching between copies - data = indexing.MemoryCachedArray(data.array) - - if deep: - data = copy.deepcopy(data) - + return copy.deepcopy(self) if deep else copy.copy(self) else: data = as_compatible_data(data) if self.shape != data.shape: raise ValueError( - "Data shape {} must match shape of object {}".format( - data.shape, self.shape - ) + f"Data shape {data.shape} must match shape of object" + f"{self.shape}" ) + return self._replace(data=data) + def __copy__(self): + data = self._data + if isinstance(data, indexing.MemoryCachedArray): + # don't share mutable caches between copies + # TODO: Can we remove this special case? It seems unnecessary and + # inconsistent with how shallow copies work with NumPy array data. + data = indexing.MemoryCachedArray(data.array) # note: # dims is already an immutable tuple # attributes and encoding will be copied when the new Array is created return self._replace(data=data) + def __deepcopy__(self, memo): + data = self._data + if is_duck_array(data) or isinstance(data, PandasIndexAdapter): + data = copy.deepcopy(data, memo) + else: + # TODO: remove this legacy code path? It exists to ensure that + # xarray's lazy backend array objects get loaded into memory by a + # copy, but that purpose is better served by load() or compute(). + # See https://github.com/pydata/xarray/issues/4449 for discussion. + data = np.asarray(data) + copied = self._replace(data=data) + memo[id(self)] = copied + return copied + def _replace( self, dims=_default, data=_default, attrs=_default, encoding=_default ) -> "Variable": if dims is _default: dims = copy.copy(self._dims) if data is _default: - data = copy.copy(self.data) + data = self.data if attrs is _default: attrs = copy.copy(self._attrs) if encoding is _default: encoding = copy.copy(self._encoding) return type(self)(dims, data, attrs, encoding, fastpath=True) - def __copy__(self): - return self.copy(deep=False) - - def __deepcopy__(self, memo=None): - # memo does nothing but is required for compatibility with - # copy.deepcopy - return self.copy(deep=True) - # mutable objects should not be hashable # https://github.com/python/mypy/issues/4266 __hash__ = None # type: ignore @@ -2432,42 +2437,6 @@ def concat(cls, variables, dim="concat_dim", positions=None, shortcut=False): return cls(first_var.dims, data, attrs) - def copy(self, deep=True, data=None): - """Returns a copy of this object. - - `deep` is ignored since data is stored in the form of - pandas.Index, which is already immutable. Dimensions, attributes - and encodings are always copied. - - Use `data` to create a new object with the same structure as - original but entirely new data. - - Parameters - ---------- - deep : bool, optional - Deep is ignored when data is given. Whether the data array is - loaded into memory and copied onto the new object. Default is True. - data : array_like, optional - Data to use in the new object. Must have same shape as original. - - Returns - ------- - object : Variable - New object with dimensions, attributes, encodings, and optionally - data copied from original. - """ - if data is None: - data = self._data.copy(deep=deep) - else: - data = as_compatible_data(data) - if self.shape != data.shape: - raise ValueError( - "Data shape {} must match shape of object {}".format( - data.shape, self.shape - ) - ) - return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) - def equals(self, other, equiv=None): # if equiv is specified, super up if equiv is not None: From b503e8ef0bedca1765fe34490902651541120b51 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 23 Sep 2020 13:46:16 -0700 Subject: [PATCH 2/2] simplify copy methods on PandasIndexAdapter --- xarray/core/indexing.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index a3c5971cc46..767374f9f93 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1467,14 +1467,6 @@ def __repr__(self) -> str: type(self).__name__, self.array, self.dtype ) - def __copy__(self): - # We don't use `copy.copy(self.array)`, as shallow copies of the - # underlying numpy.ndarrays become deep ones upon pickling - # >>> len(pickle.dumps((self.array, self.array))) - # 4000281 - # >>> len(pickle.dumps((self.array, self.array.copy(deep=False)))) - # 8000341 - return PandasIndexAdapter(self.array, self._dtype) - def __deepcopy__(self, memo): - return PandasIndexAdapter(copy.deepcopy(self.array, memo), self._dtype) + # pandas.Index is (mostly) immutable + return PandasIndexAdapter(self.array, self._dtype)