From 8370e3968478d50b538b12a0c9b665330456cb2f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 5 Sep 2019 16:34:48 -0500 Subject: [PATCH 1/7] REF/ENH: Refactor NDFrame finalization In preperation for https://github.com/pandas-dev/pandas/issues/27108 (disallowing duplicates), we need to enhance our metadata propagation. *We need a way for a particiular attribute to deterimine how it's propagated for a particular method*. Our current method of metadata propagation lacked two features 1. It only copies an attribute from a source NDFrame to a new NDFrame. There is no way to propagate metadata from a collection of NDFrames (say from `pd.concat`) to a new NDFrame. 2. It only and always copies the attribute. This is not always appropriate when dealing with a collection of input NDFrames, as the source attributes may differ. The resolution of conflicts will differ by attribute (for `Series.name` we might throw away the name. For `Series.allow_duplicates`, any Series disallowing duplicates should mean the output disallows duplicates) --- pandas/core/_meta.py | 146 ++++++++++++++++++++++++++ pandas/core/generic.py | 7 +- pandas/tests/generic/test_metadata.py | 49 +++++++++ 3 files changed, 198 insertions(+), 4 deletions(-) create mode 100644 pandas/core/_meta.py create mode 100644 pandas/tests/generic/test_metadata.py diff --git a/pandas/core/_meta.py b/pandas/core/_meta.py new file mode 100644 index 0000000000000..07294a2e7d89f --- /dev/null +++ b/pandas/core/_meta.py @@ -0,0 +1,146 @@ +""" +Metadata propagation through pandas operations. + +This module contains the infrastructure for propagating ``NDFrame._metadata`` +through operations. We perform an operation (say :meth:`pandas.Series.copy`) that +returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``) +from ``self`` to the new ``NDFrame``. + +.. note:: + + Currently, pandas doesn't provide a clean, documented API on + + * which methods call finalize + * the types passed to finalize for each method + + This is a known limitation we would like to address in the future. +""" +from collections import defaultdict +from functools import wraps +from typing import TYPE_CHECKING, Any, Callable, Union + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +if TYPE_CHECKING: + from pandas.core.generic import NDFrame + +dispatch = defaultdict(dict) +dispatch_method_type = Union[Callable[..., "NDFrame"], str] + + +def key_of(method): + if isinstance(method, str): + # TODO: figure out if this is OK. May be necessary when we have + # things like pd.merge and DataFrame.merge that hit the same finalize. + return method + elif method: + return method.__module__, method.__name__ + + +class PandasMetadata: + """ + Dispatch metadata finalization for pandas metadata. + + Users should instantiate a single `PandasMetadata` instance + for their piece of metadata and register finalizers for various + pandas methods using :meth:`PandsaMetadata.register`. + + Parameters + ---------- + name : str + The name of the attribute being finalized. + + Examples + -------- + >>> maxmeta = PandasMetadata("attr") + + Register a finalizer for a given pandas method: + + >>> @maxmeta.register(pd.concat) + ... def _(new, concatenator): + ... new.attr = max(x.attr_meta for x in concatenator.objs) + + >>> pd.DataFrame._metadata = ['attr'] + >>> x = pd.DataFrame({"x"}); x.attr = 1 + >>> y = pd.DataFrame({"y"}); y.attr = 2 + >>> pd.concat([x, y]).attr + 2 + """ + + def __init__(self, name: str): + self.name = name + + def register(self, pandas_method: dispatch_method_type): + """ + A decorator to register a finalizer for a specific pandas method. + + Parameters + ---------- + pandas_method : callable or str + A pandas method, like :meth:`pandas.concat`, that this finalizer + should be used for. The function being decorated will be called + with the relevant arguments (typically the output and the source NDFrame). + When `NDFrame.__finalize__` is called as a result of `pandas_method`, + the registered finalizer will be called. + """ + + def decorate(func): + # TODO: warn of collisions? + dispatch[key_of(pandas_method)][self.name] = func + + @wraps(func) + def wrapper(*args, **kwargs): + return func(*args, **kwargs) + + return wrapper + + return decorate + + +def default_finalizer(new: "NDFrame", other: Any, *, name: str): + """ + The default finalizer when this method, attribute hasn't been overridden. + + This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying + ``self`` inplace. + + Parameters + ---------- + new : NDFrame + The newly created NDFrame being finalized. + other : NDFrame + The source NDFrame attributes will be extracted from. + """ + object.__setattr__(new, name, getattr(other, name, None)) + + +# ---------------------------------------------------------------------------- +# Pandas Internals. + + +def ndframe_finalize(new: "NDFrame", other: Any, method: dispatch_method_type): + """ + Finalize a new NDFrame. + + The finalizer is looked up from finalizers registered with PandasMetadata. + `new` is modified inplace, and nothing is returned. + + Parameters + ---------- + new : NDFrame + other : NDFrame + Or a list of them? TBD + method : callable or str + """ + # To avoid one isinstance per _metadata name, we check up front. + # Most of the time `other` is an ndframe, but in some cases (e.g. concat) + # it's `_Concatenator` object + other_is_ndframe = isinstance(other, (ABCSeries, ABCDataFrame)) + + for name in new._metadata: + finalizer = dispatch.get(key_of(method), {}).get(name) + + if finalizer: + finalizer(new, other) + elif other_is_ndframe: + default_finalizer(new, other, name=name) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b427b1f0ac858..43c198d5707fe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -62,6 +62,7 @@ import pandas as pd from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops +from pandas.core._meta import ndframe_finalize import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -5175,9 +5176,7 @@ def __finalize__(self, other, method=None, **kwargs): types of propagation actions based on this """ - if isinstance(other, NDFrame): - for name in self._metadata: - object.__setattr__(self, name, getattr(other, name, None)) + ndframe_finalize(self, other, method) return self def __getattr__(self, name): @@ -6016,7 +6015,7 @@ def copy(self, deep=True): dtype: object """ data = self._data.copy(deep=deep) - return self._constructor(data).__finalize__(self) + return self._constructor(data).__finalize__(self, NDFrame.copy) def __copy__(self, deep=True): return self.copy(deep=deep) diff --git a/pandas/tests/generic/test_metadata.py b/pandas/tests/generic/test_metadata.py new file mode 100644 index 0000000000000..8912d7214e3fc --- /dev/null +++ b/pandas/tests/generic/test_metadata.py @@ -0,0 +1,49 @@ +import pytest + +import pandas as pd +from pandas.core._meta import PandasMetadata + +mymeta = PandasMetadata("attr") + + +@mymeta.register(pd.core.generic.NDFrame.copy) +def _(new, other): + new.attr = other.attr + 1 + + +@mymeta.register("concat") +def _(new, other): + assert isinstance(other, pd.core.reshape.concat._Concatenator) + new.attr = sum(x.attr for x in other.objs) + + +@pytest.fixture +def custom_meta(monkeypatch): + original_metadata = [] + + for cls in [pd.Series, pd.DataFrame]: + original_metadata.append(cls._metadata) + custom_metadata = cls._metadata.copy() + custom_metadata.append("attr") + + monkeypatch.setattr(cls, "_metadata", custom_metadata) + + +def test_custom_finalizer(custom_meta): + + df = pd.DataFrame({"A": [1, 2]}) + df.attr = 0 + + result = df.copy() + assert result.attr == 1 + + +def test_concat(custom_meta): + df1 = pd.DataFrame({"A": [1, 2]}) + df1.attr = 2 + + df2 = pd.DataFrame({"A": [1, 2]}) + df2.attr = 3 + + result = pd.concat([df1, df2]) + assert result.attr == 5 From d7bb99c022408b5c23cd780b82c677f566710725 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Sep 2019 08:55:29 -0500 Subject: [PATCH 2/7] refactor --- pandas/core/_meta.py | 159 ++++++++++---------------- pandas/core/generic.py | 17 ++- pandas/tests/generic/test_metadata.py | 15 ++- 3 files changed, 81 insertions(+), 110 deletions(-) diff --git a/pandas/core/_meta.py b/pandas/core/_meta.py index 07294a2e7d89f..55907af6c7083 100644 --- a/pandas/core/_meta.py +++ b/pandas/core/_meta.py @@ -5,46 +5,43 @@ through operations. We perform an operation (say :meth:`pandas.Series.copy`) that returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``) from ``self`` to the new ``NDFrame``. +""" +from typing import Any, Dict -.. note:: +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries - Currently, pandas doesn't provide a clean, documented API on +from pandas._typing import FrameOrSeries - * which methods call finalize - * the types passed to finalize for each method - This is a known limitation we would like to address in the future. -""" -from collections import defaultdict -from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, Union +class PandasMetadataType(type): + """ + Metaclass controlling creation of metadata finalizers. -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + This ensures we have one finalizer instance per name, and + provides a place to look up finalizer per name. + """ -if TYPE_CHECKING: - from pandas.core.generic import NDFrame + # TODO(Py35): Replace metaclass with __subclass_init__ -dispatch = defaultdict(dict) -dispatch_method_type = Union[Callable[..., "NDFrame"], str] + _instances = {} # type: Dict[str, "PandasMetadata"] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) -def key_of(method): - if isinstance(method, str): - # TODO: figure out if this is OK. May be necessary when we have - # things like pd.merge and DataFrame.merge that hit the same finalize. - return method - elif method: - return method.__module__, method.__name__ + def __call__(self, *args: Any, **kwds: Any) -> Any: + name = args[0] + if name in self._instances: + return self._instances[name] + else: + new = super().__call__(name, *args, **kwds) + self._instances[name] = new + return new -class PandasMetadata: +class PandasMetadata(metaclass=PandasMetadataType): """ Dispatch metadata finalization for pandas metadata. - Users should instantiate a single `PandasMetadata` instance - for their piece of metadata and register finalizers for various - pandas methods using :meth:`PandsaMetadata.register`. - Parameters ---------- name : str @@ -52,95 +49,61 @@ class PandasMetadata: Examples -------- - >>> maxmeta = PandasMetadata("attr") + If you want the default resolution (copy from a source NDFrame + to a new NDFrame), you can just create an instance - Register a finalizer for a given pandas method: + >>> mymeta = PandasMetadata("mymeta") - >>> @maxmeta.register(pd.concat) - ... def _(new, concatenator): - ... new.attr = max(x.attr_meta for x in concatenator.objs) + If you need custom metadata resolution, you'll need to subclass. - >>> pd.DataFrame._metadata = ['attr'] - >>> x = pd.DataFrame({"x"}); x.attr = 1 - >>> y = pd.DataFrame({"y"}); y.attr = 2 - >>> pd.concat([x, y]).attr - 2 + >>> class IncrementMetadata: + ... def finalize_default(self, new, other): + ... setattr(new, self.attr, getattr(other, self.name, -1) + 1) + + >>> increment_metadata = IncrementMetadata("attr") """ def __init__(self, name: str): self.name = name - def register(self, pandas_method: dispatch_method_type): + def __repr__(self): + return "PandasMetadata(name='{}')".format(self.name) + + def finalize_default(self, new: "FrameOrSeries", other: Any): + # TODO: tighten the type on Any. Union[NDFrame, _Concatenator, ...] """ - A decorator to register a finalizer for a specific pandas method. + The default finalizer when this method, attribute hasn't been overridden. + + This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying + ``self`` inplace. Parameters ---------- - pandas_method : callable or str - A pandas method, like :meth:`pandas.concat`, that this finalizer - should be used for. The function being decorated will be called - with the relevant arguments (typically the output and the source NDFrame). - When `NDFrame.__finalize__` is called as a result of `pandas_method`, - the registered finalizer will be called. + new : NDFrame + The newly created NDFrame being finalized. + other : Any + The source object attributes will be extracted from. """ + # TODO: check perf on this isinstance. + if isinstance(other, (ABCSeries, ABCDataFrame)): + object.__setattr__(new, self.name, getattr(other, self.name, None)) - def decorate(func): - # TODO: warn of collisions? - dispatch[key_of(pandas_method)][self.name] = func + def finalize_copy(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.finalize_default(new, other) - @wraps(func) - def wrapper(*args, **kwargs): - return func(*args, **kwargs) + def finalize_concat(self, new: "FrameOrSeries", other: Any): + # TODO: type other as _Concatenator + self.finalize_default(new, other) - return wrapper - return decorate +class NameMetadata(PandasMetadata): + """Finalization for Series.name""" -def default_finalizer(new: "NDFrame", other: Any, *, name: str): - """ - The default finalizer when this method, attribute hasn't been overridden. - - This copies the ``_metadata`` attribute from ``other`` to ``self``, modifying - ``self`` inplace. +# TODO: having to create this here feels weird. +name_metadata = NameMetadata("name") - Parameters - ---------- - new : NDFrame - The newly created NDFrame being finalized. - other : NDFrame - The source NDFrame attributes will be extracted from. - """ - object.__setattr__(new, name, getattr(other, name, None)) - - -# ---------------------------------------------------------------------------- -# Pandas Internals. - - -def ndframe_finalize(new: "NDFrame", other: Any, method: dispatch_method_type): - """ - Finalize a new NDFrame. - - The finalizer is looked up from finalizers registered with PandasMetadata. - `new` is modified inplace, and nothing is returned. - - Parameters - ---------- - new : NDFrame - other : NDFrame - Or a list of them? TBD - method : callable or str - """ - # To avoid one isinstance per _metadata name, we check up front. - # Most of the time `other` is an ndframe, but in some cases (e.g. concat) - # it's `_Concatenator` object - other_is_ndframe = isinstance(other, (ABCSeries, ABCDataFrame)) - - for name in new._metadata: - finalizer = dispatch.get(key_of(method), {}).get(name) - - if finalizer: - finalizer(new, other) - elif other_is_ndframe: - default_finalizer(new, other, name=name) +# For backwards compat. Do we care about this? +# We can pretty easily deprecate, require subclasses to make their +# own instance. +default_finalizer = PandasMetadata("pandas") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 43c198d5707fe..740ca9b2362a5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -62,7 +62,7 @@ import pandas as pd from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops -from pandas.core._meta import ndframe_finalize +from pandas.core._meta import PandasMetadataType, default_finalizer import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -5164,7 +5164,7 @@ def pipe(self, func, *args, **kwargs): # ---------------------------------------------------------------------- # Attribute access - def __finalize__(self, other, method=None, **kwargs): + def __finalize__(self, other, method=None): """ Propagate metadata from other to self. @@ -5176,7 +5176,16 @@ def __finalize__(self, other, method=None, **kwargs): types of propagation actions based on this """ - ndframe_finalize(self, other, method) + for name in self._metadata: + finalizer = PandasMetadataType._instances.get(name, default_finalizer) + # TODO: measure perf of this vs. a dict. + if method == "copy": + finalizer.finalize_copy(self, other) + elif method == "concat": + finalizer.finalize_concat(self, other) + else: + finalizer.finalize_default(self, other) + return self def __getattr__(self, name): @@ -6015,7 +6024,7 @@ def copy(self, deep=True): dtype: object """ data = self._data.copy(deep=deep) - return self._constructor(data).__finalize__(self, NDFrame.copy) + return self._constructor(data).__finalize__(self, method="copy") def __copy__(self, deep=True): return self.copy(deep=deep) diff --git a/pandas/tests/generic/test_metadata.py b/pandas/tests/generic/test_metadata.py index 8912d7214e3fc..9bee9a79b509c 100644 --- a/pandas/tests/generic/test_metadata.py +++ b/pandas/tests/generic/test_metadata.py @@ -3,18 +3,17 @@ import pandas as pd from pandas.core._meta import PandasMetadata -mymeta = PandasMetadata("attr") +class MyMeta(PandasMetadata): + def finalize_copy(self, new, other): + new.attr = other.attr + 1 -@mymeta.register(pd.core.generic.NDFrame.copy) -def _(new, other): - new.attr = other.attr + 1 + def finalize_concat(self, new, other): + assert isinstance(other, pd.core.reshape.concat._Concatenator) + new.attr = sum(x.attr for x in other.objs) -@mymeta.register("concat") -def _(new, other): - assert isinstance(other, pd.core.reshape.concat._Concatenator) - new.attr = sum(x.attr for x in other.objs) +mymeta = MyMeta("attr") @pytest.fixture From b05782c0127dc0eceb59ff489680d1903f01d113 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Sep 2019 09:08:16 -0500 Subject: [PATCH 3/7] move to meta --- pandas/core/generic.py | 2 +- pandas/core/{_meta.py => meta.py} | 0 pandas/tests/generic/test_metadata.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename pandas/core/{_meta.py => meta.py} (100%) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 58d2b7a7e5ba7..5ae08bb2267ee 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -62,7 +62,6 @@ import pandas as pd from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops -from pandas.core._meta import PandasMetadataType, default_finalizer import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com @@ -77,6 +76,7 @@ from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing from pandas.core.internals import BlockManager +from pandas.core.meta import PandasMetadataType, default_finalizer from pandas.core.ops import _align_method_FRAME from pandas.io.formats import format as fmt diff --git a/pandas/core/_meta.py b/pandas/core/meta.py similarity index 100% rename from pandas/core/_meta.py rename to pandas/core/meta.py diff --git a/pandas/tests/generic/test_metadata.py b/pandas/tests/generic/test_metadata.py index 9bee9a79b509c..9899abf207a55 100644 --- a/pandas/tests/generic/test_metadata.py +++ b/pandas/tests/generic/test_metadata.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -from pandas.core._meta import PandasMetadata +from pandas.core.meta import PandasMetadata class MyMeta(PandasMetadata): From 53576eb77151e24ddbf5edb37aaa3ad68f2e9379 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Sep 2019 10:11:21 -0500 Subject: [PATCH 4/7] mypy --- pandas/core/meta.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/meta.py b/pandas/core/meta.py index 55907af6c7083..f2bbbbdf9d492 100644 --- a/pandas/core/meta.py +++ b/pandas/core/meta.py @@ -28,8 +28,7 @@ class PandasMetadataType(type): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - def __call__(self, *args: Any, **kwds: Any) -> Any: - name = args[0] + def __call__(self, name, *args: Any, **kwds: Any) -> Any: # type: ignore if name in self._instances: return self._instances[name] else: From 3009732e5a437b5f95f543e583c521f31e61bf3a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 9 Sep 2019 13:23:22 -0500 Subject: [PATCH 5/7] fixed subclass --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ae08bb2267ee..3538e4c3ba852 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -76,7 +76,7 @@ from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing from pandas.core.internals import BlockManager -from pandas.core.meta import PandasMetadataType, default_finalizer +from pandas.core.meta import PandasMetadata from pandas.core.ops import _align_method_FRAME from pandas.io.formats import format as fmt @@ -5177,7 +5177,7 @@ def __finalize__(self, other, method=None): """ for name in self._metadata: - finalizer = PandasMetadataType._instances.get(name, default_finalizer) + finalizer = PandasMetadata(name) # TODO: measure perf of this vs. a dict. if method == "copy": finalizer.finalize_copy(self, other) From 710d73a07d6b875e59ac641ea7b575736ef0e1ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Sep 2019 11:58:35 -0500 Subject: [PATCH 6/7] define API --- pandas/_typing.py | 8 ++ pandas/core/frame.py | 2 +- pandas/core/generic.py | 5 + pandas/core/indexes/multi.py | 4 +- pandas/core/meta.py | 233 +++++++++++++++++++++++++++++++++-- pandas/core/series.py | 1 + 6 files changed, 241 insertions(+), 12 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 37a5d7945955d..8cffe3c9de0ad 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -15,6 +15,14 @@ from pandas.core.sparse.series import SparseSeries # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 + # TODO: Are we OK with these becoming semi-public? + from pandas.core.reshape.concat import _Concatenator as Concatenator # noqa: F401 + from pandas.core.reshape.merge import _OrderedMerge as OrderedMerge # noqa: F401 + from pandas.core.reshape.merge import _AsOfMerge as AsOfMerge # noqa: F401 + from pandas.core.reshape.merge import ( # noqa: F401 + _MergeOperation as MergeOperation, + ) + AnyArrayLike = TypeVar( "AnyArrayLike", "ExtensionArray", "Index", "Series", "SparseSeries", np.ndarray diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 16fece1c7eb8b..34cafe2b6c5df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2920,7 +2920,7 @@ def _getitem_multilevel(self, key): result = self._constructor( new_values, index=self.index, columns=result_columns ) - result = result.__finalize__(self) + result = result.__finalize__(self, method="geitem_multilevel") # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3538e4c3ba852..b59a9e2e833b4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5183,6 +5183,11 @@ def __finalize__(self, other, method=None): finalizer.finalize_copy(self, other) elif method == "concat": finalizer.finalize_concat(self, other) + elif method == "getitem_multilevel": + finalizer.getitem_multilevel(self, other) + elif method == "sort_index": + finalizer.getitem_sort_index(self, other) + else: finalizer.finalize_default(self, other) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 761862b9f30e9..122cf721d934c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1490,7 +1490,9 @@ def _try_mi(k): new_index = maybe_droplevels(new_index, k) return series._constructor( new_values, index=new_index, name=series.name - ).__finalize__(self) + ).__finalize__( + self + ) # TODO: figure out finalize API try: return self._engine.get_value(s, k) diff --git a/pandas/core/meta.py b/pandas/core/meta.py index f2bbbbdf9d492..0c5ee307c87d4 100644 --- a/pandas/core/meta.py +++ b/pandas/core/meta.py @@ -6,11 +6,19 @@ returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``) from ``self`` to the new ``NDFrame``. """ -from typing import Any, Dict +from typing import TYPE_CHECKING, Any, Dict, Union from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas._typing import FrameOrSeries +if TYPE_CHECKING: + from pandas._typing import ( + FrameOrSeries, + Series, + Concatenator, + OrderedMerge, + MergeOperation, + AsOfMerge, + ) class PandasMetadataType(type): @@ -56,7 +64,7 @@ class PandasMetadata(metaclass=PandasMetadataType): If you need custom metadata resolution, you'll need to subclass. >>> class IncrementMetadata: - ... def finalize_default(self, new, other): + ... def default(self, new, other): ... setattr(new, self.attr, getattr(other, self.name, -1) + 1) >>> increment_metadata = IncrementMetadata("attr") @@ -68,8 +76,9 @@ def __init__(self, name: str): def __repr__(self): return "PandasMetadata(name='{}')".format(self.name) - def finalize_default(self, new: "FrameOrSeries", other: Any): - # TODO: tighten the type on Any. Union[NDFrame, _Concatenator, ...] + def default( + self, new: "FrameOrSeries", other: Union["FrameOrSeries", "Concatenator"] + ): """ The default finalizer when this method, attribute hasn't been overridden. @@ -87,12 +96,216 @@ def finalize_default(self, new: "FrameOrSeries", other: Any): if isinstance(other, (ABCSeries, ABCDataFrame)): object.__setattr__(new, self.name, getattr(other, self.name, None)) - def finalize_copy(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.finalize_default(new, other) + def sort_index(self, new: "FrameOrSeries", other: "FrameOrSeries"): + self.default(new, other) - def finalize_concat(self, new: "FrameOrSeries", other: Any): - # TODO: type other as _Concatenator - self.finalize_default(new, other) + def sort_values(self, new: "FrameOrSeries", other: "FrameOrSeries"): + self.default(new, other) + + def copy(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def rename(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + # TODO: these are private methods. + def getitem_multilevel(self, new: "FrameOrSeries", other: "FrameOrSeries"): + self.default(new, other) + + def simple_replace(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def array_wrap(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def align_frame(self, new: "FrameOrSeries", other: "FrameOrSeries"): + # TODO: Check the types here. We end up calling __finalize__ twice. + # left.__finalize__(self) + # right.__finalize__(other) + # I think the type on right is Frame->Frame. + return self.default(new, other) + + def align_series(self, new: "FrameOrSeries", other: "FrameOrSeries"): + # TODO: Check the types here. We end up calling __finalize__ twice. + # left.__finalize__(self) + # right.__finalize__(other) + # I think the type on right is Series->Series. + return self.default(new, other) + + def consolidate(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def convert(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private + return self.default(new, other) + + def get_bool_data( + self, new: "FrameOrSeries", other: "FrameOrSeries" + ): # XXX: private + return self.default(new, other) + + def get_numeric_data( + self, new: "FrameOrSeries", other: "FrameOrSeries" + ): # XXX: private + return self.default(new, other) + + def reindex_with_indexers( + self, new: "FrameOrSeries", other: "FrameOrSeries" + ): # XXX: private + return self.default(new, other) + + def slice(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private + return self.default(new, other) + + def to_dict_of_blocks( + self, new: "FrameOrSeries", other: "FrameOrSeries" + ): # XXX: private + return self.default(new, other) + + def where(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private + return self.default(new, other) + + def astype(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def fillna(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def infer_objects(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def interpolate(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def rank(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def replace(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def shift(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def slice_shift(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def swapaxes(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def take(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def transpose(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def tshift(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def tz_convert(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def tz_localize(self, new: "FrameOrSeries", other: "FrameOrSeries"): + return self.default(new, other) + + def cum_func(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private + return self.default(new, other) + + # ops + # TODO: constrcut_result, wrapper + # def construct_result(self): + def flex_wrapper(self, new: "FrameOrSeries", other: Any): + return self.default(new, other) + + # ---------------------------------------------------------------------------------- + # Series + # TODO: How should we namespace these? + def series_getitem(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_get_values(self, new: "Series", other: "Series"): # XXX: private + return self.default(new, other) + + def series_get_values_tuple(self, new: "Series", other: "Series"): # XXX: private + return self.default(new, other) + + def series_apply(self, new: "Series", other: "Series"): + # TODO: We don't call __finalize__ when expanding to a DataFrame. May need to + # update the type + return self.default(new, other) + + def series_argsort(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_count(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_diff(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_dot(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_isin(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_map(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_repeat(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_reset_index(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_round(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_sort_index(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_sort_values(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_swaplevel(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_take(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_to_period(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_to_sparse(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_to_timestamp(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_view(self, new: "Series", other: "Series"): + return self.default(new, other) + + def series_duplicated(self, new: "Series", other: "Series"): + return self.default(new, other) + + # Other finalizers + def concat(self, new: "FrameOrSeries", other: "Concatenator"): + self.default(new, other) + + def merge( + self, new: "FrameOrSeries", other: "MergeOperation" + ): # todo: specialize? merge_asof? + self.default(new, other) + + def ordered_merge( + self, new: "FrameOrSeries", other: "OrderedMerge" + ): # todo: specialize? merge_asof? + self.default(new, other) + + def asof_merge( + self, new: "FrameOrSeries", other: "AsOfMerge" + ): # todo: specialize? merge_asof? + self.default(new, other) + + # TODO: Sparse? Just ignoring for now I think. class NameMetadata(PandasMetadata): diff --git a/pandas/core/series.py b/pandas/core/series.py index 10d50e89ca92e..bcfd1884172e2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3999,6 +3999,7 @@ def f(x): if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used + # TODO: would like to apply finalize here. return self._constructor_expanddim(pd.array(mapped), index=self.index) else: return self._constructor(mapped, index=self.index).__finalize__(self) From ecf3989ae5d6e4d73b4f250b07b8dfa11531e60c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Sep 2019 12:13:59 -0500 Subject: [PATCH 7/7] simplify --- pandas/_typing.py | 8 - pandas/core/frame.py | 2 +- pandas/core/generic.py | 15 +- pandas/core/indexes/multi.py | 4 +- pandas/core/meta.py | 255 ++++---------------------- pandas/tests/generic/test_metadata.py | 16 +- 6 files changed, 49 insertions(+), 251 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 8cffe3c9de0ad..37a5d7945955d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -15,14 +15,6 @@ from pandas.core.sparse.series import SparseSeries # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 - # TODO: Are we OK with these becoming semi-public? - from pandas.core.reshape.concat import _Concatenator as Concatenator # noqa: F401 - from pandas.core.reshape.merge import _OrderedMerge as OrderedMerge # noqa: F401 - from pandas.core.reshape.merge import _AsOfMerge as AsOfMerge # noqa: F401 - from pandas.core.reshape.merge import ( # noqa: F401 - _MergeOperation as MergeOperation, - ) - AnyArrayLike = TypeVar( "AnyArrayLike", "ExtensionArray", "Index", "Series", "SparseSeries", np.ndarray diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 34cafe2b6c5df..16fece1c7eb8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2920,7 +2920,7 @@ def _getitem_multilevel(self, key): result = self._constructor( new_values, index=self.index, columns=result_columns ) - result = result.__finalize__(self, method="geitem_multilevel") + result = result.__finalize__(self) # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b59a9e2e833b4..3eabeafe296ae 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5178,18 +5178,7 @@ def __finalize__(self, other, method=None): """ for name in self._metadata: finalizer = PandasMetadata(name) - # TODO: measure perf of this vs. a dict. - if method == "copy": - finalizer.finalize_copy(self, other) - elif method == "concat": - finalizer.finalize_concat(self, other) - elif method == "getitem_multilevel": - finalizer.getitem_multilevel(self, other) - elif method == "sort_index": - finalizer.getitem_sort_index(self, other) - - else: - finalizer.finalize_default(self, other) + finalizer.finalize(self, other, method) return self @@ -6029,7 +6018,7 @@ def copy(self, deep=True): dtype: object """ data = self._data.copy(deep=deep) - return self._constructor(data).__finalize__(self, method="copy") + return self._constructor(data).__finalize__(self) def __copy__(self, deep=True): return self.copy(deep=deep) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 122cf721d934c..761862b9f30e9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1490,9 +1490,7 @@ def _try_mi(k): new_index = maybe_droplevels(new_index, k) return series._constructor( new_values, index=new_index, name=series.name - ).__finalize__( - self - ) # TODO: figure out finalize API + ).__finalize__(self) try: return self._engine.get_value(s, k) diff --git a/pandas/core/meta.py b/pandas/core/meta.py index 0c5ee307c87d4..bf2617b446a39 100644 --- a/pandas/core/meta.py +++ b/pandas/core/meta.py @@ -6,19 +6,12 @@ returns an ``NDFrame`` and would like to propagate the metadata (say ``Series.name``) from ``self`` to the new ``NDFrame``. """ -from typing import TYPE_CHECKING, Any, Dict, Union +from typing import TYPE_CHECKING, Any, Dict from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries if TYPE_CHECKING: - from pandas._typing import ( - FrameOrSeries, - Series, - Concatenator, - OrderedMerge, - MergeOperation, - AsOfMerge, - ) + from pandas._typing import FrameOrSeries class PandasMetadataType(type): @@ -76,9 +69,36 @@ def __init__(self, name: str): def __repr__(self): return "PandasMetadata(name='{}')".format(self.name) - def default( - self, new: "FrameOrSeries", other: Union["FrameOrSeries", "Concatenator"] - ): + def finalize(self, new: "FrameOrSeries", other: Any, method): + """ + Run the finalization for `method`. + + Parameters + ---------- + new : DataFrame or Series + other : Any + One of the following types + + * DataFrame + * Series + * Concatenator + * MergeOperation + + method : str + The source method. + + Returns + ------- + None + Expected to operate inplace. + + Notes + ----- + The default implementation simply calls ``.default``, ignoring `method`. + """ + self.default(new, other) + + def default(self, new: "FrameOrSeries", other: Any): """ The default finalizer when this method, attribute hasn't been overridden. @@ -96,217 +116,6 @@ def default( if isinstance(other, (ABCSeries, ABCDataFrame)): object.__setattr__(new, self.name, getattr(other, self.name, None)) - def sort_index(self, new: "FrameOrSeries", other: "FrameOrSeries"): - self.default(new, other) - - def sort_values(self, new: "FrameOrSeries", other: "FrameOrSeries"): - self.default(new, other) - - def copy(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def rename(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - # TODO: these are private methods. - def getitem_multilevel(self, new: "FrameOrSeries", other: "FrameOrSeries"): - self.default(new, other) - - def simple_replace(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def array_wrap(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def align_frame(self, new: "FrameOrSeries", other: "FrameOrSeries"): - # TODO: Check the types here. We end up calling __finalize__ twice. - # left.__finalize__(self) - # right.__finalize__(other) - # I think the type on right is Frame->Frame. - return self.default(new, other) - - def align_series(self, new: "FrameOrSeries", other: "FrameOrSeries"): - # TODO: Check the types here. We end up calling __finalize__ twice. - # left.__finalize__(self) - # right.__finalize__(other) - # I think the type on right is Series->Series. - return self.default(new, other) - - def consolidate(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def convert(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private - return self.default(new, other) - - def get_bool_data( - self, new: "FrameOrSeries", other: "FrameOrSeries" - ): # XXX: private - return self.default(new, other) - - def get_numeric_data( - self, new: "FrameOrSeries", other: "FrameOrSeries" - ): # XXX: private - return self.default(new, other) - - def reindex_with_indexers( - self, new: "FrameOrSeries", other: "FrameOrSeries" - ): # XXX: private - return self.default(new, other) - - def slice(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private - return self.default(new, other) - - def to_dict_of_blocks( - self, new: "FrameOrSeries", other: "FrameOrSeries" - ): # XXX: private - return self.default(new, other) - - def where(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private - return self.default(new, other) - - def astype(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def fillna(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def infer_objects(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def interpolate(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def rank(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def replace(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def shift(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def slice_shift(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def swapaxes(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def take(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def transpose(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def tshift(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def tz_convert(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def tz_localize(self, new: "FrameOrSeries", other: "FrameOrSeries"): - return self.default(new, other) - - def cum_func(self, new: "FrameOrSeries", other: "FrameOrSeries"): # XXX: private - return self.default(new, other) - - # ops - # TODO: constrcut_result, wrapper - # def construct_result(self): - def flex_wrapper(self, new: "FrameOrSeries", other: Any): - return self.default(new, other) - - # ---------------------------------------------------------------------------------- - # Series - # TODO: How should we namespace these? - def series_getitem(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_get_values(self, new: "Series", other: "Series"): # XXX: private - return self.default(new, other) - - def series_get_values_tuple(self, new: "Series", other: "Series"): # XXX: private - return self.default(new, other) - - def series_apply(self, new: "Series", other: "Series"): - # TODO: We don't call __finalize__ when expanding to a DataFrame. May need to - # update the type - return self.default(new, other) - - def series_argsort(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_count(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_diff(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_dot(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_isin(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_map(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_repeat(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_reset_index(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_round(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_sort_index(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_sort_values(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_swaplevel(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_take(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_to_period(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_to_sparse(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_to_timestamp(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_view(self, new: "Series", other: "Series"): - return self.default(new, other) - - def series_duplicated(self, new: "Series", other: "Series"): - return self.default(new, other) - - # Other finalizers - def concat(self, new: "FrameOrSeries", other: "Concatenator"): - self.default(new, other) - - def merge( - self, new: "FrameOrSeries", other: "MergeOperation" - ): # todo: specialize? merge_asof? - self.default(new, other) - - def ordered_merge( - self, new: "FrameOrSeries", other: "OrderedMerge" - ): # todo: specialize? merge_asof? - self.default(new, other) - - def asof_merge( - self, new: "FrameOrSeries", other: "AsOfMerge" - ): # todo: specialize? merge_asof? - self.default(new, other) - - # TODO: Sparse? Just ignoring for now I think. - class NameMetadata(PandasMetadata): """Finalization for Series.name""" diff --git a/pandas/tests/generic/test_metadata.py b/pandas/tests/generic/test_metadata.py index 9899abf207a55..0a7ca78f11f99 100644 --- a/pandas/tests/generic/test_metadata.py +++ b/pandas/tests/generic/test_metadata.py @@ -5,7 +5,15 @@ class MyMeta(PandasMetadata): - def finalize_copy(self, new, other): + def finalize(self, new, other, method): + if method == "concat": + self.finalize_concat(new, other) + elif method == "copy": + self.finalize_copy(new, other) + else: + super().finalize(new, other, method) + + def default(self, new, other): new.attr = other.attr + 1 def finalize_concat(self, new, other): @@ -28,7 +36,8 @@ def custom_meta(monkeypatch): monkeypatch.setattr(cls, "_metadata", custom_metadata) -def test_custom_finalizer(custom_meta): +@pytest.mark.usefixtures("custom_meta") +def test_custom_finalizer(): df = pd.DataFrame({"A": [1, 2]}) df.attr = 0 @@ -37,7 +46,8 @@ def test_custom_finalizer(custom_meta): assert result.attr == 1 -def test_concat(custom_meta): +@pytest.mark.usefixtures("custom_meta") +def test_concat(): df1 = pd.DataFrame({"A": [1, 2]}) df1.attr = 2