From 4c5eddd63e94bacddb96bf61f81a6a8fcd9c33f0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Aug 2020 21:19:10 -0700 Subject: [PATCH 01/71] REF: remove unnecesary try/except --- pandas/core/groupby/generic.py | 69 ++++++++++++++++------------------ 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 166631e69f523..51532a75d2d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -31,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -60,6 +60,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,32 +1035,31 @@ def _cython_agg_blocks( no_result = object() - def cast_result_block(result, block: "Block", how: str) -> "Block": - # see if we can cast the block to the desired dtype + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: "Block" = block.make_block(result) - return agg_block + return result def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1093,33 +1093,30 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - try: - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - raise + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + result = cast(DataFrame, result) + # unwrap DataFrame to get array + if len(result._mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] + new_blocks.append(agg_block) else: - result = cast(DataFrame, result) - # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] - new_blocks.append(agg_block) - else: - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) + new_blocks = [agg_block] else: - agg_block = cast_result_block(result, block, how) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) new_blocks = [agg_block] return new_blocks From 42649fbb855a895ee5818d7dc80bdbd0ce0e9f5a Mon Sep 17 00:00:00 2001 From: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Date: Fri, 21 Aug 2020 17:34:51 -0500 Subject: [PATCH 02/71] TST: add test for agg on ordered categorical cols (#35630) --- .../tests/groupby/aggregate/test_aggregate.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ce9d4b892d775..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,6 +1063,85 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data=exp_data, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate From 47121ddc1c655f428c6c3fcea8fbf02eba85600a Mon Sep 17 00:00:00 2001 From: tkmz-n <60312218+tkmz-n@users.noreply.github.com> Date: Sat, 22 Aug 2020 07:42:50 +0900 Subject: [PATCH 03/71] TST: resample does not yield empty groups (#10603) (#35799) --- pandas/tests/resample/test_timedelta.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 0fbb60c176b30..3fa85e62d028c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): tm.assert_index_equal(result.index, expected_index) assert result.index.freq == expected_index.freq assert not np.isnan(result[-1]) + + +def test_resample_with_timedelta_yields_no_empty_groups(): + # GH 10603 + df = pd.DataFrame( + np.random.normal(size=(10000, 4)), + index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), + ) + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) + + expected = pd.DataFrame( + [[768.0] * 4] * 12 + [[528.0] * 4], + index=pd.timedelta_range(start="1s", periods=13, freq="3s"), + ) + tm.assert_frame_equal(result, expected) From 1decb3e0ee1923a29b8eded7507bcb783b3870d0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Aug 2020 18:48:02 -0700 Subject: [PATCH 04/71] revert accidental rebase --- pandas/core/groupby/generic.py | 61 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4b1f6cfe0a662..60e23b14eaf09 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -59,7 +59,6 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms -from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1034,31 +1033,32 @@ def _cython_agg_blocks( no_result = object() - def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: - # see if we can cast the values to the desired dtype + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(values.dtype, how) + dtype = maybe_cast_result_dtype(block.dtype, how) result = maybe_downcast_numeric(result, dtype) - if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): - # e.g. values was an IntegerArray - # (1, N) case can occur if values was Categorical + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(values)._from_sequence( - result.ravel(), dtype=values.dtype + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - return result + agg_block: "Block" = block.make_block(result) + return agg_block def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1092,25 +1092,28 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) - new_blocks = [agg_block] + try: + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + raise + else: + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_blocks = [agg_block] else: - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) + agg_block = cast_result_block(result, block, how) new_blocks = [agg_block] return new_blocks From 51205a51dd75c791848c353e9af3d8b46aa4afd6 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Aug 2020 18:51:45 -0700 Subject: [PATCH 05/71] REF/BUG: don't go through cython for EA indexes --- pandas/core/groupby/generic.py | 50 +++++++++++++++++++++++++++++----- pandas/core/groupby/ops.py | 5 ++++ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2afa56b50c3c7..36db78a77c511 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -74,7 +74,14 @@ get_groupby, ) from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba -from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + TimedeltaIndex, + all_indexes_same, +) import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series @@ -262,17 +269,46 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) - try: - return self._python_agg_general(func, *args, **kwargs) - except (ValueError, KeyError): - # TODO: KeyError is raised in _python_agg_general, - # see see test_groupby.test_basic + if isinstance( + self._selected_obj.index, (DatetimeIndex, TimedeltaIndex, PeriodIndex) + ): + # using _python_agg_general would end up incorrectly patching + # _index_data in reduction.pyx result = self._aggregate_named(func, *args, **kwargs) + else: + try: + return self._python_agg_general(func, *args, **kwargs) + except (ValueError, KeyError): + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic + result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # TODO: do we _always_ want to do this? + # shouldnt this be done later in eg _wrap_aggregated_output? + index = index._with_freq("infer") + + result_index = self.grouper.result_index + + if ( + result_index.dtype == index.dtype + and result_index.freq is not None + and index.freq is None + ): + # TODO: will dtype equality always hold? + if len(index) == 1: + index.freq = result_index.freq + + elif len(index) == 2: + if index[0] + result_index.freq == index[1]: + # infer_freq doesn't handle length-2 indexes + index.freq = result_index.freq + ret = create_series_with_explicit_dtype( result, index=index, dtype_if_empty=object ) + ret.name = self._selected_obj.name # test_metadata_propagation_indiv if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") @@ -478,7 +514,7 @@ def _get_index() -> Index: def _aggregate_named(self, func, *args, **kwargs): result = {} - for name, group in self: + for name, group in self: # TODO: could we have duplicate names? group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6171a55359fe..66a9f1353d3c5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -672,6 +672,11 @@ def _aggregate_series_pure_python( # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) # FIXME: are we potentially losing important res.index info? res = res.item() + elif group.dtype == object: + # TODO: is this at all right? + # e.g. test_agg_over_numpy_arrays where we have entries + # that are each ndarrays + pass else: raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") From f453c5b3c74a86d4012b9478a3b64204f7cd81dc Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Aug 2020 20:46:43 -0700 Subject: [PATCH 06/71] Implement _aggregate_maybe_named --- pandas/core/groupby/generic.py | 30 +++++++++++++++++++++++++++--- pandas/core/groupby/ops.py | 5 ----- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 36db78a77c511..9b72157ddd087 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -274,14 +274,14 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) ): # using _python_agg_general would end up incorrectly patching # _index_data in reduction.pyx - result = self._aggregate_named(func, *args, **kwargs) + result = self._aggregate_maybe_named(func, *args, **kwargs) else: try: return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic - result = self._aggregate_named(func, *args, **kwargs) + result = self._aggregate_maybe_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) if isinstance(index, (DatetimeIndex, TimedeltaIndex)): @@ -511,11 +511,35 @@ def _get_index() -> Index: ) return self._reindex_output(result) + def _aggregate_maybe_named(self, func, *args, **kwargs): + """ + Try the named-aggregator first, then unnamed, which better matches + what libreduction does. + """ + try: + return self._aggregate_named(func, *args, **kwargs) + except KeyError: + return self._aggregate_unnamed(func, *args, **kwargs) + def _aggregate_named(self, func, *args, **kwargs): result = {} for name, group in self: # TODO: could we have duplicate names? - group.name = name + group.name = name # only difference vs _aggregate_unnamed + output = func(group, *args, **kwargs) + if isinstance(output, (Series, Index, np.ndarray)): + raise ValueError("Must produce aggregated value") + result[name] = output + + return result + + def _aggregate_unnamed(self, func, *args, **kwargs): + """ + Pure-python analogue of what _python_agg_general does. + """ + result = {} + + for name, group in self: # TODO: could we have duplicate names? output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): raise ValueError("Must produce aggregated value") diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 66a9f1353d3c5..c6171a55359fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -672,11 +672,6 @@ def _aggregate_series_pure_python( # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) # FIXME: are we potentially losing important res.index info? res = res.item() - elif group.dtype == object: - # TODO: is this at all right? - # e.g. test_agg_over_numpy_arrays where we have entries - # that are each ndarrays - pass else: raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") From 2ae2124fab275218268b680f5d5ce9e4bbefebe9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Aug 2020 09:01:00 -0700 Subject: [PATCH 07/71] de-duplicate --- pandas/core/groupby/generic.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9b72157ddd087..7927a77141b3d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -517,32 +517,28 @@ def _aggregate_maybe_named(self, func, *args, **kwargs): what libreduction does. """ try: - return self._aggregate_named(func, *args, **kwargs) + return self._aggregate_named(func, *args, named=True, **kwargs) except KeyError: - return self._aggregate_unnamed(func, *args, **kwargs) + return self._aggregate_named(func, *args, named=False, **kwargs) - def _aggregate_named(self, func, *args, **kwargs): + def _aggregate_named(self, func, *args, named: bool = True, **kwargs): result = {} for name, group in self: # TODO: could we have duplicate names? - group.name = name # only difference vs _aggregate_unnamed - output = func(group, *args, **kwargs) - if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") - result[name] = output + if named: + group.name = name - return result - - def _aggregate_unnamed(self, func, *args, **kwargs): - """ - Pure-python analogue of what _python_agg_general does. - """ - result = {} - - for name, group in self: # TODO: could we have duplicate names? output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") + if ( + isinstance(output, Series) + and len(output) == 1 + and name in output.index + ): + # FIXME: kludge for test_resampler_grouper.test_apply + output = output.iloc[0] + else: + raise ValueError("Must produce aggregated value") result[name] = output return result From 98a91a321ff758682bff573ecee1b0bf2e0e6d2e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 27 Aug 2020 14:48:49 -0700 Subject: [PATCH 08/71] avoid passing RangeIndex to libreduction --- pandas/core/groupby/ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6171a55359fe..98e4539adbe24 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,7 +45,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -620,8 +620,10 @@ def agg_series( # TODO: can we get a performant workaround for EAs backed by ndarray? return self._aggregate_series_pure_python(obj, func) - elif obj.index._has_complex_internals: + elif obj.index._has_complex_internals or isinstance(obj.index, RangeIndex): # Preempt TypeError in _aggregate_series_fast + # exclude RangeIndex because patching it in libreduction would + # silently be incorrect return self._aggregate_series_pure_python(obj, func) try: From c230f72b502446ac3c4a65fe7c79c7314b158bb0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Sep 2020 19:29:18 -0700 Subject: [PATCH 09/71] simplify --- pandas/core/groupby/generic.py | 23 ++--------------------- pandas/tests/resample/test_base.py | 13 ++++++++----- 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 45833a882fc0f..20dfb3e8fddd8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -282,27 +282,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # see see test_groupby.test_basic result = self._aggregate_maybe_named(func, *args, **kwargs) - index = Index(sorted(result), name=self.grouper.names[0]) - if isinstance(index, (DatetimeIndex, TimedeltaIndex)): - # TODO: do we _always_ want to do this? - # shouldnt this be done later in eg _wrap_aggregated_output? - index = index._with_freq("infer") - - result_index = self.grouper.result_index - - if ( - result_index.dtype == index.dtype - and result_index.freq is not None - and index.freq is None - ): - # TODO: will dtype equality always hold? - if len(index) == 1: - index.freq = result_index.freq - - elif len(index) == 2: - if index[0] + result_index.freq == index[1]: - # infer_freq doesn't handle length-2 indexes - index.freq = result_index.freq + index = self.grouper.result_index + assert index.name == self.grouper.names[0] ret = create_series_with_explicit_dtype( result, index=index, dtype_if_empty=object diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 28d33ebb23c20..5827b1f456bd7 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -195,14 +195,17 @@ def test_resample_empty_dtypes(index, dtype, resample_method): @all_ts -def test_apply_to_empty_series(empty_series_dti): +@pytest.mark.parametrize("freq", ["M", "D", "H"]) +def test_apply_to_empty_series(empty_series_dti, freq): # GH 14313 s = empty_series_dti - for freq in ["M", "D", "H"]: - result = s.resample(freq).apply(lambda x: 1) - expected = s.resample(freq).apply(np.sum) - tm.assert_series_equal(result, expected, check_dtype=False) + result = s.resample(freq).apply(lambda x: 1) + expected = s.resample(freq).apply(np.sum) + + assert result.index.dtype == expected.index.dtype + + tm.assert_series_equal(result, expected, check_dtype=False) @all_ts From ba48381784fb7fe74c7042ded640661ede9a21e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Jun 2020 12:35:37 -0700 Subject: [PATCH 10/71] REF: dont set ndarray.data in libreduction --- pandas/_libs/reduction.pyx | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 8161b5c5c2b11..84f53e0b061eb 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -296,7 +296,7 @@ cdef class Slider: Only handles contiguous data for now """ cdef: - ndarray values, buf + ndarray values, buf, orig_buf Py_ssize_t stride, orig_len, orig_stride char *orig_data @@ -308,6 +308,7 @@ cdef class Slider: values = values.copy() self.values = values + self.orig_buf = buf self.buf = buf self.stride = values.strides[0] @@ -315,21 +316,14 @@ cdef class Slider: self.orig_len = self.buf.shape[0] self.orig_stride = self.buf.strides[0] - self.buf.data = self.values.data - self.buf.strides[0] = self.stride - cdef move(self, int start, int end): """ For slicing """ - self.buf.data = self.values.data + self.stride * start - self.buf.shape[0] = end - start + self.buf = self.values[start:end] cdef reset(self): - - self.buf.shape[0] = self.orig_len - self.buf.data = self.orig_data - self.buf.strides[0] = self.orig_stride + self.buf = self.orig_buf class InvalidApply(Exception): From e52db7dd031243a3c56219e2aef63e36804105ae Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 22 Jul 2020 09:30:04 -0700 Subject: [PATCH 11/71] less test failures --- pandas/_libs/reduction.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 84f53e0b061eb..09cbbe0602319 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -51,7 +51,7 @@ cdef class _BaseGrouper: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) + object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() cached_ityp._cache.clear() # e.g. inferred_freq must go object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) @@ -353,7 +353,7 @@ def apply_frame_axis0(object frame, object f, object names, slider.move(starts[i], ends[i]) item_cache.clear() # ugh - chunk = slider.dummy + chunk = slider.frame[starts[i]:ends[i]] object.__setattr__(chunk, 'name', names[i]) try: From 972359fd6543f6d6d0e33a18c5ea5006fe99958c Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Aug 2020 15:49:09 -0700 Subject: [PATCH 12/71] port solution from #35417 --- pandas/_libs/reduction.pyx | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 09cbbe0602319..4c7664833d967 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -400,7 +400,8 @@ cdef class BlockSlider: object frame, dummy, index int nblocks Slider idx_slider - list blocks + list blocks, blk_values + ndarray orig_blklocs, orig_blknos cdef: char **base_ptrs @@ -414,20 +415,27 @@ cdef class BlockSlider: self.dummy = frame[:0] self.index = self.dummy.index - self.blocks = [b.values for b in self.dummy._mgr.blocks] + # GH#35417 attributes we need to restore at each step in case + # the function modified them. + mgr = self.dummy._mgr + self.orig_blklocs = mgr.blklocs + self.orig_blknos = mgr.blknos + self.blocks = [x for x in self.dummy._mgr.blocks] - for x in self.blocks: + self.blk_values = [b.values for b in self.dummy._mgr.blocks] + + for x in self.blk_values: util.set_array_not_contiguous(x) - self.nblocks = len(self.blocks) + self.nblocks = len(self.blk_values) # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference to a 1-d # ndarray like datetime / timedelta / period. self.idx_slider = Slider( self.frame.index._index_data, self.dummy.index._index_data) - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): + self.base_ptrs = malloc(sizeof(char*) * len(self.blk_values)) + for i, block in enumerate(self.blk_values): self.base_ptrs[i] = (block).data def __dealloc__(self): @@ -438,9 +446,11 @@ cdef class BlockSlider: ndarray arr Py_ssize_t i + self._restore_blocks() + # move blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] + arr.strides[1] * start @@ -453,14 +463,25 @@ cdef class BlockSlider: self.index._engine.clear_mapping() self.index._cache.clear() # e.g. inferred_freq must go + cdef _restore_blocks(self): + """ + Ensure that we have the original blocks, blknos, and blklocs. + """ + mgr = self.dummy._mgr + mgr.blocks = self.blocks + mgr._blklocs = self.orig_blklocs + mgr._blknos = self.orig_blknos + cdef reset(self): cdef: ndarray arr Py_ssize_t i + self._restore_blocks() + # reset blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] From 28f6ca58cae7ecf6ba3cf26e778e52e720667ae0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 Aug 2020 19:54:45 -0700 Subject: [PATCH 13/71] dont pass Series with DTI to SeriesGrouper --- pandas/core/groupby/ops.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4ce81ac00ddd6..0688d90a1019d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,7 +45,14 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper -from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + RangeIndex, + TimedeltaIndex, + ensure_index, +) from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -616,7 +623,9 @@ def agg_series(self, obj: Series, func: F, *args, **kwargs): # TODO: can we get a performant workaround for EAs backed by ndarray? return self._aggregate_series_pure_python(obj, func) - elif obj.index._has_complex_internals or isinstance(obj.index, RangeIndex): + elif obj.index._has_complex_internals or isinstance( + obj.index, (RangeIndex, DatetimeIndex, TimedeltaIndex) + ): # Preempt TypeError in _aggregate_series_fast # exclude RangeIndex because patching it in libreduction would # silently be incorrect From 0aa2a54ddabbb01e0f0178cd1b3ce960a19683cb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 19:51:04 -0700 Subject: [PATCH 14/71] De-privatize (#36130) --- pandas/core/dtypes/dtypes.py | 4 +-- pandas/core/indexes/datetimes.py | 4 +-- pandas/core/indexing.py | 4 +-- pandas/core/util/hashing.py | 8 ++--- pandas/io/formats/format.py | 4 +-- pandas/io/formats/style.py | 20 ++++++------- pandas/plotting/_matplotlib/core.py | 29 +++++++++---------- pandas/plotting/_matplotlib/timeseries.py | 10 +++---- .../tests/indexing/multiindex/test_slice.py | 4 +-- pandas/tests/indexing/test_indexing.py | 12 ++++---- 10 files changed, 48 insertions(+), 51 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8dc500dddeafa..e321fdd9b3a9b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -395,7 +395,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype from pandas.core.util.hashing import ( - _combine_hash_arrays, + combine_hash_arrays, hash_array, hash_tuples, ) @@ -427,7 +427,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: ) else: cat_array = [cat_array] - hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6dcb9250812d0..3fd93a8159041 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -354,9 +354,9 @@ def _mpl_repr(self): @property def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 + from pandas.io.formats.format import get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cfb17b9498a36..fe2fec1c52063 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2291,7 +2291,7 @@ def need_slice(obj) -> bool: ) -def _non_reducing_slice(slice_): +def non_reducing_slice(slice_): """ Ensure that a slice doesn't reduce to a Series or Scalar. @@ -2330,7 +2330,7 @@ def pred(part) -> bool: return tuple(slice_) -def _maybe_numeric_slice(df, slice_, include_bool=False): +def maybe_numeric_slice(df, slice_, include_bool: bool = False): """ Want nice defaults for background_gradient that don't break with non-numeric data. But if slice_ is passed go with that. diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index d79b9f4092325..df082c7285ae8 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -24,7 +24,7 @@ _default_hash_key = "0123456789123456" -def _combine_hash_arrays(arrays, num_items: int): +def combine_hash_arrays(arrays, num_items: int): """ Parameters ---------- @@ -108,7 +108,7 @@ def hash_pandas_object( for _ in [None] ) arrays = itertools.chain([h], index_iter) - h = _combine_hash_arrays(arrays, 2) + h = combine_hash_arrays(arrays, 2) h = Series(h, index=obj.index, dtype="uint64", copy=False) @@ -131,7 +131,7 @@ def hash_pandas_object( # keep `hashes` specifically a generator to keep mypy happy _hashes = itertools.chain(hashes, index_hash_generator) hashes = (x for x in _hashes) - h = _combine_hash_arrays(hashes, num_items) + h = combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) else: @@ -175,7 +175,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): hashes = ( _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals ) - h = _combine_hash_arrays(hashes, len(vals)) + h = combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3d441f6e737bc..3dc4290953360 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1624,7 +1624,7 @@ def _format_datetime64_dateonly( return x._date_repr -def _get_format_datetime64( +def get_format_datetime64( is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None ) -> Callable: @@ -1656,7 +1656,7 @@ def _format_strings(self) -> List[str]: """ we by definition have a TZ """ values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = self.formatter or _get_format_datetime64( + formatter = self.formatter or get_format_datetime64( is_dates_only, date_format=self.date_format ) fmt_values = [formatter(x) for x in values] diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3bbb5271bce61..023557dd6494d 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -36,7 +36,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") @@ -475,7 +475,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns @@ -633,7 +633,7 @@ def _apply( **kwargs, ) -> "Styler": subset = slice(None) if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: result = data.apply(func, axis=axis, result_type="expand", **kwargs) @@ -725,7 +725,7 @@ def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) return self @@ -985,7 +985,7 @@ def hide_columns(self, subset) -> "Styler": ------- self : Styler """ - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) hidden_df = self.data.loc[subset] self.hidden_columns = self.columns.get_indexer_for(hidden_df.columns) return self @@ -1087,8 +1087,8 @@ def background_gradient( of the data is extended by ``low * (x.max() - x.min())`` and ``high * (x.max() - x.min())`` before normalizing. """ - subset = _maybe_numeric_slice(self.data, subset) - subset = _non_reducing_slice(subset) + subset = maybe_numeric_slice(self.data, subset) + subset = non_reducing_slice(subset) self.apply( self._background_gradient, cmap=cmap, @@ -1322,8 +1322,8 @@ def bar( "(eg: color=['#d65f5f', '#5fba7d'])" ) - subset = _maybe_numeric_slice(self.data, subset) - subset = _non_reducing_slice(subset) + subset = maybe_numeric_slice(self.data, subset) + subset = non_reducing_slice(subset) self.apply( self._bar, subset=subset, @@ -1390,7 +1390,7 @@ def _highlight_handler( axis: Optional[Axis] = None, max_: bool = True, ) -> "Styler": - subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) + subset = non_reducing_slice(maybe_numeric_slice(self.data, subset)) self.apply( self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ ) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 147e4efd74bc3..c1ba7881165f1 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -33,6 +33,13 @@ from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.timeseries import ( + decorate_axes, + format_dateaxis, + maybe_convert_index, + maybe_resample, + use_dynamic_x, +) from pandas.plotting._matplotlib.tools import ( create_subplots, flatten_axes, @@ -1074,15 +1081,11 @@ def _is_ts_plot(self) -> bool: return not self.x_compat and self.use_index and self._use_dynamic_x() def _use_dynamic_x(self): - from pandas.plotting._matplotlib.timeseries import _use_dynamic_x - - return _use_dynamic_x(self._get_ax(0), self.data) + return use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self): if self._is_ts_plot(): - from pandas.plotting._matplotlib.timeseries import _maybe_convert_index - - data = _maybe_convert_index(self._get_ax(0), self.data) + data = maybe_convert_index(self._get_ax(0), self.data) x = data.index # dummy, not used plotf = self._ts_plot @@ -1142,24 +1145,18 @@ def _plot( @classmethod def _ts_plot(cls, ax: "Axes", x, data, style=None, **kwds): - from pandas.plotting._matplotlib.timeseries import ( - _decorate_axes, - _maybe_resample, - format_dateaxis, - ) - # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose - freq, data = _maybe_resample(data, ax, kwds) + freq, data = maybe_resample(data, ax, kwds) # Set ax with freq info - _decorate_axes(ax, freq, kwds) + decorate_axes(ax, freq, kwds) # digging deeper if hasattr(ax, "left_ax"): - _decorate_axes(ax.left_ax, freq, kwds) + decorate_axes(ax.left_ax, freq, kwds) if hasattr(ax, "right_ax"): - _decorate_axes(ax.right_ax, freq, kwds) + decorate_axes(ax.right_ax, freq, kwds) ax._plot_data.append((data, cls._kind, kwds)) lines = cls._plot(ax, data.index, data.values, style=style, **kwds) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index fd89a093d25a4..f8faac6a6a026 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -32,7 +32,7 @@ # Plotting functions and monkey patches -def _maybe_resample(series: "Series", ax: "Axes", kwargs): +def maybe_resample(series: "Series", ax: "Axes", kwargs): # resample against axes freq if necessary freq, ax_freq = _get_freq(ax, series) @@ -105,7 +105,7 @@ def _replot_ax(ax: "Axes", freq, kwargs): ax._plot_data = [] ax.clear() - _decorate_axes(ax, freq, kwargs) + decorate_axes(ax, freq, kwargs) lines = [] labels = [] @@ -128,7 +128,7 @@ def _replot_ax(ax: "Axes", freq, kwargs): return lines, labels -def _decorate_axes(ax: "Axes", freq, kwargs): +def decorate_axes(ax: "Axes", freq, kwargs): """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): ax._plot_data = [] @@ -193,7 +193,7 @@ def _get_freq(ax: "Axes", series: "Series"): return freq, ax_freq -def _use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: +def use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) @@ -235,7 +235,7 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: return freq -def _maybe_convert_index(ax: "Axes", data): +def maybe_convert_index(ax: "Axes", data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 532bb4f2e6dac..ec0391a2ccc26 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm -from pandas.core.indexing import _non_reducing_slice +from pandas.core.indexing import non_reducing_slice from pandas.tests.indexing.common import _mklbl @@ -739,7 +739,7 @@ def test_non_reducing_slice_on_multiindex(self): df = pd.DataFrame(dic, index=[0, 1]) idx = pd.IndexSlice slice_ = idx[:, idx["b", "d"]] - tslice_ = _non_reducing_slice(slice_) + tslice_ = non_reducing_slice(slice_) result = df.loc[tslice_] expected = pd.DataFrame({("b", "d"): [4, 1]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 5b7f013d5de31..a080c5d169215 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series import pandas._testing as tm -from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice from pandas.tests.indexing.common import _mklbl # ------------------------------------------------------------------------ @@ -822,7 +822,7 @@ def test_range_in_series_indexing(self, size): def test_non_reducing_slice(self, slc): df = DataFrame([[0, 1], [2, 3]]) - tslice_ = _non_reducing_slice(slc) + tslice_ = non_reducing_slice(slc) assert isinstance(df.loc[tslice_], DataFrame) def test_list_slice(self): @@ -831,18 +831,18 @@ def test_list_slice(self): df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) expected = pd.IndexSlice[:, ["A"]] for subset in slices: - result = _non_reducing_slice(subset) + result = non_reducing_slice(subset) tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) - result = _maybe_numeric_slice(df, slice_=None) + result = maybe_numeric_slice(df, slice_=None) expected = pd.IndexSlice[:, ["A"]] assert result == expected - result = _maybe_numeric_slice(df, None, include_bool=True) + result = maybe_numeric_slice(df, None, include_bool=True) expected = pd.IndexSlice[:, ["A", "C"]] - result = _maybe_numeric_slice(df, [1]) + result = maybe_numeric_slice(df, [1]) expected = [1] assert result == expected From 0164b8a85366cfc8b696dc0b41a5da11ca5995e2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 03:56:30 +0100 Subject: [PATCH 15/71] TYP: misc fixes for numpy types (#36098) --- pandas/_typing.py | 2 +- pandas/core/algorithms.py | 7 +++---- pandas/core/arrays/categorical.py | 2 +- pandas/core/construction.py | 6 ++++-- pandas/core/dtypes/cast.py | 4 ++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f8af92e07c674..74bfc9134c3af 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -62,7 +62,7 @@ # other Dtype = Union[ - "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9d75d21c5637a..f297c7165208f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,7 +6,7 @@ import operator from textwrap import dedent -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import DataFrame, Series + from pandas import Categorical, DataFrame, Series _shared_docs: Dict[str, str] = {} @@ -429,8 +429,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - # error: "ExtensionArray" has no attribute "isin" [attr-defined] - return comps.isin(values) # type: ignore[attr-defined] + return cast("Categorical", comps).isin(values) comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 27b1afdb438cb..ec85ec47d625c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2316,7 +2316,7 @@ def _concat_same_type(self, to_concat): return union_categoricals(to_concat) - def isin(self, values): + def isin(self, values) -> np.ndarray: """ Check whether `values` are contained in Categorical. diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 02b8ed17244cd..9d6c2789af25b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -335,7 +335,7 @@ def array( return result -def extract_array(obj, extract_numpy: bool = False): +def extract_array(obj: AnyArrayLike, extract_numpy: bool = False) -> ArrayLike: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -383,7 +383,9 @@ def extract_array(obj, extract_numpy: bool = False): if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() - return obj + # error: Incompatible return value type (got "Index", expected "ExtensionArray") + # error: Incompatible return value type (got "Series", expected "ExtensionArray") + return obj # type: ignore[return-value] def sanitize_array( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1489e08d82bf0..7c5aafcbbc7e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1488,7 +1488,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): - return object + return np.dtype("object") return np.find_common_type(types, []) @@ -1550,7 +1550,7 @@ def construct_1d_arraylike_from_scalar( elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value - dtype = object + dtype = np.dtype("object") if not isna(value): value = ensure_str(value) From 48b5847f71c0e8935ae1fb3e78bdeaa47871461d Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Fri, 4 Sep 2020 23:10:49 -0400 Subject: [PATCH 16/71] Comma cleanup (#36082) --- .../tests/indexes/datetimes/test_datetime.py | 2 +- .../tests/indexes/datetimes/test_timezones.py | 2 +- .../tests/indexes/multi/test_constructors.py | 6 +++--- pandas/tests/indexes/multi/test_isin.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- .../indexes/timedeltas/test_scalar_compat.py | 8 ++++---- .../indexes/timedeltas/test_searchsorted.py | 2 +- pandas/tests/indexing/common.py | 4 +--- pandas/tests/indexing/test_callable.py | 18 ++++++------------ pandas/tests/indexing/test_check_indexer.py | 8 +++----- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_floats.py | 14 ++++---------- 12 files changed, 27 insertions(+), 43 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 7bb1d98086a91..8e2ac4feb7ded 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -51,7 +51,7 @@ def test_reindex_with_same_tz(self): "2010-01-02 00:00:00", ] expected1 = DatetimeIndex( - expected_list1, dtype="datetime64[ns, UTC]", freq=None, + expected_list1, dtype="datetime64[ns, UTC]", freq=None ) expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.dtype("intp")) tm.assert_index_equal(result1, expected1) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index ea68e8759c123..233835bb4b5f7 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -799,7 +799,7 @@ def test_dti_from_tzaware_datetime(self, tz): @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_constructors(self, tzstr): - """ Test different DatetimeIndex constructions with timezone + """Test different DatetimeIndex constructions with timezone Follow-up of GH#4229 """ arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 1157c7f8bb962..16af884c89e9e 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -741,18 +741,18 @@ def test_raise_invalid_sortorder(): with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2, + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2 ) with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1, + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1 ) def test_datetimeindex(): idx1 = pd.DatetimeIndex( - ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo", + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 122263e6ec198..b369b9a50954e 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -78,7 +78,7 @@ def test_isin_level_kwarg(): @pytest.mark.parametrize( "labels,expected,level", [ - ([("b", np.nan)], np.array([False, False, True]), None,), + ([("b", np.nan)], np.array([False, False, True]), None), ([np.nan, "a"], np.array([True, True, False]), 0), (["d", np.nan], np.array([False, True, True]), 1), ], diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aee4b16621b4d..7720db9d98ebf 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2426,7 +2426,7 @@ def test_index_with_tuple_bool(self): # TODO: remove tupleize_cols=False once correct behaviour is restored # TODO: also this op right now produces FutureWarning from numpy idx = Index([("a", "b"), ("b", "c"), ("c", "a")], tupleize_cols=False) - result = idx == ("c", "a",) + result = idx == ("c", "a") expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 16c19b8d00380..6a2238d90b590 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -104,18 +104,18 @@ def test_round(self): "L", t1a, TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), ( "S", t1a, TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), - ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), - ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), ]: diff --git a/pandas/tests/indexes/timedeltas/test_searchsorted.py b/pandas/tests/indexes/timedeltas/test_searchsorted.py index 4806a9acff96f..3cf45931cf6b7 100644 --- a/pandas/tests/indexes/timedeltas/test_searchsorted.py +++ b/pandas/tests/indexes/timedeltas/test_searchsorted.py @@ -17,7 +17,7 @@ def test_searchsorted_different_argument_classes(self, klass): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( - "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2], + "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2] ) def test_searchsorted_invalid_argument_dtype(self, arg): idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 9cc031001f81c..656d25bec2a6b 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -144,9 +144,7 @@ def check_values(self, f, func, values=False): tm.assert_almost_equal(result, expected) - def check_result( - self, method, key, typs=None, axes=None, fails=None, - ): + def check_result(self, method, key, typs=None, axes=None, fails=None): def _eq(axis, obj, key): """ compare equal for these 2 keys """ axified = _axify(obj, key, axis) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 621417eb38d94..bf51c3e5d1695 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,15 +17,11 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] tm.assert_frame_equal(res, df.loc[df.B == "b", :]) @@ -94,10 +90,8 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[ - lambda x: ["A", "C"], - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 + res = df.loc[lambda x: ["A", "C"]] # noqa: E231 + tm.assert_frame_equal(res, df.loc[["A", "C"]]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] tm.assert_frame_equal(res, df.loc[["A", "C"], :]) diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 69d4065234d93..865ecb129cdfa 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -32,7 +32,7 @@ def test_valid_input(indexer, expected): @pytest.mark.parametrize( - "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], + "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")] ) def test_boolean_na_returns_indexer(indexer): # https://github.com/pandas-dev/pandas/issues/31503 @@ -61,7 +61,7 @@ def test_bool_raise_length(indexer): @pytest.mark.parametrize( - "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")], + "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")] ) def test_int_raise_missing_values(indexer): array = np.array([1, 2, 3]) @@ -89,9 +89,7 @@ def test_raise_invalid_array_dtypes(indexer): check_array_indexer(array, indexer) -@pytest.mark.parametrize( - "indexer", [None, Ellipsis, slice(0, 3), (None,)], -) +@pytest.mark.parametrize("indexer", [None, Ellipsis, slice(0, 3), (None,)]) def test_pass_through_non_array_likes(indexer): array = np.array([1, 2, 3]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 1c5f00ff754a4..752ecd47fe089 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -87,7 +87,7 @@ def _assert_setitem_series_conversion( # tm.assert_series_equal(temp, expected_series) @pytest.mark.parametrize( - "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)], + "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)] ) def test_setitem_series_object(self, val, exp_dtype): obj = pd.Series(list("abcd")) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 18b9898e7d800..c48e0a129e161 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -181,9 +181,7 @@ def test_scalar_with_mixed(self): expected = 3 assert result == expected - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_scalar_integer(self, index_func, klass): @@ -405,7 +403,7 @@ def test_slice_integer(self): @pytest.mark.parametrize("l", [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)]) def test_integer_positional_indexing(self, l): - """ make sure that we are raising on positional indexing + """make sure that we are raising on positional indexing w.r.t. an integer index """ s = Series(range(2, 6), index=range(2, 6)) @@ -425,9 +423,7 @@ def test_integer_positional_indexing(self, l): with pytest.raises(TypeError, match=msg): s.iloc[l] - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_slice_integer_frame_getitem(self, index_func): # similar to above, but on the getitem dim (of a DataFrame) @@ -486,9 +482,7 @@ def test_slice_integer_frame_getitem(self, index_func): s[l] @pytest.mark.parametrize("l", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_float_slice_getitem_with_integer_index_raises(self, l, index_func): # similar to above, but on the getitem dim (of a DataFrame) From 4abfaea1a6212162f3059e164ebd7bdcd28c9ae4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 20:11:39 -0700 Subject: [PATCH 17/71] CLN: remove unused args/kwargs (#36129) --- pandas/core/groupby/generic.py | 1 + pandas/core/groupby/groupby.py | 2 ++ pandas/core/groupby/ops.py | 8 ++++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0a04ac54a0c66..01c186473a065 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1121,6 +1121,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: assert how == "ohlc" raise + # We get here with a) EADtypes and b) object dtype obj: Union[Series, DataFrame] # call our grouper again with only this block if isinstance(bvalues, ExtensionArray): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 651af2d314251..6ef2e67030881 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1012,6 +1012,8 @@ def _agg_general( # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0688d90a1019d..3ba3c8a0eddc8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -608,7 +608,7 @@ def _transform( return result - def agg_series(self, obj: Series, func: F, *args, **kwargs): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 @@ -660,7 +660,7 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): + def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -669,7 +669,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - res = func(group, *args, **kwargs) + res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): @@ -846,7 +846,7 @@ def groupings(self) -> "List[grouper.Grouping]": for lvl, name in zip(self.levels, self.names) ] - def agg_series(self, obj: Series, func: F, *args, **kwargs): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result From 3b4be0243326f969d8b4227caa1ceb12706d6f07 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Sat, 5 Sep 2020 13:15:03 +1000 Subject: [PATCH 18/71] BUG: Fix DataFrame.groupby().apply() for NaN groups with dropna=False (#35951) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/reshape/concat.py | 6 ++- pandas/tests/groupby/test_groupby_dropna.py | 53 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e65daa439a225..aa3255e673797 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -214,7 +214,8 @@ Performance improvements Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Categorical ^^^^^^^^^^^ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 299b68c6e71e0..9b94dae8556f6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna from pandas.core.arrays.categorical import ( factorize_from_iterable, @@ -624,10 +625,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - mask = level == key + # Find matching codes, include matching nan values as equal. + mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(level == key)[0][0] + i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d1501111cb22b..66db06eeebdfb 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -274,3 +274,56 @@ def test_groupby_dropna_datetime_like_data( expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, data, selected_data, levels", + [ + pytest.param( + False, + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + ["a", "b", np.nan], + id="dropna_false_has_nan", + ), + pytest.param( + True, + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0]}, + None, + id="dropna_true_has_nan", + ), + pytest.param( + # no nan in "groups"; dropna=True|False should be same. + False, + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + None, + id="dropna_false_no_nan", + ), + pytest.param( + # no nan in "groups"; dropna=True|False should be same. + True, + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + None, + id="dropna_true_no_nan", + ), + ], +) +def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels): + # GH 35889 + + df = pd.DataFrame(data) + gb = df.groupby("groups", dropna=dropna) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + + mi_tuples = tuple(zip(data["groups"], selected_data["values"])) + mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna and levels: + mi = mi.set_levels(levels, level="groups") + + expected = pd.DataFrame(selected_data, index=mi) + tm.assert_frame_equal(result, expected) From fb18f47bce85d6090ad34d9788684387024fb98d Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sat, 5 Sep 2020 05:18:12 +0200 Subject: [PATCH 19/71] Bug 29764 groupby loses index name sometimes (#36121) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/test_groupby.py | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index aa3255e673797..3b252202c14c5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -312,6 +312,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 01c186473a065..9fda3c7a6b354 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1732,6 +1732,7 @@ def _wrap_transformed_output( """ indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) + columns.name = self.obj.columns.name result = self.obj._constructor(indexed_output) result.columns = columns diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eec9e8064d584..e0196df7ceac0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2111,3 +2111,26 @@ def test_subsetting_columns_keeps_attrs(klass, attr, value): expected = df.groupby("a", **{attr: value}) result = expected[["b"]] if klass is DataFrame else expected["b"] assert getattr(result, attr) == getattr(expected, attr) + + +@pytest.mark.parametrize("func", ["sum", "any", "shift"]) +def test_groupby_column_index_name_lost(func): + # GH: 29764 groupby loses index sometimes + expected = pd.Index(["a"], name="idx") + df = pd.DataFrame([[1]], columns=expected) + df_grouped = df.groupby([1]) + result = getattr(df_grouped, func)().columns + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_groupby_column_index_name_lost_fill_funcs(func): + # GH: 29764 groupby loses index sometimes + df = pd.DataFrame( + [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], + columns=pd.Index(["type", "a", "b"], name="idx"), + ) + df_grouped = df.groupby(["type"])[["a", "b"]] + result = getattr(df_grouped, func)().columns + expected = pd.Index(["a", "b"], name="idx") + tm.assert_index_equal(result, expected) From d26090d035616ef2879ecc28d9497c8f64040537 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 04:18:59 +0100 Subject: [PATCH 20/71] STY: add code check for use of builtin filter function (#36089) --- ci/code_checks.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2e0f27fefca0b..6006d09bc3e78 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -179,6 +179,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of builtin filter function' ; echo $MSG + invgrep -R --include="*.py" -P '(? Date: Fri, 4 Sep 2020 20:21:49 -0700 Subject: [PATCH 21/71] BUG: df.replace with numeric values and str to_replace (#36093) --- doc/source/user_guide/missing_data.rst | 26 ----- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/array_algos/replace.py | 95 ++++++++++++++++++ pandas/core/generic.py | 14 --- pandas/core/internals/blocks.py | 27 ++++- pandas/core/internals/managers.py | 104 +------------------- pandas/tests/frame/methods/test_replace.py | 15 ++- pandas/tests/series/methods/test_replace.py | 5 +- 8 files changed, 136 insertions(+), 151 deletions(-) create mode 100644 pandas/core/array_algos/replace.py diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e68a0598bb71..28206192dd161 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -689,32 +689,6 @@ You can also operate on the DataFrame in place: df.replace(1.5, np.nan, inplace=True) -.. warning:: - - When replacing multiple ``bool`` or ``datetime64`` objects, the first - argument to ``replace`` (``to_replace``) must match the type of the value - being replaced. For example, - - .. code-block:: python - - >>> s = pd.Series([True, False, True]) - >>> s.replace({'a string': 'new value', True: False}) # raises - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - will raise a ``TypeError`` because one of the ``dict`` keys is not of the - correct type for replacement. - - However, when replacing a *single* object such as, - - .. ipython:: python - - s = pd.Series([True, False, True]) - s.replace('a string', 'another string') - - the original ``NDFrame`` object will be returned untouched. We're working on - unifying this API, but for backwards compatibility reasons we cannot break - the latter behavior. See :issue:`6354` for more details. - Missing data casting rules and indexing --------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3b252202c14c5..8b28a4439e1da 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -339,6 +339,7 @@ ExtensionArray Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py new file mode 100644 index 0000000000000..6ac3cc1f9f2fe --- /dev/null +++ b/pandas/core/array_algos/replace.py @@ -0,0 +1,95 @@ +""" +Methods used by Block.replace and related methods. +""" +import operator +import re +from typing import Optional, Pattern, Union + +import numpy as np + +from pandas._typing import ArrayLike, Scalar + +from pandas.core.dtypes.common import ( + is_datetimelike_v_numeric, + is_numeric_v_string_like, + is_scalar, +) +from pandas.core.dtypes.missing import isna + + +def compare_or_regex_search( + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, +) -> Union[ArrayLike, bool]: + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like + b : scalar or regex pattern + regex : bool, default False + mask : array_like or None (default) + + Returns + ------- + mask : array_like of bool + """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + if isinstance(a, np.ndarray): + a = a[mask] + + if is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + return np.zeros(a.shape, dtype=bool) + + elif is_datetimelike_v_numeric(a, b): + # GH#29553 avoid deprecation warnings from numpy + _check_comparison_types(False, a, b) + return False + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + tmp[mask] = result + result = tmp + + _check_comparison_types(result, a, b) + return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2af323ccc1dd3..93c945638a174 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6561,20 +6561,6 @@ def replace( 1 new new 2 bait xyz - Note that when replacing multiple ``bool`` or ``datetime64`` objects, - the data types in the `to_replace` parameter must match the data - type of the value being replaced: - - >>> df = pd.DataFrame({{'A': [True, False, True], - ... 'B': [False, True, False]}}) - >>> df.replace({{'a string': 'new value', True: False}}) # raises - Traceback (most recent call last): - ... - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - This raises a ``TypeError`` because one of the ``dict`` keys is not of - the correct type for replacement. - Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b2305736f9d46..3bcd4debbf41a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,7 +11,7 @@ from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -59,6 +59,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import compare_or_regex_search from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -792,7 +793,6 @@ def _replace_list( self, src_list: List[Any], dest_list: List[Any], - masks: List[np.ndarray], inplace: bool = False, regex: bool = False, ) -> List["Block"]: @@ -801,11 +801,28 @@ def _replace_list( """ src_len = len(src_list) - 1 + def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return ~mask + + s = com.maybe_box_datetimelike(s) + return compare_or_regex_search(self.values, s, regex, mask) + + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) + + masks = [comp(s, mask, regex) for s in src_list] + rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(zip(src_list, dest_list)): new_rb: List["Block"] = [] for blk in rb: - m = masks[i][blk.mgr_locs.indexer] + m = masks[i] convert = i == src_len # only convert once at the end result = blk._replace_coerce( mask=m, @@ -2908,7 +2925,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... - mask = np.asarray(mask, dtype=np.bool_) + # Except for BooleanArray, this is equivalent to just + # np.asarray(mask, dtype=bool) + mask = mask.to_numpy(dtype=bool, na_value=False) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 753b949f7c802..57a4a8c2ace8a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,14 +1,11 @@ from collections import defaultdict import itertools -import operator -import re from typing import ( Any, DefaultDict, Dict, List, Optional, - Pattern, Sequence, Tuple, TypeVar, @@ -19,7 +16,7 @@ import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label, Scalar +from pandas._typing import ArrayLike, DtypeObj, Label from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -29,12 +26,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_list_like, - is_numeric_v_string_like, - is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype @@ -44,7 +38,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -628,31 +621,10 @@ def replace_list( """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") - # figure out our mask apriori to avoid repeated replacements - values = self.as_array() - - def comp(s: Scalar, mask: np.ndarray, regex: bool = False): - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex, mask) - - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(values) - - masks = [comp(s, mask, regex) for s in src_list] - bm = self.apply( "_replace_list", src_list=src_list, dest_list=dest_list, - masks=masks, inplace=inplace, regex=regex, ) @@ -1900,80 +1872,6 @@ def _merge_blocks( return blocks -def _compare_or_regex_search( - a: ArrayLike, - b: Union[Scalar, Pattern], - regex: bool = False, - mask: Optional[ArrayLike] = None, -) -> Union[ArrayLike, bool]: - """ - Compare two array_like inputs of the same shape or two scalar values - - Calls operator.eq or re.search, depending on regex argument. If regex is - True, perform an element-wise regex matching. - - Parameters - ---------- - a : array_like - b : scalar or regex pattern - regex : bool, default False - mask : array_like or None (default) - - Returns - ------- - mask : array_like of bool - """ - - def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] - ): - """ - Raises an error if the two arrays (a,b) cannot be compared. - Otherwise, returns the comparison result as expected. - """ - if is_scalar(result) and isinstance(a, np.ndarray): - type_names = [type(a).__name__, type(b).__name__] - - if isinstance(a, np.ndarray): - type_names[0] = f"ndarray(dtype={a.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - - if not regex: - op = lambda x: operator.eq(x, b) - else: - op = np.vectorize( - lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False - ) - - # GH#32621 use mask to avoid comparing to NAs - if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) - if isinstance(a, np.ndarray): - a = a[mask] - - if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): - # GH#29553 avoid deprecation warnings from numpy - _check_comparison_types(False, a, b) - return False - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - tmp[mask] = result - result = tmp - - _check_comparison_types(result, a, b) - return result - - def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 83dfd42ae2a6e..ea2488dfc0877 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1131,8 +1131,19 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - with pytest.raises(TypeError, match="Cannot compare types .+"): - df.replace({"asdf": "asdb", True: "yes"}) + result = df.replace({"asdf": "asdb", True: "yes"}) + expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_strings_vs_ints(self): + # GH#34789 + df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + result = df.replace({"replace_string": "test"}) + + tm.assert_frame_equal(result, df) + + result = df["Y0"].replace({"replace_string": "test"}) + tm.assert_series_equal(result, df["Y0"]) def test_replace_truthy(self): df = DataFrame({"a": [True, True]}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ccaa005369a1c..e255d46e81851 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -218,8 +218,9 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) - with pytest.raises(TypeError, match="Cannot compare types .+"): - s.replace({"asdf": "asdb", True: "yes"}) + result = s.replace({"asdf": "asdb", True: "yes"}) + expected = pd.Series(["yes", False, "yes"]) + tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 From 0569e29f3bfbb05cbd41ba3faae30571e7059ef8 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 4 Sep 2020 22:33:49 -0500 Subject: [PATCH 22/71] CLN: resolve UserWarning in `pandas/plotting/_matplotlib/core.py` #35945 (#35946) --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/plotting/test_frame.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c1ba7881165f1..f0b35e1cd2a74 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1223,8 +1223,8 @@ def get_label(i): if self._need_to_set_index: xticks = ax.get_xticks() xticklabels = [get_label(x) for x in xticks] - ax.set_xticklabels(xticklabels) ax.xaxis.set_major_locator(FixedLocator(xticks)) + ax.set_xticklabels(xticklabels) condition = ( not self._use_dynamic_x() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ee43e5d7072fe..9ab697cb57690 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2796,10 +2796,12 @@ def test_table(self): _check_plot_works(df.plot, table=True) _check_plot_works(df.plot, table=df) - ax = df.plot() - assert len(ax.tables) == 0 - plotting.table(ax, df.T) - assert len(ax.tables) == 1 + # GH 35945 UserWarning + with tm.assert_produces_warning(None): + ax = df.plot() + assert len(ax.tables) == 0 + plotting.table(ax, df.T) + assert len(ax.tables) == 1 def test_errorbar_scatter(self): df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) From 7b2d4370c42187ba6be770824d041525d91968f8 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sat, 5 Sep 2020 06:50:57 -0400 Subject: [PATCH 23/71] add note about missing values to Categorical docstring (#36125) --- pandas/core/arrays/categorical.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ec85ec47d625c..c3c9009dda659 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -280,6 +280,19 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] + Missing values are not included as a category. + + >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) + >>> c + [1, 2, 3, 1, 2, 3, NaN] + Categories (3, int64): [1, 2, 3] + + However, their presence is indicated in the `codes` attribute + by code `-1`. + + >>> c.codes + array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) + Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. From 7e906865ba6bd005c3a11ba199693b730749a8d0 Mon Sep 17 00:00:00 2001 From: Sarthak Vineet Kumar Date: Sat, 5 Sep 2020 18:09:11 +0530 Subject: [PATCH 24/71] CLN removing trailing commas (#36101) --- pandas/tests/io/test_sql.py | 3 --- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/plotting/test_frame.py | 4 ++-- pandas/tests/resample/test_datetime_index.py | 10 ++++------ .../tests/reshape/merge/test_merge_index_as_string.py | 4 ++-- pandas/tests/reshape/test_crosstab.py | 4 ++-- pandas/tests/reshape/test_get_dummies.py | 2 +- 7 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a7e3162ed7b73..1edcc937f72c3 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2349,9 +2349,6 @@ def date_format(dt): def format_query(sql, *args): - """ - - """ processed_args = [] for arg in args: if isinstance(arg, float) and isna(arg): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d7fec803a8e0..88f61390957a6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1153,7 +1153,7 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize @@ -1251,7 +1251,7 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 9ab697cb57690..128a7bdb6730a 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1321,7 +1321,7 @@ def test_scatter_with_c_column_name_with_colors(self, cmap): def test_plot_scatter_with_s(self): # this refers to GH 32904 - df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) + df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"]) ax = df.plot.scatter(x="a", y="b", s="c") tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) @@ -1716,7 +1716,7 @@ def test_hist_df(self): def test_hist_weights(self, weights): # GH 33173 np.random.seed(0) - df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,)))) + df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100)))) ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) ax2 = _check_plot_works(df.plot, kind="hist") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index e7637a598403f..59a0183304c76 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -124,7 +124,7 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64", + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" ) tm.assert_series_equal(result, expected) @@ -764,7 +764,7 @@ def test_resample_origin(): @pytest.mark.parametrize( - "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], + "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()] ) def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") @@ -777,9 +777,7 @@ def test_resample_bad_origin(origin): ts.resample("5min", origin=origin) -@pytest.mark.parametrize( - "offset", ["invalid_value", "12dayys", "2000-30-30", object()], -) +@pytest.mark.parametrize("offset", ["invalid_value", "12dayys", "2000-30-30", object()]) def test_resample_bad_offset(offset): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) @@ -1595,7 +1593,7 @@ def test_downsample_dst_at_midnight(): "America/Havana", ambiguous=True ) dti = pd.DatetimeIndex(dti, freq="D") - expected = DataFrame([7.5, 28.0, 44.5], index=dti,) + expected = DataFrame([7.5, 28.0, 44.5], index=dti) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 08614d04caf4b..d20d93370ec7e 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -29,7 +29,7 @@ def df2(): @pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def left_df(request, df1): - """ Construct left test DataFrame with specified levels + """Construct left test DataFrame with specified levels (any of 'outer', 'inner', and 'v1') """ levels = request.param @@ -41,7 +41,7 @@ def left_df(request, df1): @pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def right_df(request, df2): - """ Construct right test DataFrame with specified levels + """Construct right test DataFrame with specified levels (any of 'outer', 'inner', and 'v2') """ levels = request.param diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 6f5550a6f8209..1aadcfdc30f1b 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -354,7 +354,7 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize="columns"), ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index") ) row_normal_margins = DataFrame( @@ -377,7 +377,7 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index c003bfa6a239a..ce13762ea8f86 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -161,7 +161,7 @@ def test_get_dummies_unicode(self, sparse): s = [e, eacute, eacute] res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8, + {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 ) if sparse: exp = exp.apply(SparseArray, fill_value=0) From 0d2b936286475bb4aa24bc81c2418b7b999fd483 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sat, 5 Sep 2020 15:44:26 +0100 Subject: [PATCH 25/71] Updated series documentation to close #35406 (#36139) --- pandas/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9d84ce4b9ab2e..d8fdaa2a60252 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -164,9 +164,9 @@ class Series(base.IndexOpsMixin, generic.NDFrame): index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to - RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index - sequence are used, the index will override the keys found in the - dict. + RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like + and index is None, then the values in the index are used to + reindex the Series after it is created using the keys in the data. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. From 2bcc15632b0f1faedef76318dfca4669eed70bfe Mon Sep 17 00:00:00 2001 From: joooeey Date: Sat, 5 Sep 2020 16:49:09 +0200 Subject: [PATCH 26/71] BUG: repair 'style' kwd handling in DataFrame.plot (#21003) (#33821) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/plotting/_matplotlib/core.py | 27 ++++++++++++++++----------- pandas/tests/plotting/test_frame.py | 18 ++++++++++++++++++ pandas/tests/plotting/test_series.py | 2 +- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8b28a4439e1da..39e53daf516c4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -299,7 +299,7 @@ I/O Plotting ^^^^^^^^ -- +- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) - Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index f0b35e1cd2a74..def4a1dc3f5c4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,4 +1,3 @@ -import re from typing import TYPE_CHECKING, List, Optional, Tuple import warnings @@ -55,6 +54,15 @@ from matplotlib.axis import Axis +def _color_in_style(style: str) -> bool: + """ + Check if there is a color letter in the style string. + """ + from matplotlib.colors import BASE_COLORS + + return not set(BASE_COLORS).isdisjoint(style) + + class MPLPlot: """ Base class for assembling a pandas plot using matplotlib @@ -200,8 +208,6 @@ def __init__( self._validate_color_args() def _validate_color_args(self): - import matplotlib.colors - if ( "color" in self.kwds and self.nseries == 1 @@ -233,13 +239,12 @@ def _validate_color_args(self): styles = [self.style] # need only a single match for s in styles: - for char in s: - if char in matplotlib.colors.BASE_COLORS: - raise ValueError( - "Cannot pass 'style' string with a color symbol and " - "'color' keyword argument. Please use one or the other or " - "pass 'style' without a color symbol" - ) + if _color_in_style(s): + raise ValueError( + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the " + "other or pass 'style' without a color symbol" + ) def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -739,7 +744,7 @@ def _apply_style_colors(self, colors, kwds, col_num, label): style = self.style has_color = "color" in kwds or self.colormap is not None - nocolor_style = style is None or re.match("[a-z]+", style) is None + nocolor_style = style is None or not _color_in_style(style) if (has_color or self.subplots) and nocolor_style: if isinstance(colors, dict): kwds["color"] = colors[label] diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 128a7bdb6730a..3b3902647390d 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -205,6 +205,24 @@ def test_color_and_style_arguments(self): with pytest.raises(ValueError): df.plot(color=["red", "black"], style=["k-", "r--"]) + @pytest.mark.parametrize( + "color, expected", + [ + ("green", ["green"] * 4), + (["yellow", "red", "green", "blue"], ["yellow", "red", "green", "blue"]), + ], + ) + def test_color_and_marker(self, color, expected): + # GH 21003 + df = DataFrame(np.random.random((7, 4))) + ax = df.plot(color=color, style="d--") + # check colors + result = [i.get_color() for i in ax.lines] + assert result == expected + # check markers and linestyles + assert all(i.get_linestyle() == "--" for i in ax.lines) + assert all(i.get_marker() == "d" for i in ax.lines) + def test_nonnumeric_exclude(self): df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) ax = df.plot() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index c296e2a6278c5..85c06b2e7b748 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -958,7 +958,7 @@ def test_plot_no_numeric_data(self): def test_style_single_ok(self): s = pd.Series([1, 2]) ax = s.plot(style="s", color="C3") - assert ax.lines[0].get_color() == ["C3"] + assert ax.lines[0].get_color() == "C3" @pytest.mark.parametrize( "index_name, old_label, new_label", From 55bdb16faf90e0e102b1ce45c234e240357c8005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 5 Sep 2020 10:50:03 -0400 Subject: [PATCH 27/71] BUG/ENH: to_pickle/read_pickle support compression for file ojects (#35736) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_typing.py | 4 ++-- pandas/core/frame.py | 4 ++-- pandas/io/common.py | 24 +++++++++--------------- pandas/io/formats/csvs.py | 15 ++++----------- pandas/io/json/_json.py | 11 ++--------- pandas/io/parsers.py | 13 +++++-------- pandas/io/pickle.py | 10 ++-------- pandas/io/stata.py | 30 +++++------------------------- pandas/tests/io/test_pickle.py | 29 +++++++++++++++++++++++++++++ 10 files changed, 61 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 39e53daf516c4..b1229a5d5823d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -295,6 +295,7 @@ I/O - :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) +- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 74bfc9134c3af..b237013ac7805 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -116,7 +116,7 @@ # compression keywords and compression -CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] +CompressionDict = Dict[str, Any] CompressionOptions = Optional[Union[str, CompressionDict]] @@ -138,6 +138,6 @@ class IOargs(Generic[ModeVar, EncodingVar]): filepath_or_buffer: FileOrBuffer encoding: EncodingVar - compression: CompressionOptions + compression: CompressionDict should_close: bool mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c48bec9b670ad..1713743b98bff 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,7 +27,6 @@ Iterable, Iterator, List, - Mapping, Optional, Sequence, Set, @@ -49,6 +48,7 @@ ArrayLike, Axes, Axis, + CompressionOptions, Dtype, FilePathOrBuffer, FrameOrSeriesUnion, @@ -2062,7 +2062,7 @@ def to_stata( variable_labels: Optional[Dict[Label, str]] = None, version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 2b13d54ec3aed..a80b89569f429 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -205,11 +205,13 @@ def get_filepath_or_buffer( """ filepath_or_buffer = stringify_path(filepath_or_buffer) + # handle compression dict + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(filepath_or_buffer, compression_method) + compression = dict(compression, method=compression_method) + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files - compression_method = infer_compression( - filepath_or_buffer, get_compression_method(compression)[0] - ) if ( mode and "w" in mode @@ -238,7 +240,7 @@ def get_filepath_or_buffer( content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = "gzip" + compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() return IOargs( @@ -374,11 +376,7 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - # error: Incompatible types in assignment (expression has type - # "Union[str, int, None]", variable has type "Optional[str]") - compression_method = compression_args.pop( # type: ignore[assignment] - "method" - ) + compression_method = compression_args.pop("method") except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: @@ -652,12 +650,8 @@ def __init__( super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): - archive_name = self.filename - if self.archive_name is not None: - archive_name = self.archive_name - if archive_name is None: - # ZipFile needs a non-empty string - archive_name = "zip" + # ZipFile needs a non-empty string + archive_name = self.archive_name or self.filename or "zip" super().writestr(archive_name, data) @property diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 270caec022fef..15cd5c026c6b6 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -21,12 +21,7 @@ ) from pandas.core.dtypes.missing import notna -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, -) +from pandas.io.common import get_filepath_or_buffer, get_handle class CSVFormatter: @@ -60,17 +55,15 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() - # Extract compression mode as given, if dict - compression, self.compression_args = get_compression_method(compression) - self.compression = infer_compression(path_or_buf, compression) - ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, - compression=self.compression, + compression=compression, mode=mode, storage_options=storage_options, ) + self.compression = ioargs.compression.pop("method") + self.compression_args = ioargs.compression self.path_or_buf = ioargs.filepath_or_buffer self.should_close = ioargs.should_close self.mode = ioargs.mode diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7a3b76ff7e3d0..a4d923fdbe45a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -19,12 +19,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, -) +from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -66,6 +61,7 @@ def to_json( ) path_or_buf = ioargs.filepath_or_buffer should_close = ioargs.should_close + compression = ioargs.compression if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -616,9 +612,6 @@ def read_json( if encoding is None: encoding = "utf-8" - compression_method, compression = get_compression_method(compression) - compression_method = infer_compression(path_or_buf, compression_method) - compression = dict(compression, method=compression_method) ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c6ef5221e7ead..a0466c5ac6b57 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -63,12 +63,7 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import ( - get_filepath_or_buffer, - get_handle, - infer_compression, - validate_header_arg, -) +from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -424,9 +419,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding - compression = kwds.get("compression", "infer") - compression = infer_compression(filepath_or_buffer, compression) # TODO: get_filepath_or_buffer could return # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] @@ -1976,6 +1969,10 @@ def __init__(self, src, **kwds): encoding = kwds.get("encoding") + # parsers.TextReader doesn't support compression dicts + if isinstance(kwds.get("compression"), dict): + kwds["compression"] = kwds["compression"]["method"] + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 857a2d1b69be4..655deb5ca3779 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -92,11 +92,8 @@ def to_pickle( mode="wb", storage_options=storage_options, ) - compression = ioargs.compression - if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": - compression = None f, fh = get_handle( - ioargs.filepath_or_buffer, "wb", compression=compression, is_text=False + ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL @@ -196,11 +193,8 @@ def read_pickle( ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - compression = ioargs.compression - if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": - compression = None f, fh = get_handle( - ioargs.filepath_or_buffer, "rb", compression=compression, is_text=False + ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False ) # 1) try standard library Pickle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 34d520004cc65..b3b16e04a5d9e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -16,18 +16,7 @@ from pathlib import Path import struct import sys -from typing import ( - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - Sequence, - Tuple, - Union, -) +from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union import warnings from dateutil.relativedelta import relativedelta @@ -58,13 +47,7 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, - stringify_path, -) +from pandas.io.common import get_filepath_or_buffer, get_handle, stringify_path _version_error = ( "Version of given Stata file is {version}. pandas supports importing " @@ -1976,9 +1959,6 @@ def _open_file_binary_write( return fname, False, None # type: ignore[return-value] elif isinstance(fname, (str, Path)): # Extract compression mode as given, if dict - compression_typ, compression_args = get_compression_method(compression) - compression_typ = infer_compression(fname, compression_typ) - compression = dict(compression_args, method=compression_typ) ioargs = get_filepath_or_buffer( fname, mode="wb", compression=compression, storage_options=storage_options ) @@ -2235,7 +2215,7 @@ def __init__( time_stamp: Optional[datetime.datetime] = None, data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): super().__init__() @@ -3118,7 +3098,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): # Copy to new list since convert_strl might be modified later @@ -3523,7 +3503,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): if version is None: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 6331113ab8945..d1c6705dd7a6f 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -14,7 +14,9 @@ import datetime import glob import gzip +import io import os +from pathlib import Path import pickle import shutil from warnings import catch_warnings, simplefilter @@ -486,3 +488,30 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) + + +def test_pickle_binary_object_compression(compression): + """ + Read/write from binary file-objects w/wo compression. + + GH 26237, GH 29054, and GH 29570 + """ + df = tm.makeDataFrame() + + # reference for compression + with tm.ensure_clean() as path: + df.to_pickle(path, compression=compression) + reference = Path(path).read_bytes() + + # write + buffer = io.BytesIO() + df.to_pickle(buffer, compression=compression) + buffer.seek(0) + + # gzip and zip safe the filename: cannot compare the compressed content + assert buffer.getvalue() == reference or compression in ("gzip", "zip") + + # read + read_df = pd.read_pickle(buffer, compression=compression) + buffer.seek(0) + tm.assert_frame_equal(df, read_df) From 40008d01bcd2f4e5c7b456afa90f1b661f863831 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 15:55:11 +0100 Subject: [PATCH 28/71] TYP: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias (#36137) --- ci/code_checks.sh | 8 ++++++++ pandas/core/apply.py | 16 +++++++--------- pandas/core/groupby/generic.py | 10 +++++----- pandas/core/groupby/grouper.py | 2 +- pandas/core/reshape/merge.py | 10 +++++----- pandas/core/reshape/pivot.py | 4 ++-- pandas/io/pytables.py | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6006d09bc3e78..8ee579cd25203 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -230,6 +230,9 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} '!r}' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + # ------------------------------------------------------------------------- + # Type annotations + MSG='Check for use of comment-based annotation syntax' ; echo $MSG invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -238,6 +241,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG + invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # ------------------------------------------------------------------------- MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG invgrep -R --include=*.{py,pyx} '\.__class__' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 99a9e1377563c..bbf832f33065b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,12 +1,12 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type import numpy as np from pandas._config import option_context -from pandas._typing import Axis +from pandas._typing import Axis, FrameOrSeriesUnion from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence @@ -73,7 +73,7 @@ def series_generator(self) -> Iterator["Series"]: @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: pass # --------------------------------------------------------------- @@ -289,9 +289,7 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]: return results, res_index - def wrap_results( - self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + def wrap_results(self, results: ResType, res_index: "Index") -> FrameOrSeriesUnion: from pandas import Series # see if we can infer the results @@ -335,7 +333,7 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the rows """ if self.result_type == "reduce": @@ -408,9 +406,9 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the columns """ - result: Union["Series", "DataFrame"] + result: FrameOrSeriesUnion # we have requested to expand if self.result_type == "expand": diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9fda3c7a6b354..728e1ee4653fd 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -325,7 +325,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} + results: Dict[base.OutputKey, FrameOrSeriesUnion] = {} for idx, (name, func) in enumerate(arg): obj = self @@ -349,7 +349,7 @@ def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> Union[Series, DataFrame]: + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -372,7 +372,7 @@ def _wrap_series_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result: Union[Series, DataFrame] + result: FrameOrSeriesUnion if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns @@ -390,7 +390,7 @@ def _wrap_aggregated_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> Union[Series, DataFrame]: + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -1122,7 +1122,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: raise # We get here with a) EADtypes and b) object dtype - obj: Union[Series, DataFrame] + obj: FrameOrSeriesUnion # call our grouper again with only this block if isinstance(bvalues, ExtensionArray): # TODO(EA2D): special case not needed with 2D EAs diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6678edc3821c8..59ea7781025c4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -393,7 +393,7 @@ class Grouping: ---------- index : Index grouper : - obj Union[DataFrame, Series]: + obj : DataFrame or Series name : Label level : observed : bool, default False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 602ff226f8878..f1c5486222ea1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,14 +6,14 @@ import datetime from functools import partial import string -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Tuple import warnings import numpy as np from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin -from pandas._typing import ArrayLike, FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -51,7 +51,7 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame # noqa:F401 @Substitution("\nleft : DataFrame") @@ -575,8 +575,8 @@ class _MergeOperation: def __init__( self, - left: Union["Series", "DataFrame"], - right: Union["Series", "DataFrame"], + left: FrameOrSeriesUnion, + right: FrameOrSeriesUnion, how: str = "inner", on=None, left_on=None, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 969ac56e41860..842a42f80e1b7 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -12,7 +12,7 @@ import numpy as np -from pandas._typing import Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -200,7 +200,7 @@ def pivot_table( def _add_margins( - table: Union["Series", "DataFrame"], + table: FrameOrSeriesUnion, data, values, rows, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0913627324c48..e850a101a0a63 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -16,7 +16,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones -from pandas._typing import ArrayLike, FrameOrSeries, Label +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import PerformanceWarning @@ -2566,7 +2566,7 @@ class Fixed: pandas_kind: str format_type: str = "fixed" # GH#30962 needed by dask - obj_type: Type[Union[DataFrame, Series]] + obj_type: Type[FrameOrSeriesUnion] ndim: int encoding: str parent: HDFStore @@ -4442,7 +4442,7 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type: Type[Union[DataFrame, Series]] = DataFrame + obj_type: Type[FrameOrSeriesUnion] = DataFrame @property def is_transposed(self) -> bool: From 6f2ca9231621603d13b1bfcd592e860a6179fb81 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 16:03:54 +0100 Subject: [PATCH 29/71] TYP: remove string literals for type annotations in pandas\core\frame.py (#36140) --- pandas/core/frame.py | 104 +++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1713743b98bff..29d6fb9aa7d56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -420,7 +420,7 @@ class DataFrame(NDFrame): _typ = "dataframe" @property - def _constructor(self) -> Type["DataFrame"]: + def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series @@ -1233,7 +1233,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -1671,7 +1671,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ) -> "DataFrame": + ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -2012,7 +2012,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, - ) -> "DataFrame": + ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2720,7 +2720,7 @@ def memory_usage(self, index=True, deep=False) -> Series: ).append(result) return result - def transpose(self, *args, copy: bool = False) -> "DataFrame": + def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. @@ -2843,7 +2843,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": return result.__finalize__(self, method="transpose") @property - def T(self) -> "DataFrame": + def T(self) -> DataFrame: return self.transpose() # ---------------------------------------------------------------------- @@ -3503,7 +3503,7 @@ def eval(self, expr, inplace=False, **kwargs): return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> "DataFrame": + def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3667,7 +3667,7 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs) -> "DataFrame": + def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -3965,7 +3965,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": + def _reindex_multi(self, axes, copy, fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -3998,7 +3998,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ) -> "DataFrame": + ) -> DataFrame: return super().align( other, join=join, @@ -4067,7 +4067,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): ("tolerance", None), ], ) - def reindex(self, *args, **kwargs) -> "DataFrame": + def reindex(self, *args, **kwargs) -> DataFrame: axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names @@ -4229,7 +4229,7 @@ def rename( inplace: bool = False, level: Optional[Level] = None, errors: str = "ignore", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Alter axes labels. @@ -4357,7 +4357,7 @@ def fillna( inplace=False, limit=None, downcast=None, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: return super().fillna( value=value, method=method, @@ -4465,7 +4465,7 @@ def _replace_columnwise( return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame: return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4666,7 +4666,7 @@ def reset_index( inplace: bool = False, col_level: Hashable = 0, col_fill: Label = "", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Reset the index, or a level of it. @@ -4910,20 +4910,20 @@ def _maybe_casted_values(index, labels=None): # Reindex-based selection methods @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> "DataFrame": + def isna(self) -> DataFrame: result = self._constructor(self._data.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> "DataFrame": + def isnull(self) -> DataFrame: return self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> "DataFrame": + def notna(self) -> DataFrame: return ~self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> "DataFrame": + def notnull(self) -> DataFrame: return ~self.isna() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -5074,7 +5074,7 @@ def drop_duplicates( keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Return DataFrame with duplicate rows removed. @@ -5168,7 +5168,7 @@ def duplicated( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", - ) -> "Series": + ) -> Series: """ Return boolean Series denoting duplicate rows. @@ -5619,7 +5619,7 @@ def value_counts( return counts - def nlargest(self, n, columns, keep="first") -> "DataFrame": + def nlargest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. @@ -5728,7 +5728,7 @@ def nlargest(self, n, columns, keep="first") -> "DataFrame": """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first") -> "DataFrame": + def nsmallest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5830,7 +5830,7 @@ def nsmallest(self, n, columns, keep="first") -> "DataFrame": self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": + def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame: """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5861,7 +5861,7 @@ def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0) -> "DataFrame": + def reorder_levels(self, order, axis=0) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5894,7 +5894,7 @@ def reorder_levels(self, order, axis=0) -> "DataFrame": # ---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other: "DataFrame", func, fill_value=None): + def _combine_frame(self, other: DataFrame, func, fill_value=None): # at this point we have `self._indexed_same(other)` if fill_value is None: @@ -5914,7 +5914,7 @@ def _arith_op(left, right): new_data = ops.dispatch_to_series(self, other, _arith_op) return new_data - def _construct_result(self, result) -> "DataFrame": + def _construct_result(self, result) -> DataFrame: """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -6031,11 +6031,11 @@ def _construct_result(self, result) -> "DataFrame": @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, - other: "DataFrame", + other: DataFrame, align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> "DataFrame": + ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, @@ -6044,8 +6044,8 @@ def compare( ) def combine( - self, other: "DataFrame", func, fill_value=None, overwrite=True - ) -> "DataFrame": + self, other: DataFrame, func, fill_value=None, overwrite=True + ) -> DataFrame: """ Perform column-wise combine with another DataFrame. @@ -6212,7 +6212,7 @@ def combine( # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: "DataFrame") -> "DataFrame": + def combine_first(self, other: DataFrame) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -6718,7 +6718,7 @@ def groupby( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None) -> "DataFrame": + def pivot(self, index=None, columns=None, values=None) -> DataFrame: from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -6870,7 +6870,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -7056,7 +7056,7 @@ def stack(self, level=-1, dropna=True): def explode( self, column: Union[str, Tuple], ignore_index: bool = False - ) -> "DataFrame": + ) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. @@ -7211,7 +7211,7 @@ def melt( value_name="value", col_level=None, ignore_index=True, - ) -> "DataFrame": + ) -> DataFrame: return melt( self, @@ -7299,7 +7299,7 @@ def melt( 1 255.0""" ), ) - def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: bm_axis = self._get_block_manager_axis(axis) self._consolidate_inplace() @@ -7462,7 +7462,7 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": + def transform(self, func, axis=0, *args, **kwargs) -> DataFrame: axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T @@ -7616,7 +7616,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func) -> "DataFrame": + def applymap(self, func) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -7678,7 +7678,7 @@ def infer(x): def append( self, other, ignore_index=False, verify_integrity=False, sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Append rows of `other` to the end of caller, returning a new object. @@ -7818,7 +7818,7 @@ def append( def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Join columns of another DataFrame. @@ -8009,7 +8009,7 @@ def merge( copy=True, indicator=False, validate=None, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.merge import merge return merge( @@ -8028,7 +8028,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs) -> "DataFrame": + def round(self, decimals=0, *args, **kwargs) -> DataFrame: """ Round a DataFrame to a variable number of decimal places. @@ -8142,7 +8142,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1) -> "DataFrame": + def corr(self, method="pearson", min_periods=1) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -8233,7 +8233,7 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": def cov( self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 - ) -> "DataFrame": + ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -8636,7 +8636,7 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data(axis_matters: bool) -> "DataFrame": + def _get_data(axis_matters: bool) -> DataFrame: if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": @@ -8937,7 +8937,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": + def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame: """ Get the mode(s) of each element along the selected axis. @@ -9122,7 +9122,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): def to_timestamp( self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True - ) -> "DataFrame": + ) -> DataFrame: """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -9151,7 +9151,7 @@ def to_timestamp( setattr(new_obj, axis_name, new_ax) return new_obj - def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame": + def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -9180,7 +9180,7 @@ def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame" setattr(new_obj, axis_name, new_ax) return new_obj - def isin(self, values) -> "DataFrame": + def isin(self, values) -> DataFrame: """ Whether each element in the DataFrame is contained in values. @@ -9287,10 +9287,10 @@ def isin(self, values) -> "DataFrame": _info_axis_number = 1 _info_axis_name = "columns" - index: "Index" = properties.AxisProperty( + index: Index = properties.AxisProperty( axis=1, doc="The index (row labels) of the DataFrame." ) - columns: "Index" = properties.AxisProperty( + columns: Index = properties.AxisProperty( axis=0, doc="The column labels of the DataFrame." ) From e7668950713670b86f0ac024147908376d3b79be Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 10:53:21 -0700 Subject: [PATCH 30/71] STY+CI: check for private function access across modules (#36144) --- Makefile | 7 +++ ci/code_checks.sh | 8 ++++ pandas/_libs/algos.pyx | 14 +++--- pandas/core/algorithms.py | 8 ++-- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/internals/blocks.py | 2 +- pandas/core/missing.py | 2 +- pandas/plotting/_matplotlib/compat.py | 10 ++-- pandas/plotting/_matplotlib/core.py | 4 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/plotting/common.py | 8 ++-- pandas/tests/plotting/test_frame.py | 4 +- pandas/tests/plotting/test_misc.py | 4 +- scripts/validate_unwanted_patterns.py | 69 +++++++++++++++++++++++++-- 14 files changed, 111 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index f26689ab65ba5..4a9a48992f92f 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,10 @@ doc: cd doc; \ python make.py clean; \ python make.py html + +check: + python3 scripts/validate_unwanted_patterns.py \ + --validation-type="private_function_across_module" \ + --included-file-extensions="py" \ + --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \ + pandas/ diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8ee579cd25203..875f1dbb83ce3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,6 +116,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then fi RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of private module attribute access' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/ + else + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/ + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" isort --version-number diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0a70afda893cf..c4723a5f064c7 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -412,7 +412,7 @@ ctypedef fused algos_t: uint8_t -def _validate_limit(nobs: int, limit=None) -> int: +def validate_limit(nobs: int, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -452,7 +452,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer @@ -509,7 +509,7 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[0] for i in range(N): @@ -537,7 +537,7 @@ def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -593,7 +593,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: return indexer @@ -651,7 +651,7 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[N - 1] for i in range(N - 1, -1, -1): @@ -681,7 +681,7 @@ def backfill_2d_inplace(algos_t[:, :] values, if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f297c7165208f..50ec3714f454b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Series + from pandas import Categorical, DataFrame, Series # noqa:F401 _shared_docs: Dict[str, str] = {} @@ -767,7 +767,7 @@ def value_counts( counts = result._values else: - keys, counts = _value_counts_arraylike(values, dropna) + keys, counts = value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) @@ -780,8 +780,8 @@ def value_counts( return result -# Called once from SparseArray -def _value_counts_arraylike(values, dropna: bool): +# Called once from SparseArray, otherwise could be private +def value_counts_arraylike(values, dropna: bool): """ Parameters ---------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 1531f7b292365..47c960dc969d6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -735,7 +735,7 @@ def value_counts(self, dropna=True): """ from pandas import Index, Series - keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0: if self._null_fill_value and dropna: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3bcd4debbf41a..9f4e535dc787d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -390,7 +390,7 @@ def fillna( mask = isna(self.values) if limit is not None: - limit = libalgos._validate_limit(None, limit=limit) + limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 7802c5cbdbfb3..be66b19d10064 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -228,7 +228,7 @@ def interpolate_1d( ) # default limit is unlimited GH #16282 - limit = algos._validate_limit(nobs=None, limit=limit) + limit = algos.validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index 7f107f18eca25..964596d9b6319 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -17,8 +17,8 @@ def inner(): return inner -_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) -_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) -_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) -_mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) -_mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge) +mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) +mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) +mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) +mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) +mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index def4a1dc3f5c4..8275c0991e464 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -29,7 +29,7 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 +from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( @@ -944,7 +944,7 @@ def _plot_colorbar(self, ax: "Axes", **kwds): img = ax.collections[-1] cbar = self.fig.colorbar(img, ax=ax, **kwds) - if _mpl_ge_3_0_0(): + if mpl_ge_3_0_0(): # The workaround below is no longer necessary. return diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 98aaab6838fba..c5b44f37150bb 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -307,7 +307,7 @@ def handle_shared_axes( sharey: bool, ): if nplots > 1: - if compat._mpl_ge_3_2_0(): + if compat.mpl_ge_3_2_0(): row_num = lambda x: x.get_subplotspec().rowspan.start col_num = lambda x: x.get_subplotspec().colspan.start else: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index b753c96af6290..9301a29933d45 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -28,10 +28,10 @@ def setup_method(self, method): mpl.rcdefaults() - self.mpl_ge_2_2_3 = compat._mpl_ge_2_2_3() - self.mpl_ge_3_0_0 = compat._mpl_ge_3_0_0() - self.mpl_ge_3_1_0 = compat._mpl_ge_3_1_0() - self.mpl_ge_3_2_0 = compat._mpl_ge_3_2_0() + self.mpl_ge_2_2_3 = compat.mpl_ge_2_2_3() + self.mpl_ge_3_0_0 = compat.mpl_ge_3_0_0() + self.mpl_ge_3_1_0 = compat.mpl_ge_3_1_0() + self.mpl_ge_3_2_0 = compat.mpl_ge_3_2_0() self.bp_n_objects = 7 self.polycollection_factor = 2 diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3b3902647390d..d2b22c7a4c2e3 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -51,7 +51,7 @@ def _assert_xtickslabels_visibility(self, axes, expected): @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_plot(self): - from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0 + from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0 df = self.tdf _check_plot_works(df.plot, grid=False) @@ -69,7 +69,7 @@ def test_plot(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({"x": [1, 2], "y": [3, 4]}) - if _mpl_ge_3_1_0(): + if mpl_ge_3_1_0(): msg = "'Line2D' object has no property 'blarg'" else: msg = "Unknown property blarg" diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 130acaa8bcd58..0208ab3e0225b 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -96,7 +96,7 @@ def test_bootstrap_plot(self): class TestDataFramePlots(TestPlotBase): @td.skip_if_no_scipy def test_scatter_matrix_axis(self): - from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 + from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 scatter_matrix = plotting.scatter_matrix @@ -105,7 +105,7 @@ def test_scatter_matrix_axis(self): # we are plotting multiples on a sub-plot with tm.assert_produces_warning( - UserWarning, raise_on_extra_warnings=_mpl_ge_3_0_0() + UserWarning, raise_on_extra_warnings=mpl_ge_3_0_0() ): axes = _check_plot_works( scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 193fef026a96b..1a6d8cc8b9914 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -16,9 +16,7 @@ import sys import token import tokenize -from typing import IO, Callable, FrozenSet, Iterable, List, Tuple - -PATHS_TO_IGNORE: Tuple[str, ...] = ("asv_bench/env",) +from typing import IO, Callable, FrozenSet, Iterable, List, Set, Tuple def _get_literal_string_prefix_len(token_string: str) -> int: @@ -114,6 +112,58 @@ def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: ) +PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"} # no known alternative + + +def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not used across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of the private function that is used across modules. + msg : str + Explenation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + imported_modules: Set[str] = set() + + for node in ast.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + for module in node.names: + module_fqdn = module.name if module.asname is None else module.asname + imported_modules.add(module_fqdn) + + if not isinstance(node, ast.Call): + continue + + try: + module_name = node.func.value.id + function_name = node.func.attr + except AttributeError: + continue + + # Exception section # + + # (Debatable) Class case + if module_name[0].isupper(): + continue + # (Debatable) Dunder methods case + elif function_name.startswith("__") and function_name.endswith("__"): + continue + elif module_name + "." + function_name in PRIVATE_FUNCTIONS_ALLOWED: + continue + + if module_name in imported_modules and function_name.startswith("_"): + yield (node.lineno, f"Private function '{module_name}.{function_name}'") + + def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: """ This test case is necessary after 'Black' (https://github.com/psf/black), @@ -293,6 +343,7 @@ def main( source_path: str, output_format: str, file_extensions_to_check: str, + excluded_file_paths: str, ) -> bool: """ Main entry point of the script. @@ -305,6 +356,10 @@ def main( Source path representing path to a file/directory. output_format : str Output format of the error message. + file_extensions_to_check : str + Coma seperated values of what file extensions to check. + excluded_file_paths : str + Coma seperated values of what file paths to exclude during the check. Returns ------- @@ -325,6 +380,7 @@ def main( FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( file_extensions_to_check.split(",") ) + PATHS_TO_IGNORE = frozenset(excluded_file_paths.split(",")) if os.path.isfile(source_path): file_path = source_path @@ -362,6 +418,7 @@ def main( if __name__ == "__main__": available_validation_types: List[str] = [ "bare_pytest_raises", + "private_function_across_module", "strings_to_concatenate", "strings_with_wrong_placed_whitespace", ] @@ -389,6 +446,11 @@ def main( default="py,pyx,pxd,pxi", help="Coma seperated file extensions to check.", ) + parser.add_argument( + "--excluded-file-paths", + default="asv_bench/env", + help="Comma separated file extensions to check.", + ) args = parser.parse_args() @@ -398,5 +460,6 @@ def main( source_path=args.path, output_format=args.format, file_extensions_to_check=args.included_file_extensions, + excluded_file_paths=args.excluded_file_paths, ) ) From 65407bcee68ca604e2cba85039ef79ebce9c1254 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 10:54:30 -0700 Subject: [PATCH 31/71] CLN: unused case in compare_or_regex_search (#36143) --- pandas/core/array_algos/replace.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 6ac3cc1f9f2fe..09f9aefd64096 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -3,7 +3,7 @@ """ import operator import re -from typing import Optional, Pattern, Union +from typing import Pattern, Union import numpy as np @@ -14,14 +14,10 @@ is_numeric_v_string_like, is_scalar, ) -from pandas.core.dtypes.missing import isna def compare_or_regex_search( - a: ArrayLike, - b: Union[Scalar, Pattern], - regex: bool = False, - mask: Optional[ArrayLike] = None, + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -33,8 +29,8 @@ def compare_or_regex_search( ---------- a : array_like b : scalar or regex pattern - regex : bool, default False - mask : array_like or None (default) + regex : bool + mask : array_like Returns ------- @@ -68,8 +64,6 @@ def _check_comparison_types( ) # GH#32621 use mask to avoid comparing to NAs - if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) if isinstance(a, np.ndarray): a = a[mask] From 238de4c4c3075351faacb74a392273884951474b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 5 Sep 2020 12:40:06 -0700 Subject: [PATCH 32/71] REF: window/test_dtypes.py with pytest idioms (#35918) --- pandas/tests/window/conftest.py | 31 +++ pandas/tests/window/test_dtypes.py | 315 ++++++++--------------------- 2 files changed, 118 insertions(+), 228 deletions(-) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index eb8252d5731be..7f03fa2a5ea0d 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -308,3 +308,34 @@ def which(request): def halflife_with_times(request): """Halflife argument for EWM when times is specified.""" return request.param + + +@pytest.fixture( + params=[ + "object", + "category", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "m8[ns]", + "M8[ns]", + pytest.param( + "datetime64[ns, UTC]", + marks=pytest.mark.skip( + "direct creation of extension dtype datetime64[ns, UTC] " + "is not supported ATM" + ), + ), + ] +) +def dtypes(request): + """Dtypes for window tests""" + return request.param diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 0aa5bf019ff5e..245b48b351684 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -1,5 +1,3 @@ -from itertools import product - import numpy as np import pytest @@ -10,234 +8,95 @@ # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes # -# NOTE that these are yielded tests and so _create_data -# is explicitly called. -# # further note that we are only checking rolling for fully dtype # compliance (though both expanding and ewm inherit) -class Dtype: - window = 2 - - funcs = { - "count": lambda v: v.count(), - "max": lambda v: v.max(), - "min": lambda v: v.min(), - "sum": lambda v: v.sum(), - "mean": lambda v: v.mean(), - "std": lambda v: v.std(), - "var": lambda v: v.var(), - "median": lambda v: v.median(), - } - - def get_expects(self): - expects = { - "sr1": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), - "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), - "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), - "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), - "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), - "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), +def get_dtype(dtype, coerce_int=None): + if coerce_int is False and "int" in dtype: + return None + if dtype != "category": + return np.dtype(dtype) + return dtype + + +@pytest.mark.parametrize( + "method, data, expected_data, coerce_int", + [ + ("count", np.arange(5), [1, 2, 2, 2, 2], True), + ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True), + ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False), + ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True), + ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True), + ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False), + ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True), + ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True), + ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False), + ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True), + ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True), + ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False), + ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), + ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), + ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), + ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True), + ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True), + ( + "std", + [0, 1, 2, np.nan, 4], + [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, + False, + ), + ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True), + ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True), + ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False), + ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), + ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), + ("median", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), + ], +) +def test_series_dtypes(method, data, expected_data, coerce_int, dtypes): + s = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int)) + if dtypes in ("m8[ns]", "M8[ns]") and method != "count": + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + getattr(s.rolling(2), method)() + else: + result = getattr(s.rolling(2), method)() + expected = Series(expected_data, dtype="float64") + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "method, expected_data", + [ + ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}), + ("max", {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}), + ("min", {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}), + ( + "sum", + {0: Series([np.nan, 2, 6, 10, 14]), 1: Series([np.nan, 4, 8, 12, 16])}, + ), + ("mean", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), + ( + "std", + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), }, - "sr2": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), - "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), - "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), - "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), - "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), - "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - }, - "sr3": { - "count": Series([1, 2, 2, 1, 1], dtype="float64"), - "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), - "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), - "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), - "mean": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), - "std": Series( - [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, dtype="float64" - ), - "var": Series([np.nan, 0.5, 0.5, np.nan, np.nan], dtype="float64"), - "median": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), - }, - "df": { - "count": DataFrame( - {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, - dtype="float64", - ), - "max": DataFrame( - {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, - dtype="float64", - ), - "min": DataFrame( - {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, - dtype="float64", - ), - "sum": DataFrame( - { - 0: Series([np.nan, 2, 6, 10, 14]), - 1: Series([np.nan, 4, 8, 12, 16]), - }, - dtype="float64", - ), - "mean": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - "std": DataFrame( - { - 0: Series([np.nan] + [np.sqrt(2)] * 4), - 1: Series([np.nan] + [np.sqrt(2)] * 4), - }, - dtype="float64", - ), - "var": DataFrame( - {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, - dtype="float64", - ), - "median": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - }, - } - return expects - - def _create_dtype_data(self, dtype): - sr1 = Series(np.arange(5), dtype=dtype) - sr2 = Series(np.arange(10, 0, -2), dtype=dtype) - sr3 = sr1.copy() - sr3[3] = np.NaN - df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2, "sr3": sr3, "df": df} - - return data - - def _create_data(self): - self.data = self._create_dtype_data(self.dtype) - self.expects = self.get_expects() - - def test_dtypes(self): - self._create_data() - for f_name, d_name in product(self.funcs.keys(), self.data.keys()): - - f = self.funcs[f_name] - d = self.data[d_name] - exp = self.expects[d_name][f_name] - self.check_dtypes(f, f_name, d, d_name, exp) - - def check_dtypes(self, f, f_name, d, d_name, exp): - roll = d.rolling(window=self.window) - result = f(roll) - tm.assert_almost_equal(result, exp) - - -class TestDtype_object(Dtype): - dtype = object - - -class Dtype_integer(Dtype): - pass - - -class TestDtype_int8(Dtype_integer): - dtype = np.int8 - - -class TestDtype_int16(Dtype_integer): - dtype = np.int16 - - -class TestDtype_int32(Dtype_integer): - dtype = np.int32 - - -class TestDtype_int64(Dtype_integer): - dtype = np.int64 - - -class Dtype_uinteger(Dtype): - pass - - -class TestDtype_uint8(Dtype_uinteger): - dtype = np.uint8 - - -class TestDtype_uint16(Dtype_uinteger): - dtype = np.uint16 - - -class TestDtype_uint32(Dtype_uinteger): - dtype = np.uint32 - - -class TestDtype_uint64(Dtype_uinteger): - dtype = np.uint64 - - -class Dtype_float(Dtype): - pass - - -class TestDtype_float16(Dtype_float): - dtype = np.float16 - - -class TestDtype_float32(Dtype_float): - dtype = np.float32 - - -class TestDtype_float64(Dtype_float): - dtype = np.float64 - - -class TestDtype_category(Dtype): - dtype = "category" - include_df = False - - def _create_dtype_data(self, dtype): - sr1 = Series(range(5), dtype=dtype) - sr2 = Series(range(10, 0, -2), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2} - - return data - - -class DatetimeLike(Dtype): - def check_dtypes(self, f, f_name, d, d_name, exp): - - roll = d.rolling(window=self.window) - if f_name == "count": - result = f(roll) - tm.assert_almost_equal(result, exp) - - else: - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - f(roll) - - -class TestDtype_timedelta(DatetimeLike): - dtype = np.dtype("m8[ns]") - - -class TestDtype_datetime(DatetimeLike): - dtype = np.dtype("M8[ns]") - - -class TestDtype_datetime64UTC(DatetimeLike): - dtype = "datetime64[ns, UTC]" - - def _create_data(self): - pytest.skip( - "direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM" - ) + ), + ("var", {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}), + ("median", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), + ], +) +def test_dataframe_dtypes(method, expected_data, dtypes): + if dtypes == "category": + pytest.skip("Category dataframe testing not implemented.") + df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes)) + if dtypes in ("m8[ns]", "M8[ns]") and method != "count": + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + getattr(df.rolling(2), method)() + else: + result = getattr(df.rolling(2), method)() + expected = DataFrame(expected_data, dtype="float64") + tm.assert_frame_equal(result, expected) From 13c0dd320191ed5124d63cf076987d5c75bc573e Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 5 Sep 2020 14:50:43 -0500 Subject: [PATCH 33/71] DOC: add userwarning doc about mpl #35684 (#36145) --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b1229a5d5823d..d7d2e3cf876ca 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -301,7 +301,7 @@ Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) -- +- meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ From 435a1d09f61c1f47d51e9fc85ed0386c34073b05 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 12:55:36 -0700 Subject: [PATCH 34/71] BUG: item_cache invalidation in get_numeric_data (#35882) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/managers.py | 1 - pandas/tests/frame/methods/test_cov_corr.py | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index d1a66256454ca..6935a64c7572f 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -36,6 +36,7 @@ Bug fixes - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) +- Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 57a4a8c2ace8a..13bc6a2e82195 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -691,7 +691,6 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() return self._combine([b for b in self.blocks if b.is_numeric], copy) def _combine(self: T, blocks: List[Block], copy: bool = True) -> T: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d3548b639572d..f307acd8c2178 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -191,6 +191,23 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) + def test_corr_item_cache(self): + # Check that corr does not lead to incorrect entries in item_cache + + df = pd.DataFrame({"A": range(10)}) + df["B"] = range(10)[::-1] + + ser = df["A"] # populate item_cache + assert len(df._mgr.blocks) == 2 + + _ = df.corr() + + # Check that the corr didnt break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + assert df["A"] is ser + assert df.values[0, 0] == 99 + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): From 0d287523dce419be1cda1b18003a7d80d9d618ab Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 17:18:51 -0400 Subject: [PATCH 35/71] Make MultiIndex.get_loc raise for unhashable type (#35914) Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/indexes/multi.py | 5 +++-- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++++++ pandas/tests/series/indexing/test_setitem.py | 11 ++++++++++- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 6935a64c7572f..c6cfcc6730112 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f66b009e6d505..080ece8547479 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2725,6 +2725,8 @@ def get_loc(self, key, method=None): "currently supported for MultiIndex" ) + hash(key) + def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" if not isinstance(loc, np.ndarray) or loc.dtype != "int64": @@ -2739,8 +2741,7 @@ def _maybe_to_slice(loc): mask[loc] = True return mask - if not isinstance(key, (tuple, list)): - # not including list here breaks some indexing, xref #30892 + if not isinstance(key, tuple): loc = self._get_level_indexer(key, level=0) return _maybe_to_slice(loc) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d27487dfb8aaa..e4549dfb3e68d 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2111,7 +2111,7 @@ def test_type_error_multiindex(self): ) dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) - with pytest.raises(TypeError, match="is an invalid key"): + with pytest.raises(TypeError, match="unhashable type"): dg[:, 0] index = Index(range(2), name="i") diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 5e5fcd3db88d8..4565d79c632de 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning @@ -83,3 +84,10 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) + + def test_multiindex_get_loc_list_raises(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = pd.MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = "unhashable type" + with pytest.raises(TypeError, match=msg): + idx.get_loc([]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3463de25ad91b..593d1c78a19e2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,6 +1,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import MultiIndex, NaT, Series, date_range +import pandas.testing as tm class TestSetitemDT64Values: @@ -17,3 +18,11 @@ def test_setitem_none_nan(self): series[5:7] = np.nan assert series[6] is NaT + + def test_setitem_multiindex_empty_slice(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + result = Series([1, 2], index=idx) + expected = result.copy() + result.loc[[]] = 0 + tm.assert_series_equal(result, expected) From 29c0bc2d858c807d30f5826a84609bfe07176e37 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 18:36:42 -0400 Subject: [PATCH 36/71] ENH: Make explode work for sets (#35637) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/reshape.pyx | 6 ++++-- pandas/core/frame.py | 7 ++++--- pandas/core/series.py | 7 ++++--- pandas/tests/frame/methods/test_explode.py | 8 ++++++++ pandas/tests/series/methods/test_explode.py | 8 ++++++++ 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d7d2e3cf876ca..ff9e803b4990a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -103,7 +103,7 @@ Other enhancements - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) -- +- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 5c6c15fb50fed..75dbb4b74aabd 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -124,7 +124,8 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] - if c_is_list_like(v, False): + + if c_is_list_like(v, True): if len(v): counts[i] += len(v) else: @@ -138,8 +139,9 @@ def explode(ndarray[object] values): for i in range(n): v = values[i] - if c_is_list_like(v, False): + if c_is_list_like(v, True): if len(v): + v = list(v) for j in range(len(v)): result[count] = v[j] count += 1 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 29d6fb9aa7d56..150d6e24dbb86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7091,10 +7091,11 @@ def explode( Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index d8fdaa2a60252..6cbd93135a2ca 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3829,10 +3829,11 @@ def explode(self, ignore_index: bool = False) -> "Series": Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of elements in + the output will be non-deterministic when exploding sets. Examples -------- diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 2bbe8ac2d5b81..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -172,3 +172,11 @@ def test_ignore_index(): {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] ) tm.assert_frame_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) + result = df.explode(column="a").sort_values(by="a") + expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 4b65e042f7b02..1f0fbd1cc5ecb 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -126,3 +126,11 @@ def test_ignore_index(): result = s.explode(ignore_index=True) expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) tm.assert_series_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + s = pd.Series([{"a", "b", "c"}], index=[1]) + result = s.explode().sort_values() + expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) + tm.assert_series_equal(result, expected) From c67b7076957ff43d74de34760c643a8fe3815d2d Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 19:13:44 -0400 Subject: [PATCH 37/71] BUG: Don't raise when constructing Series from ordered set (#36054) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/construction.py | 9 ++++++--- pandas/tests/series/test_constructors.py | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c6cfcc6730112..b8f6d0e52d058 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -35,6 +35,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9d6c2789af25b..3812c306b8eb4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -438,7 +438,12 @@ def sanitize_array( subarr = subarr.copy() return subarr - elif isinstance(data, (list, tuple)) and len(data) > 0: + elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: + if isinstance(data, set): + # Raise only for unordered sets, e.g., not for dict_keys + raise TypeError("Set type is unordered") + data = list(data) + if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -450,8 +455,6 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - elif isinstance(data, abc.Set): - raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index bcf7039ec9039..ce078059479b4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1464,3 +1464,13 @@ def test_constructor_sparse_datetime64(self, values): arr = pd.arrays.SparseArray(values, dtype=dtype) expected = pd.Series(arr) tm.assert_series_equal(result, expected) + + def test_construction_from_ordered_collection(self): + # https://github.com/pandas-dev/pandas/issues/36044 + result = Series({"a": 1, "b": 2}.keys()) + expected = Series(["a", "b"]) + tm.assert_series_equal(result, expected) + + result = Series({"a": 1, "b": 2}.values()) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) From b8181f47af2d1e939a5e20382efc7da7aa0164c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 6 Sep 2020 18:58:32 +0200 Subject: [PATCH 38/71] REGR: append tz-aware DataFrame with tz-naive values (#36115) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/dtypes/concat.py | 6 ++++-- pandas/core/internals/concat.py | 8 ++++++-- pandas/tests/reshape/test_concat.py | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index b8f6d0e52d058..f0adc951a5f99 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) +- Fix regression in :meth:`DataFrame.append` mixing tz-aware and tz-naive datetime columns (:issue:`35460`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9902016475b22..dd005752a4832 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -148,15 +148,17 @@ def is_nonempty(x) -> bool: any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: + # we ignore axis here, as internally concatting with EAs is always + # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if isinstance(to_concat[0], ExtensionArray) and axis == 0: + if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat, axis=axis) + return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b45f0890cafa4..513c5fed1ca62 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -335,9 +335,13 @@ def _concatenate_join_units(join_units, concat_axis, copy): # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0) - if not isinstance(concat_values, ExtensionArray): + if not isinstance(concat_values, ExtensionArray) or ( + isinstance(concat_values, DatetimeArray) and concat_values.tz is None + ): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block + # special case DatetimeArray, which *is* an EA, but is put in a + # consolidated 2D block concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 38cf2cc2402a1..90705f827af25 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1110,6 +1110,23 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df.append([s, s], ignore_index=True) tm.assert_frame_equal(result, expected) + def test_append_empty_tz_frame_with_datetime64ns(self): + # https://github.com/pandas-dev/pandas/issues/35460 + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + + # pd.NaT gets inferred as tz-naive, so append result is tz-naive + result = df.append({"a": pd.NaT}, ignore_index=True) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # also test with typed value to append + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + result = df.append( + pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True + ) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + class TestConcatenate: def test_concat_copy(self): From 88b5e100971e3b435eb7f14ecc5ec469d4cc1dfa Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 6 Sep 2020 12:59:43 -0400 Subject: [PATCH 39/71] BUG: Respect errors="ignore" during extension astype (#35979) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/blocks.py | 9 ++++++-- pandas/tests/frame/methods/test_astype.py | 22 +++++++++++++++++++ pandas/tests/series/methods/test_astype.py | 25 +++++++++++++++++++++- 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index f0adc951a5f99..1e946d325ace1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -34,6 +34,7 @@ Bug fixes - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) +- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` not respecting the ``errors`` argument when set to ``"ignore"`` for extension dtypes (:issue:`35471`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) - Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f4e535dc787d..263c7c2b6940a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -581,8 +581,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # force the copy here if self.is_extension: - # TODO: Should we try/except this astype? - values = self.values.astype(dtype) + try: + values = self.values.astype(dtype) + except (ValueError, TypeError): + if errors == "ignore": + values = self.values + else: + raise else: if issubclass(dtype.type, str): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b0fd0496ea81e..d3f256259b15f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -8,6 +8,7 @@ CategoricalDtype, DataFrame, DatetimeTZDtype, + Interval, IntervalDtype, NaT, Series, @@ -565,3 +566,24 @@ def test_astype_empty_dtype_dict(self): result = df.astype(dict()) tm.assert_frame_equal(result, df) assert result is not df + + @pytest.mark.parametrize( + "df", + [ + DataFrame(Series(["x", "y", "z"], dtype="string")), + DataFrame(Series(["x", "y", "z"], dtype="category")), + DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), + DataFrame(Series(3 * [Interval(0, 1)])), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = df + result = df.astype(float, errors=errors) + tm.assert_frame_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + df.astype(float, errors=errors) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 9fdc4179de2e1..b9d90a9fc63dd 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,4 +1,6 @@ -from pandas import Series, date_range +import pytest + +from pandas import Interval, Series, Timestamp, date_range import pandas._testing as tm @@ -23,3 +25,24 @@ def test_astype_dt64tz_to_str(self): dtype=object, ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + Series(["x", "y", "z"], dtype="string"), + Series(["x", "y", "z"], dtype="category"), + Series(3 * [Timestamp("2020-01-01", tz="UTC")]), + Series(3 * [Interval(0, 1)]), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = values + result = values.astype(float, errors="ignore") + tm.assert_series_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + values.astype(float, errors=errors) From f9ce5792a6fb3ec9b7ed42fe6c7cd018756973ab Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 6 Sep 2020 10:05:35 -0700 Subject: [PATCH 40/71] De-privatize imported names (#36156) --- pandas/_libs/hashtable.pyx | 4 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/parsers.pyx | 8 ++++---- pandas/_testing.py | 8 ++++---- pandas/compat/__init__.py | 4 ++-- pandas/core/algorithms.py | 4 ++-- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/masked.py | 4 ++-- pandas/core/computation/check.py | 10 +++++----- pandas/core/computation/eval.py | 6 +++--- pandas/core/computation/expressions.py | 10 +++++----- pandas/core/computation/ops.py | 6 +++--- pandas/core/frame.py | 4 ++-- pandas/core/indexes/multi.py | 4 ++-- pandas/core/internals/__init__.py | 4 ++-- pandas/core/internals/blocks.py | 4 ++-- pandas/core/internals/managers.py | 6 +++--- pandas/core/sorting.py | 2 +- pandas/core/window/common.py | 4 ++-- pandas/core/window/ewm.py | 6 +++--- pandas/core/window/rolling.py | 6 +++--- pandas/io/common.py | 6 +++--- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_odfreader.py | 4 ++-- pandas/io/excel/_openpyxl.py | 4 ++-- pandas/io/excel/_pyxlsb.py | 4 ++-- pandas/io/excel/_xlrd.py | 4 ++-- pandas/io/formats/format.py | 8 ++++---- pandas/io/formats/printing.py | 4 ++-- pandas/tests/computation/test_compat.py | 6 +++--- pandas/tests/computation/test_eval.py | 12 ++++++------ pandas/tests/extension/json/array.py | 2 +- pandas/tests/frame/test_arithmetic.py | 4 ++-- pandas/tests/frame/test_query_eval.py | 6 +++--- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/test_pickle.py | 6 +++--- pandas/tests/test_algos.py | 2 +- .../moments/test_moments_consistency_rolling.py | 4 ++-- pandas/tests/window/test_pairwise.py | 2 +- pandas/util/_test_decorators.py | 4 ++-- 41 files changed, 101 insertions(+), 101 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ffaf6d6505955..5a0cddb0af197 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -56,7 +56,7 @@ from pandas._libs.missing cimport checknull cdef int64_t NPY_NAT = util.get_nat() -_SIZE_HINT_LIMIT = (1 << 20) + 7 +SIZE_HINT_LIMIT = (1 << 20) + 7 cdef Py_ssize_t _INIT_VEC_CAP = 128 @@ -176,7 +176,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e0e026fe7cb5e..5e4da96d57e42 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -268,7 +268,7 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: @@ -603,7 +603,7 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): @@ -916,7 +916,7 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_pymap(self.table, size_hint) def __dealloc__(self): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0cc0a6b192df5..fcd081f563f92 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -138,7 +138,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fa77af6bd5a25..811e28b830921 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -67,7 +67,7 @@ from pandas._libs.khash cimport ( khiter_t, ) -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( @@ -82,7 +82,7 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals -lzma = _import_lzma() +lzma = import_lzma() cdef: float64_t INF = np.inf @@ -638,9 +638,9 @@ cdef class TextReader: f'zip file {zip_names}') elif self.compression == 'xz': if isinstance(source, str): - source = _get_lzma_file(lzma)(source, 'rb') + source = get_lzma_file(lzma)(source, 'rb') else: - source = _get_lzma_file(lzma)(filename=source) + source = get_lzma_file(lzma)(filename=source) else: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') diff --git a/pandas/_testing.py b/pandas/_testing.py index 04d36749a3d8c..7dba578951deb 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -25,7 +25,7 @@ from pandas._libs.lib import no_default import pandas._libs.testing as _testing from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.core.dtypes.common import ( is_bool, @@ -70,7 +70,7 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing -lzma = _import_lzma() +lzma = import_lzma() _N = 30 _K = 4 @@ -243,7 +243,7 @@ def decompress_file(path, compression): elif compression == "bz2": f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") + f = get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() @@ -288,7 +288,7 @@ def write_to_compressed(compression, path, data, dest="test"): elif compression == "bz2": compress_method = bz2.BZ2File elif compression == "xz": - compress_method = _get_lzma_file(lzma) + compress_method = get_lzma_file(lzma) else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f2018a5c01711..57e378758cc78 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -77,7 +77,7 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" -def _import_lzma(): +def import_lzma(): """ Importing the `lzma` module. @@ -97,7 +97,7 @@ def _import_lzma(): warnings.warn(msg) -def _get_lzma_file(lzma): +def get_lzma_file(lzma): """ Importing the `LZMAFile` class from the `lzma` module. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 50ec3714f454b..57e63daff29e4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -462,7 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: return f(comps, values) -def _factorize_array( +def factorize_array( values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ @@ -671,7 +671,7 @@ def factorize( else: na_value = None - codes, uniques = _factorize_array( + codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8193d65b3b30c..0c8efda5fc588 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -31,7 +31,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.algorithms import _factorize_array, unique +from pandas.core.algorithms import factorize_array, unique from pandas.core.missing import backfill_1d, pad_1d from pandas.core.sorting import nargminmax, nargsort @@ -845,7 +845,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray" # Complete control over factorization. arr, na_value = self._values_for_factorize() - codes, uniques = _factorize_array( + codes, uniques = factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1237dea5c1a64..31274232e2525 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -17,7 +17,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import _factorize_array, take +from pandas.core.algorithms import factorize_array, take from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.indexers import check_array_indexer @@ -287,7 +287,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index 4d205909b9e2e..6c7261b3b33c9 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,10 +1,10 @@ from pandas.compat._optional import import_optional_dependency ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") -_NUMEXPR_INSTALLED = ne is not None -if _NUMEXPR_INSTALLED: - _NUMEXPR_VERSION = ne.__version__ +NUMEXPR_INSTALLED = ne is not None +if NUMEXPR_INSTALLED: + NUMEXPR_VERSION = ne.__version__ else: - _NUMEXPR_VERSION = None + NUMEXPR_VERSION = None -__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] +__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"] diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index b74f99fca21c7..f6a7935142a32 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -38,10 +38,10 @@ def _check_engine(engine: Optional[str]) -> str: str Engine name. """ - from pandas.core.computation.check import _NUMEXPR_INSTALLED + from pandas.core.computation.check import NUMEXPR_INSTALLED if engine is None: - engine = "numexpr" if _NUMEXPR_INSTALLED else "python" + engine = "numexpr" if NUMEXPR_INSTALLED else "python" if engine not in _engines: valid_engines = list(_engines.keys()) @@ -53,7 +53,7 @@ def _check_engine(engine: Optional[str]) -> str: # that won't necessarily be import-able) # Could potentially be done on engine instantiation if engine == "numexpr": - if not _NUMEXPR_INSTALLED: + if not NUMEXPR_INSTALLED: raise ImportError( "'numexpr' is not installed or an unsupported version. Cannot use " "engine='numexpr' for query/eval if 'numexpr' is not installed" diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a9c0cb0571446..d2c08c343ab4b 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -15,15 +15,15 @@ from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.computation.check import _NUMEXPR_INSTALLED +from pandas.core.computation.check import NUMEXPR_INSTALLED from pandas.core.ops import roperator -if _NUMEXPR_INSTALLED: +if NUMEXPR_INSTALLED: import numexpr as ne _TEST_MODE = None _TEST_RESULT: List[bool] = list() -_USE_NUMEXPR = _NUMEXPR_INSTALLED +_USE_NUMEXPR = NUMEXPR_INSTALLED _evaluate = None _where = None @@ -40,7 +40,7 @@ def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR - if _NUMEXPR_INSTALLED: + if NUMEXPR_INSTALLED: _USE_NUMEXPR = v # choose what we are going to do @@ -53,7 +53,7 @@ def set_use_numexpr(v=True): def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if NUMEXPR_INSTALLED and _USE_NUMEXPR: if n is None: n = ne.detect_number_of_cores() ne.set_num_threads(n) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b2144c45c6323..1fb3910b8577d 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -600,11 +600,11 @@ def __repr__(self) -> str: class FuncNode: def __init__(self, name: str): - from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + from pandas.core.computation.check import NUMEXPR_INSTALLED, NUMEXPR_VERSION if name not in _mathops or ( - _NUMEXPR_INSTALLED - and _NUMEXPR_VERSION < LooseVersion("2.6.9") + NUMEXPR_INSTALLED + and NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") ): raise ValueError(f'"{name}" is not a supported function') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 150d6e24dbb86..e1a889bf79d95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5257,7 +5257,7 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 from pandas.core.sorting import get_group_index @@ -5266,7 +5266,7 @@ def duplicated( def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + vals, size_hint=min(len(self), SIZE_HINT_LIMIT) ) return labels.astype("i8", copy=False), len(shape) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 080ece8547479..e49a23935efbd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1342,9 +1342,9 @@ def format( ) if adjoin: - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment - adj = _get_adjustment() + adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: return result_levels diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e12e0d7760ea7..fbccac1c2af67 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -10,8 +10,8 @@ IntBlock, ObjectBlock, TimeDeltaBlock, - _safe_reshape, make_block, + safe_reshape, ) from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( @@ -33,7 +33,7 @@ "IntBlock", "ObjectBlock", "TimeDeltaBlock", - "_safe_reshape", + "safe_reshape", "make_block", "BlockManager", "SingleBlockManager", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 263c7c2b6940a..c8da04fbbf987 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1678,7 +1678,7 @@ def putmask( if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): new = new[mask] - mask = _safe_reshape(mask, new_values.shape) + mask = safe_reshape(mask, new_values.shape) new_values[mask] = new return [self.make_block(values=new_values)] @@ -2820,7 +2820,7 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def _safe_reshape(arr, new_shape): +def safe_reshape(arr, new_shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 13bc6a2e82195..3f446874ffd0e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -47,10 +47,10 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - _safe_reshape, extend_blocks, get_block_type, make_block, + safe_reshape, ) from pandas.core.internals.ops import blockwise_all, operate_blockwise @@ -1015,7 +1015,7 @@ def value_getitem(placement): else: if value.ndim == self.ndim - 1: - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -1138,7 +1138,7 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 8bdd466ae6f33..d03b2f29521b7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -520,7 +520,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) + size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 2f3058db4493b..df60d2dcf5e84 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -92,7 +92,7 @@ def f(x, name=name, *args): return self._groupby.apply(f) -def _flex_binary_moment(arg1, arg2, f, pairwise=False): +def flex_binary_moment(arg1, arg2, f, pairwise=False): if not ( isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) @@ -222,7 +222,7 @@ def dataframe_from_int_dict(data, frame_template): return dataframe_from_int_dict(results, arg1) else: - return _flex_binary_moment(arg2, arg1, f) + return flex_binary_moment(arg2, arg1, f) def zsqrt(x): diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 1913b51a68c15..2bd36d8bff155 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,7 +15,7 @@ import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import _flex_binary_moment, _Rolling +from pandas.core.window.rolling import _Rolling, flex_binary_moment _bias_template = """ Parameters @@ -416,7 +416,7 @@ def _get_cov(X, Y): ) return X._wrap_result(cov) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -470,6 +470,6 @@ def _cov(x, y): corr = cov / zsqrt(x_var * y_var) return X._wrap_result(corr) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 558c0eeb0ea65..4c4ec4d700b7f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -54,8 +54,8 @@ from pandas.core.window.common import ( WindowGroupByMixin, _doc_template, - _flex_binary_moment, _shared_docs, + flex_binary_moment, zsqrt, ) from pandas.core.window.indexers import ( @@ -1774,7 +1774,7 @@ def _get_cov(X, Y): bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -1913,7 +1913,7 @@ def _get_corr(a, b): return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) diff --git a/pandas/io/common.py b/pandas/io/common.py index a80b89569f429..3f130401558dd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -40,12 +40,12 @@ ModeVar, StorageOptions, ) -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like -lzma = _import_lzma() +lzma = import_lzma() _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -562,7 +562,7 @@ def get_handle( # XZ Compression elif compression == "xz": - f = _get_lzma_file(lzma)(path_or_buf, mode) + f = get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 74eb65521f5b2..87343c22ad4e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -346,7 +346,7 @@ def read_excel( ) -class _BaseExcelReader(metaclass=abc.ABCMeta): +class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6cbca59aed97e..02575ab878f6e 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -7,10 +7,10 @@ import pandas as pd -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _ODFReader(_BaseExcelReader): +class _ODFReader(BaseExcelReader): """ Read tables out of OpenDocument formatted files. diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 89b581da6ed31..f395127902101 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -5,7 +5,7 @@ from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import ExcelWriter, _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: @@ -438,7 +438,7 @@ def write_cells( setattr(xcell, k, v) -class _OpenpyxlReader(_BaseExcelReader): +class _OpenpyxlReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index c15a52abe4d53..069c3a2eaa643 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -3,10 +3,10 @@ from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _PyxlsbReader(_BaseExcelReader): +class _PyxlsbReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a7fb519af61c6..9057106fb08e5 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -5,10 +5,10 @@ from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _XlrdReader(_BaseExcelReader): +class _XlrdReader(BaseExcelReader): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3dc4290953360..53b2b533215f0 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -256,7 +256,7 @@ def __init__( float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = _get_adjustment() + self.adj = get_adjustment() self._chk_truncate() @@ -439,7 +439,7 @@ def _get_pad(t): return [x.rjust(_get_pad(x)) for x in texts] -def _get_adjustment() -> TextAdjustment: +def get_adjustment() -> TextAdjustment: use_east_asian_width = get_option("display.unicode.east_asian_width") if use_east_asian_width: return EastAsianTextAdjustment() @@ -628,7 +628,7 @@ def __init__( self.columns = frame.columns self._chk_truncate() - self.adj = _get_adjustment() + self.adj = get_adjustment() def _chk_truncate(self) -> None: """ @@ -1733,7 +1733,7 @@ def _make_fixed_width( return strings if adj is None: - adj = _get_adjustment() + adj = get_adjustment() max_len = max(adj.len(x) for x in strings) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 23daab725ec65..edc6fbfff61d7 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -321,7 +321,7 @@ def format_object_summary( summary string """ from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment display_width, _ = get_console_size() if display_width is None: @@ -350,7 +350,7 @@ def format_object_summary( is_truncated = n > max_seq_items # adj can optionally handle unicode eastern asian width - adj = _get_adjustment() + adj = get_adjustment() def _extend_line( s: str, line: str, value: str, display_width: int, next_line_prefix: str diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index b3fbd8c17d8bf..ead102f532a20 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -12,16 +12,16 @@ def test_compat(): # test we have compat with our version of nu - from pandas.core.computation.check import _NUMEXPR_INSTALLED + from pandas.core.computation.check import NUMEXPR_INSTALLED try: import numexpr as ne ver = ne.__version__ if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]): - assert not _NUMEXPR_INSTALLED + assert not NUMEXPR_INSTALLED else: - assert _NUMEXPR_INSTALLED + assert NUMEXPR_INSTALLED except ImportError: pytest.skip("not testing numexpr version compat") diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 853ab00853d1b..49066428eb16c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -18,7 +18,7 @@ from pandas import DataFrame, Series, compat, date_range import pandas._testing as tm from pandas.core.computation import pytables -from pandas.core.computation.check import _NUMEXPR_VERSION +from pandas.core.computation.check import NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr from pandas.core.computation.expr import ( @@ -26,7 +26,7 @@ PandasExprVisitor, PythonExprVisitor, ) -from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR +from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED from pandas.core.computation.ops import ( _arith_ops_syms, _binary_math_ops, @@ -43,7 +43,7 @@ marks=pytest.mark.skipif( engine == "numexpr" and not _USE_NUMEXPR, reason=f"numexpr enabled->{_USE_NUMEXPR}, " - f"installed->{_NUMEXPR_INSTALLED}", + f"installed->{NUMEXPR_INSTALLED}", ), ) for engine in _engines @@ -60,15 +60,15 @@ def parser(request): @pytest.fixture def ne_lt_2_6_9(): - if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + if NUMEXPR_INSTALLED and NUMEXPR_VERSION >= LooseVersion("2.6.9"): pytest.skip("numexpr is >= 2.6.9") return "numexpr" @pytest.fixture def unary_fns_for_ne(): - if _NUMEXPR_INSTALLED: - if _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + if NUMEXPR_INSTALLED: + if NUMEXPR_VERSION >= LooseVersion("2.6.9"): return _unary_math_ops else: return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 447a6108fc3c7..e3cdeb9c1951f 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -189,7 +189,7 @@ def _concat_same_type(cls, to_concat): def _values_for_factorize(self): frozen = self._values_for_argsort() if len(frozen) == 0: - # _factorize_array expects 1-d array, this is a len-0 2-d array. + # factorize_array expects 1-d array, this is a len-0 2-d array. frozen = frozen.ravel() return frozen, () diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e17357e9845b5..70d0b4e9e835c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm import pandas.core.common as com -from pandas.core.computation.expressions import _MIN_ELEMENTS, _NUMEXPR_INSTALLED +from pandas.core.computation.expressions import _MIN_ELEMENTS, NUMEXPR_INSTALLED from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int # ------------------------------------------------------------------- @@ -375,7 +375,7 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(not _NUMEXPR_INSTALLED, reason="numexpr not installed") + @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) def test_floordiv_axis0_numexpr_path(self, opname): # case that goes through numexpr and has to fall back to masked_arith_op diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 56d178daee7fd..2994482fa5139 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, date_range import pandas._testing as tm -from pandas.core.computation.check import _NUMEXPR_INSTALLED +from pandas.core.computation.check import NUMEXPR_INSTALLED PARSERS = "python", "pandas" ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @@ -39,7 +39,7 @@ def setup_method(self, method): def test_query_default(self): # GH 12749 - # this should always work, whether _NUMEXPR_INSTALLED or not + # this should always work, whether NUMEXPR_INSTALLED or not df = self.df result = df.query("A>0") tm.assert_frame_equal(result, self.expected1) @@ -65,7 +65,7 @@ def test_query_python(self): def test_query_numexpr(self): df = self.df - if _NUMEXPR_INSTALLED: + if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, self.expected1) result = df.eval("A+1", engine="numexpr") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 22942ed75d0f3..1fb957505987f 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -226,7 +226,7 @@ def test_repr_truncation(self): r = repr(df) r = r[r.find("\n") + 1 :] - adj = fmt._get_adjustment() + adj = fmt.get_adjustment() for line, value in zip(r.split("\n"), df["B"]): if adj.len(value) + 1 > max_len: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index d1c6705dd7a6f..2241fe7013568 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -24,7 +24,7 @@ import pytest -from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian import pandas.util._test_decorators as td import pandas as pd @@ -33,7 +33,7 @@ from pandas.tseries.offsets import Day, MonthEnd -lzma = _import_lzma() +lzma = import_lzma() @pytest.fixture(scope="module") @@ -268,7 +268,7 @@ def compress_file(self, src_path, dest_path, compression): with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": - f = _get_lzma_file(lzma)(dest_path, "w") + f = get_lzma_file(lzma)(dest_path, "w") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 72a679d980641..ec7413514d430 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -303,7 +303,7 @@ def test_parametrized_factorize_na_value_default(self, data): ], ) def test_parametrized_factorize_na_value(self, data, na_value): - codes, uniques = algos._factorize_array(data, na_value=na_value) + codes, uniques = algos.factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(codes, expected_codes) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 158b994cf03ae..dfcbdde466d44 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, Series import pandas._testing as tm -from pandas.core.window.common import _flex_binary_moment +from pandas.core.window.common import flex_binary_moment from pandas.tests.window.common import ( check_pairwise_moment, moments_consistency_cov_data, @@ -150,7 +150,7 @@ def test_flex_binary_moment(): # don't blow the stack msg = "arguments to moment function must be of type np.ndarray/Series/DataFrame" with pytest.raises(TypeError, match=msg): - _flex_binary_moment(5, 6, None) + flex_binary_moment(5, 6, None) def test_corr_sanity(): diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 7425cc5df4c2f..7f4e85b385b2d 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -41,7 +41,7 @@ def compare(self, result, expected): @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): - # DataFrame methods (which do not call _flex_binary_moment()) + # DataFrame methods (which do not call flex_binary_moment()) results = [f(df) for df in self.df1s] for (df, result) in zip(self.df1s, results): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 78facd6694635..94c252eca1671 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -35,7 +35,7 @@ def test_foo(): from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version -from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR +from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED def safe_import(mod_name: str, min_version: Optional[str] = None): @@ -196,7 +196,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None): ) skip_if_no_ne = pytest.mark.skipif( not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", ) From 911e997579536a059b2438b6b861b0de18a6778c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 6 Sep 2020 10:08:26 -0700 Subject: [PATCH 41/71] REF: share more EA methods (#36154) --- pandas/core/arrays/_mixins.py | 33 +++++++- pandas/core/arrays/categorical.py | 126 ++--------------------------- pandas/core/arrays/datetimelike.py | 28 ++----- pandas/core/arrays/numpy_.py | 12 +-- 4 files changed, 45 insertions(+), 154 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 2976747d66dfa..8b79f8ce66756 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -4,9 +4,10 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import cache_readonly, doc -from pandas.core.algorithms import take, unique +from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray _T = TypeVar("_T", bound="NDArrayBackedExtensionArray") @@ -120,3 +121,31 @@ def repeat(self: _T, repeats, axis=None) -> _T: def unique(self: _T) -> _T: new_data = unique(self._ndarray) return self._from_backing_data(new_data) + + @classmethod + @doc(ExtensionArray._concat_same_type) + def _concat_same_type(cls, to_concat, axis: int = 0): + dtypes = {str(x.dtype) for x in to_concat} + if len(dtypes) != 1: + raise ValueError("to_concat must have the same dtype (tz)", dtypes) + + new_values = [x._ndarray for x in to_concat] + new_values = np.concatenate(new_values, axis=axis) + return to_concat[0]._from_backing_data(new_values) + + @doc(ExtensionArray.searchsorted) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self._ndarray, value, side=side, sorter=sorter) + + @doc(ExtensionArray.shift) + def shift(self, periods=1, fill_value=None, axis=0): + + fill_value = self._validate_shift_value(fill_value) + new_values = shift(self._ndarray, periods, axis, fill_value) + + return self._from_backing_data(new_values) + + def _validate_shift_value(self, fill_value): + # TODO: after deprecation in datetimelikearraymixin is enforced, + # we can remove this and ust validate_fill_value directly + return self._validate_fill_value(fill_value) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c3c9009dda659..02305479bef67 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,8 +44,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d -from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -1193,35 +1192,6 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) - def shift(self, periods, fill_value=None): - """ - Shift Categorical by desired number of periods. - - Parameters - ---------- - periods : int - Number of periods to move, can be positive or negative - fill_value : object, optional - The scalar value to use for newly introduced missing values. - - .. versionadded:: 0.24.0 - - Returns - ------- - shifted : Categorical - """ - # since categoricals always have ndim == 1, an axis parameter - # doesn't make any sense here. - codes = self.codes - if codes.ndim > 1: - raise NotImplementedError("Categorical with ndim > 1.") - - fill_value = self._validate_fill_value(fill_value) - - codes = shift(codes, periods, axis=0, fill_value=fill_value) - - return self._constructor(codes, dtype=self.dtype, fastpath=True) - def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our @@ -1383,20 +1353,6 @@ def notna(self): notnull = notna - def dropna(self): - """ - Return the Categorical without null values. - - Missing values (-1 in .codes) are detected. - - Returns - ------- - valid : Categorical - """ - result = self[self.notna()] - - return result - def value_counts(self, dropna=True): """ Return a Series containing counts of each category. @@ -1749,81 +1705,6 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(codes, dtype=self.dtype, fastpath=True) - def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: - """ - Take elements from the Categorical. - - Parameters - ---------- - indexer : sequence of int - The indices in `self` to take. The meaning of negative values in - `indexer` depends on the value of `allow_fill`. - allow_fill : bool, default False - How to handle negative values in `indexer`. - - * False: negative values in `indices` indicate positional indices - from the right. This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate missing values - (the default). These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - .. versionchanged:: 1.0.0 - - Default value changed from ``True`` to ``False``. - - fill_value : object - The value to use for `indices` that are missing (-1), when - ``allow_fill=True``. This should be the category, i.e. a value - in ``self.categories``, not a code. - - Returns - ------- - Categorical - This Categorical will have the same categories and ordered as - `self`. - - See Also - -------- - Series.take : Similar method for Series. - numpy.ndarray.take : Similar method for NumPy arrays. - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'b']) - >>> cat - ['a', 'a', 'b'] - Categories (2, object): ['a', 'b'] - - Specify ``allow_fill==False`` to have negative indices mean indexing - from the right. - - >>> cat.take([0, -1, -2], allow_fill=False) - ['a', 'b', 'a'] - Categories (2, object): ['a', 'b'] - - With ``allow_fill=True``, indices equal to ``-1`` mean "missing" - values that should be filled with the `fill_value`, which is - ``np.nan`` by default. - - >>> cat.take([0, -1, -1], allow_fill=True) - ['a', NaN, NaN] - Categories (2, object): ['a', 'b'] - - The fill value can be specified. - - >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') - ['a', 'a', 'a'] - Categories (2, object): ['a', 'b'] - - Specifying a fill value that's not in ``self.categories`` - will raise a ``ValueError``. - """ - return NDArrayBackedExtensionArray.take( - self, indexer, allow_fill=allow_fill, fill_value=fill_value - ) - # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -1861,6 +1742,9 @@ def __contains__(self, key) -> bool: return contains(self, key, container=self._codes) + # ------------------------------------------------------------------ + # Rendering Methods + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1959,6 +1843,8 @@ def __repr__(self) -> str: return result + # ------------------------------------------------------------------ + def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 5a44f87400b79..a5b8032974fa4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -54,9 +54,8 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts -from pandas.core.array_algos.transforms import shift from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.base import ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer @@ -672,18 +671,11 @@ def view(self, dtype=None): @classmethod def _concat_same_type(cls, to_concat, axis: int = 0): - - # do not pass tz to set because tzlocal cannot be hashed - dtypes = {str(x.dtype) for x in to_concat} - if len(dtypes) != 1: - raise ValueError("to_concat must have the same dtype (tz)", dtypes) + new_obj = super()._concat_same_type(to_concat, axis) obj = to_concat[0] dtype = obj.dtype - i8values = [x.asi8 for x in to_concat] - values = np.concatenate(i8values, axis=axis) - new_freq = None if is_period_dtype(dtype): new_freq = obj.freq @@ -697,11 +689,13 @@ def _concat_same_type(cls, to_concat, axis: int = 0): if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): new_freq = obj.freq - return cls._simple_new(values, dtype=dtype, freq=new_freq) + new_obj._freq = new_freq + return new_obj def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: - values = self.asi8.copy() - return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) + new_obj = super().copy() + new_obj._freq = self.freq + return new_obj def _values_for_factorize(self): return self.asi8, iNaT @@ -713,14 +707,6 @@ def _from_factorized(cls, values, original): def _values_for_argsort(self): return self._data - @Appender(ExtensionArray.shift.__doc__) - def shift(self, periods=1, fill_value=None, axis=0): - - fill_value = self._validate_shift_value(fill_value) - new_values = shift(self._data, periods, axis, fill_value) - - return type(self)._simple_new(new_values, dtype=self.dtype) - # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 23a4a70734c81..588d68514649a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -7,7 +7,6 @@ from pandas._libs import lib from pandas._typing import Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -16,10 +15,9 @@ from pandas import compat from pandas.core import nanops, ops -from pandas.core.algorithms import searchsorted from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.base import ExtensionOpsMixin from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import backfill_1d, pad_1d @@ -189,10 +187,6 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "PandasArray def _from_factorized(cls, values, original) -> "PandasArray": return cls(values) - @classmethod - def _concat_same_type(cls, to_concat) -> "PandasArray": - return cls(np.concatenate(to_concat)) - def _from_backing_data(self, arr: np.ndarray) -> "PandasArray": return type(self)(arr) @@ -423,10 +417,6 @@ def to_numpy( return result - @doc(ExtensionArray.searchsorted) - def searchsorted(self, value, side="left", sorter=None): - return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) - # ------------------------------------------------------------------------ # Ops From 4480b4a40a6c50749dd9c885a0807acca70f2326 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Sep 2020 13:11:04 -0400 Subject: [PATCH 42/71] CLN: Separate transform tests (#36146) --- pandas/tests/frame/apply/test_frame_apply.py | 49 +------------ .../tests/frame/apply/test_frame_transform.py | 72 +++++++++++++++++++ pandas/tests/frame/common.py | 24 +++++++ .../tests/series/apply/test_series_apply.py | 31 +------- .../series/apply/test_series_transform.py | 59 +++++++++++++++ 5 files changed, 157 insertions(+), 78 deletions(-) create mode 100644 pandas/tests/frame/apply/test_frame_transform.py create mode 100644 pandas/tests/series/apply/test_series_transform.py diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 5a1e448beb40f..bc09501583e2c 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1,7 +1,6 @@ from collections import OrderedDict from datetime import datetime from itertools import chain -import operator import warnings import numpy as np @@ -14,6 +13,7 @@ import pandas._testing as tm from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError +from pandas.tests.frame.common import zip_frames @pytest.fixture @@ -1058,25 +1058,6 @@ def test_consistency_for_boxed(self, box, int_frame_const_col): tm.assert_frame_equal(result, expected) -def zip_frames(frames, axis=1): - """ - take a list of frames, zip them together under the - assumption that these all have the first frames' index/columns. - - Returns - ------- - new_frame : DataFrame - """ - if axis == 1: - columns = frames[0].columns - zipped = [f.loc[:, c] for c in columns for f in frames] - return pd.concat(zipped, axis=1) - else: - index = frames[0].index - zipped = [f.loc[i, :] for i in index for f in frames] - return pd.DataFrame(zipped) - - class TestDataFrameAggregate: def test_agg_transform(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 @@ -1087,16 +1068,10 @@ def test_agg_transform(self, axis, float_frame): f_sqrt = np.sqrt(float_frame) # ufunc - result = float_frame.transform(np.sqrt, axis=axis) expected = f_sqrt.copy() - tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) - result = float_frame.transform(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() @@ -1110,9 +1085,6 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting @@ -1128,38 +1100,19 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) - def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) - msg = "cannot combine transform and aggregation operations" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.transform(["max", "sqrt"], axis=axis) - df = pd.DataFrame({"A": range(5), "B": 5}) def f(): with np.errstate(all="ignore"): df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) - def test_transform_method_name(self, method): - # GH 19760 - df = pd.DataFrame({"A": [-1, 2]}) - result = df.transform(method) - expected = operator.methodcaller(method)(df) - tm.assert_frame_equal(result, expected) - def test_demo(self): # demonstration tests df = pd.DataFrame({"A": range(5), "B": 5}) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py new file mode 100644 index 0000000000000..3a345215482ed --- /dev/null +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -0,0 +1,72 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.frame.common import zip_frames + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt.copy() + tm.assert_frame_equal(result, expected) + + result = float_frame.transform(np.sqrt, axis=axis) + tm.assert_frame_equal(result, expected) + + # list-like + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product([float_frame.index, ["sqrt"]]) + result = float_frame.transform([np.sqrt], axis=axis) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) + tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_err(axis, float_frame): + # cannot both transform and agg + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "min"], axis=axis) + + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.transform(["max", "sqrt"], axis=axis) + + +@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) +def test_transform_method_name(method): + # GH 19760 + df = pd.DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 463a140972ab5..73e60ff389038 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -1,3 +1,8 @@ +from typing import List + +from pandas import DataFrame, concat + + def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 dtypes = dict(A="float32", B="float32", C="float16", D="float64") @@ -29,3 +34,22 @@ def _check_mixed_int(df, dtype=None): assert df.dtypes["C"] == dtypes["C"] if dtypes.get("D"): assert df.dtypes["D"] == dtypes["D"] + + +def zip_frames(frames: List[DataFrame], axis: int = 1) -> DataFrame: + """ + take a list of frames, zip them together under the + assumption that these all have the first frames' index/columns. + + Returns + ------- + new_frame : DataFrame + """ + if axis == 1: + columns = frames[0].columns + zipped = [f.loc[:, c] for c in columns for f in frames] + return concat(zipped, axis=1) + else: + index = frames[0].index + zipped = [f.loc[i, :] for i in index for f in frames] + return DataFrame(zipped) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 308398642895c..b948317f32062 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -209,25 +209,16 @@ def test_transform(self, string_series): f_abs = np.abs(string_series) # ufunc - result = string_series.transform(np.sqrt) expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - result = string_series.apply(np.sqrt) tm.assert_series_equal(result, expected) # list-like - result = string_series.transform([np.sqrt]) + result = string_series.apply([np.sqrt]) expected = f_sqrt.to_frame().copy() expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = string_series.transform([np.sqrt]) - tm.assert_frame_equal(result, expected) - - result = string_series.transform(["sqrt"]) - tm.assert_frame_equal(result, expected) - # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting @@ -236,10 +227,6 @@ def test_transform(self, string_series): result = string_series.apply([np.sqrt, np.abs]) tm.assert_frame_equal(result, expected) - result = string_series.transform(["sqrt", "abs"]) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] @@ -250,19 +237,11 @@ def test_transform(self, string_series): def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - string_series.transform(["min", "max"]) - msg = "cannot combine transform and aggregation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.agg(["sqrt", "max"]) - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.transform(["sqrt", "max"]) - msg = "cannot perform both aggregation and transformation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): @@ -463,14 +442,6 @@ def test_agg_cython_table_raises(self, series, func, expected): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) - def test_transform_none_to_type(self): - # GH34377 - df = pd.DataFrame({"a": [None]}) - - msg = "DataFrame constructor called with incompatible data and dtype" - with pytest.raises(TypeError, match=msg): - df.transform({"a": int}) - class TestSeriesMap: def test_map(self, datetime_series): diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py new file mode 100644 index 0000000000000..8bc3d2dc4d0db --- /dev/null +++ b/pandas/tests/series/apply/test_series_transform.py @@ -0,0 +1,59 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_transform(string_series): + # transforming functions + + with np.errstate(all="ignore"): + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.transform([np.sqrt]) + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt"]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + result = string_series.transform(["sqrt", "abs"]) + expected.columns = ["sqrt", "abs"] + tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_error(string_series): + # we are trying to transform with an aggregator + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): + string_series.transform(["min", "max"]) + + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) + + +def test_transform_none_to_type(): + # GH34377 + df = pd.DataFrame({"a": [None]}) + + msg = "DataFrame constructor called with incompatible data and dtype" + with pytest.raises(TypeError, match=msg): + df.transform({"a": int}) From c2a0eac713ef21244a6c4c1846bfad863a96bb9b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Sep 2020 13:24:03 -0400 Subject: [PATCH 43/71] CLN: _wrap_applied_output (#36160) --- pandas/core/groupby/generic.py | 191 ++++++++++++++++----------------- 1 file changed, 91 insertions(+), 100 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 728e1ee4653fd..f428085cf441a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1229,113 +1229,104 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - else: - key_index = self.grouper.result_index if self.as_index else None - - if isinstance(first_not_none, Series): - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype( - dtype_if_empty=object, **kwargs - ) - - values = [x if (x is not None) else backup for x in values] - v = values[0] - - if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: - # values are not series or array-like but scalars - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) + key_index = self.grouper.result_index if self.as_index else None + + if isinstance(first_not_none, Series): + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + + values = [x if (x is not None) else backup for x in values] + + v = values[0] + + if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + return self.obj._constructor_sliced(values, index=key_index) + + if isinstance(v, Series): + all_indexed_same = all_indexes_same((x.index for x in values)) + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + applied_index = self._selected_obj._get_axis(self.axis) + singular_series = len(values) == 1 and applied_index.nlevels == 1 + + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat + + return concat(values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) + + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] else: - if isinstance(v, Series): - all_indexed_same = all_indexes_same((x.index for x in values)) - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - applied_index = self._selected_obj._get_axis(self.axis) - singular_series = ( - len(values) == 1 and applied_index.nlevels == 1 - ) - - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) - - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) - - if self.axis == 0: - index = key_index - columns = v.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = list(names)[0] - else: - index = v.index - columns = key_index - stacked_values = stacked_values.T - - result = self.obj._constructor( - stacked_values, index=index, columns=columns - ) + index = v.index + columns = key_index + stacked_values = stacked_values.T - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame( - values, index=key_index, columns=[self._selection] - ) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) + result = self.obj._constructor(stacked_values, index=index, columns=columns) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = _recast_datetimelike_result(result) - else: - result = result._convert(datetime=True) + elif not self.as_index: + # We add grouping column below, so create a frame here + result = DataFrame(values, index=key_index, columns=[self._selection]) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) + + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) - return self._reindex_output(result) + return self._reindex_output(result) def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs From 366f63cfc25201f8fc354a9c51d03f0e974f6f32 Mon Sep 17 00:00:00 2001 From: Alex Kirko Date: Sun, 6 Sep 2020 20:47:40 +0300 Subject: [PATCH 44/71] BUG: allow missing values in Index when calling Index.sort_values (#35604) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/conftest.py | 23 ++++++++ pandas/core/indexes/base.py | 27 ++++++++-- .../tests/indexes/interval/test_interval.py | 2 +- pandas/tests/indexes/period/test_ops.py | 16 ++++-- pandas/tests/indexes/test_common.py | 52 ++++++++++++++++++- 6 files changed, 112 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ff9e803b4990a..b4fdbf9588ffe 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -270,8 +270,9 @@ Interval Indexing ^^^^^^^^ + - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) -- +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Missing diff --git a/pandas/conftest.py b/pandas/conftest.py index 0878380d00837..5474005a63b8e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,6 +437,29 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev + # Azure pipeline that writes into indices_dict despite copy + ind = indices_dict[request.param].copy(deep=True) + vals = ind.values + if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] + return MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + return type(ind)(vals) + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 65b5dfb6df911..a1bc8a4659b24 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -88,7 +88,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -4443,7 +4443,11 @@ def asof_locs(self, where, mask): return result def sort_values( - self, return_indexer=False, ascending=True, key: Optional[Callable] = None + self, + return_indexer=False, + ascending=True, + na_position: str_t = "last", + key: Optional[Callable] = None, ): """ Return a sorted copy of the index. @@ -4457,6 +4461,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.2.0 + key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4497,9 +4507,16 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - _as = idx.argsort() - if not ascending: - _as = _as[::-1] + # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MutiIndex + if not isinstance(self, ABCMultiIndex): + _as = nargsort( + items=idx, ascending=ascending, na_position=na_position, key=key + ) + else: + _as = idx.argsort() + if not ascending: + _as = _as[::-1] sorted_index = self.take(_as) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index a20e542b1edd7..42849e0bbb5c7 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -618,7 +618,7 @@ def test_sort_values(self, closed): expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) - result = index.sort_values(ascending=False) + result = index.sort_values(ascending=False, na_position="first") expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index e7dd76584d780..d1b34c315b682 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -174,9 +174,6 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") @@ -333,3 +330,16 @@ def test_freq_setter_deprecated(self): # warning for setter with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() + + +@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)") +def test_order_stability_compat(): + # GH 35584. The new implementation of sort_values for Index.sort_values + # is stable when sorting in descending order. Datetime-like sort_values + # currently aren't stable. xfail should be removed after + # the implementations' behavior is synchronized (xref GH 35922) + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index db260b71e7186..aa6b395176b06 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,14 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, MultiIndex, RangeIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, +) import pandas._testing as tm @@ -391,3 +398,46 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name + + +@pytest.mark.parametrize("na_position", [None, "middle"]) +def test_sort_values_invalid_na_position(index_with_missing, na_position): + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + if na_position not in ["first", "last"]: + with pytest.raises( + ValueError, match=f"invalid na_position: {na_position}", + ): + index_with_missing.sort_values(na_position=na_position) + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_sort_values_with_missing(index_with_missing, na_position): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and place missing according to na_position + + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + missing_count = np.sum(index_with_missing.isna()) + not_na_vals = index_with_missing[index_with_missing.notna()].values + sorted_values = np.sort(not_na_vals) + if na_position == "first": + sorted_values = np.concatenate([[None] * missing_count, sorted_values]) + else: + sorted_values = np.concatenate([sorted_values, [None] * missing_count]) + expected = type(index_with_missing)(sorted_values) + + result = index_with_missing.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected) From 8631f2e5e5c0c5d95e3381702ffbd34b282df8a2 Mon Sep 17 00:00:00 2001 From: Honfung Wong Date: Mon, 7 Sep 2020 01:49:26 +0800 Subject: [PATCH 45/71] BUG: extra leading space in to_string when index=False (#36094) --- doc/source/whatsnew/v1.2.0.rst | 5 ++- pandas/io/formats/format.py | 28 +++++++++++----- pandas/tests/io/formats/test_format.py | 42 +++++++++++++++++++++--- pandas/tests/io/formats/test_to_latex.py | 22 ++++++------- 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b4fdbf9588ffe..9a778acba4764 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -214,8 +214,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Categorical ^^^^^^^^^^^ @@ -257,7 +255,7 @@ Conversion Strings ^^^^^^^ - +- Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) - - @@ -315,6 +313,7 @@ Groupby/resample/rolling - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Reshaping diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 53b2b533215f0..70e38c3106bdb 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -345,6 +345,7 @@ def _get_formatted_values(self) -> List[str]: None, float_format=self.float_format, na_rep=self.na_rep, + leading_space=self.index, ) def to_string(self) -> str: @@ -960,6 +961,7 @@ def _format_col(self, i: int) -> List[str]: na_rep=self.na_rep, space=self.col_space.get(frame.columns[i]), decimal=self.decimal, + leading_space=self.index, ) def to_html( @@ -1111,7 +1113,7 @@ def format_array( space: Optional[Union[str, int]] = None, justify: str = "right", decimal: str = ".", - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, quoting: Optional[int] = None, ) -> List[str]: """ @@ -1127,7 +1129,7 @@ def format_array( space justify decimal - leading_space : bool, optional + leading_space : bool, optional, default True Whether the array should be formatted with a leading space. When an array as a column of a Series or DataFrame, we do want the leading space to pad between columns. @@ -1194,7 +1196,7 @@ def __init__( decimal: str = ".", quoting: Optional[int] = None, fixed_width: bool = True, - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, ): self.values = values self.digits = digits @@ -1395,9 +1397,11 @@ def format_values_with(float_format): float_format: Optional[FloatFormatType] if self.float_format is None: if self.fixed_width: - float_format = partial( - "{value: .{digits:d}f}".format, digits=self.digits - ) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}f}" + else: + fmt_str = "{value:.{digits:d}f}" + float_format = partial(fmt_str.format, digits=self.digits) else: float_format = self.float_format else: @@ -1429,7 +1433,11 @@ def format_values_with(float_format): ).any() if has_small_values or (too_long and has_large_values): - float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}e}" + else: + fmt_str = "{value:.{digits:d}e}" + float_format = partial(fmt_str.format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values @@ -1444,7 +1452,11 @@ def _format_strings(self) -> List[str]: class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - formatter = self.formatter or (lambda x: f"{x: d}") + if self.leading_space is False: + formatter_str = lambda x: f"{x:d}".format(x=x) + else: + formatter_str = lambda x: f"{x: d}".format(x=x) + formatter = self.formatter or formatter_str fmt_values = [formatter(x) for x in self.values] return fmt_values diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 1fb957505987f..f00fa6274fca2 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1546,11 +1546,11 @@ def test_to_string_no_index(self): df_s = df.to_string(index=False) # Leading space is expected for positive numbers. - expected = " x y z\n 11 33 AAA\n 22 -44 " + expected = " x y z\n11 33 AAA\n22 -44 " assert df_s == expected df_s = df[["y", "x", "z"]].to_string(index=False) - expected = " y x z\n 33 11 AAA\n-44 22 " + expected = " y x z\n 33 11 AAA\n-44 22 " assert df_s == expected def test_to_string_line_width_no_index(self): @@ -1565,7 +1565,7 @@ def test_to_string_line_width_no_index(self): df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 " + expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected @@ -2269,7 +2269,7 @@ def test_to_string_without_index(self): # GH 11729 Test index=False option s = Series([1, 2, 3, 4]) result = s.to_string(index=False) - expected = " 1\n" + " 2\n" + " 3\n" + " 4" + expected = "1\n" + "2\n" + "3\n" + "4" assert result == expected def test_unicode_name_in_footer(self): @@ -3391,3 +3391,37 @@ def test_filepath_or_buffer_bad_arg_raises(float_frame, method): msg = "buf is not a file name and it has no write method" with pytest.raises(TypeError, match=msg): getattr(float_frame, method)(buf=object()) + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ("a", "a"), + (["a", "b"], "a\nb"), + ([1, "a"], "1\na"), + (1, "1"), + ([0, -1], " 0\n-1"), + (1.0, "1.0"), + ([" a", " b"], " a\n b"), + ([".1", "1"], ".1\n 1"), + (["10", "-10"], " 10\n-10"), + ], +) +def test_format_remove_leading_space_series(input_array, expected): + # GH: 24980 + s = pd.Series(input_array).to_string(index=False) + assert s == expected + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ({"A": ["a"]}, "A\na"), + ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), + ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), + ], +) +def test_format_remove_leading_space_dataframe(input_array, expected): + # GH: 24980 + df = pd.DataFrame(input_array).to_string(index=False) + assert df == expected diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 96a9ed2b86cf4..9dfd851e91c65 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -50,10 +50,10 @@ def test_to_latex(self, float_frame): withoutindex_result = df.to_latex(index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule - a & b \\ + a & b \\ \midrule - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ @@ -413,7 +413,7 @@ def test_to_latex_longtable(self): withoutindex_result = df.to_latex(index=False, longtable=True) withoutindex_expected = r"""\begin{longtable}{rl} \toprule - a & b \\ + a & b \\ \midrule \endhead \midrule @@ -423,8 +423,8 @@ def test_to_latex_longtable(self): \bottomrule \endlastfoot - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \end{longtable} """ @@ -663,8 +663,8 @@ def test_to_latex_no_header(self): withoutindex_result = df.to_latex(index=False, header=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule - 1 & b1 \\ - 2 & b2 \\ +1 & b1 \\ +2 & b2 \\ \bottomrule \end{tabular} """ @@ -690,10 +690,10 @@ def test_to_latex_specified_header(self): withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule -AA & BB \\ +AA & BB \\ \midrule - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ From 66b3b5aeb0becc0c0d5657f2980e5454b1d59db5 Mon Sep 17 00:00:00 2001 From: Harsh Sharma <51477130+hs2361@users.noreply.github.com> Date: Mon, 7 Sep 2020 16:46:11 +0530 Subject: [PATCH 46/71] =?UTF-8?q?BUG:=20shows=20correct=20package=20name?= =?UTF-8?q?=20when=20import=5Foptional=5Fdependency=20is=20ca=E2=80=A6=20(?= =?UTF-8?q?#36134)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/compat/_optional.py | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 1e946d325ace1..da261907565a1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -41,6 +41,7 @@ Bug fixes - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) +- Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 689c7c889ef66..40688a3978cfc 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -33,6 +33,19 @@ "numba": "0.46.0", } +# A mapping from import name to package name (on PyPI) for packages where +# these two names are different. + +INSTALL_MAPPING = { + "bs4": "beautifulsoup4", + "bottleneck": "Bottleneck", + "lxml.etree": "lxml", + "odf": "odfpy", + "pandas_gbq": "pandas-gbq", + "sqlalchemy": "SQLAlchemy", + "jinja2": "Jinja2", +} + def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -82,9 +95,13 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + + package_name = INSTALL_MAPPING.get(name) + install_name = package_name if package_name is not None else name + msg = ( - f"Missing optional dependency '{name}'. {extra} " - f"Use pip or conda to install {name}." + f"Missing optional dependency '{install_name}'. {extra} " + f"Use pip or conda to install {install_name}." ) try: module = importlib.import_module(name) From 4550cf1de59bcc0bf7f5e00c1d08ea2bbe15210e Mon Sep 17 00:00:00 2001 From: ivanovmg <41443370+ivanovmg@users.noreply.github.com> Date: Tue, 8 Sep 2020 01:58:50 +0700 Subject: [PATCH 47/71] REF: simplify latex formatting (#35872) --- pandas/io/formats/format.py | 7 +- pandas/io/formats/latex.py | 778 +++++++++++++++++------ pandas/tests/io/formats/test_to_latex.py | 102 +++ 3 files changed, 694 insertions(+), 193 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 70e38c3106bdb..623dc6e6bad91 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -939,17 +939,18 @@ def to_latex( """ from pandas.io.formats.latex import LatexFormatter - return LatexFormatter( + latex_formatter = LatexFormatter( self, - column_format=column_format, longtable=longtable, + column_format=column_format, multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, caption=caption, label=label, position=position, - ).get_result(buf=buf, encoding=encoding) + ) + return latex_formatter.get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: frame = self.tr_frame diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 715b8bbdf5672..8080d953da308 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -1,7 +1,8 @@ """ Module for formatting output data in Latex. """ -from typing import IO, List, Optional, Tuple +from abc import ABC, abstractmethod +from typing import IO, Iterator, List, Optional, Type import numpy as np @@ -10,56 +11,95 @@ from pandas.io.formats.format import DataFrameFormatter, TableFormatter -class LatexFormatter(TableFormatter): - """ - Used to render a DataFrame to a LaTeX tabular/longtable environment output. +class RowStringConverter(ABC): + r"""Converter for dataframe rows into LaTeX strings. Parameters ---------- formatter : `DataFrameFormatter` - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - longtable : boolean, default False - Use a longtable environment instead of tabular. + Instance of `DataFrameFormatter`. + multicolumn: bool, optional + Whether to use \multicolumn macro. + multicolumn_format: str, optional + Multicolumn format. + multirow: bool, optional + Whether to use \multirow macro. - See Also - -------- - HTMLFormatter """ def __init__( self, formatter: DataFrameFormatter, - column_format: Optional[str] = None, - longtable: bool = False, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.bold_rows - self.column_format = column_format - self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.caption = caption - self.label = label - self.escape = self.fmt.escape - self.position = position - self._table_float = any(p is not None for p in (caption, label, position)) + self.clinebuf: List[List[int]] = [] + self.strcols = self._get_strcols() + self.strrows: List[List[str]] = ( + list(zip(*self.strcols)) # type: ignore[arg-type] + ) + + def get_strrow(self, row_num: int) -> str: + """Get string representation of the row.""" + row = self.strrows[row_num] + + is_multicol = ( + row_num < self.column_levels and self.fmt.header and self.multicolumn + ) + + is_multirow = ( + row_num >= self.header_levels + and self.fmt.index + and self.multirow + and self.index_levels > 1 + ) + + is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 + + crow = self._preprocess_row(row) + + if is_multicol: + crow = self._format_multicolumn(crow) + if is_multirow: + crow = self._format_multirow(crow, row_num) + + lst = [] + lst.append(" & ".join(crow)) + lst.append(" \\\\") + if is_cline_maybe_required: + cline = self._compose_cline(row_num, len(self.strcols)) + lst.append(cline) + return "".join(lst) + + @property + def _header_row_num(self) -> int: + """Number of rows in header.""" + return self.header_levels if self.fmt.header else 0 + + @property + def index_levels(self) -> int: + """Integer number of levels in index.""" + return self.frame.index.nlevels + + @property + def column_levels(self) -> int: + return self.frame.columns.nlevels + + @property + def header_levels(self) -> int: + nlevels = self.column_levels + if self.fmt.has_index_names and self.fmt.show_index_names: + nlevels += 1 + return nlevels - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a LaTeX tabular, longtable, or table/tabular - environment output. - """ - # string representation of the columns + def _get_strcols(self) -> List[List[str]]: + """String representation of the columns.""" if len(self.frame.columns) == 0 or len(self.frame.index) == 0: info_line = ( f"Empty {type(self.frame).__name__}\n" @@ -70,12 +110,6 @@ def write_result(self, buf: IO[str]) -> None: else: strcols = self.fmt._to_str_columns() - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return "r" - else: - return "l" - # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( @@ -107,89 +141,19 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] + return strcols - if self.column_format is None: - dtypes = self.frame.dtypes._values - column_format = "".join(map(get_col_type, dtypes)) - if self.fmt.index: - index_format = "l" * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(self.column_format, str): # pragma: no cover - raise AssertionError( - f"column_format must be str or unicode, not {type(column_format)}" - ) + def _preprocess_row(self, row: List[str]) -> List[str]: + """Preprocess elements of the row.""" + if self.fmt.escape: + crow = _escape_symbols(row) else: - column_format = self.column_format - - self._write_tabular_begin(buf, column_format) - - buf.write("\\toprule\n") + crow = [x if x else "{}" for x in row] + if self.fmt.bold_rows and self.fmt.index: + crow = _convert_to_bold(crow, self.index_levels) + return crow - ilevels = self.frame.index.nlevels - clevels = self.frame.columns.nlevels - nlevels = clevels - if self.fmt.has_index_names and self.fmt.show_index_names: - nlevels += 1 - strrows = list(zip(*strcols)) - self.clinebuf: List[List[int]] = [] - - for i, row in enumerate(strrows): - if i == nlevels and self.fmt.header: - buf.write("\\midrule\n") # End of header - if self.longtable: - buf.write("\\endhead\n") - buf.write("\\midrule\n") - buf.write( - f"\\multicolumn{{{len(row)}}}{{r}}" - "{{Continued on next page}} \\\\\n" - ) - buf.write("\\midrule\n") - buf.write("\\endfoot\n\n") - buf.write("\\bottomrule\n") - buf.write("\\endlastfoot\n") - if self.escape: - # escape backslashes first - crow = [ - ( - x.replace("\\", "\\textbackslash ") - .replace("_", "\\_") - .replace("%", "\\%") - .replace("$", "\\$") - .replace("#", "\\#") - .replace("{", "\\{") - .replace("}", "\\}") - .replace("~", "\\textasciitilde ") - .replace("^", "\\textasciicircum ") - .replace("&", "\\&") - if (x and x != "{}") - else "{}" - ) - for x in row - ] - else: - crow = [x if x else "{}" for x in row] - if self.bold_rows and self.fmt.index: - # bold row labels - crow = [ - f"\\textbf{{{x}}}" - if j < ilevels and x.strip() not in ["", "{}"] - else x - for j, x in enumerate(crow) - ] - if i < clevels and self.fmt.header and self.multicolumn: - # sum up columns to multicolumns - crow = self._format_multicolumn(crow, ilevels) - if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: - # sum up rows to multirows - crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(" & ".join(crow)) - buf.write(" \\\\\n") - if self.multirow and i < len(strrows) - 1: - self._print_cline(buf, i, len(strcols)) - - self._write_tabular_end(buf) - - def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: + def _format_multicolumn(self, row: List[str]) -> List[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format @@ -199,7 +163,7 @@ def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: will become \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} """ - row2 = list(row[:ilevels]) + row2 = row[: self.index_levels] ncol = 1 coltext = "" @@ -214,7 +178,7 @@ def append_col(): else: row2.append(coltext) - for c in row[ilevels:]: + for c in row[self.index_levels :]: # if next col has text, write the previous if c.strip(): if coltext: @@ -229,9 +193,7 @@ def append_col(): append_col() return row2 - def _format_multirow( - self, row: List[str], ilevels: int, i: int, rows: List[Tuple[str, ...]] - ) -> List[str]: + def _format_multirow(self, row: List[str], i: int) -> List[str]: r""" Check following rows, whether row should be a multirow @@ -241,10 +203,10 @@ def _format_multirow( b & 0 & \cline{1-2} b & 0 & """ - for j in range(ilevels): + for j in range(self.index_levels): if row[j].strip(): nrow = 1 - for r in rows[i + 1 :]: + for r in self.strrows[i + 1 :]: if not r[j].strip(): nrow += 1 else: @@ -256,88 +218,524 @@ def _format_multirow( self.clinebuf.append([i + nrow - 1, j + 1]) return row - def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: + def _compose_cline(self, i: int, icol: int) -> str: """ - Print clines after multirow-blocks are finished. + Create clines after multirow-blocks are finished. """ + lst = [] for cl in self.clinebuf: if cl[0] == i: - buf.write(f"\\cline{{{cl[1]:d}-{icol:d}}}\n") - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] + lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] + return "".join(lst) + + +class RowStringIterator(RowStringConverter): + """Iterator over rows of the header or the body of the table.""" + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over LaTeX string representations of rows.""" + + +class RowHeaderIterator(RowStringIterator): + """Iterator for the table header rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num < self._header_row_num: + yield self.get_strrow(row_num) + + +class RowBodyIterator(RowStringIterator): + """Iterator for the table body rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num >= self._header_row_num: + yield self.get_strrow(row_num) - def _write_tabular_begin(self, buf, column_format: str): - """ - Write the beginning of a tabular environment or - nested table/tabular environments including caption and label. + +class TableBuilderAbstract(ABC): + """ + Abstract table builder producing string representation of LaTeX table. + + Parameters + ---------- + formatter : `DataFrameFormatter` + Instance of `DataFrameFormatter`. + column_format: str, optional + Column format, for example, 'rcl' for three columns. + multicolumn: bool, optional + Use multicolumn to enhance MultiIndex columns. + multicolumn_format: str, optional + The alignment for multicolumns, similar to column_format. + multirow: bool, optional + Use multirow to enhance MultiIndex rows. + caption: str, optional + Table caption. + label: str, optional + LaTeX label. + position: str, optional + Float placement specifier, for example, 'htb'. + """ + + def __init__( + self, + formatter: DataFrameFormatter, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.column_format = column_format + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.label = label + self.position = position + + def get_result(self) -> str: + """String representation of LaTeX table.""" + elements = [ + self.env_begin, + self.top_separator, + self.header, + self.middle_separator, + self.env_body, + self.bottom_separator, + self.env_end, + ] + result = "\n".join([item for item in elements if item]) + trailing_newline = "\n" + result += trailing_newline + return result + + @property + @abstractmethod + def env_begin(self) -> str: + """Beginning of the environment.""" + + @property + @abstractmethod + def top_separator(self) -> str: + """Top level separator.""" + + @property + @abstractmethod + def header(self) -> str: + """Header lines.""" + + @property + @abstractmethod + def middle_separator(self) -> str: + """Middle level separator.""" + + @property + @abstractmethod + def env_body(self) -> str: + """Environment body.""" + + @property + @abstractmethod + def bottom_separator(self) -> str: + """Bottom level separator.""" + + @property + @abstractmethod + def env_end(self) -> str: + """End of the environment.""" + + +class GenericTableBuilder(TableBuilderAbstract): + """Table builder producing string representation of LaTeX table.""" + + @property + def header(self) -> str: + iterator = self._create_row_iterator(over="header") + return "\n".join(list(iterator)) + + @property + def top_separator(self) -> str: + return "\\toprule" + + @property + def middle_separator(self) -> str: + return "\\midrule" if self._is_separator_required() else "" + + @property + def env_body(self) -> str: + iterator = self._create_row_iterator(over="body") + return "\n".join(list(iterator)) + + def _is_separator_required(self) -> bool: + return bool(self.header and self.env_body) + + @property + def _position_macro(self) -> str: + r"""Position macro, extracted from self.position, like [h].""" + return f"[{self.position}]" if self.position else "" + + @property + def _caption_macro(self) -> str: + r"""Caption macro, extracted from self.caption, like \caption{cap}.""" + return f"\\caption{{{self.caption}}}" if self.caption else "" + + @property + def _label_macro(self) -> str: + r"""Label macro, extracted from self.label, like \label{ref}.""" + return f"\\label{{{self.label}}}" if self.label else "" + + def _create_row_iterator(self, over: str) -> RowStringIterator: + """Create iterator over header or body of the table. Parameters ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns + over : {'body', 'header'} + Over what to iterate. + + Returns + ------- + RowStringIterator + Iterator over body or header. """ - if self._table_float: - # then write output in a nested table/tabular or longtable environment - if self.caption is None: - caption_ = "" - else: - caption_ = f"\n\\caption{{{self.caption}}}" + iterator_kind = self._select_iterator(over) + return iterator_kind( + formatter=self.fmt, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + ) + + def _select_iterator(self, over: str) -> Type[RowStringIterator]: + """Select proper iterator over table rows.""" + if over == "header": + return RowHeaderIterator + elif over == "body": + return RowBodyIterator + else: + msg = f"'over' must be either 'header' or 'body', but {over} was provided" + raise ValueError(msg) + + +class LongTableBuilder(GenericTableBuilder): + """Concrete table builder for longtable. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = LongTableBuilder(formatter, caption='caption', label='lab', + ... column_format='lrl') + >>> table = builder.get_result() + >>> print(table) + \\begin{longtable}{lrl} + \\caption{caption} + \\label{lab}\\\\ + \\toprule + {} & a & b \\\\ + \\midrule + \\endhead + \\midrule + \\multicolumn{3}{r}{{Continued on next page}} \\\\ + \\midrule + \\endfoot + + \\bottomrule + \\endlastfoot + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\end{longtable} + + """ - if self.label is None: - label_ = "" - else: - label_ = f"\n\\label{{{self.label}}}" + @property + def env_begin(self) -> str: + first_row = ( + f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" + ) + elements = [first_row, f"{self._caption_and_label()}"] + return "\n".join([item for item in elements if item]) + + def _caption_and_label(self) -> str: + if self.caption or self.label: + double_backslash = "\\\\" + elements = [f"{self._caption_macro}", f"{self._label_macro}"] + caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label += double_backslash + return caption_and_label + else: + return "" + + @property + def middle_separator(self) -> str: + iterator = self._create_row_iterator(over="header") + elements = [ + "\\midrule", + "\\endhead", + "\\midrule", + f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" + "{{Continued on next page}} \\\\", + "\\midrule", + "\\endfoot\n", + "\\bottomrule", + "\\endlastfoot", + ] + if self._is_separator_required(): + return "\n".join(elements) + return "" + + @property + def bottom_separator(self) -> str: + return "" + + @property + def env_end(self) -> str: + return "\\end{longtable}" + + +class RegularTableBuilder(GenericTableBuilder): + """Concrete table builder for regular table. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', + ... column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{table} + \\centering + \\caption{caption} + \\label{lab} + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + \\end{table} + + """ - if self.position is None: - position_ = "" - else: - position_ = f"[{self.position}]" + @property + def env_begin(self) -> str: + elements = [ + f"\\begin{{table}}{self._position_macro}", + "\\centering", + f"{self._caption_macro}", + f"{self._label_macro}", + f"\\begin{{tabular}}{{{self.column_format}}}", + ] + return "\n".join([item for item in elements if item]) + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" + + @property + def env_end(self) -> str: + return "\n".join(["\\end{tabular}", "\\end{table}"]) + + +class TabularBuilder(GenericTableBuilder): + """Concrete table builder for tabular environment. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = TabularBuilder(formatter, column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + + """ - if self.longtable: - table_ = f"\\begin{{longtable}}{position_}{{{column_format}}}" - tabular_ = "\n" - else: - table_ = f"\\begin{{table}}{position_}\n\\centering" - tabular_ = f"\n\\begin{{tabular}}{{{column_format}}}\n" - - if self.longtable and (self.caption is not None or self.label is not None): - # a double-backslash is required at the end of the line - # as discussed here: - # https://tex.stackexchange.com/questions/219138 - backlash_ = "\\\\" - else: - backlash_ = "" - buf.write(f"{table_}{caption_}{label_}{backlash_}{tabular_}") - else: - if self.longtable: - tabletype_ = "longtable" - else: - tabletype_ = "tabular" - buf.write(f"\\begin{{{tabletype_}}}{{{column_format}}}\n") + @property + def env_begin(self) -> str: + return f"\\begin{{tabular}}{{{self.column_format}}}" + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" + + @property + def env_end(self) -> str: + return "\\end{tabular}" + + +class LatexFormatter(TableFormatter): + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + + See Also + -------- + HTMLFormatter + """ + + def __init__( + self, + formatter: DataFrameFormatter, + longtable: bool = False, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.frame = self.fmt.frame + self.longtable = longtable + self.column_format = column_format # type: ignore[assignment] + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.label = label + self.position = position - def _write_tabular_end(self, buf): + def write_result(self, buf: IO[str]) -> None: """ - Write the end of a tabular environment or nested table/tabular - environment. + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. + """ + table_string = self.builder.get_result() + buf.write(table_string) - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. + @property + def builder(self) -> TableBuilderAbstract: + """Concrete table builder. + Returns + ------- + TableBuilder """ + builder = self._select_builder() + return builder( + formatter=self.fmt, + column_format=self.column_format, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + caption=self.caption, + label=self.label, + position=self.position, + ) + + def _select_builder(self) -> Type[TableBuilderAbstract]: + """Select proper table builder.""" if self.longtable: - buf.write("\\end{longtable}\n") + return LongTableBuilder + if any([self.caption, self.label, self.position]): + return RegularTableBuilder + return TabularBuilder + + @property + def column_format(self) -> str: + """Column format.""" + return self._column_format + + @column_format.setter + def column_format(self, input_column_format: Optional[str]) -> None: + """Setter for column format.""" + if input_column_format is None: + self._column_format = ( + self._get_index_format() + self._get_column_format_based_on_dtypes() + ) + elif not isinstance(input_column_format, str): + raise ValueError( + f"column_format must be str or unicode, " + f"not {type(input_column_format)}" + ) else: - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") - if self._table_float: - buf.write("\\end{table}\n") - else: - pass + self._column_format = input_column_format + + def _get_column_format_based_on_dtypes(self) -> str: + """Get column format based on data type. + + Right alignment for numbers and left - for strings. + """ + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return "r" + return "l" + + dtypes = self.frame.dtypes._values + return "".join(map(get_col_type, dtypes)) + + def _get_index_format(self) -> str: + """Get index column format.""" + return "l" * self.frame.index.nlevels if self.fmt.index else "" + + +def _escape_symbols(row: List[str]) -> List[str]: + """Carry out string replacements for special symbols. + + Parameters + ---------- + row : list + List of string, that may contain special symbols. + + Returns + ------- + list + list of strings with the special symbols replaced. + """ + return [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] + + +def _convert_to_bold(crow: List[str], ilevels: int) -> List[str]: + """Convert elements in ``crow`` to bold.""" + return [ + f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x + for j, x in enumerate(crow) + ] + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 9dfd851e91c65..a98644250b328 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -7,6 +7,14 @@ from pandas import DataFrame, Series import pandas._testing as tm +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.latex import ( + RegularTableBuilder, + RowBodyIterator, + RowHeaderIterator, + RowStringConverter, +) + class TestToLatex: def test_to_latex_filename(self, float_frame): @@ -60,6 +68,16 @@ def test_to_latex(self, float_frame): assert withoutindex_result == withoutindex_expected + @pytest.mark.parametrize( + "bad_column_format", + [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, dict(a="r", b="l")], + ) + def test_to_latex_bad_column_format(self, bad_column_format): + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + msg = r"column_format must be str or unicode" + with pytest.raises(ValueError, match=msg): + df.to_latex(column_format=bad_column_format) + def test_to_latex_format(self, float_frame): # GH Bug #9402 float_frame.to_latex(column_format="ccc") @@ -930,3 +948,87 @@ def test_to_latex_multindex_header(self): \end{tabular} """ assert observed == expected + + +class TestTableBuilder: + @pytest.fixture + def dataframe(self): + return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + @pytest.fixture + def table_builder(self, dataframe): + return RegularTableBuilder(formatter=DataFrameFormatter(dataframe)) + + def test_create_row_iterator(self, table_builder): + iterator = table_builder._create_row_iterator(over="header") + assert isinstance(iterator, RowHeaderIterator) + + def test_create_body_iterator(self, table_builder): + iterator = table_builder._create_row_iterator(over="body") + assert isinstance(iterator, RowBodyIterator) + + def test_create_body_wrong_kwarg_raises(self, table_builder): + with pytest.raises(ValueError, match="must be either 'header' or 'body'"): + table_builder._create_row_iterator(over="SOMETHING BAD") + + +class TestRowStringConverter: + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & Design & ratio & xy \\"), + (1, r"0 & 1 & 4 & 10 \\"), + (2, r"1 & 2 & 5 & 11 \\"), + ], + ) + def test_get_strrow_normal_without_escape(self, row_num, expected): + df = DataFrame({r"Design": [1, 2, 3], r"ratio": [4, 5, 6], r"xy": [10, 11, 12]}) + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df, escape=True), + ) + assert row_string_converter.get_strrow(row_num=row_num) == expected + + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & Design \# & ratio, \% & x\&y \\"), + (1, r"0 & 1 & 4 & 10 \\"), + (2, r"1 & 2 & 5 & 11 \\"), + ], + ) + def test_get_strrow_normal_with_escape(self, row_num, expected): + df = DataFrame( + {r"Design #": [1, 2, 3], r"ratio, %": [4, 5, 6], r"x&y": [10, 11, 12]} + ) + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df, escape=True), + ) + assert row_string_converter.get_strrow(row_num=row_num) == expected + + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & \multicolumn{2}{r}{c1} & \multicolumn{2}{r}{c2} & c3 \\"), + (1, r"{} & 0 & 1 & 0 & 1 & 0 \\"), + (2, r"0 & 0 & 5 & 0 & 5 & 0 \\"), + ], + ) + def test_get_strrow_multindex_multicolumn(self, row_num, expected): + df = DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) + + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df), + multicolumn=True, + multicolumn_format="r", + multirow=True, + ) + + assert row_string_converter.get_strrow(row_num=row_num) == expected From 7db9d22e1a52a757c95a6772319572252770b77d Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Mon, 7 Sep 2020 15:04:42 -0400 Subject: [PATCH 48/71] Comma cleanup (#36168) --- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_loc.py | 33 +++++++------------- pandas/tests/internals/test_internals.py | 8 ++--- pandas/tests/io/formats/test_css.py | 12 +++---- pandas/tests/io/formats/test_info.py | 12 +++---- pandas/tests/io/json/test_compression.py | 2 +- pandas/tests/io/json/test_pandas.py | 10 ++---- pandas/tests/io/parser/test_c_parser_only.py | 4 +-- pandas/tests/io/parser/test_parse_dates.py | 4 +-- pandas/tests/io/parser/test_usecols.py | 2 +- 11 files changed, 34 insertions(+), 57 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4fae01ec710fd..bfb62835add93 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -56,7 +56,7 @@ def test_is_scalar_access(self): assert ser.iloc._is_scalar_access((1,)) df = ser.to_frame() - assert df.iloc._is_scalar_access((1, 0,)) + assert df.iloc._is_scalar_access((1, 0)) def test_iloc_exceeds_bounds(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a080c5d169215..ca8a3ddc95575 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1004,7 +1004,7 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): # all numeric columns -> numeric series df = pd.DataFrame( - {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"], + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"] ) result = df.loc["a"] expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 193800fae751f..e42d9679464d8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -29,13 +29,11 @@ def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result( - "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError, + "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError ) self.check_result("loc", "f", typs=["floats"], fails=KeyError) self.check_result("loc", "f", typs=["floats"], fails=KeyError) - self.check_result( - "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, - ) + self.check_result("loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError) self.check_result("loc", 20, typs=["labels"], fails=KeyError) self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError) self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError) @@ -46,26 +44,24 @@ def test_loc_getitem_label_list(self): pass def test_loc_getitem_label_list_with_missing(self): + self.check_result("loc", [0, 1, 2], typs=["empty"], fails=KeyError) self.check_result( - "loc", [0, 1, 2], typs=["empty"], fails=KeyError, - ) - self.check_result( - "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError, + "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError ) self.check_result( - "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError, + "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError ) # GH 17758 - MultiIndex and missing keys self.check_result( - "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError, + "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError ) def test_loc_getitem_label_list_fails(self): # fails self.check_result( - "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError, + "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError ) def test_loc_getitem_label_array_like(self): @@ -95,18 +91,14 @@ def test_loc_getitem_label_slice(self): ) self.check_result( - "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError, + "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError, - ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError, - ) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError) self.check_result( - "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError, + "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError ) def test_setitem_from_duplicate_axis(self): @@ -669,8 +661,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): (1, ["A", "B", "C"]), np.array([7, 8, 9], dtype=np.int64), pd.DataFrame( - [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], - columns=["A", "B", "C"], + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], columns=["A", "B", "C"] ), ), ( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 06ccdd2484a2a..1d73d1e35728b 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -892,16 +892,16 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): fill_value, ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value, + mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value, + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value ) assert_reindex_indexer_is_ok( mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value ) assert_reindex_indexer_is_ok( mgr, @@ -913,7 +913,7 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value ) diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 9383f86e335fa..785904fafd31a 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -99,11 +99,11 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"} ) assert_resolves( - f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"} ) assert_resolves( @@ -189,9 +189,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) @pytest.mark.parametrize( @@ -225,6 +223,4 @@ def test_css_relative_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 877bd1650ae60..7000daeb9b575 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -299,7 +299,7 @@ def test_info_memory_usage(): DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) df.index.nbytes df.memory_usage(index=True) @@ -336,7 +336,7 @@ def test_info_memory_usage_deep_pypy(): @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with @@ -359,16 +359,14 @@ def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) ) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) assert "+" in buf.getvalue() @@ -384,7 +382,7 @@ def memory_usage(f): N = 100 M = len(uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"], + [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"] ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c0e3220454bf1..a41af9886c617 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -45,7 +45,7 @@ def test_with_s3_url(compression, s3_resource, s3so): s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) roundtripped_df = pd.read_json( - "s3://pandas-test/test-1", compression=compression, storage_options=s3so, + "s3://pandas-test/test-1", compression=compression, storage_options=s3so ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 59d64e1a6e909..13152f01abb04 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -745,11 +745,7 @@ def test_reconstruction_index(self): def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: - for df in [ - float_frame, - int_frame, - datetime_frame, - ]: + for df in [float_frame, int_frame, datetime_frame]: df.to_json(path) read_json(path) @@ -1706,9 +1702,7 @@ def test_to_s3(self, s3_resource, s3so): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json( - f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so, - ) + df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50179fc1ec4b8..50d5fb3e49c2a 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -646,9 +646,7 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "float_precision", [None, "high", "round_trip"], -) +@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index ed947755e3419..833186b69c63b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1439,7 +1439,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=pytz.FixedOffset(540), - ), + ) ), freq=None, ) @@ -1553,5 +1553,5 @@ def test_missing_parse_dates_column_raises( msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" with pytest.raises(ValueError, match=msg): parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates ) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index d4e049cc3fcc2..7e9c9866a666d 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -199,7 +199,7 @@ def test_usecols_with_whitespace(all_parsers): # Column selection by index. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),), + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), ], ) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): From 172c626b217f03bb3357d168e993bd2947dad31e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:05:50 +0100 Subject: [PATCH 49/71] TST: test_datetime64_factorize on 32bit (#36192) --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ec7413514d430..a2c2ae22a0b62 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -256,7 +256,7 @@ def test_datetime64_factorize(self, writable): # GH35650 Verify whether read-only datetime64 array can be factorized data = np.array([np.datetime64("2020-01-01T00:00:00.000")]) data.setflags(write=writable) - expected_codes = np.array([0], dtype=np.int64) + expected_codes = np.array([0], dtype=np.intp) expected_uniques = np.array( ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]" ) From f895c6a73361269dea2a115690510b4b9adcd0df Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:39:15 +0100 Subject: [PATCH 50/71] TST: update test_series_factorize_na_sentinel_none for 32bit (#36191) --- pandas/tests/base/test_factorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 9fad9856d53cc..f8cbadb987d29 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -34,7 +34,7 @@ def test_series_factorize_na_sentinel_none(): ser = pd.Series(values) codes, uniques = ser.factorize(na_sentinel=None) - expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = pd.Index([1.0, 2.0, np.nan]) tm.assert_numpy_array_equal(codes, expected_codes) From bb5b86a6130769225aa2b5884a277398a639f25c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:41:42 +0100 Subject: [PATCH 51/71] DOC: move release note for #36155 (#36187) --- doc/source/whatsnew/v1.1.2.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index da261907565a1..e9cba3de56920 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -50,6 +50,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) +- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9a778acba4764..ccaae9f996425 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -300,7 +300,6 @@ Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) -- meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ From de2a1dcdb41b564ddef302bffa005fff72340f45 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 12:56:18 -0700 Subject: [PATCH 52/71] REF: use _validate_foo pattern in Categorical (#36181) --- pandas/core/arrays/categorical.py | 31 ++++++++++++++++++++++--------- pandas/core/indexes/category.py | 11 +++-------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 02305479bef67..228e630f95863 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1192,6 +1192,26 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) + def _validate_insert_value(self, value) -> int: + code = self.categories.get_indexer([value]) + if (code == -1) and not (is_scalar(value) and isna(value)): + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) + return code[0] + + def _validate_searchsorted_value(self, value): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self.categories.get_loc(value) + codes = self.codes.dtype.type(codes) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) + return codes + def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our @@ -1299,15 +1319,8 @@ def memory_usage(self, deep=False): @doc(_shared_docs["searchsorted"], klass="Categorical") def searchsorted(self, value, side="left", sorter=None): - # searchsorted is very performance sensitive. By converting codes - # to same dtype as self.codes, we get much faster performance. - if is_scalar(value): - codes = self.categories.get_loc(value) - codes = self.codes.dtype.type(codes) - else: - locs = [self.categories.get_loc(x) for x in value] - codes = np.array(locs, dtype=self.codes.dtype) - return self.codes.searchsorted(codes, side=side, sorter=sorter) + value = self._validate_searchsorted_value(value) + return self.codes.searchsorted(value, side=side, sorter=sorter) def isna(self): """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index cbb30763797d1..d38f77aaceb01 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -734,15 +734,10 @@ def insert(self, loc: int, item): ValueError if the item is not in the categories """ - code = self.categories.get_indexer([item]) - if (code == -1) and not (is_scalar(item) and isna(item)): - raise TypeError( - "cannot insert an item into a CategoricalIndex " - "that is not already an existing category" - ) + code = self._data._validate_insert_value(item) codes = self.codes - codes = np.concatenate((codes[:loc], code, codes[loc:])) + codes = np.concatenate((codes[:loc], [code], codes[loc:])) return self._create_from_codes(codes) def _concat(self, to_concat, name): From d9de663cae80dbf718b08ab852f3056da7b64559 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 13:46:33 -0700 Subject: [PATCH 53/71] DTA/TDA/PA use self._data instead of self.asi8 for self._ndarray (#36171) --- pandas/core/arrays/datetimelike.py | 50 +++++++++++--------- pandas/core/arrays/datetimes.py | 4 ++ pandas/core/arrays/period.py | 4 ++ pandas/core/arrays/timedeltas.py | 4 ++ pandas/tests/frame/indexing/test_datetime.py | 4 +- 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a5b8032974fa4..a218745db0a44 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -27,7 +27,7 @@ from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( @@ -175,6 +175,14 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT: """ raise AbstractMethodError(self) + @classmethod + def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: + """ + Box an integer unboxed via _unbox_scalar into the native type for + the underlying ndarray. + """ + raise AbstractMethodError(cls) + def _unbox_scalar(self, value: DTScalarOrNaT) -> int: """ Unbox the integer value of a scalar `value`. @@ -458,18 +466,15 @@ class DatetimeLikeArrayMixin( # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat - # TODO: make this a cache_readonly; need to get around _index_data - # kludge in libreduction - @property + @cache_readonly def _ndarray(self) -> np.ndarray: - # NB: A bunch of Interval tests fail if we use ._data - return self.asi8 + return self._data def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` - # error: Too many arguments for "NDArrayBackedExtensionArray" - # error: Unexpected keyword argument "dtype" for "NDArrayBackedExtensionArray" - return type(self)(arr, dtype=self.dtype) # type: ignore[call-arg] + return type(self)._simple_new( # type: ignore[attr-defined] + arr, dtype=self.dtype + ) # ------------------------------------------------------------------ @@ -526,7 +531,7 @@ def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) - return self._data + return self._ndarray def __getitem__(self, key): """ @@ -536,7 +541,7 @@ def __getitem__(self, key): if lib.is_integer(key): # fast-path - result = self._data[key] + result = self._ndarray[key] if self.ndim == 1: return self._box_func(result) return self._simple_new(result, dtype=self.dtype) @@ -557,7 +562,7 @@ def __getitem__(self, key): key = check_array_indexer(self, key) freq = self._get_getitem_freq(key) - result = self._data[key] + result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) return self._simple_new(result, dtype=self.dtype, freq=freq) @@ -612,7 +617,7 @@ def __setitem__( value = self._validate_setitem_value(value) key = check_array_indexer(self, key) - self._data[key] = value + self._ndarray[key] = value self._maybe_clear_freq() def _maybe_clear_freq(self): @@ -663,8 +668,8 @@ def astype(self, dtype, copy=True): def view(self, dtype=None): if dtype is None or dtype is self.dtype: - return type(self)(self._data, dtype=self.dtype) - return self._data.view(dtype=dtype) + return type(self)(self._ndarray, dtype=self.dtype) + return self._ndarray.view(dtype=dtype) # ------------------------------------------------------------------ # ExtensionArray Interface @@ -705,7 +710,7 @@ def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) def _values_for_argsort(self): - return self._data + return self._ndarray # ------------------------------------------------------------------ # Validation Methods @@ -722,7 +727,7 @@ def _validate_fill_value(self, fill_value): Returns ------- - fill_value : np.int64 + fill_value : np.int64, np.datetime64, or np.timedelta64 Raises ------ @@ -736,7 +741,8 @@ def _validate_fill_value(self, fill_value): fill_value = self._validate_scalar(fill_value, msg) except TypeError as err: raise ValueError(msg) from err - return self._unbox(fill_value) + rv = self._unbox(fill_value) + return self._rebox_native(rv) def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value @@ -951,9 +957,9 @@ def value_counts(self, dropna=False): from pandas import Index, Series if dropna: - values = self[~self.isna()]._data + values = self[~self.isna()]._ndarray else: - values = self._data + values = self._ndarray cls = type(self) @@ -1044,9 +1050,9 @@ def fillna(self, value=None, method=None, limit=None): else: func = missing.backfill_1d - values = self._data + values = self._ndarray if not is_period_dtype(self.dtype): - # For PeriodArray self._data is i8, which gets copied + # For PeriodArray self._ndarray is i8, which gets copied # by `func`. Otherwise we need to make a copy manually # to avoid modifying `self` in-place. values = values.copy() diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1bea3a9eb137e..d913e7be9ae5f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -446,6 +446,10 @@ def _generate_range( # ----------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.datetime64: + return np.int64(value).view("M8[ns]") + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index cc39ffb5d1203..c3a9430736969 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -253,6 +253,10 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.int64: + return np.int64(value) + def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: if value is NaT: return value.value diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 2d694c469b3a9..485ebb49a376d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -271,6 +271,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): # ---------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.timedelta64: + return np.int64(value).view("m8[ns]") + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index 1937a4c380dc9..1866ac341def6 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -23,7 +23,9 @@ def test_setitem(self, timezone_frame): b1 = df._mgr.blocks[1] b2 = df._mgr.blocks[2] tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) + b1base = b1.values._data.base + b2base = b2.values._data.base + assert b1base is None or (id(b1base) != id(b2base)) # with nan df2 = df.copy() From 7cb14217612c8b253e02a86991835b1af30a43f2 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Mon, 7 Sep 2020 21:47:39 +0100 Subject: [PATCH 54/71] TST verify groupby doesn't alter unit64s to floats #30859 (#36164) --- pandas/tests/groupby/test_groupby.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e0196df7ceac0..69397228dd941 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1183,6 +1183,18 @@ def test_groupby_dtype_inference_empty(): tm.assert_frame_equal(result, expected, by_blocks=True) +def test_groupby_unit64_float_conversion(): + #  GH: 30859 groupby converts unit64 to floats sometimes + df = pd.DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) + result = df.groupby(["first", "second"])["value"].max() + expected = pd.Series( + [16148277970000000000], + pd.MultiIndex.from_product([[1], [1]], names=["first", "second"]), + name="value", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_list_infer_array_like(df): result = df.groupby(list(df["A"])).mean() expected = df.groupby(df["A"]).mean() From f79614082e03da46e842c42bd817059a2cb99359 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Sep 2020 23:06:29 +0200 Subject: [PATCH 55/71] Fix compressed multiindex for output of groupby.rolling (#36152) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/window/rolling.py | 10 +++++----- pandas/tests/window/test_grouper.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index e9cba3de56920..28ce49c11b3f0 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) +- Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4c4ec4d700b7f..235bd5364af02 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2211,17 +2211,17 @@ def _apply( # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names grouped_object_index = self.obj.index - grouped_index_name = [grouped_object_index.name] + grouped_index_name = [*grouped_object_index.names] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name result_index_data = [] for key, values in self._groupby.grouper.indices.items(): for value in values: - if not is_list_like(key): - data = [key, grouped_object_index[value]] - else: - data = [*key, grouped_object_index[value]] + data = [ + *com.maybe_make_list(key), + *com.maybe_make_list(grouped_object_index[value]), + ] result_index_data.append(tuple(data)) result_index = MultiIndex.from_tuples( diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 170bf100b3891..cb85ad7584da7 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -372,3 +372,24 @@ def test_groupby_subset_rolling_subset_with_closed(self): name="column1", ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["max", "min"]) + def test_groupby_rolling_index_changed(self, func): + # GH: #36018 nlevels of MultiIndex changed + ds = Series( + [1, 2, 2], + index=pd.MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"] + ), + name="a", + ) + + result = getattr(ds.groupby(ds).rolling(2), func)() + expected = Series( + [np.nan, np.nan, 2.0], + index=pd.MultiIndex.from_tuples( + [(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"] + ), + name="a", + ) + tm.assert_series_equal(result, expected) From c962e70175b73c956c5e1fee43b0d40311d08329 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Sep 2020 23:11:29 +0200 Subject: [PATCH 56/71] TST: DataFrame.replace: TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'unicode' (#36202) --- pandas/tests/frame/methods/test_replace.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ea2488dfc0877..a77753ed9f9d0 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1599,3 +1599,11 @@ def test_replace_intervals(self): result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = pd.DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) + + def test_replace_unicode(self): + # GH: 16784 + columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} + df1 = pd.DataFrame({"positive": np.ones(3)}) + result = df1.replace(columns_values_map) + expected = pd.DataFrame({"positive": np.ones(3)}) + tm.assert_frame_equal(result, expected) From 22b547b64fc42fb5b41b854f322d1dd42aefeea2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 14:43:12 -0700 Subject: [PATCH 57/71] REF: collect methods by topic (#36173) --- pandas/core/arrays/categorical.py | 148 +++++++++++++++++------------- pandas/core/indexes/category.py | 29 +++--- pandas/core/indexes/datetimes.py | 3 + pandas/core/indexes/interval.py | 64 +++++++------ pandas/core/indexes/multi.py | 2 + pandas/core/indexes/numeric.py | 42 +++++---- pandas/core/indexes/period.py | 67 +++++++------- pandas/core/indexes/range.py | 5 + pandas/core/indexes/timedeltas.py | 5 + 9 files changed, 212 insertions(+), 153 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 228e630f95863..58847528d2183 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -393,56 +393,6 @@ def __init__( self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @property - def categories(self): - """ - The categories of this categorical. - - Setting assigns new values to each category (effectively a rename of - each individual category). - - The assigned value has to be a list-like object. All items must be - unique and the number of items in the new categories must be the same - as the number of items in the old categories. - - Assigning to `categories` is a inplace operation! - - Raises - ------ - ValueError - If the new categories do not validate as categories or if the - number of new categories is unequal the number of old categories - - See Also - -------- - rename_categories : Rename categories. - reorder_categories : Reorder categories. - add_categories : Add new categories. - remove_categories : Remove the specified categories. - remove_unused_categories : Remove categories which are not used. - set_categories : Set the categories to the specified ones. - """ - return self.dtype.categories - - @categories.setter - def categories(self, categories): - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) - self._dtype = new_dtype - - @property - def ordered(self) -> Ordered: - """ - Whether the categories have an ordered relationship. - """ - return self.dtype.ordered - @property def dtype(self) -> CategoricalDtype: """ @@ -458,10 +408,6 @@ def _constructor(self) -> Type["Categorical"]: def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) - def _formatter(self, boxed=False): - # Defer to CategoricalFormatter's formatter. - return None - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype @@ -640,6 +586,59 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): return cls(codes, dtype=dtype, fastpath=True) + # ------------------------------------------------------------------ + # Categories/Codes/Ordered + + @property + def categories(self): + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + self._dtype = new_dtype + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self.dtype.ordered + @property def codes(self) -> np.ndarray: """ @@ -1104,6 +1103,8 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat + # ------------------------------------------------------------------ + def map(self, mapper): """ Map categories using input correspondence (dict, Series, or function). @@ -1192,6 +1193,9 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) + # ------------------------------------------------------------- + # Validators; ideally these can be de-duplicated + def _validate_insert_value(self, value) -> int: code = self.categories.get_indexer([value]) if (code == -1) and not (is_scalar(value) and isna(value)): @@ -1241,6 +1245,8 @@ def _validate_fill_value(self, fill_value): ) return fill_value + # ------------------------------------------------------------- + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1758,6 +1764,10 @@ def __contains__(self, key) -> bool: # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1987,7 +1997,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: result = dict(zip(categories, _result)) return result - # reduction ops # + # ------------------------------------------------------------------ + # Reductions + def _reduce(self, name: str, skipna: bool = True, **kwargs): func = getattr(self, name, None) if func is None: @@ -2090,6 +2102,9 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + # ------------------------------------------------------------------ + # ExtensionArray Interface + def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are @@ -2179,6 +2194,18 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other_codes) return False + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import union_categoricals + + return union_categoricals(to_concat) + + # ------------------------------------------------------------------ + def is_dtype_equal(self, other): """ Returns True if categoricals are the same dtype @@ -2217,17 +2244,6 @@ def describe(self): return result - # Implement the ExtensionArray interface - @property - def _can_hold_na(self): - return True - - @classmethod - def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import union_categoricals - - return union_categoricals(to_concat) - def isin(self, values) -> np.ndarray: """ Check whether `values` are contained in Categorical. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d38f77aaceb01..7509cb35069e8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -433,11 +433,6 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype("object") - def _maybe_cast_indexer(self, key): - code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) - return code - @doc(Index.where) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -537,6 +532,14 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer + # -------------------------------------------------------------------- + # Indexing Methods + + def _maybe_cast_indexer(self, key): + code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) + return code + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) @@ -619,6 +622,15 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) + @doc(Index._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side, kind): + if kind == "loc": + return label + + return super()._maybe_cast_slice_bound(label, side, kind) + + # -------------------------------------------------------------------- + def take_nd(self, *args, **kwargs): """Alias for `take`""" warnings.warn( @@ -628,13 +640,6 @@ def take_nd(self, *args, **kwargs): ) return self.take(*args, **kwargs) - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side, kind): - if kind == "loc": - return label - - return super()._maybe_cast_slice_bound(label, side, kind) - def map(self, mapper): """ Map values using input correspondence (a dict, Series, or function). diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3fd93a8159041..f0b80c2852bd5 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -509,6 +509,9 @@ def snap(self, freq="S"): dta = DatetimeArray(snapped, dtype=self.dtype) return DatetimeIndex._simple_new(dta, name=self.name) + # -------------------------------------------------------------------- + # Indexing Methods + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): """ Calculate datetime bounds for parsed time string and its resolution. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 419ff81a2a478..3f72577c9420e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -57,7 +57,7 @@ from pandas.core.ops import get_op_result_name if TYPE_CHECKING: - from pandas import CategoricalIndex + from pandas import CategoricalIndex # noqa:F401 _VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -515,28 +515,6 @@ def is_overlapping(self) -> bool: # GH 23309 return self._engine.is_overlapping - def _should_fallback_to_positional(self) -> bool: - # integer lookups in Series.__getitem__ are unambiguously - # positional in this case - return self.dtype.subtype.kind in ["m", "M"] - - def _maybe_cast_slice_bound(self, label, side, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError - - return locs - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -668,6 +646,9 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) + # -------------------------------------------------------------------- + # Indexing Methods + def get_loc( self, key, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -885,6 +866,30 @@ def _convert_slice_indexer(self, key: slice, kind: str): return super()._convert_slice_indexer(key, kind) + def _should_fallback_to_positional(self) -> bool: + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + return self.dtype.subtype.kind in ["m", "M"] + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(Index._convert_list_indexer.__doc__) + def _convert_list_indexer(self, keyarr): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + + # we have missing values + if (locs == -1).any(): + raise KeyError + + return locs + + # -------------------------------------------------------------------- + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: @@ -1030,6 +1035,9 @@ def equals(self, other: object) -> bool: and self.closed == other.closed ) + # -------------------------------------------------------------------- + # Set Operations + @Appender(Index.intersection.__doc__) @SetopCheck(op_name="intersection") def intersection( @@ -1115,6 +1123,12 @@ def func(self, other, sort=sort): return func + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") + + # -------------------------------------------------------------------- + @property def is_all_dates(self) -> bool: """ @@ -1123,10 +1137,6 @@ def is_all_dates(self) -> bool: """ return False - union = _setop("union") - difference = _setop("difference") - symmetric_difference = _setop("symmetric_difference") - # TODO: arithmetic operations # GH#30817 until IntervalArray implements inequalities, get them from Index diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e49a23935efbd..9630e154ccd17 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3154,6 +3154,8 @@ def _update_indexer(idxr, indexer=indexer): return indexer._values + # -------------------------------------------------------------------- + def _reorder_indexer( self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index cd3f1f51a86d2..079f43cb2c66b 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -97,6 +97,9 @@ def _validate_dtype(cls, dtype: Dtype) -> None: f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["loc", "getitem", None] @@ -104,6 +107,8 @@ def _maybe_cast_slice_bound(self, label, side, kind): # we will try to coerce to integers return self._maybe_cast_indexer(label) + # ---------------------------------------------------------------- + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): if values is not None and not self._can_hold_na and values.dtype.kind == "f": @@ -293,6 +298,9 @@ class UInt64Index(IntegerIndex): _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned @@ -314,6 +322,8 @@ def _convert_index_indexer(self, keyarr): return keyarr.astype(np.uint64) return keyarr + # ---------------------------------------------------------------- + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return UInt64Index(joined, name=name) @@ -385,6 +395,22 @@ def _convert_slice_indexer(self, key: slice, kind: str): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + @doc(Index.get_loc) + def get_loc(self, key, method=None, tolerance=None): + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + + return super().get_loc(key, method=method, tolerance=tolerance) + # ---------------------------------------------------------------- def _format_native_types( @@ -409,22 +435,6 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - - if is_float(key) and np.isnan(key): - nan_idxs = self._nan_idxs - if not len(nan_idxs): - raise KeyError(key) - elif len(nan_idxs) == 1: - return nan_idxs[0] - return nan_idxs - - return super().get_loc(key, method=method, tolerance=tolerance) - @cache_readonly def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index cdb502199c6f1..5282b6f0154b4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -433,6 +433,41 @@ def inferred_type(self) -> str: # indexing return "period" + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.astype(object).insert(loc, item) + + i8result = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) + arr = type(self._data)._simple_new(i8result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) + + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + if not isinstance(other, PeriodIndex): + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + # _assert_can_do_setop ensures we have matching dtype + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + return result + + # ------------------------------------------------------------------------ + # Indexing Methods + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) @@ -607,38 +642,6 @@ def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True except KeyError as err: raise KeyError(key) from err - def insert(self, loc, item): - if not isinstance(item, Period) or self.freq != item.freq: - return self.astype(object).insert(loc, item) - - i8result = np.concatenate( - (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) - ) - arr = type(self._data)._simple_new(i8result, dtype=self.dtype) - return type(self)._simple_new(arr, name=self.name) - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - self._assert_can_do_setop(other) - - if not isinstance(other, PeriodIndex): - return self.astype(object).join( - other, how=how, level=level, return_indexers=return_indexers, sort=sort - ) - - # _assert_can_do_setop ensures we have matching dtype - result = Int64Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - return result - # ------------------------------------------------------------------------ # Set Operation Methods diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f1457a9aac62b..684691501de5c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -338,6 +338,9 @@ def __contains__(self, key: Any) -> bool: return False return key in self._range + # -------------------------------------------------------------------- + # Indexing Methods + @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: @@ -379,6 +382,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): locs[valid] = len(self) - 1 - locs[valid] return ensure_platform_int(locs) + # -------------------------------------------------------------------- + def tolist(self): return list(self._range) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 85c8396dfd1fe..df08fda78823d 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -202,6 +202,9 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ return is_timedelta64_dtype(dtype) + # ------------------------------------------------------------------- + # Indexing Methods + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -248,6 +251,8 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return label + # ------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" From a1559865dcdf2ec1cfbe1418544288558032d026 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 15:41:00 -0700 Subject: [PATCH 58/71] REF: implement Categorical._validate_setitem_value (#36180) --- pandas/core/arrays/categorical.py | 35 +++++++++++++++--------------- pandas/core/arrays/datetimelike.py | 16 +++++++++----- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 58847528d2183..b732db4c66003 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,7 +9,7 @@ from pandas._config import get_option -from pandas._libs import NaT, algos as libalgos, hashtable as htable +from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc @@ -1868,14 +1868,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - def _maybe_coerce_indexer(self, indexer): - """ - return an indexer coerced to the codes dtype - """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": - indexer = indexer.astype(self._codes.dtype) - return indexer - def __getitem__(self, key): """ Return an item. @@ -1905,6 +1897,11 @@ def __setitem__(self, key, value): If (one or more) Value is not in categories or if a assigned `Categorical` does not have the same categories """ + key = self._validate_setitem_key(key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value + + def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) # require identical categories set @@ -1934,12 +1931,19 @@ def __setitem__(self, key, value): "category, set the categories first" ) - # set by position - if isinstance(key, (int, np.integer)): + lindexer = self.categories.get_indexer(rvalue) + if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i": + lindexer = lindexer.astype(self._ndarray.dtype) + + return lindexer + + def _validate_setitem_key(self, key): + if lib.is_integer(key): + # set by position pass - # tuple of indexers (dataframe) elif isinstance(key, tuple): + # tuple of indexers (dataframe) # only allow 1 dimensional slicing, but can # in a 2-d case be passed (slice(None),....) if len(key) == 2: @@ -1951,17 +1955,14 @@ def __setitem__(self, key, value): else: raise AssertionError("invalid slicing for a 1-ndim categorical") - # slicing in Series or Categorical elif isinstance(key, slice): + # slicing in Series or Categorical pass # else: array of True/False in Series or Categorical - lindexer = self.categories.get_indexer(rvalue) - lindexer = self._maybe_coerce_indexer(lindexer) - key = check_array_indexer(self, key) - self._codes[key] = lindexer + return key def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a218745db0a44..2626890c2dbe5 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -546,6 +546,15 @@ def __getitem__(self, key): return self._box_func(result) return self._simple_new(result, dtype=self.dtype) + key = self._validate_getitem_key(key) + result = self._ndarray[key] + if lib.is_scalar(result): + return self._box_func(result) + + freq = self._get_getitem_freq(key) + return self._simple_new(result, dtype=self.dtype, freq=freq) + + def _validate_getitem_key(self, key): if com.is_bool_indexer(key): # first convert to boolean, because check_array_indexer doesn't # allow object dtype @@ -560,12 +569,7 @@ def __getitem__(self, key): pass else: key = check_array_indexer(self, key) - - freq = self._get_getitem_freq(key) - result = self._ndarray[key] - if lib.is_scalar(result): - return self._box_func(result) - return self._simple_new(result, dtype=self.dtype, freq=freq) + return key def _get_getitem_freq(self, key): """ From 5aa96ddc103eeb8fd8ce57e82a709f5ea766f674 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 17:15:51 -0700 Subject: [PATCH 59/71] COMPAT: match numpy behavior for searchsorted on dt64/td64 (#36176) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 7 +++---- pandas/tests/arrays/test_datetimelike.py | 11 ++++++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ccaae9f996425..2afa1f1a6199e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,6 +228,7 @@ Datetimelike - Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) +- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64`` or ``timedelta64`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2626890c2dbe5..6477b94a823ce 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -862,7 +862,8 @@ def _validate_searchsorted_value(self, value): # TODO: cast_str? we accept it for scalar value = self._validate_listlike(value, "searchsorted") - return self._unbox(value) + rv = self._unbox(value) + return self._rebox_native(rv) def _validate_setitem_value(self, value): msg = ( @@ -941,9 +942,7 @@ def searchsorted(self, value, side="left", sorter=None): Array of insertion points with the same shape as `value`. """ value = self._validate_searchsorted_value(value) - - # TODO: Use datetime64 semantics for sorting, xref GH#29844 - return self.asi8.searchsorted(value, side=side, sorter=sorter) + return self._data.searchsorted(value, side=side, sorter=sorter) def value_counts(self, dropna=False): """ diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b1ab700427c28..292557fc04258 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -241,10 +241,15 @@ def test_searchsorted(self): expected = np.array([2, 3], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - # Following numpy convention, NaT goes at the beginning - # (unlike NaN which goes at the end) + # GH#29884 match numpy convention on whether NaT goes + # at the end or the beginning result = arr.searchsorted(pd.NaT) - assert result == 0 + if _np_version_under1p18 or self.array_cls is PeriodArray: + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + assert result == 0 + else: + assert result == 10 def test_getitem_2d(self, arr1d): # 2d slicing on a 1D array From 81c5802e6eb6cf4b441f5c386bd39996c5503425 Mon Sep 17 00:00:00 2001 From: Nidhi Zare Date: Tue, 8 Sep 2020 06:00:19 +0530 Subject: [PATCH 60/71] pandas docs json_normalize example (#36194) Co-authored-by: Nidhi Zare --- pandas/io/json/_normalize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 44765dbe74b46..2e1fc57e88ed1 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -176,7 +176,7 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> json_normalize(data, max_level=0) + >>> pandas.json_normalize(data, max_level=0) fitness id name 0 {'height': 130, 'weight': 60} 1.0 Cole Volk 1 {'height': 130, 'weight': 60} NaN Mose Reg @@ -191,7 +191,7 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> json_normalize(data, max_level=1) + >>> pandas.json_normalize(data, max_level=1) fitness.height fitness.weight id name 0 130 60 1.0 Cole Volk 1 130 60 NaN Mose Reg @@ -208,7 +208,7 @@ def _json_normalize( ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> result = json_normalize(data, 'counties', ['state', 'shortname', + >>> result = pandas.json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population state shortname info.governor @@ -219,7 +219,7 @@ def _json_normalize( 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} - >>> json_normalize(data, 'A', record_prefix='Prefix.') + >>> pandas.json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 From a56c6af86c6ddf46f872cfabc79904ece3543441 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 8 Sep 2020 02:51:20 -0700 Subject: [PATCH 61/71] BUG: GroupbyRolling with an empty frame (#36208) Co-authored-by: Matt Roeschke --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/window/rolling.py | 10 ++++++---- pandas/tests/window/test_grouper.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 28ce49c11b3f0..f13d38d1f8f76 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) - Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) -- +- Fixed regression in :class:`DataFrameGroupBy` on an empty :class:`DataFrame` (:issue:`36197`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 235bd5364af02..9466ada3f4578 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2240,10 +2240,12 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): """ # Ensure the object we're rolling over is monotonically sorted relative # to the groups - groupby_order = np.concatenate( - list(self._groupby.grouper.indices.values()) - ).astype(np.int64) - obj = obj.take(groupby_order) + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate( + list(self._groupby.grouper.indices.values()) + ).astype(np.int64) + obj = obj.take(groupby_order) return super()._create_blocks(obj) def _get_cython_func_type(self, func: str) -> Callable: diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index cb85ad7584da7..786cf68d28871 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -393,3 +393,15 @@ def test_groupby_rolling_index_changed(self, func): name="a", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_empty_frame(self): + # GH 36197 + expected = pd.DataFrame({"s1": []}) + result = expected.groupby("s1").rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({"s1": [], "s2": []}) + result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) + tm.assert_frame_equal(result, expected) From 4a0152e731c3c34d2f5cde80952d1bee497df80f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 11:22:20 +0100 Subject: [PATCH 62/71] DOC: doc fix (#36205) --- doc/source/whatsnew/v1.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index f13d38d1f8f76..0e4a88f3ee56b 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -51,7 +51,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) -- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) +- :meth:`DataFrame.plot` and :meth:`Series.plot` raise ``UserWarning`` about usage of ``FixedFormatter`` and ``FixedLocator`` (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- From 3aed293f71416a1543c962b3a72c6a31d5c36006 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 12:50:13 +0100 Subject: [PATCH 63/71] DOC: release date for 1.1.2 (#36182) --- doc/source/whatsnew/v1.1.2.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 0e4a88f3ee56b..a214ad9762733 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_112: -What's new in 1.1.2 (??) ------------------------- +What's new in 1.1.2 (September 8, 2020) +--------------------------------------- These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog including other versions of pandas. From 4c9add82fbe587599892c22b6718d467a1d4de9a Mon Sep 17 00:00:00 2001 From: Yanxian Lin Date: Tue, 8 Sep 2020 06:01:43 -0700 Subject: [PATCH 64/71] Fixed pandas.json_normalize doctests errors` (#36207) --- pandas/io/json/_normalize.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 2e1fc57e88ed1..3ed0b5851b395 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -163,11 +163,11 @@ def _json_normalize( >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, ... {'name': {'given': 'Mose', 'family': 'Regner'}}, ... {'id': 2, 'name': 'Faye Raker'}] - >>> pandas.json_normalize(data) - id name name.family name.first name.given name.last - 0 1.0 NaN NaN Coleen NaN Volk - 1 NaN NaN Regner NaN Mose NaN - 2 2.0 Faye Raker NaN NaN NaN NaN + >>> pd.json_normalize(data) + id name.first name.last name.given name.family name + 0 1.0 Coleen Volk NaN NaN NaN + 1 NaN NaN NaN Mose Regner NaN + 2 2.0 NaN NaN NaN NaN Faye Raker >>> data = [{'id': 1, ... 'name': "Cole Volk", @@ -176,11 +176,11 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> pandas.json_normalize(data, max_level=0) - fitness id name - 0 {'height': 130, 'weight': 60} 1.0 Cole Volk - 1 {'height': 130, 'weight': 60} NaN Mose Reg - 2 {'height': 130, 'weight': 60} 2.0 Faye Raker + >>> pd.json_normalize(data, max_level=0) + id name fitness + 0 1.0 Cole Volk {'height': 130, 'weight': 60} + 1 NaN Mose Reg {'height': 130, 'weight': 60} + 2 2.0 Faye Raker {'height': 130, 'weight': 60} Normalizes nested data up to level 1. @@ -191,11 +191,11 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> pandas.json_normalize(data, max_level=1) - fitness.height fitness.weight id name - 0 130 60 1.0 Cole Volk - 1 130 60 NaN Mose Reg - 2 130 60 2.0 Faye Raker + >>> pd.json_normalize(data, max_level=1) + id name fitness.height fitness.weight + 0 1.0 Cole Volk 130 60 + 1 NaN Mose Reg 130 60 + 2 2.0 Faye Raker 130 60 >>> data = [{'state': 'Florida', ... 'shortname': 'FL', @@ -208,7 +208,7 @@ def _json_normalize( ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> result = pandas.json_normalize(data, 'counties', ['state', 'shortname', + >>> result = pd.json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population state shortname info.governor @@ -219,7 +219,7 @@ def _json_normalize( 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} - >>> pandas.json_normalize(data, 'A', record_prefix='Prefix.') + >>> pd.json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 From 11643bc9072d25c49e57339959415771fc7b78fa Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 09:45:08 -0400 Subject: [PATCH 65/71] BUG: copying series into empty dataframe does not preserve dataframe index name (#36141) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/frame.py | 8 +++++--- pandas/tests/indexing/test_partial.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index a214ad9762733..c6a08f4fb852a 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -43,6 +43,7 @@ Bug fixes - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) - Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) +- Bug when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`31368`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e1a889bf79d95..59cf4c0e2f81d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3206,9 +3206,11 @@ def _ensure_valid_index(self, value): "and a value that cannot be converted to a Series" ) from err - self._mgr = self._mgr.reindex_axis( - value.index.copy(), axis=1, fill_value=np.nan - ) + # GH31368 preserve name of index + index_copy = value.index.copy() + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) def _box_col_values(self, values, loc: int) -> Series: """ diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 350f86b4e9fd0..7afbbc2b9ab2b 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -660,3 +660,15 @@ def test_indexing_timeseries_regression(self): expected = Series(rng, index=rng) tm.assert_series_equal(result, expected) + + def test_index_name_empty(self): + # GH 31368 + df = pd.DataFrame({}, index=pd.RangeIndex(0, name="df_index")) + series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + + df["series"] = series + expected = pd.DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + ) + + tm.assert_frame_equal(df, expected) From edd802f99cbbe9840e7b70d556dd28b2331c326c Mon Sep 17 00:00:00 2001 From: tiagohonorato <61059243+tiagohonorato@users.noreply.github.com> Date: Tue, 8 Sep 2020 12:28:21 -0300 Subject: [PATCH 66/71] CLN remove trailing commas (#36222) --- pandas/tests/io/pytables/test_timezones.py | 4 ++-- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_s3.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 38d32b0bdc8a3..1c29928991cde 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -110,7 +110,7 @@ def test_append_with_timezones_dateutil(setup_path): dti = dti._with_freq(None) # freq doesnt round-trip # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=dti,))) + df = DataFrame(dict(A=Series(range(3), index=dti))) _maybe_remove(store, "df") store.put("df", df) @@ -197,7 +197,7 @@ def test_append_with_timezones_pytz(setup_path): dti = dti._with_freq(None) # freq doesnt round-trip # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=dti,))) + df = DataFrame(dict(A=Series(range(3), index=dti))) _maybe_remove(store, "df") store.put("df", df) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a8a5c8f00e6bf..c1e63f512b53e 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -76,7 +76,7 @@ def test_basic(self): pd.Timestamp("20130103"), ], "dtns": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, freq="ns")), freq=None, + list(pd.date_range("20130101", periods=3, freq="ns")), freq=None ), } ) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index a137e76b1696b..0ee6cb0796644 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -43,6 +43,6 @@ def test_read_with_creds_from_pub_bucket(): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( - "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None, + "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None ) assert len(df) == 5 From 9339b8059b0777706df253a2001922e7903bcc95 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 08:29:03 -0700 Subject: [PATCH 67/71] CLN: remove unused return value in _create_blocks (#36196) --- pandas/core/window/rolling.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9466ada3f4578..5a7482076903c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -234,7 +234,7 @@ def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: f"get_window_bounds" ) - def _create_blocks(self, obj: FrameOrSeriesUnion): + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. """ @@ -242,9 +242,8 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - blocks = obj._to_dict_of_blocks(copy=False).values() - return blocks, obj + return obj def _gotitem(self, key, ndim, subset=None): """ @@ -333,7 +332,7 @@ def __repr__(self) -> str: def __iter__(self): window = self._get_window(win_type=None) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) index = self._get_window_indexer(window=window) start, end = index.get_window_bounds( @@ -469,7 +468,7 @@ def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": """ Series version of _apply_blockwise """ - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) try: values = self._prep_values(obj.values) @@ -489,7 +488,7 @@ def _apply_blockwise( if self._selected_obj.ndim == 1: return self._apply_series(homogeneous_func) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) mgr = obj._mgr def hfunc(bvalues: ArrayLike) -> ArrayLike: @@ -1268,7 +1267,7 @@ def count(self): # implementations shouldn't end up here assert not isinstance(self.window, BaseIndexer) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) def hfunc(values: np.ndarray) -> np.ndarray: result = notna(values) @@ -2234,7 +2233,7 @@ def _apply( def _constructor(self): return Rolling - def _create_blocks(self, obj: FrameOrSeriesUnion): + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. """ @@ -2246,7 +2245,7 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): list(self._groupby.grouper.indices.values()) ).astype(np.int64) obj = obj.take(groupby_order) - return super()._create_blocks(obj) + return super()._create_data(obj) def _get_cython_func_type(self, func: str) -> Callable: """ From 070481c3650af926849eb2a01fecd6db20899a5d Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 11:30:36 -0400 Subject: [PATCH 68/71] Make to_numeric default to correct precision (#36149) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/src/parse_helper.h | 4 +- pandas/tests/tools/test_to_numeric.py | 58 +++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2afa1f1a6199e..2aac2596c18cb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -245,7 +245,7 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Conversion diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 2ada0a4bd173d..d161c4e29fe15 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -18,7 +18,9 @@ int to_double(char *item, double *p_value, char sci, char decimal, char *p_end = NULL; int error = 0; - *p_value = xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); + /* Switch to precise xstrtod GH 31364 */ + *p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, + &error, maybe_int); return (error == 0) && (!*p_end); } diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 263887a8ea36e..450076f2824ad 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -649,3 +649,61 @@ def test_failure_to_convert_uint64_string_to_NaN(): ser = Series([32, 64, np.nan]) result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize( + "strrep", + [ + "243.164", + "245.968", + "249.585", + "259.745", + "265.742", + "272.567", + "279.196", + "280.366", + "275.034", + "271.351", + "272.889", + "270.627", + "280.828", + "290.383", + "308.153", + "319.945", + "336.0", + "344.09", + "351.385", + "356.178", + "359.82", + "361.03", + "367.701", + "380.812", + "387.98", + "391.749", + "391.171", + "385.97", + "385.345", + "386.121", + "390.996", + "399.734", + "413.073", + "421.532", + "430.221", + "437.092", + "439.746", + "446.01", + "451.191", + "460.463", + "469.779", + "472.025", + "479.49", + "474.864", + "467.54", + "471.978", + ], +) +def test_precision_float_conversion(strrep): + # GH 31364 + result = to_numeric(strrep) + + assert result == float(strrep) From 4193e0354e10f23ba2bce9fef2078921465ceead Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 14 Sep 2020 20:23:32 -0700 Subject: [PATCH 69/71] post-rebase fixup --- pandas/core/groupby/generic.py | 61 +++++++++++++++++++++++++++------- pandas/core/groupby/ops.py | 4 +-- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d4e673d2e538c..e07d434684ee7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -75,7 +75,14 @@ group_selection_context, ) from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba -from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + TimedeltaIndex, + all_indexes_same, +) import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager from pandas.core.series import Series @@ -257,17 +264,27 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) - try: - return self._python_agg_general(func, *args, **kwargs) - except (ValueError, KeyError): - # TODO: KeyError is raised in _python_agg_general, - # see see test_groupby.test_basic - result = self._aggregate_named(func, *args, **kwargs) + if isinstance( + self._selected_obj.index, (DatetimeIndex, TimedeltaIndex, PeriodIndex) + ): + # using _python_agg_general would end up incorrectly patching + # _index_data in reduction.pyx + result = self._aggregate_maybe_named(func, *args, **kwargs) + else: + try: + return self._python_agg_general(func, *args, **kwargs) + except (ValueError, KeyError): + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic + result = self._aggregate_maybe_named(func, *args, **kwargs) + + index = self.grouper.result_index + assert index.name == self.grouper.names[0] - index = Index(sorted(result), name=self.grouper.names[0]) ret = create_series_with_explicit_dtype( result, index=index, dtype_if_empty=object ) + ret.name = self._selected_obj.name # test_metadata_propagation_indiv if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") @@ -470,14 +487,34 @@ def _get_index() -> Index: ) return self._reindex_output(result) - def _aggregate_named(self, func, *args, **kwargs): + def _aggregate_maybe_named(self, func, *args, **kwargs): + """ + Try the named-aggregator first, then unnamed, which better matches + what libreduction does. + """ + try: + return self._aggregate_named(func, *args, named=True, **kwargs) + except KeyError: + return self._aggregate_named(func, *args, named=False, **kwargs) + + def _aggregate_named(self, func, *args, named: bool = True, **kwargs): result = {} - for name, group in self: - group.name = name + for name, group in self: # TODO: could we have duplicate names? + if named: + group.name = name + output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") + if ( + isinstance(output, Series) + and len(output) == 1 + and name in output.index + ): + # FIXME: kludge for test_resampler_grouper.test_apply + output = output.iloc[0] + else: + raise ValueError("Must produce aggregated value") result[name] = output return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3ba3c8a0eddc8..955f0463bcccf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -624,10 +624,10 @@ def agg_series(self, obj: Series, func: F): return self._aggregate_series_pure_python(obj, func) elif obj.index._has_complex_internals or isinstance( - obj.index, (RangeIndex, DatetimeIndex, TimedeltaIndex) + obj.index, (DatetimeIndex, TimedeltaIndex, RangeIndex) ): # Preempt TypeError in _aggregate_series_fast - # exclude RangeIndex because patching it in libreduction would + # exclude RangeIndex/DTI/TDI because patching it in libreduction would # silently be incorrect return self._aggregate_series_pure_python(obj, func) From 816f2fcadb3e7c6ee72ab0693d572530bf7aa203 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 15 Sep 2020 08:05:05 -0700 Subject: [PATCH 70/71] revert whitespace mixup --- doc/source/whatsnew/v1.2.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7afa4eacd9cf0..8b18b56929acd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -118,7 +118,6 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) -- .. _whatsnew_120.api_breaking.python: @@ -324,7 +323,6 @@ Plotting - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) - Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ From 865cb8baa221bdcb5ce5b5dd7c5595506e2f243b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Sep 2020 13:55:41 -0700 Subject: [PATCH 71/71] Implement _can_use_libreduction --- pandas/core/generic.py | 12 ++++++++++++ pandas/core/groupby/generic.py | 7 +++---- pandas/core/groupby/ops.py | 31 +++++------------------------- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/category.py | 5 ----- pandas/core/indexes/interval.py | 5 ----- pandas/core/indexes/multi.py | 5 ----- pandas/core/indexes/period.py | 5 ----- pandas/tests/groupby/test_apply.py | 1 - 9 files changed, 23 insertions(+), 52 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cc18b8681200f..85ff334e9c7ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -404,6 +404,18 @@ def _data(self): # e.g. fastparquet return self._mgr + @property + def _can_use_libreduction(self) -> bool: + # groupby ops can only use libreduction fast-path if we are all-numpy + if self.index._has_complex_internals: + return False + + is_invalid = lambda x: is_extension_array_dtype(x) or x.kind in ["m", "M"] + if self.ndim == 1: + return not is_invalid(self.dtype) + else: + return not self.dtypes.apply(is_invalid).any() + # ---------------------------------------------------------------------- # Axis _stat_axis_number = 0 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3712f5ab9915d..f65a47c9a9e6c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -277,13 +277,12 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # see see test_groupby.test_basic result = self._aggregate_maybe_named(func, *args, **kwargs) + # name setting -> test_metadata_propagation_indiv index = self.grouper.result_index - assert index.name == self.grouper.names[0] - + obj = self._selected_obj ret = create_series_with_explicit_dtype( - result, index=index, dtype_if_empty=object + result, index=index, dtype_if_empty=object, name=obj.name ) - ret.name = self._selected_obj.name # test_metadata_propagation_indiv if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 955f0463bcccf..af1f02adf5331 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,14 +45,7 @@ from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base, grouper -from pandas.core.indexes.api import ( - DatetimeIndex, - Index, - MultiIndex, - RangeIndex, - TimedeltaIndex, - ensure_index, -) +from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -163,18 +156,13 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): result_values = None sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): - # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 - # if we pass EA instead of ndarray - # TODO: can we have a workaround for EAs backed by ndarray? - pass - - elif ( + if ( com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not sdata.index._has_complex_internals + # or columns + and sdata._can_use_libreduction ): try: result_values, mutated = splitter.fast_apply(f, sdata, group_keys) @@ -616,19 +604,10 @@ def agg_series(self, obj: Series, func: F): # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype): + elif not obj._can_use_libreduction: # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # In the datetime64tz case it would incorrectly cast to tz-naive - # TODO: can we get a performant workaround for EAs backed by ndarray? - return self._aggregate_series_pure_python(obj, func) - - elif obj.index._has_complex_internals or isinstance( - obj.index, (DatetimeIndex, TimedeltaIndex, RangeIndex) - ): - # Preempt TypeError in _aggregate_series_fast - # exclude RangeIndex/DTI/TDI because patching it in libreduction would - # silently be incorrect return self._aggregate_series_pure_python(obj, func) try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11490e2e0be29..07e4bfba2313e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4053,7 +4053,9 @@ def _has_complex_internals(self) -> bool: Indicates if an index is not directly backed by a numpy array """ # used to avoid libreduction code paths, which raise or require conversion - return False + return isinstance(self, (ABCMultiIndex, ABCRangeIndex)) or not isinstance( + self._data, np.ndarray + ) def _is_memory_usage_qualified(self) -> bool: """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c798ae0bd4e4d..77a55bced2187 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -343,11 +343,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9ef584f5b7fbc..2176d7419557e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -394,11 +394,6 @@ def values(self) -> IntervalArray: """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a21a54e4a9be3..561402a79fa27 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1508,11 +1508,6 @@ def _get_level_number(self, level) -> int: ) from err return level - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @cache_readonly def is_monotonic_increasing(self) -> bool: """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 42dce1bd53f22..6213aa5b71674 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -247,11 +247,6 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): def values(self): return np.asarray(self) - @property - def _has_complex_internals(self): - # used to avoid libreduction code paths, which raise or require conversion - return True - def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name cache = self._cache.copy() if values is None else {} diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index db5c4af9c6f53..93761a186b804 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1002,7 +1002,6 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-34998") def test_apply_with_timezones_aware(): # GH: 27212