From 214820d7c6979e10219f432598ea892ee39e9caf Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 23 Nov 2020 20:00:05 -0800 Subject: [PATCH 1/8] BUG: NumericIndex.insert(0, False) casting to int --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/numeric.py | 17 ++++++++++------- pandas/tests/frame/indexing/test_setitem.py | 14 ++++++++++++++ pandas/tests/indexing/test_coercion.py | 4 ++-- 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 766c418741ada..80d13d185b426 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -619,6 +619,7 @@ Indexing - Bug in indexing on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` using listlike indexer that contains elements that are in the index's ``categories`` but not in the index itself failing to raise ``KeyError`` (:issue:`37901`) - Bug in :meth:`DataFrame.iloc` and :meth:`Series.iloc` aligning objects in ``__setitem__`` (:issue:`22046`) - Bug in :meth:`DataFrame.loc` did not raise ``KeyError`` when missing combination was given with ``slice(None)`` for remaining levels (:issue:`19556`) +- Bug on inserting a boolean label into a :class:`DataFrame` with a numeric :class:`Index` columns incorrectly casting to integer (:issue:`36319`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 24aaf5885fe0e..e486d393ed6bf 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -124,6 +124,12 @@ def _validate_fill_value(self, value): raise TypeError elif isinstance(value, str) or lib.is_complex(value): raise TypeError + elif is_scalar(value) and isna(value): + if is_valid_nat_for_dtype(value, self.dtype): + value = self._na_value + else: + # NaT, np.datetime64("NaT"), np.timedelta64("NaT") + raise TypeError return value @@ -162,13 +168,10 @@ def _is_all_dates(self) -> bool: @doc(Index.insert) def insert(self, loc: int, item): - # treat NA values as nans: - if is_scalar(item) and isna(item): - if is_valid_nat_for_dtype(item, self.dtype): - item = self._na_value - else: - # NaT, np.datetime64("NaT"), np.timedelta64("NaT") - return self.astype(object).insert(loc, item) + try: + item = self._validate_fill_value(item) + except TypeError: + return self.astype(object).insert(loc, item) return super().insert(loc, item) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index cd3102836422f..e4a66ea9133dd 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -304,6 +304,20 @@ def test_setitem_complete_column_with_array(self): ) tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) + def test_setitem_bool_with_numeric_index(self, dtype): + # GH#36319 + cols = Index([1, 2, 3], dtype=dtype) + df = DataFrame(np.random.randn(3, 3), columns=cols) + + df[False] = ["a", "b", "c"] + + expected_cols = Index([1, 2, 3, False], dtype=object) + if dtype == "f8": + expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object) + + tm.assert_index_equal(df.columns, expected_cols) + class TestDataFrameSetItemSlicing: def test_setitem_slice_position(self): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index fd6f6fbc6a4ba..bde7e9991bbed 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -393,7 +393,7 @@ def test_insert_index_object(self, insert, coerced_val, coerced_dtype): [ (1, 1, np.int64), (1.1, 1.1, np.float64), - (False, 0, np.int64), + (False, False, object), # GH#36319 ("x", "x", object), ], ) @@ -409,7 +409,7 @@ def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): [ (1, 1.0, np.float64), (1.1, 1.1, np.float64), - (False, 0.0, np.float64), + (False, False, object), # GH#36319 ("x", "x", object), ], ) From 6d03c7553300a41cad42d9409f7f9a9467c98bd7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Nov 2020 16:30:57 -0800 Subject: [PATCH 2/8] CLN: remove ABCIndex (#38055) --- pandas/core/common.py | 13 +++++-------- pandas/core/dtypes/generic.py | 1 - pandas/core/indexes/datetimelike.py | 14 ++++++++++---- pandas/core/ops/array_ops.py | 8 +++----- pandas/tests/dtypes/test_generic.py | 1 - 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 24680fc855b0d..cdcbc43055052 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,12 +24,7 @@ is_extension_array_dtype, is_integer, ) -from pandas.core.dtypes.generic import ( - ABCExtensionArray, - ABCIndex, - ABCIndexClass, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -105,7 +100,7 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( + if isinstance(key, (ABCSeries, np.ndarray, ABCIndexClass)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: @@ -471,7 +466,9 @@ def convert_to_list_like( Convert list-like or scalar input to list-like. List, numpy and pandas array-like inputs are returned unmodified whereas others are converted to list. """ - if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): + if isinstance( + values, (list, np.ndarray, ABCIndexClass, ABCSeries, ABCExtensionArray) + ): # np.ndarray resolving as Any gives a false positive return values # type: ignore[return-value] elif isinstance(values, abc.Iterable) and not isinstance(values, str): diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 34891180906bb..0e5867809fe52 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -23,7 +23,6 @@ def _check(cls, inst) -> bool: return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 57f6a8ea0cca5..1b18f04ba603d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -22,7 +22,7 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin @@ -53,16 +53,22 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): # error: 'staticmethod' used with a non-method @staticmethod # type: ignore[misc] def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + # Note: these only get called with left.dtype == right.dtype + if isinstance( + left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) + ): left = left.view("i8") - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + if isinstance( + right, + (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), + ): right = right.view("i8") results = joinf(left, right) if with_indexers: # dtype should be timedelta64[ns] for TimedeltaIndex # and datetime64[ns] for DatetimeIndex - dtype = left.dtype.base + dtype = cast(np.dtype, left.dtype).base join_index, left_indexer, right_indexer = results join_index = join_index.view(dtype) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 8142fc3e695a3..c855687552e82 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -27,7 +27,7 @@ is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core.ops import missing @@ -40,13 +40,11 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): - # Note: these checks can be for ABCIndex and not ABCIndexClass - # because that is the only object-dtype class. + if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)): if not is_object_dtype(y.dtype): y = y.astype(np.object_) - if isinstance(y, (ABCSeries, ABCIndex)): + if isinstance(y, (ABCSeries, ABCIndexClass)): y = y._values if x.shape != y.shape: diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index e51b0546b0cee..847daa1e6b263 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -22,7 +22,6 @@ class TestABCClasses: timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) def test_abc_types(self): - assert isinstance(pd.Index(["a", "b", "c"]), gt.ABCIndex) assert isinstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) assert isinstance(pd.UInt64Index([1, 2, 3]), gt.ABCUInt64Index) assert isinstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) From 47fea62b5289fe66433891b4d60228a3a2b227ce Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 26 Nov 2020 01:31:46 +0100 Subject: [PATCH 3/8] ENH: Implement cross method for Merge Operations (#37864) --- asv_bench/benchmarks/join_merge.py | 6 ++ doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/frame.py | 56 ++++++++++- pandas/core/reshape/merge.py | 63 +++++++++++- pandas/tests/reshape/merge/test_join.py | 12 +++ .../tests/reshape/merge/test_merge_cross.py | 95 +++++++++++++++++++ 6 files changed, 230 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/reshape/merge/test_merge_cross.py diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 1333b3a0f0560..a572b8a70a680 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort): def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): self.df_shuf.join(self.df_key2, on="key2", sort=sort) + def time_join_dataframes_cross(self, sort): + self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort) + class JoinIndex: def setup(self): @@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort): def time_merge_dataframe_integer_key(self, sort): merge(self.df, self.df2, on="key1", sort=sort) + def time_merge_dataframes_cross(self, sort): + merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) + class I8Merge: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d0a00e286aad5..2c02757155728 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -255,6 +255,7 @@ Other enhancements - Improve error reporting for :meth:`DataFrame.merge` when invalid merge column definitions were given (:issue:`16228`) - Improve numerical stability for :meth:`.Rolling.skew`, :meth:`.Rolling.kurt`, :meth:`Expanding.skew` and :meth:`Expanding.kurt` through implementation of Kahan summation (:issue:`6929`) - Improved error reporting for subsetting columns of a :class:`.DataFrameGroupBy` with ``axis=1`` (:issue:`37725`) +- Implement method ``cross`` for :meth:`DataFrame.merge` and :meth:`DataFrame.join` (:issue:`5401`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a5ba803897fc6..bca6e255d7a2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -205,12 +205,14 @@ The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner'}, default 'inner' +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -221,6 +223,11 @@ join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + + .. versionadded:: 1.2.0 + on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -341,6 +348,44 @@ ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 """ @@ -8083,6 +8128,15 @@ def _join_compat( other = DataFrame({other.name: other}) if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + ) return merge( self, other, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cdcd6b19704c4..3b755c40721fb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -5,6 +5,7 @@ import copy import datetime from functools import partial +import hashlib import string from typing import TYPE_CHECKING, Optional, Tuple, cast import warnings @@ -643,6 +644,17 @@ def __init__( self._validate_specification() + cross_col = None + if self.how == "cross": + ( + self.left, + self.right, + self.how, + cross_col, + ) = self._create_cross_configuration(self.left, self.right) + self.left_on = self.right_on = [cross_col] + self._cross = cross_col + # note this function has side effects ( self.left_join_keys, @@ -690,8 +702,14 @@ def get_result(self): self._maybe_restore_index_levels(result) + self._maybe_drop_cross_column(result, self._cross) + return result.__finalize__(self, method="merge") + def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): + if cross_col is not None: + result.drop(columns=cross_col, inplace=True) + def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" ) -> Tuple["DataFrame", "DataFrame"]: @@ -1200,9 +1218,50 @@ def _maybe_coerce_merge_keys(self): typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) + def _create_cross_configuration( + self, left, right + ) -> Tuple["DataFrame", "DataFrame", str, str]: + """ + Creates the configuration to dispatch the cross operation to inner join, + e.g. adding a join column and resetting parameters. Join column is added + to a new object, no inplace modification + + Parameters + ---------- + left: DataFrame + right DataFrame + + Returns + ------- + a tuple (left, right, how, cross_col) representing the adjusted + DataFrames with cross_col, the merge operation set to inner and the column + to join over. + """ + cross_col = f"_cross_{hashlib.md5().hexdigest()}" + how = "inner" + return ( + left.assign(**{cross_col: 1}), + right.assign(**{cross_col: 1}), + how, + cross_col, + ) + def _validate_specification(self): + if self.how == "cross": + if ( + self.left_index + or self.right_index + or self.right_on is not None + or self.left_on is not None + or self.on is not None + ): + raise MergeError( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + return # Hm, any way to make this logic less complicated?? - if self.on is None and self.left_on is None and self.right_on is None: + elif self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () @@ -1266,7 +1325,7 @@ def _validate_specification(self): 'of levels in the index of "left"' ) self.left_on = [None] * n - if len(self.right_on) != len(self.left_on): + if self.how != "cross" and len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 7db92eb55fa0b..00ef7a05f5902 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -803,3 +803,15 @@ def test_join_inner_multiindex_deterministic_order(): index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_join_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_cross.py b/pandas/tests/reshape/merge/test_merge_cross.py new file mode 100644 index 0000000000000..d6c29ea129027 --- /dev/null +++ b/pandas/tests/reshape/merge/test_merge_cross.py @@ -0,0 +1,95 @@ +import pytest + +from pandas import DataFrame +import pandas._testing as tm +from pandas.core.reshape.merge import MergeError, merge + + +@pytest.mark.parametrize( + ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])] +) +def test_merge_cross(input_col, output_cols): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({input_col: [3, 4]}) + left_copy = left.copy() + right_copy = right.copy() + result = merge(left, right, how="cross") + expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]}) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(left, left_copy) + tm.assert_frame_equal(right, right_copy) + + +@pytest.mark.parametrize( + "kwargs", + [ + {"left_index": True}, + {"right_index": True}, + {"on": "a"}, + {"left_on": "a"}, + {"right_on": "b"}, + ], +) +def test_merge_cross_error_reporting(kwargs): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"b": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + merge(left, right, how="cross", **kwargs) + + +def test_merge_cross_mixed_dtypes(): + # GH#5401 + left = DataFrame(["a", "b", "c"], columns=["A"]) + right = DataFrame(range(2), columns=["B"]) + result = merge(left, right, how="cross") + expected = DataFrame({"A": ["a", "a", "b", "b", "c", "c"], "B": [0, 1, 0, 1, 0, 1]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_more_than_one_column(): + # GH#5401 + left = DataFrame({"A": list("ab"), "B": [2, 1]}) + right = DataFrame({"C": range(2), "D": range(4, 6)}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "A": ["a", "a", "b", "b"], + "B": [2, 2, 1, 1], + "C": [0, 1, 0, 1], + "D": [4, 5, 4, 5], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_merge_cross_null_values(nulls_fixture): + # GH#5401 + left = DataFrame({"a": [1, nulls_fixture]}) + right = DataFrame({"b": ["a", "b"], "c": [1.0, 2.0]}) + result = merge(left, right, how="cross") + expected = DataFrame( + { + "a": [1, 1, nulls_fixture, nulls_fixture], + "b": ["a", "b", "a", "b"], + "c": [1.0, 2.0, 1.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) + + +def test_join_cross_error_reporting(): + # GH#5401 + left = DataFrame({"a": [1, 3]}) + right = DataFrame({"a": [3, 4]}) + msg = ( + "Can not pass on, right_on, left_on or set right_index=True or " + "left_index=True" + ) + with pytest.raises(MergeError, match=msg): + left.join(right, how="cross", on="a") From d05a12c392e551b2310a4cced566bdb7282ddc9b Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 26 Nov 2020 01:40:35 +0100 Subject: [PATCH 4/8] BUG: __getitem__ raise blank KeyError for IntervalIndex and missing keys (#37873) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/interval.py | 2 +- pandas/tests/indexing/interval/test_interval.py | 8 ++++---- pandas/tests/indexing/interval/test_interval_new.py | 12 ++++++++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2c02757155728..0770426bc5020 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -628,6 +628,7 @@ Indexing - Bug in :meth:`DataFrame.loc` raising ``TypeError`` when non-integer slice was given to select values from :class:`MultiIndex` (:issue:`25165`, :issue:`24263`) - Bug in :meth:`DataFrame.loc` returning and assigning elements in wrong order when indexer is differently ordered than the :class:`MultiIndex` to filter (:issue:`31330`, :issue:`34603`) - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.__getitem__` raising ``KeyError`` when columns were :class:`MultiIndex` with only one level (:issue:`29749`) +- Bug in :meth:`Series.__getitem__` and :meth:`DataFrame.__getitem__` raising blank ``KeyError`` without missing keys for :class:`IntervalIndex` (:issue:`27365`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 98752a21e44a2..986c4d2c59723 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -806,7 +806,7 @@ def _convert_list_indexer(self, keyarr): # we have missing values if (locs == -1).any(): - raise KeyError + raise KeyError(keyarr[locs == -1].tolist()) return locs diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 0fa6abb27cb61..f4e7296598d54 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -65,10 +65,10 @@ def test_non_matching(self): # this is a departure from our current # indexing scheme, but simpler - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[-1\]$"): s.loc[[-1, 3, 4, 5]] - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[-1\]$"): s.loc[[-1, 3]] @pytest.mark.arm_slow @@ -107,11 +107,11 @@ def test_loc_getitem_frame(self): expected = df.take([4, 5, 4, 5]) tm.assert_frame_equal(result, expected) - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[10\]$"): df.loc[[10]] # partial missing - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[10\]$"): df.loc[[10, 4]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 03c3034772bc6..a9512bc97d9de 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -204,13 +204,13 @@ def test_loc_with_overlap(self): with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s.loc[Interval(3, 5)] - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): s.loc[[Interval(3, 5)]] with pytest.raises(KeyError, match=re.escape("Interval(3, 5, closed='right')")): s[Interval(3, 5)] - with pytest.raises(KeyError, match="^$"): + with pytest.raises(KeyError, match=r"^\[Interval\(3, 5, closed='right'\)\]$"): s[[Interval(3, 5)]] # slices with interval (only exact matches) @@ -266,3 +266,11 @@ def test_non_unique_moar(self): expected = s.iloc[[0, 1]] result = s[[Interval(1, 3)]] tm.assert_series_equal(expected, result) + + def test_missing_key_error_message(self, frame_or_series): + # GH#27365 + obj = frame_or_series( + np.arange(5), index=IntervalIndex.from_breaks(np.arange(6)) + ) + with pytest.raises(KeyError, match=r"\[6\]"): + obj.loc[[4, 5, 6]] From c28fb2ba37d46b3667199e5f7fd33266b2070ca4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 26 Nov 2020 17:37:28 -0800 Subject: [PATCH 5/8] REF: use validate_fill_value idiom in Index.insert --- pandas/core/dtypes/cast.py | 13 +++++++-- pandas/core/indexes/base.py | 35 +++++++++++------------- pandas/core/indexes/numeric.py | 29 ++++++++++++-------- pandas/tests/dtypes/cast/test_promote.py | 9 ++++-- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0f0e82f4ad4e2..1735684a05822 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,7 +20,7 @@ import numpy as np -from pandas._libs import lib, tslib, tslibs +from pandas._libs import lib, missing as libmissing, tslib, tslibs from pandas._libs.tslibs import ( NaT, OutOfBoundsDatetime, @@ -584,6 +584,9 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) + elif is_valid_nat_for_dtype(fill_value, dtype): + # e.g. pd.NA, which is not accepted by Timestamp constructor + fill_value = np.datetime64("NaT", "ns") else: try: fill_value = Timestamp(fill_value).to_datetime64() @@ -597,6 +600,9 @@ def maybe_promote(dtype, fill_value=np.nan): ): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) + elif is_valid_nat_for_dtype(fill_value, dtype): + # e.g pd.NA, which is not accepted by the Timedelta constructor + fill_value = np.timedelta64("NaT", "ns") else: try: fv = Timedelta(fill_value) @@ -670,7 +676,7 @@ def maybe_promote(dtype, fill_value=np.nan): # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst - elif fill_value is None: + elif fill_value is None or fill_value is libmissing.NA: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): @@ -680,7 +686,8 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) - fill_value = np.nan + if fill_value is not libmissing.NA: + fill_value = np.nan else: dtype = np.dtype(np.object_) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c074ae2c066f6..2f3213315b11d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -34,6 +34,7 @@ from pandas.core.dtypes.cast import ( maybe_cast_to_integer_array, + maybe_promote, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -4162,24 +4163,6 @@ def _string_data_error(cls, data): "to explicitly cast to a numeric type" ) - @final - def _coerce_scalar_to_index(self, item): - """ - We need to coerce a scalar to a compat for our index type. - - Parameters - ---------- - item : scalar item to coerce - """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) - def _to_safe_for_reshape(self): """ Convert to object if we are a categorical. @@ -5490,8 +5473,22 @@ def insert(self, loc: int, item): """ # Note: this method is overridden by all ExtensionIndex subclasses, # so self is never backed by an EA. + + try: + item = self._validate_fill_value(item) + except TypeError: + if is_scalar(item): + dtype, item = maybe_promote(self.dtype, item) + else: + # maybe_promote would raise ValueError + dtype = np.dtype(object) + + return self.astype(dtype).insert(loc, item) + arr = np.asarray(self) - item = self._coerce_scalar_to_index(item)._values + + # Use Index constructor to ensure we get tuples cast correctly. + item = Index([item], dtype=self.dtype)._values idx = np.concatenate((arr[:loc], item, arr[loc:])) return Index(idx, name=self.name) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 12f61fc44582d..d88e9fd76367c 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -16,6 +16,7 @@ is_float, is_float_dtype, is_integer_dtype, + is_number, is_numeric_dtype, is_scalar, is_signed_integer_dtype, @@ -120,15 +121,30 @@ def _validate_fill_value(self, value): # force conversion to object # so we don't lose the bools raise TypeError - elif isinstance(value, str) or lib.is_complex(value): - raise TypeError elif is_scalar(value) and isna(value): if is_valid_nat_for_dtype(value, self.dtype): value = self._na_value + if self.dtype.kind != "f": + # raise so that caller can cast + raise TypeError else: # NaT, np.datetime64("NaT"), np.timedelta64("NaT") raise TypeError + elif is_scalar(value): + if not is_number(value): + # e.g. datetime64, timedelta64, datetime, ... + raise TypeError + + elif lib.is_complex(value): + # at least until we have a ComplexIndx + raise TypeError + + elif is_float(value) and self.dtype.kind != "f": + if not value.is_integer(): + raise TypeError + value = int(value) + return value def _convert_tolerance(self, tolerance, target): @@ -164,15 +180,6 @@ def _is_all_dates(self) -> bool: """ return False - @doc(Index.insert) - def insert(self, loc: int, item): - try: - item = self._validate_fill_value(item) - except TypeError: - return self.astype(object).insert(loc, item) - - return super().insert(loc, item) - def _union(self, other, sort): # Right now, we treat union(int, float) a bit special. # See https://github.com/pandas-dev/pandas/issues/26778 for discussion diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 74a11c9f33195..5e9143c5ab4d9 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -110,6 +110,8 @@ def _assert_match(result_fill_value, expected_fill_value): assert res_type == ex_type or res_type.__name__ == ex_type.__name__ match_value = result_fill_value == expected_fill_value + if match_value is pd.NA: + match_value = False # Note: type check above ensures that we have the _same_ NA value # for missing values, None == None (which is checked @@ -569,7 +571,7 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype): _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -@pytest.mark.parametrize("fill_value", [None, np.nan, NaT]) +@pytest.mark.parametrize("fill_value", [None, np.nan, NaT, pd.NA]) def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value): dtype = np.dtype(any_numpy_dtype_reduced) @@ -597,7 +599,10 @@ def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_val else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) - exp_val_for_scalar = np.nan + if fill_value is pd.NA: + exp_val_for_scalar = pd.NA + else: + exp_val_for_scalar = np.nan _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) From 6e2c0aa3e77c2faa615d419cc23fc8464e418be0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 28 Nov 2020 10:48:11 -0800 Subject: [PATCH 6/8] use fixture, docstring --- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/numeric.py | 3 ++- pandas/tests/dtypes/cast/test_promote.py | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f21194c71adf5..cbc12431f83f8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4171,8 +4171,8 @@ def _to_safe_for_reshape(self): def _validate_fill_value(self, value): """ - Check if the value can be inserted into our array, and convert - it to an appropriate native type if necessary. + Check if the value can be inserted into our array without casting, + and convert it to an appropriate native type if necessary. """ return value diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index d88e9fd76367c..cf7747499c63f 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -115,7 +115,8 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): def _validate_fill_value(self, value): """ - Convert value to be insertable to ndarray. + Check if the value can be inserted into our array without casting, + and convert it to an appropriate native type if necessary. """ if is_bool(value) or is_bool_dtype(value): # force conversion to object diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 5e9143c5ab4d9..294abafa86812 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -571,8 +571,8 @@ def test_maybe_promote_any_with_object(any_numpy_dtype_reduced, object_dtype): _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -@pytest.mark.parametrize("fill_value", [None, np.nan, NaT, pd.NA]) -def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value): +def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, nulls_fixture): + fill_value = nulls_fixture dtype = np.dtype(any_numpy_dtype_reduced) if is_integer_dtype(dtype) and fill_value is not NaT: From de857e505aa359ad3d14ada4b1ed9336d4191137 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 12:02:21 -0800 Subject: [PATCH 7/8] flesh out docstring --- pandas/core/indexes/base.py | 5 +++++ pandas/core/indexes/numeric.py | 5 +---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a01b40d820c5f..dd6b72a657dde 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4172,6 +4172,11 @@ def _validate_fill_value(self, value): """ Check if the value can be inserted into our array without casting, and convert it to an appropriate native type if necessary. + + Raises + ------ + TypeError + If the value cannot be inserted into an array of this dtype. """ return value diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index cf7747499c63f..8dce3d89aaa92 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -113,11 +113,8 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): return Float64Index._simple_new(values, name=name) return super()._shallow_copy(values=values, name=name) + @doc(Index._validate_fill_value) def _validate_fill_value(self, value): - """ - Check if the value can be inserted into our array without casting, - and convert it to an appropriate native type if necessary. - """ if is_bool(value) or is_bool_dtype(value): # force conversion to object # so we don't lose the bools From f121087c3f997a3a61907ed6e7ac5606758bcbd1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 13:06:24 -0800 Subject: [PATCH 8/8] raises section for maybe_promote docstring --- pandas/core/dtypes/cast.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e3f78883b45b3..69a21536795dc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -548,6 +548,11 @@ def maybe_promote(dtype, fill_value=np.nan): Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. + + Raises + ------ + ValueError + If fill_value is a non-scalar and dtype is not object. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can