Merge remote-tracking branch 'upstream/master' into styler_format_index

attack68 · attack68 · commit afa6da3cfa1d · 2021-09-08T07:25:21.000+02:00
diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst
@@ -22,6 +22,7 @@ Fixed regressions
 - Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`)
 - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
 - Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
+- Fixed regression in :meth:`DataFrame.loc.__setitem__` raising ``ValueError`` when setting array as cell value (:issue:`43422`)
 - Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`)
 - Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`)
 - Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -361,7 +361,7 @@ Indexing
 - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
 - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
 - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
--
+- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
 
 
 Missing
@@ -419,6 +419,7 @@ Reshaping
 - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
 - Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
 - Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`)
+- Bug in :func:`concat` of ``bool`` and ``boolean`` dtypes resulting in ``object`` dtype instead of ``boolean`` dtype (:issue:`42800`)
 -
 
 Sparse
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1252,6 +1252,8 @@ class SelectNSeries(SelectN):
 
     def compute(self, method: str) -> Series:
 
+        from pandas.core.reshape.concat import concat
+
         n = self.n
         dtype = self.obj.dtype
         if not self.is_valid_dtype_n_method(dtype):
@@ -1261,6 +1263,7 @@ def compute(self, method: str) -> Series:
             return self.obj[[]]
 
         dropped = self.obj.dropna()
+        nan_index = self.obj.drop(dropped.index)
 
         if is_extension_array_dtype(dropped.dtype):
             # GH#41816 bc we have dropped NAs above, MaskedArrays can use the
@@ -1277,7 +1280,7 @@ def compute(self, method: str) -> Series:
         # slow method
         if n >= len(self.obj):
             ascending = method == "nsmallest"
-            return dropped.sort_values(ascending=ascending).head(n)
+            return self.obj.sort_values(ascending=ascending).head(n)
 
         # fast method
         new_dtype = dropped.dtype
@@ -1295,6 +1298,8 @@ def compute(self, method: str) -> Series:
         if self.keep == "last":
             arr = arr[::-1]
 
+        nbase = n
+        findex = len(self.obj)
         narr = len(arr)
         n = min(n, narr)
 
@@ -1306,12 +1311,13 @@ def compute(self, method: str) -> Series:
 
         if self.keep != "all":
             inds = inds[:n]
+            findex = nbase
 
         if self.keep == "last":
             # reverse indices
             inds = narr - 1 - inds
 
-        return dropped.iloc[inds]
+        return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
 
 
 class SelectNFrame(SelectN):
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -17,6 +17,7 @@
     ArrayLike,
     AstypeArg,
     Dtype,
+    DtypeObj,
     npt,
     type_t,
 )
@@ -153,6 +154,18 @@ def __from_arrow__(
         else:
             return BooleanArray._concat_same_type(results)
 
+    def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
+        # Handle only boolean + np.bool_ -> boolean, since other cases like
+        # Int64 + boolean -> Int64 will be handled by the other type
+        if all(
+            isinstance(t, BooleanDtype)
+            or (isinstance(t, np.dtype) and (np.issubdtype(t, np.bool_)))
+            for t in dtypes
+        ):
+            return BooleanDtype()
+        else:
+            return None
+
 
 def coerce_to_array(
     values, mask=None, copy: bool = False
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -2185,6 +2185,11 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
         # ExtensionBlock._can_hold_element
         return True
 
+    # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
+    # operand type: "Type[object]")
+    if dtype == object:  # type: ignore[comparison-overlap]
+        return True
+
     tipo = maybe_infer_dtype_type(element)
 
     if dtype.kind in ["i", "u"]:
@@ -2232,11 +2237,6 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
             return tipo.kind == "b"
         return lib.is_bool(element)
 
-    # error: Non-overlapping equality check (left operand type: "dtype[Any]", right
-    # operand type: "Type[object]")
-    elif dtype == object:  # type: ignore[comparison-overlap]
-        return True
-
     elif dtype.kind == "S":
         # TODO: test tests.frame.methods.test_replace tests get here,
         #  need more targeted tests.  xref phofl has a PR about this
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -354,35 +354,17 @@ def array_func(values: ArrayLike) -> ArrayLike:
         )
         return self._reindex_output(ser)
 
-    def _wrap_aggregated_output(
-        self,
-        output: Mapping[base.OutputKey, Series | ArrayLike],
+    def _indexed_output_to_ndframe(
+        self, output: Mapping[base.OutputKey, ArrayLike]
     ) -> Series:
         """
-        Wraps the output of a SeriesGroupBy aggregation into the expected result.
-
-        Parameters
-        ----------
-        output : Mapping[base.OutputKey, Union[Series, ArrayLike]]
-            Data to wrap.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        In the vast majority of cases output will only contain one element.
-        The exception is operations that expand dimensions, like ohlc.
+        Wrap the dict result of a GroupBy aggregation into a Series.
         """
         assert len(output) == 1
-
-        name = self.obj.name
-        index = self.grouper.result_index
         values = next(iter(output.values()))
-
-        result = self.obj._constructor(values, index=index, name=name)
-        return self._reindex_output(result)
+        result = self.obj._constructor(values)
+        result.name = self.obj.name
+        return result
 
     def _wrap_transformed_output(
         self, output: Mapping[base.OutputKey, Series | ArrayLike]
@@ -1614,46 +1596,19 @@ def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None:
             if in_axis and name not in columns:
                 result.insert(0, name, lev)
 
-    def _wrap_aggregated_output(
-        self,
-        output: Mapping[base.OutputKey, Series | ArrayLike],
+    def _indexed_output_to_ndframe(
+        self, output: Mapping[base.OutputKey, ArrayLike]
     ) -> DataFrame:
         """
-        Wraps the output of DataFrameGroupBy aggregations into the expected result.
-
-        Parameters
-        ----------
-        output : Mapping[base.OutputKey, Union[Series, np.ndarray]]
-           Data to wrap.
-
-        Returns
-        -------
-        DataFrame
+        Wrap the dict result of a GroupBy aggregation into a DataFrame.
         """
-        if isinstance(output, DataFrame):
-            result = output
-        else:
-            indexed_output = {key.position: val for key, val in output.items()}
-            columns = Index([key.label for key in output])
-            columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
-
-            result = self.obj._constructor(indexed_output)
-            result.columns = columns
-
-        if not self.as_index:
-            self._insert_inaxis_grouper_inplace(result)
-            result = result._consolidate()
-        else:
-            result.index = self.grouper.result_index
-
-        if self.axis == 1:
-            result = result.T
-            if result.index.equals(self.obj.index):
-                # Retain e.g. DatetimeIndex/TimedeltaIndex freq
-                result.index = self.obj.index.copy()
-                # TODO: Do this more systematically
+        indexed_output = {key.position: val for key, val in output.items()}
+        columns = Index([key.label for key in output])
+        columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
 
-        return self._reindex_output(result)
+        result = self.obj._constructor(indexed_output)
+        result.columns = columns
+        return result
 
     def _wrap_transformed_output(
         self, output: Mapping[base.OutputKey, Series | ArrayLike]
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1095,9 +1095,54 @@ def _set_result_index_ordered(
 
         return result
 
-    def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, ArrayLike]):
+    def _indexed_output_to_ndframe(
+        self, result: Mapping[base.OutputKey, ArrayLike]
+    ) -> Series | DataFrame:
         raise AbstractMethodError(self)
 
+    def _wrap_aggregated_output(
+        self, output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike]
+    ):
+        """
+        Wraps the output of GroupBy aggregations into the expected result.
+
+        Parameters
+        ----------
+        output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike]
+           Data to wrap.
+
+        Returns
+        -------
+        Series or DataFrame
+        """
+
+        if isinstance(output, (Series, DataFrame)):
+            # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce,
+            #  in which case our columns are already set correctly.
+            # ATM we do not get here for SeriesGroupBy; when we do, we will
+            #  need to require that result.name already match self.obj.name
+            result = output
+        else:
+            result = self._indexed_output_to_ndframe(output)
+
+        if not self.as_index:
+            # `not self.as_index` is only relevant for DataFrameGroupBy,
+            #   enforced in __init__
+            self._insert_inaxis_grouper_inplace(result)
+            result = result._consolidate()
+        else:
+            result.index = self.grouper.result_index
+
+        if self.axis == 1:
+            # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
+            result = result.T
+            if result.index.equals(self.obj.index):
+                # Retain e.g. DatetimeIndex/TimedeltaIndex freq
+                result.index = self.obj.index.copy()
+                # TODO: Do this more systematically
+
+        return self._reindex_output(result)
+
     def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]):
         raise AbstractMethodError(self)
 
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1206,6 +1206,13 @@ def test_getitem_interval_index_partial_indexing(self):
         res = df.loc[:, 0.5]
         tm.assert_series_equal(res, expected)
 
+    def test_setitem_array_as_cell_value(self):
+        # GH#43422
+        df = DataFrame(columns=["a", "b"], dtype=object)
+        df.loc[0] = {"a": np.zeros((2,)), "b": np.zeros((2, 2))}
+        expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
+        tm.assert_frame_equal(df, expected)
+
 
 class TestDataFrameIndexingUInt64:
     def test_setitem(self, uint64_frame):
diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py
@@ -209,3 +209,10 @@ def test_nlargest_multiindex_column_lookup(self):
         result = df.nlargest(3, ("x", "b"))
         expected = df.iloc[[3, 2, 1]]
         tm.assert_frame_equal(result, expected)
+
+    def test_nlargest_nan(self):
+        # GH#43060
+        df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3])
+        result = df.nlargest(5, 0)
+        expected = df.sort_values(0, ascending=False).head(5)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1145,3 +1145,15 @@ def test_doctest_example2():
         {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dropna", [True, False])
+def test_apply_na(dropna):
+    # GH#28984
+    df = DataFrame(
+        {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
+    )
+    dfgrp = df.groupby("grp", dropna=dropna)
+    result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
+    expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -119,3 +119,13 @@ def test_multiindex_complex(self):
             ),
         )
         tm.assert_frame_equal(result, expected)
+
+    def test_rename_multiindex_with_duplicates(self):
+        # GH 38015
+        mi = MultiIndex.from_tuples([("A", "cat"), ("B", "cat"), ("B", "cat")])
+        df = DataFrame(index=mi)
+        df = df.rename(index={"A": "Apple"}, level=0)
+
+        mi2 = MultiIndex.from_tuples([("Apple", "cat"), ("B", "cat"), ("B", "cat")])
+        expected = DataFrame(index=mi2)
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -597,6 +597,24 @@ def test_concat_preserves_extension_int64_dtype():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "dtype1,dtype2,expected_dtype",
+    [
+        ("bool", "bool", "bool"),
+        ("boolean", "bool", "boolean"),
+        ("bool", "boolean", "boolean"),
+        ("boolean", "boolean", "boolean"),
+    ],
+)
+def test_concat_bool_types(dtype1, dtype2, expected_dtype):
+    # GH 42800
+    ser1 = Series([True, False], dtype=dtype1)
+    ser2 = Series([False, True], dtype=dtype2)
+    result = concat([ser1, ser2], ignore_index=True)
+    expected = Series([True, False, False, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     ("keys", "integrity"),
     [
diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py
@@ -127,8 +127,12 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split):
     def test_nlargest_misc(self):
 
         ser = Series([3.0, np.nan, 1, 2, 5])
-        tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]])
-        tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]])
+        result = ser.nlargest()
+        expected = ser.iloc[[4, 0, 3, 2, 1]]
+        tm.assert_series_equal(result, expected)
+        result = ser.nsmallest()
+        expected = ser.iloc[[2, 3, 0, 4, 1]]
+        tm.assert_series_equal(result, expected)
 
         msg = 'keep must be either "first", "last"'
         with pytest.raises(ValueError, match=msg):