Merge branch 'main' into deprecation

phofl · web-flow · commit c56f00e031ea · 2023-08-28T20:55:56.000+02:00
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -718,6 +718,7 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :meth:`Series.str` that did not raise a  ``TypeError`` when iterated (:issue:`54173`)
+- Bug in ``repr`` for :class:`DataFrame`` with string-dtype columns (:issue:`54797`)
 
 Interval
 ^^^^^^^^
diff --git a/meson.build b/meson.build
@@ -6,20 +6,15 @@ project(
     license: 'BSD-3',
     meson_version: '>=1.0.1',
     default_options: [
-        # TODO: investigate, does meson try to compile against debug Python
-        # when buildtype = debug, this seems to be causing problems on CI
-        # where provided Python is not compiled in debug mode
         'buildtype=release',
         # TODO: Reactivate werror, some warnings on Windows
         #'werror=true',
         'c_std=c99'
     ]
 )
 
-py_mod = import('python')
 fs = import('fs')
-py = py_mod.find_installation('python')
-py_dep = py.dependency()
+py = import('python').find_installation()
 tempita = files('generate_pxi.py')
 versioneer = files('generate_version.py')
 
diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build
@@ -3,7 +3,6 @@ py.extension_module(
     ['aggregations.pyx'],
     cython_args: ['-X always_allow_keywords=true'],
     include_directories: [inc_np, inc_pd],
-    dependencies: [py_dep],
     subdir: 'pandas/_libs/window',
     override_options : ['cython_language=cpp'],
     install: true
@@ -14,7 +13,6 @@ py.extension_module(
     ['indexers.pyx'],
     cython_args: ['-X always_allow_keywords=true'],
     include_directories: [inc_np, inc_pd],
-    dependencies: [py_dep],
     subdir: 'pandas/_libs/window',
     install: true
 )
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7099,6 +7099,8 @@ def fillna(
 
         See Also
         --------
+        ffill : Fill values by propagating the last valid observation to next valid.
+        bfill : Fill values by using the next valid observation to fill the gap.
         interpolate : Fill NaN values using interpolation.
         reindex : Conform object to new index.
         asfreq : Convert TimeSeries to specified frequency.
@@ -7358,7 +7360,10 @@ def ffill(
         ...
 
     @final
-    @doc(klass=_shared_doc_kwargs["klass"])
+    @doc(
+        klass=_shared_doc_kwargs["klass"],
+        axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
+    )
     def ffill(
         self,
         *,
@@ -7370,6 +7375,27 @@ def ffill(
         """
         Fill NA/NaN values by propagating the last valid observation to next valid.
 
+        Parameters
+        ----------
+        axis : {axes_single_arg}
+            Axis along which to fill missing values. For `Series`
+            this parameter is unused and defaults to 0.
+        inplace : bool, default False
+            If True, fill in-place. Note: this will modify any
+            other views on this object (e.g., a no-copy slice for a column in a
+            DataFrame).
+        limit : int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill. In other words, if there is
+            a gap with more than this number of consecutive NaNs, it will only
+            be partially filled. If method is not specified, this is the
+            maximum number of entries along the entire axis where NaNs will be
+            filled. Must be greater than 0 if not None.
+        downcast : dict, default is None
+            A dict of item->dtype of what to downcast if possible,
+            or the string 'infer' which will try to downcast to an appropriate
+            equal type (e.g. float64 to int64 if possible).
+
         Returns
         -------
         {klass} or None
@@ -7437,7 +7463,7 @@ def pad(
         downcast: dict | None | lib.NoDefault = lib.no_default,
     ) -> Self | None:
         """
-        Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
+        Fill NA/NaN values by propagating the last valid observation to next valid.
 
         .. deprecated:: 2.0
 
@@ -7494,7 +7520,10 @@ def bfill(
         ...
 
     @final
-    @doc(klass=_shared_doc_kwargs["klass"])
+    @doc(
+        klass=_shared_doc_kwargs["klass"],
+        axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
+    )
     def bfill(
         self,
         *,
@@ -7504,7 +7533,28 @@ def bfill(
         downcast: dict | None | lib.NoDefault = lib.no_default,
     ) -> Self | None:
         """
-        Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
+        Fill NA/NaN values by using the next valid observation to fill the gap.
+
+        Parameters
+        ----------
+        axis : {axes_single_arg}
+            Axis along which to fill missing values. For `Series`
+            this parameter is unused and defaults to 0.
+        inplace : bool, default False
+            If True, fill in-place. Note: this will modify any
+            other views on this object (e.g., a no-copy slice for a column in a
+            DataFrame).
+        limit : int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill. In other words, if there is
+            a gap with more than this number of consecutive NaNs, it will only
+            be partially filled. If method is not specified, this is the
+            maximum number of entries along the entire axis where NaNs will be
+            filled. Must be greater than 0 if not None.
+        downcast : dict, default is None
+            A dict of item->dtype of what to downcast if possible,
+            or the string 'infer' which will try to downcast to an appropriate
+            equal type (e.g. float64 to int64 if possible).
 
         Returns
         -------
@@ -7583,7 +7633,7 @@ def backfill(
         downcast: dict | None | lib.NoDefault = lib.no_default,
     ) -> Self | None:
         """
-        Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
+        Fill NA/NaN values by using the next valid observation to fill the gap.
 
         .. deprecated:: 2.0
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -124,6 +124,7 @@
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCDatetimeIndex,
+    ABCIntervalIndex,
     ABCMultiIndex,
     ABCPeriodIndex,
     ABCSeries,
@@ -1396,8 +1397,8 @@ def _format_with_header(self, header: list[str_t], na_rep: str_t) -> list[str_t]
 
         values = self._values
 
-        if is_object_dtype(values.dtype):
-            values = cast(np.ndarray, values)
+        if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
+            values = np.asarray(values)
             values = lib.maybe_convert_objects(values, safe=True)
 
             result = [pprint_thing(x, escape_chars=("\t", "\r", "\n")) for x in values]
@@ -3492,8 +3493,6 @@ def _intersection(self, other: Index, sort: bool = False):
             and other.is_monotonic_increasing
             and self._can_use_libjoin
             and other._can_use_libjoin
-            and not isinstance(self, ABCMultiIndex)
-            and not isinstance(other, ABCMultiIndex)
         ):
             try:
                 res_indexer, indexer, _ = self._inner_indexer(other)
@@ -4632,28 +4631,13 @@ def join(
 
         _validate_join_method(how)
 
-        if not self.is_unique and not other.is_unique:
-            return self._join_non_unique(other, how=how, sort=sort)
-        elif not self.is_unique or not other.is_unique:
-            if self.is_monotonic_increasing and other.is_monotonic_increasing:
-                # Note: 2023-08-15 we *do* have tests that get here with
-                #  Categorical, string[python] (can use libjoin)
-                #  and Interval (cannot)
-                if self._can_use_libjoin and other._can_use_libjoin:
-                    # otherwise we will fall through to _join_via_get_indexer
-                    # GH#39133
-                    # go through object dtype for ea till engine is supported properly
-                    return self._join_monotonic(other, how=how)
-            else:
-                return self._join_non_unique(other, how=how, sort=sort)
-        elif (
-            # GH48504: exclude MultiIndex to avoid going through MultiIndex._values
-            self.is_monotonic_increasing
+        if (
+            not isinstance(self.dtype, CategoricalDtype)
+            and self.is_monotonic_increasing
             and other.is_monotonic_increasing
             and self._can_use_libjoin
             and other._can_use_libjoin
-            and not isinstance(self, ABCMultiIndex)
-            and not isinstance(self.dtype, CategoricalDtype)
+            and (self.is_unique or other.is_unique)
         ):
             # Categorical is monotonic if data are ordered as categories, but join can
             #  not handle this in case of not lexicographically monotonic GH#38502
@@ -4662,6 +4646,8 @@ def join(
             except TypeError:
                 # object dtype; non-comparable objects
                 pass
+        elif not self.is_unique or not other.is_unique:
+            return self._join_non_unique(other, how=how, sort=sort)
 
         return self._join_via_get_indexer(other, how, sort)
 
@@ -4797,6 +4783,9 @@ def _join_non_unique(
         join_idx = self.take(left_idx)
         right = other.take(right_idx)
         join_index = join_idx.putmask(mask, right)
+        if isinstance(join_index, ABCMultiIndex) and how == "outer":
+            # test_join_index_levels
+            join_index = join_index._sort_levels_monotonic()
         return join_index, left_idx, right_idx
 
     @final
@@ -5042,10 +5031,10 @@ def _can_use_libjoin(self) -> bool:
                 or isinstance(self._values, (ArrowExtensionArray, BaseMaskedArray))
                 or self.dtype == "string[python]"
             )
-        # For IntervalIndex, the conversion to numpy converts
-        #  to object dtype, which negates the performance benefit of libjoin
-        # TODO: exclude RangeIndex and MultiIndex as these also make copies?
-        return not isinstance(self.dtype, IntervalDtype)
+        # Exclude index types where the conversion to numpy converts to object dtype,
+        #  which negates the performance benefit of libjoin
+        # TODO: exclude RangeIndex? Seems to break test_concat_datetime_timezone
+        return not isinstance(self, (ABCIntervalIndex, ABCMultiIndex))
 
     # --------------------------------------------------------------------
     # Uncategorized Methods
@@ -5180,8 +5169,7 @@ def _get_join_target(self) -> np.ndarray:
             # present
             return self._values.to_numpy()
 
-        # TODO: exclude ABCRangeIndex, ABCMultiIndex cases here as those create
-        #  copies.
+        # TODO: exclude ABCRangeIndex case here as it copies
         target = self._get_engine_target()
         if not isinstance(target, np.ndarray):
             raise ValueError("_can_use_libjoin should return False.")
@@ -5194,7 +5182,7 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
         """
         if isinstance(self.values, BaseMaskedArray):
             return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
-        elif isinstance(self.values, ArrowExtensionArray):
+        elif isinstance(self.values, (ArrowExtensionArray, StringArray)):
             return type(self.values)._from_sequence(result)
         return result
 
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -604,12 +604,12 @@ def at(self) -> _AtIndexer:
         Raises
         ------
         KeyError
-            * If getting a value and 'label' does not exist in a DataFrame or
-                Series.
+            If getting a value and 'label' does not exist in a DataFrame or Series.
+
         ValueError
-            * If row/column label pair is not a tuple or if any label from
-                the pair is not a scalar for DataFrame.
-            * If label is list-like (*excluding* NamedTuple) for Series.
+            If row/column label pair is not a tuple or if any label
+            from the pair is not a scalar for DataFrame.
+            If label is list-like (*excluding* NamedTuple) for Series.
 
         See Also
         --------
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -2008,7 +2008,7 @@ def fillna(
                     "need to implement this keyword or an exception will be "
                     "raised. In the interim, the keyword is ignored by "
                     f"{type(self.values).__name__}.",
-                    FutureWarning,
+                    DeprecationWarning,
                     stacklevel=find_stack_level(),
                 )
 
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -133,7 +133,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators):
     def test_fillna_frame(self, data_missing):
         msg = "ExtensionArray.fillna added a 'copy' keyword"
         with tm.assert_produces_warning(
-            FutureWarning, match=msg, check_stacklevel=False
+            DeprecationWarning, match=msg, check_stacklevel=False
         ):
             super().test_fillna_frame(data_missing)
 
@@ -166,7 +166,7 @@ def test_fillna_no_op_returns_copy(self, data):
     def test_fillna_series(self, data_missing):
         msg = "ExtensionArray.fillna added a 'copy' keyword"
         with tm.assert_produces_warning(
-            FutureWarning, match=msg, check_stacklevel=False
+            DeprecationWarning, match=msg, check_stacklevel=False
         ):
             super().test_fillna_series(data_missing)
 
@@ -178,13 +178,13 @@ def test_fillna_series_method(self, data_missing, fillna_method):
             super().test_fillna_series_method(data_missing, fillna_method)
 
     def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
-        warn = FutureWarning if not using_copy_on_write else None
+        warn = DeprecationWarning if not using_copy_on_write else None
         msg = "ExtensionArray.fillna added a 'copy' keyword"
         with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
             super().test_fillna_copy_frame(data_missing)
 
     def test_fillna_copy_series(self, data_missing, using_copy_on_write):
-        warn = FutureWarning if not using_copy_on_write else None
+        warn = DeprecationWarning if not using_copy_on_write else None
         msg = "ExtensionArray.fillna added a 'copy' keyword"
         with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
             super().test_fillna_copy_series(data_missing)
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
@@ -455,3 +455,14 @@ def test_masked_ea_with_formatter(self):
 0  0.12  1.00
 1  1.12  2.00"""
         assert result == expected
+
+    def test_repr_ea_columns(self, any_string_dtype):
+        # GH#54797
+        pytest.importorskip("pyarrow")
+        df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]})
+        df.columns = df.columns.astype(any_string_dtype)
+        expected = """   long_column_name  col2
+0                 1     4
+1                 2     5
+2                 3     6"""
+        assert repr(df) == expected
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
@@ -899,3 +899,10 @@ def test_union_ea_dtypes(self, any_numeric_ea_and_arrow_dtype):
         result = idx.union(idx2)
         expected = Index([1, 2, 3, 4, 5], dtype=any_numeric_ea_and_arrow_dtype)
         tm.assert_index_equal(result, expected)
+
+    def test_union_string_array(self, any_string_dtype):
+        idx1 = Index(["a"], dtype=any_string_dtype)
+        idx2 = Index(["b"], dtype=any_string_dtype)
+        result = idx1.union(idx2)
+        expected = Index(["a", "b"], dtype=any_string_dtype)
+        tm.assert_index_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -2008,7 +2008,7 @@ def fillna(`
`2008`	`2008`	`"need to implement this keyword or an exception will be "`
`2009`	`2009`	`"raised. In the interim, the keyword is ignored by "`
`2010`	`2010`	`f"{type(self.values).__name__}.",`
`2011`		`- FutureWarning,`
	`2011`	`+ DeprecationWarning,`
`2012`	`2012`	`stacklevel=find_stack_level(),`
`2013`	`2013`	`)`
`2014`	`2014`