Merge branch 'pandas-dev:main' into raise-on-parse-int-overflow

SandroCasagrande · web-flow · commit 3d72cf265f89 · 2022-08-13T23:01:23.000+02:00
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -653,12 +653,20 @@ def linkcode_resolve(domain, info):
     try:
         fn = inspect.getsourcefile(inspect.unwrap(obj))
     except TypeError:
-        fn = None
+        try:  # property
+            fn = inspect.getsourcefile(inspect.unwrap(obj.fget))
+        except (AttributeError, TypeError):
+            fn = None
     if not fn:
         return None
 
     try:
         source, lineno = inspect.getsourcelines(obj)
+    except TypeError:
+        try:  # property
+            source, lineno = inspect.getsourcelines(obj.fget)
+        except (AttributeError, TypeError):
+            lineno = None
     except OSError:
         lineno = None
 
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -293,6 +293,7 @@ Other enhancements
 - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`)
 - :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`)
 - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`)
+- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_150.notable_bug_fixes:
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
@@ -132,6 +132,7 @@ def ensure_int8(arr: object, copy=...) -> npt.NDArray[np.int8]: ...
 def ensure_int16(arr: object, copy=...) -> npt.NDArray[np.int16]: ...
 def ensure_int32(arr: object, copy=...) -> npt.NDArray[np.int32]: ...
 def ensure_int64(arr: object, copy=...) -> npt.NDArray[np.int64]: ...
+def ensure_uint64(arr: object, copy=...) -> npt.NDArray[np.uint64]: ...
 def take_1d_int8_int8(
     values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
 ) -> None: ...
diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in
@@ -41,12 +41,12 @@ dtypes = [('float64', 'FLOAT64', 'float64'),
           ('int16', 'INT16', 'int16'),
           ('int32', 'INT32', 'int32'),
           ('int64', 'INT64', 'int64'),
+          ('uint64', 'UINT64', 'uint64'),
           # Disabling uint and complex dtypes because we do not use them
-          #  (and compiling them increases wheel size)
+          #  (and compiling them increases wheel size) (except uint64)
           # ('uint8', 'UINT8', 'uint8'),
           # ('uint16', 'UINT16', 'uint16'),
           # ('uint32', 'UINT32', 'uint32'),
-          # ('uint64', 'UINT64', 'uint64'),
           # ('complex64', 'COMPLEX64', 'complex64'),
           # ('complex128', 'COMPLEX128', 'complex128')
 ]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -513,14 +513,7 @@ ctypedef fused mean_t:
 
 ctypedef fused sum_t:
     mean_t
-    int8_t
-    int16_t
-    int32_t
     int64_t
-
-    uint8_t
-    uint16_t
-    uint32_t
     uint64_t
     object
 
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -100,6 +100,7 @@ def ensure_float(arr):
 ensure_int8 = algos.ensure_int8
 ensure_platform_int = algos.ensure_platform_int
 ensure_object = algos.ensure_object
+ensure_uint64 = algos.ensure_uint64
 
 
 def ensure_str(value: bytes | Any) -> str:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5824,6 +5824,7 @@ def set_index(
         append: bool = ...,
         inplace: Literal[False] = ...,
         verify_integrity: bool = ...,
+        copy: bool | lib.NoDefault = ...,
     ) -> DataFrame:
         ...
 
@@ -5836,6 +5837,7 @@ def set_index(
         append: bool = ...,
         inplace: Literal[True],
         verify_integrity: bool = ...,
+        copy: bool | lib.NoDefault = ...,
     ) -> None:
         ...
 
@@ -5847,6 +5849,7 @@ def set_index(
         append: bool = False,
         inplace: bool = False,
         verify_integrity: bool = False,
+        copy: bool | lib.NoDefault = lib.no_default,
     ) -> DataFrame | None:
         """
         Set the DataFrame index using existing columns.
@@ -5873,6 +5876,11 @@ def set_index(
             Check the new index for duplicates. Otherwise defer the check until
             necessary. Setting to False will improve the performance of this
             method.
+        copy : bool, default True
+            Whether to make a copy of the underlying data when returning a new
+            DataFrame.
+
+            .. versionadded:: 1.5.0
 
         Returns
         -------
@@ -5938,6 +5946,13 @@ def set_index(
         4 16     10  2014    31
         """
         inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if copy is not lib.no_default:
+                raise ValueError("Cannot specify copy when inplace=True")
+            copy = False
+        elif copy is lib.no_default:
+            copy = True
+
         self._check_inplace_and_allows_duplicate_labels(inplace)
         if not isinstance(keys, list):
             keys = [keys]
@@ -5973,7 +5988,7 @@ def set_index(
         if inplace:
             frame = self
         else:
-            frame = self.copy()
+            frame = self.copy(deep=copy)
 
         arrays = []
         names: list[Hashable] = []
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -46,6 +46,7 @@
     ensure_float64,
     ensure_int64,
     ensure_platform_int,
+    ensure_uint64,
     is_1d_only_ea_dtype,
     is_bool_dtype,
     is_complex_dtype,
@@ -224,6 +225,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
+            elif how == "sum":
+                # Avoid overflow during group op
+                if values.dtype.kind == "i":
+                    values = ensure_int64(values)
+                else:
+                    values = ensure_uint64(values)
+
         return values
 
     # TODO: general case implementation overridable by EAs.
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -35,6 +35,7 @@
 from pandas.util._decorators import (
     Appender,
     Substitution,
+    cache_readonly,
 )
 from pandas.util._exceptions import find_stack_level
 
@@ -651,16 +652,6 @@ def __init__(
 
         self.indicator = indicator
 
-        self.indicator_name: str | None
-        if isinstance(self.indicator, str):
-            self.indicator_name = self.indicator
-        elif isinstance(self.indicator, bool):
-            self.indicator_name = "_merge" if self.indicator else None
-        else:
-            raise ValueError(
-                "indicator option can only accept boolean or string arguments"
-            )
-
         if not is_bool(left_index):
             raise ValueError(
                 f"left_index parameter must be of type bool, not {type(left_index)}"
@@ -753,6 +744,17 @@ def _maybe_drop_cross_column(
         if cross_col is not None:
             del result[cross_col]
 
+    @cache_readonly
+    def _indicator_name(self) -> str | None:
+        if isinstance(self.indicator, str):
+            return self.indicator
+        elif isinstance(self.indicator, bool):
+            return "_merge" if self.indicator else None
+        else:
+            raise ValueError(
+                "indicator option can only accept boolean or string arguments"
+            )
+
     def _indicator_pre_merge(
         self, left: DataFrame, right: DataFrame
     ) -> tuple[DataFrame, DataFrame]:
@@ -765,7 +767,7 @@ def _indicator_pre_merge(
                     "Cannot use `indicator=True` option when "
                     f"data contains a column named {i}"
                 )
-        if self.indicator_name in columns:
+        if self._indicator_name in columns:
             raise ValueError(
                 "Cannot use name of an existing column for indicator column"
             )
@@ -786,13 +788,13 @@ def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
         result["_left_indicator"] = result["_left_indicator"].fillna(0)
         result["_right_indicator"] = result["_right_indicator"].fillna(0)
 
-        result[self.indicator_name] = Categorical(
+        result[self._indicator_name] = Categorical(
             (result["_left_indicator"] + result["_right_indicator"]),
             categories=[1, 2, 3],
         )
-        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(
-            ["left_only", "right_only", "both"]
-        )
+        result[self._indicator_name] = result[
+            self._indicator_name
+        ].cat.rename_categories(["left_only", "right_only", "both"])
 
         result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
         return result
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4862,17 +4862,23 @@ def rename(
 
         Parameters
         ----------
-        axis : {0 or 'index'}
-            Unused. Parameter needed for compatibility with DataFrame.
-        index : scalar, hashable sequence, dict-like or function, optional
+        index : scalar, hashable sequence, dict-like or function optional
             Functions or dict-like are transformations to apply to
             the index.
             Scalar or hashable sequence-like will alter the ``Series.name``
             attribute.
-
-        **kwargs
-            Additional keyword arguments passed to the function. Only the
-            "inplace" keyword is used.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        copy : bool, default True
+            Also copy underlying data.
+        inplace : bool, default False
+            Whether to return a new Series. If True the value of copy is ignored.
+        level : int or level name, default None
+            In case of MultiIndex, only rename labels in the specified level.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise `KeyError` when a `dict-like mapper` or
+            `index` contains labels that are not present in the index being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be ignored.
 
         Returns
         -------
diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py
@@ -25,6 +25,25 @@
 
 
 class TestSetIndex:
+    def test_set_index_copy(self):
+        # GH#48043
+        df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
+        expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A"))
+
+        res = df.set_index("A", copy=True)
+        tm.assert_frame_equal(res, expected)
+        assert not any(tm.shares_memory(df[col], res[col]) for col in res.columns)
+
+        res = df.set_index("A", copy=False)
+        tm.assert_frame_equal(res, expected)
+        assert all(tm.shares_memory(df[col], res[col]) for col in res.columns)
+
+        msg = "Cannot specify copy when inplace=True"
+        with pytest.raises(ValueError, match=msg):
+            df.set_index("A", inplace=True, copy=True)
+        with pytest.raises(ValueError, match=msg):
+            df.set_index("A", inplace=True, copy=False)
+
     def test_set_index_multiindex(self):
         # segfault in GH#3308
         d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2829,3 +2829,16 @@ def test_groupby_sum_support_mask(any_numeric_ea_dtype):
         dtype=any_numeric_ea_dtype,
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
+def test_groupby_sum_overflow(val, dtype):
+    # GH#37493
+    df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
+    result = df.groupby("a").sum()
+    expected = DataFrame(
+        {"b": [val * 2]},
+        index=Index([1], name="a", dtype=f"{dtype}64"),
+        dtype=f"{dtype}64",
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+import re
 
 import numpy as np
 import pytest
@@ -134,3 +135,18 @@ def test_rename_series_with_multiindex(self):
         series_expected = Series(np.ones(5), index=index_expected)
 
         tm.assert_series_equal(result, series_expected)
+
+    def test_rename_error_arg(self):
+        # GH 46889
+        ser = Series(["foo", "bar"])
+        match = re.escape("[2] not found in axis")
+        with pytest.raises(KeyError, match=match):
+            ser.rename({2: 9}, errors="raise")
+
+    def test_rename_copy_false(self):
+        # GH 46889
+        ser = Series(["foo", "bar"])
+        shallow_copy = ser.rename({1: 9}, copy=False)
+        ser[0] = "foobar"
+        assert ser[0] == shallow_copy[0]
+        assert ser[1] == shallow_copy[9]
diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md
@@ -6,10 +6,10 @@ _pandas_ is made with love by more than [2,000 volunteer contributors](https://g
 
 If you want to support pandas development, you can find information in the [donations page](../donate.html).
 
-## Maintainers
+## Active maintainers
 
 <div class="card-group maintainers">
-    {% for person in maintainers.people %}
+    {% for person in maintainers.active_with_github_info %}
         <div class="card">
             <img class="card-img-top" alt="" src="{{ person.avatar_url }}"/>
             <div class="card-body">
@@ -60,10 +60,14 @@ The project governance is available in the [project governance page](governance.
     {% endfor %}
 </ul>
 
-## Emeritus maintainers
+## Inactive maintainers
 
 <ul>
-    {% for person in maintainers.emeritus %}
-        <li>{{ person }}</li>
+    {% for person in maintainers.inactive_with_github_info %}
+        <li>
+            <a href="{{ person.blog or person.html_url }}">
+                {{ person.name or person.login }}
+            </a>
+        </li>
     {% endfor %}
 </ul>
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
@@ -68,13 +68,10 @@ maintainers:
   - wesm
   - jorisvandenbossche
   - TomAugspurger
-  - shoyer
   - jreback
-  - chris-b1
   - sinhrks
   - cpcloud
   - gfyoung
-  - toobaz
   - WillAyd
   - mroeschke
   - jschendel
@@ -93,10 +90,15 @@ maintainers:
   - attack68
   - fangchenli
   - twoertwein
-  emeritus:
-  - Wouter Overmeire
-  - Skipper Seabold
-  - Jeff Tratner
+  - lithomas1
+  - mzeitlin11
+  inactive:
+  - lodagro
+  - jseabold
+  - jtratner
+  - shoyer
+  - chris-b1
+  - toobaz
   coc:
   - Safia Abdalla
   - Tom Augspurger
diff --git a/web/pandas_web.py b/web/pandas_web.py