diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 124ec8f4ab92c..a2537a20058d4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -93,6 +93,34 @@ Other API Changes Deprecations ~~~~~~~~~~~~ +**Lists as arrays in :meth:`DataFrame.set_index`** + +Currently, :meth:`DataFrame.set_index` accepts lists as meaning two different things - as a list of labels, and as an array-like collection of values. +This ambiguity decides in favor of the list of labels, but nested lists are interpreted as arrays: + +.. ipython:: python + :okwarning: + + df = pd.DataFrame(np.reshape(np.arange(12), (3, 4)), + columns=['a', 'b', 'c', 'd']) + df.set_index(['a', 'b', 'c']) + df.set_index([['a', 'b', 'c']]) + +The latter case has now been deprecated and will be removed in a future version. As a replacement, +it is suggested to wrap the list in a :class:`Series`, :class:`Index`, ``np.array`` or an iterator. + +.. ipython:: python + + df.set_index(pd.Series(['a', 'b', 'c'])) + +It remains possible to use lists as collecting several column keys or arrays to create multiple levels of a :class:`MultiIndex`. + +.. ipython:: ipython + + df.set_index(['a', pd.Series(['a', 'b', 'c'])]) + +**Other deprecations** + - Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`) .. _whatsnew_0250.prior_deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6b4d95055d06d..e97b347220072 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4033,6 +4033,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, arbitrary combination of column keys and arrays. Here, "array" encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and instances of :class:`abc.Iterator`. + Lists (in the sense of a sequence of values, not column labels) + have been deprecated, and will be removed in a future version. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4116,13 +4118,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 'one-dimensional arrays.') missing = [] + depr_warn = False for col in keys: if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, - list, Iterator)): + Iterator)): # arrays are fine as long as they are one-dimensional # iterators get converted to list below if getattr(col, 'ndim', 1) != 1: raise ValueError(err_msg) + elif isinstance(col, list): + depr_warn = True else: # everything else gets tried as a key; see GH 24969 try: @@ -4136,6 +4141,13 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if missing: raise KeyError('None of {} are in the columns'.format(missing)) + if depr_warn: + msg = ('Passing lists within a list to the parameter "keys" is ' + 'deprecated and will be removed in a future version. To ' + 'silence this warning, wrap the lists in a Series / Index ' + 'or np.ndarray. E.g. df.set_index(["A", [1, 2, 3]]) should ' + 'be passed as df.set_index(["A", pd.Series([1, 2, 3])]).') + warnings.warn(msg, FutureWarning, stacklevel=2) if inplace: frame = self diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index a25e893e08900..9ea2d4258bc9c 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -115,10 +115,8 @@ def test_set_index_after_mutation(self): tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda - # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) - @pytest.mark.parametrize('box', [Series, Index, np.array, - list, lambda x: [list(x)], + @pytest.mark.parametrize('box', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -135,7 +133,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, with pytest.raises(KeyError, match=msg): df.set_index(key, drop=drop, append=append) else: - # np.array/list-of-list "forget" the name of B + # np.array "forgets" the name of B name_mi = getattr(key, 'names', None) name = [getattr(key, 'name', None)] if name_mi is None else name_mi @@ -163,9 +161,13 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, keys = ['A', box(df['B'])] # np.array/list "forget" the name of B - names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] + names = ['A', None if box in [np.array, list] else 'B'] - result = df.set_index(keys, drop=drop, append=append) + if box == list: + with tm.assert_produces_warning(FutureWarning): + result = df.set_index(keys, drop=drop, append=append) + else: + result = df.set_index(keys, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, only A is dropped, if at all @@ -193,7 +195,12 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, df.index.name = index_name keys = [box1(df['A']), box2(df['A'])] - result = df.set_index(keys, drop=drop, append=append) + + if box1 == list or box2 == list: + with tm.assert_produces_warning(FutureWarning): + result = df.set_index(keys, drop=drop, append=append) + else: + result = df.set_index(keys, drop=drop, append=append) # if either box is iter, it has been consumed; re-read keys = [box1(df['A']), box2(df['A'])] @@ -206,8 +213,16 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, # to test against already-tested behaviour, we add sequentially, # hence second append always True; must wrap keys in list, otherwise # box = list would be interpreted as keys - expected = df.set_index([keys[0]], drop=first_drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) + if box1 == list or box2 == list: + with tm.assert_produces_warning(FutureWarning): + expected = df.set_index([keys[0]], drop=first_drop, + append=append) + expected = expected.set_index([keys[1]], drop=drop, + append=True) + else: + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False])