From bf0eee049b6d64fc18875f04eb93da295dd3e177 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 08:51:31 +0100 Subject: [PATCH 01/14] DEPR/API: disallow lists within list for set_index --- pandas/core/frame.py | 56 +++++++++++++++++---------- pandas/tests/frame/test_alter_axes.py | 48 ++++++++--------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7bbbdd70e062e..de52fd95f7b56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4041,12 +4041,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, Set the DataFrame index using existing columns. Set the DataFrame index (row labels) using one or more existing - columns. The index can replace the existing index or expand on it. + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. Parameters ---------- - keys : label or list of label - Name or names of the columns that will be used as the index. + keys : label or array-like or list-like of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list-like containing + an arbitrary combination of column keys and arrays. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4091,7 +4094,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 7 2013 84 10 2014 31 - Create a multi-index using columns 'year' and 'month': + Create a MultiIndex using columns 'year' and 'month': >>> df.set_index(['year', 'month']) sale @@ -4101,35 +4104,52 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2013 7 84 2014 10 31 - Create a multi-index using a set of values and a column: + Create a MultiIndex using a set of values and a column: - >>> df.set_index([[1, 2, 3, 4], 'year']) + >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31 + + Create a MultiIndex using a set of values and a column: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, 'inplace') - if not isinstance(keys, list): + + err_msg = ('The parameter "keys" may be a column key, one-dimensional ' + 'array, or a list-like containing only valid column keys ' + 'and one-dimensional arrays') + + if (is_scalar(keys) or isinstance(keys, tuple) + or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))): + # make sure we have a container of keys/arrays we can iterate over + # tuples can appear as valid column keys! keys = [keys] + elif not isinstance(keys, list): + raise ValueError(err_msg) missing = [] for col in keys: if (is_scalar(col) or isinstance(col, tuple)) and col in self: - # tuples can be both column keys or list-likes - # if they are valid column keys, everything is fine + # if col is a valid column key, everything is fine continue elif is_scalar(col) and col not in self: - # tuples that are not column keys are considered list-like, - # not considered missing + # tuples that are not keys will be are excluded here; + # will be considered list-like, not missing missing.append(col) - elif (not is_list_like(col, allow_sets=False) + elif (not isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray)) or getattr(col, 'ndim', 1) > 1): - raise TypeError('The parameter "keys" may only contain a ' - 'combination of valid column keys and ' - 'one-dimensional list-likes') + raise ValueError(err_msg) if missing: raise KeyError('{}'.format(missing)) @@ -4162,12 +4182,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False, elif isinstance(col, (list, np.ndarray)): arrays.append(col) names.append(None) - elif (is_list_like(col) - and not (isinstance(col, tuple) and col in self)): - # all other list-likes (but avoid valid column keys) - col = list(col) # ensure iterator do not get read twice etc. - arrays.append(col) - names.append(None) # from here, col can only be a column label else: arrays.append(frame[col]._values) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index b63151dfb459e..46316c429f554 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -118,7 +118,6 @@ def test_set_index_after_mutation(self): # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, lambda x: [list(x)], lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -129,29 +128,22 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, df.index.name = index_name key = box(df['B']) - if box == list: - # list of strings gets interpreted as list of keys - msg = "['one', 'two', 'three', 'one', 'two']" - with pytest.raises(KeyError, match=msg): - df.set_index(key, drop=drop, append=append) - else: - # np.array/tuple/iter/list-of-list "forget" the name of B - name_mi = getattr(key, 'names', None) - name = [getattr(key, 'name', None)] if name_mi is None else name_mi + # np.array "forgets" the name of B + name_mi = getattr(key, 'names', None) + name = [getattr(key, 'name', None)] if name_mi is None else name_mi - result = df.set_index(key, drop=drop, append=append) + result = df.set_index(key, drop=drop, append=append) - # only valid column keys are dropped - # since B is always passed as array above, nothing is dropped - expected = df.set_index(['B'], drop=False, append=append) - expected.index.names = [index_name] + name if append else name + # only valid column keys are dropped + # since B is always passed as array above, nothing is dropped + expected = df.set_index(['B'], drop=False, append=append) + expected.index.names = [index_name] + name if append else name - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) @pytest.mark.parametrize('box', [Series, Index, np.array, - list, tuple, iter, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -163,8 +155,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array/list/tuple/iter "forget" the name of B - names = ['A', None if box in [np.array, list, tuple, iter] else 'B'] + # np.array "forgets" the name of B + names = ['A', None if box in [np.array] else 'B'] result = df.set_index(keys, drop=drop, append=append) @@ -180,11 +172,9 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) @pytest.mark.parametrize('box2', [Series, Index, np.array, - list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('box1', [Series, Index, np.array, - list, tuple, iter, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -198,19 +188,15 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, keys = [box1(df['A']), box2(df['A'])] result = df.set_index(keys, drop=drop, append=append) - # if either box was iter, the content has been consumed; re-read it - keys = [box1(df['A']), box2(df['A'])] - # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; # use "is" because == would give ambiguous Boolean error for containers first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop # to test against already-tested behaviour, we add sequentially, - # hence second append always True; must wrap keys in list, otherwise - # box = list would be illegal - expected = df.set_index([keys[0]], drop=first_drop, append=append) - expected = expected.set_index([keys[1]], drop=drop, append=True) + # hence second append always True + expected = df.set_index(keys[0], drop=first_drop, append=append) + expected = expected.set_index(keys[1], drop=drop, append=True) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) @@ -249,13 +235,13 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append): with pytest.raises(KeyError, match='X'): df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append) - msg = 'The parameter "keys" may only contain a combination of.*' + msg = 'The parameter "keys" may be a column key, .*' # forbidden type, e.g. set - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): df.set_index(set(df['A']), drop=drop, append=append) # forbidden type in list, e.g. set - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): df.set_index(['A', df['A'], set(df['A'])], drop=drop, append=append) From ed0de1f3756e2f90f8ba0d32c1223375bd9fe9a3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 09:39:58 +0100 Subject: [PATCH 02/14] Add deprecation and whatsnew --- doc/source/whatsnew/v0.24.0.rst | 3 ++- pandas/core/frame.py | 11 ++++++-- pandas/tests/frame/test_alter_axes.py | 36 ++++++++++++++++++++------- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index aee3d78243d2e..1bd926a4d4406 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1251,7 +1251,7 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types, +- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, and raises a ``ValueError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) @@ -1309,6 +1309,7 @@ Deprecations - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) - :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`) +- :meth:`DataFrame.set_index` has deprecated using lists of values *within* lists. It remains possible to pass array-likes, both directly and within a list. .. _whatsnew_0240.deprecations.datetimelike_int_ops: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de52fd95f7b56..42dd7874c6348 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4139,20 +4139,27 @@ def set_index(self, keys, drop=True, append=False, inplace=False, raise ValueError(err_msg) missing = [] + depr_warn = False for col in keys: if (is_scalar(col) or isinstance(col, tuple)) and col in self: # if col is a valid column key, everything is fine continue elif is_scalar(col) and col not in self: - # tuples that are not keys will be are excluded here; - # will be considered list-like, not missing + # tuples that are not keys are not considered missing, + # but as an illegal list-like missing.append(col) + elif isinstance(col, list): + depr_warn = True elif (not isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray)) or getattr(col, 'ndim', 1) > 1): raise ValueError(err_msg) if missing: raise KeyError('{}'.format(missing)) + if depr_warn: + msg = ('passing lists within a list to the parameter "keys" is ' + 'deprecated and will be removed in a future version.') + warnings.warn(msg, FutureWarning, stacklevel=2) if inplace: frame = self diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 46316c429f554..2670da5ca028e 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -143,7 +143,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) - @pytest.mark.parametrize('box', [Series, Index, np.array, + @pytest.mark.parametrize('box', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'A'), (True, 'B'), @@ -156,9 +156,13 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, keys = ['A', box(df['B'])] # np.array "forgets" the name of B - names = ['A', None if box in [np.array] else 'B'] + names = ['A', None if box in [list, np.array] else 'B'] - result = df.set_index(keys, drop=drop, append=append) + if box == list: + with tm.assert_produces_warning(FutureWarning): + result = df.set_index(keys, drop=drop, append=append) + else: + result = df.set_index(keys, drop=drop, append=append) # only valid column keys are dropped # since B is always passed as array above, only A is dropped, if at all @@ -171,10 +175,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, # MultiIndex constructor does not work directly on Series -> lambda # We also emulate a "constructor" for the label -> lambda # also test index name if append=True (name is duplicate here for A) - @pytest.mark.parametrize('box2', [Series, Index, np.array, + @pytest.mark.parametrize('box2', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) - @pytest.mark.parametrize('box1', [Series, Index, np.array, + @pytest.mark.parametrize('box1', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x]), lambda x: x.name]) @pytest.mark.parametrize('append, index_name', [(True, None), @@ -186,7 +190,12 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, df.index.name = index_name keys = [box1(df['A']), box2(df['A'])] - result = df.set_index(keys, drop=drop, append=append) + + if box1 == list or box2 == list: + with tm.assert_produces_warning(FutureWarning): + result = df.set_index(keys, drop=drop, append=append) + else: + result = df.set_index(keys, drop=drop, append=append) # need to adapt first drop for case that both keys are 'A' -- # cannot drop the same column twice; @@ -194,9 +203,18 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop, first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop # to test against already-tested behaviour, we add sequentially, - # hence second append always True - expected = df.set_index(keys[0], drop=first_drop, append=append) - expected = expected.set_index(keys[1], drop=drop, append=True) + # hence second append always True; must wrap keys in list, otherwise + # box = list would be interpreted as keys + if box1 == list or box2 == list: + with tm.assert_produces_warning(FutureWarning): + expected = df.set_index([keys[0]], drop=first_drop, + append=append) + expected = expected.set_index([keys[1]], drop=drop, + append=True) + else: + expected = df.set_index([keys[0]], drop=first_drop, append=append) + expected = expected.set_index([keys[1]], drop=drop, append=True) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('append', [True, False]) From dc274e3926747f4eabd9ccdada39803ca1abbe72 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 09:44:50 +0100 Subject: [PATCH 03/14] restore test for list-of-scalars interpreted as keys --- pandas/tests/frame/test_alter_axes.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2670da5ca028e..f3db8d09a1278 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -117,7 +117,7 @@ def test_set_index_after_mutation(self): # MultiIndex constructor does not work directly on Series -> lambda # Add list-of-list constructor because list is ambiguous -> lambda # also test index name if append=True (name is duplicate here for B) - @pytest.mark.parametrize('box', [Series, Index, np.array, + @pytest.mark.parametrize('box', [Series, Index, np.array, list, lambda x: MultiIndex.from_arrays([x])]) @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), (True, 'test'), (False, None)]) @@ -128,18 +128,24 @@ def test_set_index_pass_single_array(self, frame_of_index_cols, df.index.name = index_name key = box(df['B']) - # np.array "forgets" the name of B - name_mi = getattr(key, 'names', None) - name = [getattr(key, 'name', None)] if name_mi is None else name_mi + if box == list: + # list of strings gets interpreted as list of keys + msg = "['one', 'two', 'three', 'one', 'two']" + with pytest.raises(KeyError, match=msg): + df.set_index(key, drop=drop, append=append) + else: + # np.array "forgets" the name of B + name_mi = getattr(key, 'names', None) + name = [getattr(key, 'name', None)] if name_mi is None else name_mi - result = df.set_index(key, drop=drop, append=append) + result = df.set_index(key, drop=drop, append=append) - # only valid column keys are dropped - # since B is always passed as array above, nothing is dropped - expected = df.set_index(['B'], drop=False, append=append) - expected.index.names = [index_name] + name if append else name + # only valid column keys are dropped + # since B is always passed as array above, nothing is dropped + expected = df.set_index(['B'], drop=False, append=append) + expected.index.names = [index_name] + name if append else name - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # MultiIndex constructor does not work directly on Series -> lambda # also test index name if append=True (name is duplicate here for A & B) From 623fc9ae6d9a7136851035d1c77ff0974f7db888 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 13:26:27 +0100 Subject: [PATCH 04/14] Small doc fixes --- doc/source/whatsnew/v0.24.0.rst | 4 ++-- pandas/core/frame.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 1bd926a4d4406..98f299ca66e6a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1251,8 +1251,8 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, and raises a ``ValueError`` for incorrect types, - has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) +- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, + and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 42dd7874c6348..6db2755d1c7c2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4104,7 +4104,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 2013 7 84 2014 10 31 - Create a MultiIndex using a set of values and a column: + Create a MultiIndex using an Index and a column: >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) month sale @@ -4114,7 +4114,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 3 2013 7 84 4 2014 10 31 - Create a MultiIndex using a set of values and a column: + Create a MultiIndex using two Series: >>> s = pd.Series([1, 2, 3, 4]) >>> df.set_index([s, s**2]) From 5f6e3033f9ac7b419e0f452a71023f12ff5281f3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 20:05:56 +0100 Subject: [PATCH 05/14] Improve docstring; small fixes --- pandas/core/frame.py | 13 ++++++++----- pandas/tests/frame/test_alter_axes.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6db2755d1c7c2..a2539a6760772 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4048,8 +4048,11 @@ def set_index(self, keys, drop=True, append=False, inplace=False, ---------- keys : label or array-like or list-like of labels/arrays This parameter can be either a single column key, a single array of - the same length as the calling DataFrame, or a list-like containing - an arbitrary combination of column keys and arrays. + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. + Lists (in the sense of a sequence of values, not column labels) + have been deprecated, and will be removed in a future version. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4127,8 +4130,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, inplace = validate_bool_kwarg(inplace, 'inplace') err_msg = ('The parameter "keys" may be a column key, one-dimensional ' - 'array, or a list-like containing only valid column keys ' - 'and one-dimensional arrays') + 'array, or a list containing only valid column keys and ' + 'one-dimensional arrays.') if (is_scalar(keys) or isinstance(keys, tuple) or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))): @@ -4146,7 +4149,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, continue elif is_scalar(col) and col not in self: # tuples that are not keys are not considered missing, - # but as an illegal list-like + # but as an illegal list-like (see below) missing.append(col) elif isinstance(col, list): depr_warn = True diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index f3db8d09a1278..6d05b7f7413d2 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -161,8 +161,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols, df.index.name = index_name keys = ['A', box(df['B'])] - # np.array "forgets" the name of B - names = ['A', None if box in [list, np.array] else 'B'] + # np.array/list "forget" the name of B + names = ['A', None if box in [np.array, list] else 'B'] if box == list: with tm.assert_produces_warning(FutureWarning): From 813b4fcbcaf01659b77d3f905c5bc614e5f2e00e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 10 Jan 2019 20:08:47 +0100 Subject: [PATCH 06/14] Remove last mention of "list-like" --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a2539a6760772..49404c2036e8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4046,7 +4046,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, Parameters ---------- - keys : label or array-like or list-like of labels/arrays + keys : label or array-like or list of labels/arrays This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" From 4c130ee01fc82c7272a49b5cbd21786492fdb397 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 11 Jan 2019 00:28:32 +0100 Subject: [PATCH 07/14] rephrase "illegal" --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49404c2036e8b..70e755608c386 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4149,7 +4149,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, continue elif is_scalar(col) and col not in self: # tuples that are not keys are not considered missing, - # but as an illegal list-like (see below) + # but illegal (see below) missing.append(col) elif isinstance(col, list): depr_warn = True From e1d999b8f15adc355257dd09382ec88a10df8b06 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 14 Jan 2019 17:31:16 +0100 Subject: [PATCH 08/14] Improve warning message (review TomAugspurger) --- pandas/core/frame.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70e755608c386..a8204aab9db2b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4160,8 +4160,11 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if missing: raise KeyError('{}'.format(missing)) if depr_warn: - msg = ('passing lists within a list to the parameter "keys" is ' - 'deprecated and will be removed in a future version.') + msg = ('Passing lists within a list to the parameter "keys" is ' + 'deprecated and will be removed in a future version. To ' + 'silence this warning, wrap the lists in a Series / Index ' + 'or np.ndarray. E.g. df.set_index(["A", [1, 2, 3]]) should ' + 'be passed as df.set_index(["A", pd.Series([1, 2, 3])).') warnings.warn(msg, FutureWarning, stacklevel=2) if inplace: From 726ef1c427ce39cd4dcca7275ed44658c1340c75 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 14 Jan 2019 17:37:55 +0100 Subject: [PATCH 09/14] typo --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a8204aab9db2b..50d798172e388 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4164,7 +4164,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 'deprecated and will be removed in a future version. To ' 'silence this warning, wrap the lists in a Series / Index ' 'or np.ndarray. E.g. df.set_index(["A", [1, 2, 3]]) should ' - 'be passed as df.set_index(["A", pd.Series([1, 2, 3])).') + 'be passed as df.set_index(["A", pd.Series([1, 2, 3])]).') warnings.warn(msg, FutureWarning, stacklevel=2) if inplace: From b0b326fae4cce6424ab629f31b4857cb2300b550 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 16 Jan 2019 20:19:20 +0100 Subject: [PATCH 10/14] Tuples always considered keys; KeyError, not ValueError if missing --- pandas/core/frame.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4dc821a65789..b9d185ace388f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4145,13 +4145,11 @@ def set_index(self, keys, drop=True, append=False, inplace=False, missing = [] depr_warn = False for col in keys: - if (is_scalar(col) or isinstance(col, tuple)) and col in self: + if (is_scalar(col) or isinstance(col, tuple)): # if col is a valid column key, everything is fine - continue - elif is_scalar(col) and col not in self: - # tuples that are not keys are not considered missing, - # but illegal (see below) - missing.append(col) + # tuples are always considered keys, never as list-likes + if col not in self: + missing.append(col) elif isinstance(col, list): depr_warn = True elif (not isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray)) From 02148015563e99d396712b03a5b834a0c99525b3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 11:25:37 +0100 Subject: [PATCH 11/14] Actually commit fix for conflict, duh --- pandas/core/frame.py | 42 ++++++++------------------- pandas/tests/frame/test_alter_axes.py | 9 ------ 2 files changed, 12 insertions(+), 39 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8aedfb97d75c2..79233addc77e2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4025,14 +4025,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False, This parameter can be either a single column key, a single array of the same length as the calling DataFrame, or a list containing an arbitrary combination of column keys and arrays. Here, "array" -<<<<<<< HEAD - encompasses :class:`Series`, :class:`Index` and ``np.ndarray``. - Lists (in the sense of a sequence of values, not column labels) - have been deprecated, and will be removed in a future version. -======= encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and instances of :class:`abc.Iterator`. ->>>>>>> upstream/master + Lists (in the sense of a sequence of values, not column labels) + have been deprecated, and will be removed in a future version. drop : bool, default True Delete columns to be used as the new index. append : bool, default False @@ -4118,34 +4114,14 @@ def set_index(self, keys, drop=True, append=False, inplace=False, missing = [] depr_warn = False for col in keys: -<<<<<<< HEAD - if (is_scalar(col) or isinstance(col, tuple)): - # if col is a valid column key, everything is fine - # tuples are always considered keys, never as list-likes - if col not in self: - missing.append(col) - elif isinstance(col, list): - depr_warn = True - elif (not isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray)) - or getattr(col, 'ndim', 1) > 1): - raise ValueError(err_msg) - - if missing: - raise KeyError('{}'.format(missing)) - if depr_warn: - msg = ('Passing lists within a list to the parameter "keys" is ' - 'deprecated and will be removed in a future version. To ' - 'silence this warning, wrap the lists in a Series / Index ' - 'or np.ndarray. E.g. df.set_index(["A", [1, 2, 3]]) should ' - 'be passed as df.set_index(["A", pd.Series([1, 2, 3])]).') - warnings.warn(msg, FutureWarning, stacklevel=2) -======= if isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray, - list, Iterator)): + Iterator)): # arrays are fine as long as they are one-dimensional # iterators get converted to list below if getattr(col, 'ndim', 1) != 1: raise ValueError(err_msg) + elif isinstance(col, list): + depr_warn = True else: # everything else gets tried as a key; see GH 24969 try: @@ -4159,7 +4135,13 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if missing: raise KeyError('None of {} are in the columns'.format(missing)) ->>>>>>> upstream/master + if depr_warn: + msg = ('Passing lists within a list to the parameter "keys" is ' + 'deprecated and will be removed in a future version. To ' + 'silence this warning, wrap the lists in a Series / Index ' + 'or np.ndarray. E.g. df.set_index(["A", [1, 2, 3]]) should ' + 'be passed as df.set_index(["A", pd.Series([1, 2, 3])]).') + warnings.warn(msg, FutureWarning, stacklevel=2) if inplace: frame = self diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 51f83b870c0d8..9ea2d4258bc9c 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -279,21 +279,12 @@ def test_set_index_raise_on_type(self, frame_of_index_cols, box, df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' -<<<<<<< HEAD - # forbidden type, e.g. set/iter - with pytest.raises(ValueError, match=msg): - df.set_index(box(df['A']), drop=drop, append=append) - - # forbidden type in list, e.g. set/iter - with pytest.raises(ValueError, match=msg): -======= # forbidden type, e.g. set with pytest.raises(TypeError, match=msg): df.set_index(box(df['A']), drop=drop, append=append) # forbidden type in list, e.g. set with pytest.raises(TypeError, match=msg): ->>>>>>> upstream/master df.set_index(['A', df['A'], box(df['A'])], drop=drop, append=append) From 61c511d7a5aa92815a9159ba1423bfd0a4dc61a3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 24 Feb 2019 11:25:48 +0100 Subject: [PATCH 12/14] Move whatsnew to 0.25 --- doc/source/whatsnew/v0.24.0.rst | 1 - doc/source/whatsnew/v0.25.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 0592e44ca9893..a49ea2cf493a6 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1323,7 +1323,6 @@ Deprecations - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) - :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`) -- :meth:`DataFrame.set_index` has deprecated using lists of values *within* lists. It remains possible to pass array-likes, both directly and within a list. - Passing an integer to :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtypes is deprecated, will raise ``TypeError`` in a future version. Use ``obj.fillna(pd.Timedelta(...))`` instead (:issue:`24694`) - ``Series.cat.categorical``, ``Series.cat.name`` and ``Sersies.cat.index`` have been deprecated. Use the attributes on ``Series.cat`` or ``Series`` directly. (:issue:`24751`). - Passing a dtype without a precision like ``np.dtype('datetime64')`` or ``timedelta64`` to :class:`Index`, :class:`DatetimeIndex` and :class:`TimedeltaIndex` is now deprecated. Use the nanosecond-precision dtype instead (:issue:`24753`). diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 170e7f14da397..4141d7f67b35a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -79,6 +79,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`) +- :meth:`DataFrame.set_index` has deprecated using lists of values *within* lists. It remains possible to pass array-likes, both directly and within a list. .. _whatsnew_0250.prior_deprecations: From 5fa544c5858725b792332a958db2f518f8fa8d12 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 1 Mar 2019 19:18:23 +0100 Subject: [PATCH 13/14] Add deprecation-section (review jreback) --- doc/source/whatsnew/v0.25.0.rst | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0ee3f0809f212..c0e0d7908e7e6 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -93,8 +93,34 @@ Other API Changes Deprecations ~~~~~~~~~~~~ +**Lists as arrays in :meth:`DataFrame.set_index`** + +Currently, :meth:`DataFrame.set_index` accepts lists as meaning two different things - as a list of labels, and as an array-like collection of values. +This ambiguity decides in favor of the list of labels, but nested lists are interpreted as arrays: + +.. ipython:: ipython + :okwarning: + + df = pd.DataFrame(np.reshape(np.arange(12), (3, 4)), columns=['a', 'b', 'c', 'd']) + df.set_index(['a', 'b', 'c']) + df.set_index([['a', 'b', 'c']]) + +The latter case has now been deprecated and will be removed in a future version. As a replacement, +it is suggested to wrap the list in a :class:`Series`, :class:`Index`, ``np.array`` or an iterator. + +.. ipython:: ipython + + df.set_index(pd.Series(['a', 'b', 'c'])) + +It remains possible to use lists as collecting several column keys or arrays to create multiple levels of a :class:`MultiIndex`. + +.. ipython:: ipython + + df.set_index(['a', pd.Series(['a', 'b', 'c'])]) + +**Other deprecations** + - Deprecated the `M (months)` and `Y (year)` `units` parameter of :func: `pandas.to_timedelta`, :func: `pandas.Timedelta` and :func: `pandas.TimedeltaIndex` (:issue:`16344`) -- :meth:`DataFrame.set_index` has deprecated using lists of values *within* lists. It remains possible to pass array-likes, both directly and within a list. .. _whatsnew_0250.prior_deprecations: From 0c65876b7818329d7cd602ab95ab87648cdf0122 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 3 Mar 2019 21:39:55 +0100 Subject: [PATCH 14/14] Fix doc fails --- doc/source/whatsnew/v0.25.0.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 560f841ec30b1..a2537a20058d4 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -98,17 +98,18 @@ Deprecations Currently, :meth:`DataFrame.set_index` accepts lists as meaning two different things - as a list of labels, and as an array-like collection of values. This ambiguity decides in favor of the list of labels, but nested lists are interpreted as arrays: -.. ipython:: ipython +.. ipython:: python :okwarning: - df = pd.DataFrame(np.reshape(np.arange(12), (3, 4)), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.reshape(np.arange(12), (3, 4)), + columns=['a', 'b', 'c', 'd']) df.set_index(['a', 'b', 'c']) df.set_index([['a', 'b', 'c']]) The latter case has now been deprecated and will be removed in a future version. As a replacement, it is suggested to wrap the list in a :class:`Series`, :class:`Index`, ``np.array`` or an iterator. -.. ipython:: ipython +.. ipython:: python df.set_index(pd.Series(['a', 'b', 'c']))