Skip to content

Commit 4027825

Browse files
committed
API: better warnings for df.set_index
1 parent 824e96b commit 4027825

File tree

3 files changed

+48
-33
lines changed

3 files changed

+48
-33
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ Other API Changes
500500
- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`)
501501
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
502502
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
503+
- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
503504

504505
.. _whatsnew_0240.deprecations:
505506

pandas/core/frame.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
is_sequence,
6161
is_named_tuple)
6262
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
63+
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
6364
from pandas.core.dtypes.missing import isna, notna
6465

6566

@@ -3862,26 +3863,24 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
38623863
-------
38633864
dataframe : DataFrame
38643865
"""
3865-
from pandas import Series
3866-
38673866
if not isinstance(keys, list):
38683867
keys = [keys]
38693868

3870-
# collect elements from "keys" that are not allowed array types
3871-
col_labels = [x for x in keys
3872-
if not isinstance(x, (Series, Index, MultiIndex,
3873-
list, np.ndarray))]
3869+
# collect elements from keys that are scalar (or tuples for MultiIndex)
3870+
col_labels = [x for x in keys if is_scalar(x) or isinstance(x, tuple)]
38743871
if any(x not in self for x in col_labels):
38753872
# if there are any labels that are invalid, we raise a KeyError
38763873
missing = [x for x in col_labels if x not in self]
38773874
raise KeyError('{}'.format(missing))
38783875

3879-
elif len(set(col_labels)) < len(col_labels):
3880-
# if all are valid labels, but there are duplicates
3881-
dup = Series(col_labels)
3882-
dup = list(dup.loc[dup.duplicated()])
3883-
raise ValueError('Passed duplicate column names '
3884-
'to keys: {dup}'.format(dup=dup))
3876+
# raise error for elements of keys that are neither valid column keys
3877+
# or an array of allowed type
3878+
if any(not (is_scalar(x) or isinstance(x, tuple)) and
3879+
not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex,
3880+
list, np.ndarray)) for x in keys):
3881+
raise TypeError('keys may only contain a combination of the '
3882+
'following: valid column keys, Series, Index, '
3883+
'MultiIndex, list or np.ndarray')
38853884

38863885
inplace = validate_bool_kwarg(inplace, 'inplace')
38873886

@@ -3902,21 +3901,21 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39023901

39033902
to_remove = []
39043903
for col in keys:
3905-
if isinstance(col, MultiIndex):
3904+
if isinstance(col, ABCMultiIndex):
39063905
# append all but the last column so we don't have to modify
39073906
# the end of this loop
39083907
for n in range(col.nlevels - 1):
39093908
arrays.append(col._get_level_values(n))
39103909

39113910
level = col._get_level_values(col.nlevels - 1)
39123911
names.extend(col.names)
3913-
elif isinstance(col, Series):
3912+
elif isinstance(col, ABCSeries):
39143913
level = col._values
39153914
names.append(col.name)
3916-
elif isinstance(col, Index):
3915+
elif isinstance(col, ABCIndexClass):
39173916
level = col
39183917
names.append(col.name)
3919-
elif isinstance(col, (list, np.ndarray, Index)):
3918+
elif isinstance(col, (list, np.ndarray)):
39203919
level = col
39213920
names.append(None)
39223921
else:
@@ -3933,7 +3932,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
39333932
raise ValueError('Index has duplicate keys: {dup}'.format(
39343933
dup=duplicates))
39353934

3936-
for c in to_remove:
3935+
# use set to handle duplicate column names gracefully in case of drop
3936+
for c in set(to_remove):
39373937
del frame[c]
39383938

39393939
# clear up memory usage

pandas/tests/frame/test_alter_axes.py

+30-16
Original file line numberDiff line numberDiff line change
@@ -185,20 +185,21 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
185185
keys = [box1(df['A']), box2(df['A'])]
186186

187187
# == gives ambiguous Boolean for Series
188-
if keys[0] is 'A' and keys[1] is 'A':
189-
with tm.assert_raises_regex(ValueError,
190-
'Passed duplicate column names.*'):
191-
df.set_index(keys, drop=drop, append=append)
188+
if drop and keys[0] is 'A' and keys[1] is 'A':
189+
# can't drop same column twice
190+
first_drop = False
192191
else:
193-
result = df.set_index(keys, drop=drop, append=append)
192+
first_drop = drop
194193

195-
# to test against already-tested behavior, we add sequentially,
196-
# hence second append always True; must wrap in list, otherwise
197-
# list-box will be illegal
198-
expected = df.set_index([keys[0]], drop=drop, append=append)
199-
expected = expected.set_index([keys[1]], drop=drop, append=True)
194+
print(keys)
195+
# to test against already-tested behavior, we add sequentially,
196+
# hence second append always True; must wrap in list, otherwise
197+
# list-box will be illegal
198+
expected = df.set_index([keys[0]], drop=first_drop, append=append)
199+
expected = expected.set_index([keys[1]], drop=drop, append=True)
200200

201-
tm.assert_frame_equal(result, expected)
201+
result = df.set_index(keys, drop=drop, append=append)
202+
tm.assert_frame_equal(result, expected)
202203

203204
@pytest.mark.parametrize('append', [True, False])
204205
@pytest.mark.parametrize('drop', [True, False])
@@ -225,15 +226,28 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
225226
'Index has duplicate keys'):
226227
df.set_index([df['A'], df['A']], verify_integrity=True)
227228

228-
def test_set_index_raise(self, frame_of_index_cols):
229+
@pytest.mark.parametrize('append', [True, False])
230+
@pytest.mark.parametrize('drop', [True, False])
231+
def test_set_index_raise(self, frame_of_index_cols, drop, append):
229232
df = frame_of_index_cols
230233

231-
with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
232-
df.set_index(['foo', 'bar', 'baz'], verify_integrity=True)
234+
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
235+
# column names are A-E
236+
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
233237

234238
# non-existent key in list with arrays
235-
with tm.assert_raises_regex(KeyError, '.*'):
236-
df.set_index([df['A'], df['B'], 'X'], verify_integrity=True)
239+
with tm.assert_raises_regex(KeyError, 'X'):
240+
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
241+
242+
rgx = 'keys may only contain a combination of the following:.*'
243+
# forbidden type, e.g. iterator
244+
with tm.assert_raises_regex(TypeError, rgx):
245+
df.set_index(map(str, df['A']), drop=drop, append=append)
246+
247+
# forbidden type in list, e.g. iterator
248+
with tm.assert_raises_regex(TypeError, rgx):
249+
df.set_index(['A', df['A'], map(str, df['A'])],
250+
drop=drop, append=append)
237251

238252
def test_construction_with_categorical_index(self):
239253
ci = tm.makeCategoricalIndex(10)

0 commit comments

Comments
 (0)