API: better warnings for df.set_index

h-vetinari · h-vetinari · commit 40278255dfe2 · 2018-08-23T17:24:29.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -500,6 +500,7 @@ Other API Changes
 - :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`)
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
+- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 
 .. _whatsnew_0240.deprecations:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -60,6 +60,7 @@
     is_sequence,
     is_named_tuple)
 from pandas.core.dtypes.concat import _get_sliced_frame_result_type
+from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
 from pandas.core.dtypes.missing import isna, notna
 
 
@@ -3862,26 +3863,24 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         -------
         dataframe : DataFrame
         """
-        from pandas import Series
-
         if not isinstance(keys, list):
             keys = [keys]
 
-        # collect elements from "keys" that are not allowed array types
-        col_labels = [x for x in keys
-                      if not isinstance(x, (Series, Index, MultiIndex,
-                                            list, np.ndarray))]
+        # collect elements from keys that are scalar (or tuples for MultiIndex)
+        col_labels = [x for x in keys if is_scalar(x) or isinstance(x, tuple)]
         if any(x not in self for x in col_labels):
             # if there are any labels that are invalid, we raise a KeyError
             missing = [x for x in col_labels if x not in self]
             raise KeyError('{}'.format(missing))
 
-        elif len(set(col_labels)) < len(col_labels):
-            # if all are valid labels, but there are duplicates
-            dup = Series(col_labels)
-            dup = list(dup.loc[dup.duplicated()])
-            raise ValueError('Passed duplicate column names '
-                             'to keys: {dup}'.format(dup=dup))
+        # raise error for elements of keys that are neither valid column keys
+        # or an array of allowed type
+        if any(not (is_scalar(x) or isinstance(x, tuple)) and
+               not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex,
+                                  list, np.ndarray)) for x in keys):
+            raise TypeError('keys may only contain a combination of the '
+                            'following: valid column keys, Series, Index, '
+                            'MultiIndex, list or np.ndarray')
 
         inplace = validate_bool_kwarg(inplace, 'inplace')
 
@@ -3902,21 +3901,21 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
 
         to_remove = []
         for col in keys:
-            if isinstance(col, MultiIndex):
+            if isinstance(col, ABCMultiIndex):
                 # append all but the last column so we don't have to modify
                 # the end of this loop
                 for n in range(col.nlevels - 1):
                     arrays.append(col._get_level_values(n))
 
                 level = col._get_level_values(col.nlevels - 1)
                 names.extend(col.names)
-            elif isinstance(col, Series):
+            elif isinstance(col, ABCSeries):
                 level = col._values
                 names.append(col.name)
-            elif isinstance(col, Index):
+            elif isinstance(col, ABCIndexClass):
                 level = col
                 names.append(col.name)
-            elif isinstance(col, (list, np.ndarray, Index)):
+            elif isinstance(col, (list, np.ndarray)):
                 level = col
                 names.append(None)
             else:
@@ -3933,7 +3932,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             raise ValueError('Index has duplicate keys: {dup}'.format(
                 dup=duplicates))
 
-        for c in to_remove:
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in set(to_remove):
             del frame[c]
 
         # clear up memory usage
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -185,20 +185,21 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
         keys = [box1(df['A']), box2(df['A'])]
 
         # == gives ambiguous Boolean for Series
-        if keys[0] is 'A' and keys[1] is 'A':
-            with tm.assert_raises_regex(ValueError,
-                                        'Passed duplicate column names.*'):
-                df.set_index(keys, drop=drop, append=append)
+        if drop and keys[0] is 'A' and keys[1] is 'A':
+            # can't drop same column twice
+            first_drop = False
         else:
-            result = df.set_index(keys, drop=drop, append=append)
+            first_drop = drop
 
-            # to test against already-tested behavior, we add sequentially,
-            # hence second append always True; must wrap in list, otherwise
-            # list-box will be illegal
-            expected = df.set_index([keys[0]], drop=drop, append=append)
-            expected = expected.set_index([keys[1]], drop=drop, append=True)
+        print(keys)
+        # to test against already-tested behavior, we add sequentially,
+        # hence second append always True; must wrap in list, otherwise
+        # list-box will be illegal
+        expected = df.set_index([keys[0]], drop=first_drop, append=append)
+        expected = expected.set_index([keys[1]], drop=drop, append=True)
 
-            tm.assert_frame_equal(result, expected)
+        result = df.set_index(keys, drop=drop, append=append)
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize('append', [True, False])
     @pytest.mark.parametrize('drop', [True, False])
@@ -225,15 +226,28 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
                                     'Index has duplicate keys'):
             df.set_index([df['A'], df['A']], verify_integrity=True)
 
-    def test_set_index_raise(self, frame_of_index_cols):
+    @pytest.mark.parametrize('append', [True, False])
+    @pytest.mark.parametrize('drop', [True, False])
+    def test_set_index_raise(self, frame_of_index_cols, drop, append):
         df = frame_of_index_cols
 
-        with tm.assert_raises_regex(KeyError, '.*'):  # column names are A-E
-            df.set_index(['foo', 'bar', 'baz'], verify_integrity=True)
+        with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
+            # column names are A-E
+            df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)
 
         # non-existent key in list with arrays
-        with tm.assert_raises_regex(KeyError, '.*'):
-            df.set_index([df['A'], df['B'], 'X'], verify_integrity=True)
+        with tm.assert_raises_regex(KeyError, 'X'):
+            df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
+
+        rgx = 'keys may only contain a combination of the following:.*'
+        # forbidden type, e.g. iterator
+        with tm.assert_raises_regex(TypeError, rgx):
+            df.set_index(map(str, df['A']), drop=drop, append=append)
+
+        # forbidden type in list, e.g. iterator
+        with tm.assert_raises_regex(TypeError, rgx):
+            df.set_index(['A', df['A'], map(str, df['A'])],
+                         drop=drop, append=append)
 
     def test_construction_with_categorical_index(self):
         ci = tm.makeCategoricalIndex(10)