diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5e55efb4e21fb..df50624a9fb2f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -184,6 +184,7 @@ Other API Changes - A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). +- Levels names of a ``MultiIndex`` (when not None) are now required to be unique: trying to create a ``MultiIndex`` with repeated names will raise a ``ValueError`` (:issue:`18872`) - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) - :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c20c6e1f75a24..f4c4f91d2cc57 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -579,23 +579,24 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) + used = {} else: level = [self._get_level_number(l) for l in level] + used = {self.levels[l].name: l + for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): + if name is not None and name in used: + raise ValueError('Duplicated level name: "{}", assigned to ' + 'level {}, is already used for level ' + '{}.'.format(name, l, used[name])) self.levels[l].rename(name, inplace=True) + used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") - def _reference_duplicate_name(self, name): - """ - Returns True if the name refered to in self.names is duplicated. - """ - # count the times name equals an element in self.names. - return sum(name == n for n in self.names) > 1 - def _format_native_types(self, na_rep='nan', **kwargs): new_levels = [] new_labels = [] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 320ad109f01ba..1ca014baa9ec8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -91,12 +91,6 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index - if isinstance(self.index, MultiIndex): - if index._reference_duplicate_name(level): - msg = ("Ambiguous reference to {level}. The index " - "names are not unique.".format(level=level)) - raise ValueError(msg) - self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -502,11 +496,6 @@ def factorize(index): return categories, codes N, K = frame.shape - if isinstance(frame.columns, MultiIndex): - if frame.columns._reference_duplicate_name(level): - msg = ("Ambiguous reference to {level}. The column " - "names are not unique.".format(level=level)) - raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e7ea3f9c62540..6e3b7a059fd49 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,6 +130,20 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' + @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + def test_set_index_duplicate_names(self, level): + # GH18872 + df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) + + # Pass an existing level name: + df.index.name = 'a' + pytest.raises(ValueError, df.set_index, level, append=True) + pytest.raises(ValueError, df.set_index, [level], append=True) + + # Pass twice the same level name: + df.index.name = 'c' + pytest.raises(ValueError, df.set_index, [level, level]) + def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], @@ -591,19 +605,6 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) - result = df.reorder_levels([0, 0, 0]) - e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], - labels=[[0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]], - names=['L0', 'L0', 'L0']) - expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, - index=e_idx) - assert_frame_equal(result, expected) - - result = df.reorder_levels(['L0', 'L0', 'L0']) - assert_frame_equal(result, expected) - def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) @@ -831,7 +832,7 @@ def test_set_index_names(self): mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, - names=['A', 'B', 'A', 'B']) + names=['A', 'B', 'C', 'D']) df = df.set_index(['A', 'B']) @@ -843,13 +844,14 @@ def test_set_index_names(self): # Check actual equality tm.assert_index_equal(df.set_index(df.index).index, mi) + idx2 = df.index.rename(['C', 'D']) + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather # than a pair of tuples - assert isinstance(df.set_index( - [df.index, df.index]).index, MultiIndex) + assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex) # Check equality - tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2) def test_rename_objects(self): renamed = self.mixed_frame.rename(columns=str.upper) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f34d25142a057..5ff4f58774322 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,16 +560,6 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) - def test_unstack_non_unique_index_names(self): - idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], - names=['c1', 'c1']) - df = DataFrame([1, 2], index=idx) - with pytest.raises(ValueError): - df.unstack('c1') - - with pytest.raises(ValueError): - df.T.stack('c1') - def test_unstack_nan_index(self): # GH7466 cast = lambda val: '{0:1}'.format('' if val != val else val) nan = np.nan diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 5e3d2bb9cf091..12f5b98fb64f4 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -388,8 +388,8 @@ def test_groupby_multi_categorical_as_index(self): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper (conflicting names in index) - s = Series(['a', 'b', 'b'], name='cat') + # another not in-axis grouper + s = Series(['a', 'b', 'b'], name='cat2') result = df.groupby(['cat', s], as_index=False).sum() expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10.0, nan, nan, 22.0, nan, nan], @@ -397,6 +397,10 @@ def test_groupby_multi_categorical_as_index(self): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) + # GH18872: conflicting names in desired index + pytest.raises(ValueError, lambda: df.groupby(['cat', + s.rename('cat')]).sum()) + # is original index dropped? expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), 'A': [10, 11, 10, 11, 10, 11], diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 4d6e543851d4f..2a7c020f4c9e9 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -536,15 +536,6 @@ def test_names(self): level_names = [level.name for level in index.levels] assert ind_names == level_names - def test_reference_duplicate_name(self): - idx = MultiIndex.from_tuples( - [('a', 'b'), ('c', 'd')], names=['x', 'x']) - assert idx._reference_duplicate_name('x') - - idx = MultiIndex.from_tuples( - [('a', 'b'), ('c', 'd')], names=['x', 'y']) - assert not idx._reference_duplicate_name('x') - def test_astype(self): expected = self.index.copy() actual = self.index.astype('O') @@ -609,6 +600,23 @@ def test_constructor_mismatched_label_levels(self): with tm.assert_raises_regex(ValueError, label_error): self.index.copy().set_labels([[0, 0, 0, 0], [0, 0]]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) + def test_duplicate_level_names(self, names): + # GH18872 + pytest.raises(ValueError, pd.MultiIndex.from_product, + [[0, 1]] * 3, names=names) + + # With .rename() + mi = pd.MultiIndex.from_product([[0, 1]] * 3) + tm.assert_raises_regex(ValueError, "Duplicated level name:", + mi.rename, names) + + # With .rename(., level=) + mi.rename(names[0], level=1, inplace=True) + tm.assert_raises_regex(ValueError, "Duplicated level name:", + mi.rename, names[:2], level=[0, 2]) + def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) tm.assert_copy(copy.levels, original.levels) @@ -667,11 +675,6 @@ def test_changing_names(self): shallow_copy.names = [name + "c" for name in shallow_copy.names] self.check_level_names(self.index, new_names) - def test_duplicate_names(self): - self.index.names = ['foo', 'foo'] - tm.assert_raises_regex(KeyError, 'Level foo not found', - self.index._get_level_number, 'foo') - def test_get_level_number_integer(self): self.index.names = [1, 0] assert self.index._get_level_number(1) == 0 diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index bedafccca5798..2f8ef32722051 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -589,8 +589,8 @@ def test_to_latex_no_bold_rows(self): """ assert observed == expected - @pytest.mark.parametrize('name0', [None, 'named']) - @pytest.mark.parametrize('name1', [None, 'named']) + @pytest.mark.parametrize('name0', [None, 'named0']) + @pytest.mark.parametrize('name1', [None, 'named1']) @pytest.mark.parametrize('axes', [[0], [1], [0, 1]]) def test_to_latex_multiindex_names(self, name0, name1, axes): # GH 18667 diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 47be8d115a07e..305c1ebcedc6f 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -1908,12 +1908,6 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) - # dup within level - _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], - index=make_index(['date', 'date', 'date'])) - pytest.raises(ValueError, store.append, 'df', df) - # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5b64f62527da4..786c57a4a82df 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1623,14 +1623,9 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279 + # GH 13279, GH 18872 s = pd.Series(range(3), name='foo') - result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name='foo') - expected = pd.DataFrame(np.eye(3, dtype=np.int64), - index=expected_index, - columns=expected_index) - tm.assert_frame_equal(result, expected) + pytest.raises(ValueError, pd.crosstab, s, s) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index f3be7bb9905f4..714e43a4af1f8 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -214,17 +214,6 @@ def test_reorder_levels(self): expected = Series(np.arange(6), index=e_idx) assert_series_equal(result, expected) - result = s.reorder_levels([0, 0, 0]) - e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], - labels=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0]], - names=['L0', 'L0', 'L0']) - expected = Series(np.arange(6), index=e_idx) - assert_series_equal(result, expected) - - result = s.reorder_levels(['L0', 'L0', 'L0']) - assert_series_equal(result, expected) - def test_rename_axis_inplace(self): # GH 15704 series = self.ts.copy()