Skip to content

API: disallow duplicate level names #18882

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ Other API Changes
- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`)
- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
- Levels names of a ``MultiIndex`` (when not None) are now required to be unique: trying to create a ``MultiIndex`` with repeated names will raise a ``ValueError`` (:issue:`18872`)
- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`)
- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`)
Expand Down
15 changes: 8 additions & 7 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,23 +579,24 @@ def _set_names(self, names, level=None, validate=True):

if level is None:
level = range(self.nlevels)
used = {}
else:
level = [self._get_level_number(l) for l in level]
used = {self.levels[l].name: l
for l in set(range(self.nlevels)) - set(level)}

# set the name
for l, name in zip(level, names):
if name is not None and name in used:
raise ValueError('Duplicated level name: "{}", assigned to '
'level {}, is already used for level '
'{}.'.format(name, l, used[name]))
self.levels[l].rename(name, inplace=True)
used[name] = l

names = property(fset=_set_names, fget=_get_names,
doc="Names of levels in MultiIndex")

def _reference_duplicate_name(self, name):
"""
Returns True if the name refered to in self.names is duplicated.
"""
# count the times name equals an element in self.names.
return sum(name == n for n in self.names) > 1

def _format_native_types(self, na_rep='nan', **kwargs):
new_levels = []
new_labels = []
Expand Down
11 changes: 0 additions & 11 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,6 @@ def __init__(self, values, index, level=-1, value_columns=None,

self.index = index

if isinstance(self.index, MultiIndex):
if index._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The index "
"names are not unique.".format(level=level))
raise ValueError(msg)

self.level = self.index._get_level_number(level)

# when index includes `nan`, need to lift levels/strides by 1
Expand Down Expand Up @@ -502,11 +496,6 @@ def factorize(index):
return categories, codes

N, K = frame.shape
if isinstance(frame.columns, MultiIndex):
if frame.columns._reference_duplicate_name(level):
msg = ("Ambiguous reference to {level}. The column "
"names are not unique.".format(level=level))
raise ValueError(msg)

# Will also convert negative level numbers and check if out of bounds.
level_num = frame.columns._get_level_number(level)
Expand Down
36 changes: 19 additions & 17 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,20 @@ def test_set_index2(self):
result = df.set_index(df.C)
assert result.index.name == 'C'

@pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')])
def test_set_index_duplicate_names(self, level):
# GH18872
df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b'])

# Pass an existing level name:
df.index.name = 'a'
pytest.raises(ValueError, df.set_index, level, append=True)
pytest.raises(ValueError, df.set_index, [level], append=True)

# Pass twice the same level name:
df.index.name = 'c'
pytest.raises(ValueError, df.set_index, [level, level])

def test_set_index_nonuniq(self):
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
Expand Down Expand Up @@ -591,19 +605,6 @@ def test_reorder_levels(self):
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels([0, 0, 0])
e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
labels=[[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
names=['L0', 'L0', 'L0'])
expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
index=e_idx)
assert_frame_equal(result, expected)

result = df.reorder_levels(['L0', 'L0', 'L0'])
assert_frame_equal(result, expected)

def test_reset_index(self):
stacked = self.frame.stack()[::2]
stacked = DataFrame({'foo': stacked, 'bar': stacked})
Expand Down Expand Up @@ -831,7 +832,7 @@ def test_set_index_names(self):

mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B'])
mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values,
names=['A', 'B', 'A', 'B'])
names=['A', 'B', 'C', 'D'])

df = df.set_index(['A', 'B'])

Expand All @@ -843,13 +844,14 @@ def test_set_index_names(self):
# Check actual equality
tm.assert_index_equal(df.set_index(df.index).index, mi)

idx2 = df.index.rename(['C', 'D'])

# Check that [MultiIndex, MultiIndex] yields a MultiIndex rather
# than a pair of tuples
assert isinstance(df.set_index(
[df.index, df.index]).index, MultiIndex)
assert isinstance(df.set_index([df.index, idx2]).index, MultiIndex)

# Check equality
tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2)
tm.assert_index_equal(df.set_index([df.index, idx2]).index, mi2)

def test_rename_objects(self):
renamed = self.mixed_frame.rename(columns=str.upper)
Expand Down
10 changes: 0 additions & 10 deletions pandas/tests/frame/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,16 +560,6 @@ def test_unstack_dtypes(self):
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)

def test_unstack_non_unique_index_names(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this do now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It raises when you try to create the MultiIndex with duplicated name.

idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')

with pytest.raises(ValueError):
df.T.stack('c1')

def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
nan = np.nan
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,15 +388,19 @@ def test_groupby_multi_categorical_as_index(self):
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# another not in-axis grouper (conflicting names in index)
s = Series(['a', 'b', 'b'], name='cat')
# another not in-axis grouper
s = Series(['a', 'b', 'b'], name='cat2')
result = df.groupby(['cat', s], as_index=False).sum()
expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
'A': [10.0, nan, nan, 22.0, nan, nan],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# GH18872: conflicting names in desired index
pytest.raises(ValueError, lambda: df.groupby(['cat',
s.rename('cat')]).sum())

# is original index dropped?
expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]),
'A': [10, 11, 10, 11, 10, 11],
Expand Down
31 changes: 17 additions & 14 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,15 +536,6 @@ def test_names(self):
level_names = [level.name for level in index.levels]
assert ind_names == level_names

def test_reference_duplicate_name(self):
idx = MultiIndex.from_tuples(
[('a', 'b'), ('c', 'd')], names=['x', 'x'])
assert idx._reference_duplicate_name('x')

idx = MultiIndex.from_tuples(
[('a', 'b'), ('c', 'd')], names=['x', 'y'])
assert not idx._reference_duplicate_name('x')

def test_astype(self):
expected = self.index.copy()
actual = self.index.astype('O')
Expand Down Expand Up @@ -609,6 +600,23 @@ def test_constructor_mismatched_label_levels(self):
with tm.assert_raises_regex(ValueError, label_error):
self.index.copy().set_labels([[0, 0, 0, 0], [0, 0]])

@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
[1, 'a', 1]])
def test_duplicate_level_names(self, names):
# GH18872
pytest.raises(ValueError, pd.MultiIndex.from_product,
[[0, 1]] * 3, names=names)

# With .rename()
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you check the error messages here, use tm.assert_raises_regex

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(done)

tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names)

# With .rename(., level=)
mi.rename(names[0], level=1, inplace=True)
tm.assert_raises_regex(ValueError, "Duplicated level name:",
mi.rename, names[:2], level=[0, 2])

def assert_multiindex_copied(self, copy, original):
# Levels should be (at least, shallow copied)
tm.assert_copy(copy.levels, original.levels)
Expand Down Expand Up @@ -667,11 +675,6 @@ def test_changing_names(self):
shallow_copy.names = [name + "c" for name in shallow_copy.names]
self.check_level_names(self.index, new_names)

def test_duplicate_names(self):
self.index.names = ['foo', 'foo']
tm.assert_raises_regex(KeyError, 'Level foo not found',
self.index._get_level_number, 'foo')

def test_get_level_number_integer(self):
self.index.names = [1, 0]
assert self.index._get_level_number(1) == 0
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/formats/test_to_latex.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,8 @@ def test_to_latex_no_bold_rows(self):
"""
assert observed == expected

@pytest.mark.parametrize('name0', [None, 'named'])
@pytest.mark.parametrize('name1', [None, 'named'])
@pytest.mark.parametrize('name0', [None, 'named0'])
@pytest.mark.parametrize('name1', [None, 'named1'])
@pytest.mark.parametrize('axes', [[0], [1], [0, 1]])
def test_to_latex_multiindex_names(self, name0, name1, axes):
# GH 18667
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/io/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1908,12 +1908,6 @@ def make_index(names=None):
'a', 'b'], index=make_index(['date', 'a', 't']))
pytest.raises(ValueError, store.append, 'df', df)

# dup within level
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
index=make_index(['date', 'date', 'date']))
pytest.raises(ValueError, store.append, 'df', df)

# fully names
_maybe_remove(store, 'df')
df = DataFrame(np.zeros((12, 2)), columns=[
Expand Down
9 changes: 2 additions & 7 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,14 +1623,9 @@ def test_crosstab_with_numpy_size(self):
tm.assert_frame_equal(result, expected)

def test_crosstab_dup_index_names(self):
# GH 13279
# GH 13279, GH 18872
s = pd.Series(range(3), name='foo')
result = pd.crosstab(s, s)
expected_index = pd.Index(range(3), name='foo')
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
index=expected_index,
columns=expected_index)
tm.assert_frame_equal(result, expected)
pytest.raises(ValueError, pd.crosstab, s, s)

@pytest.mark.parametrize("names", [['a', ('b', 'c')],
[('a', 'b'), 'c']])
Expand Down
11 changes: 0 additions & 11 deletions pandas/tests/series/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,17 +214,6 @@ def test_reorder_levels(self):
expected = Series(np.arange(6), index=e_idx)
assert_series_equal(result, expected)

result = s.reorder_levels([0, 0, 0])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test that raises

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the new test_set_index_duplicate_names (the constructor itself raises).

e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
labels=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0]],
names=['L0', 'L0', 'L0'])
expected = Series(np.arange(6), index=e_idx)
assert_series_equal(result, expected)

result = s.reorder_levels(['L0', 'L0', 'L0'])
assert_series_equal(result, expected)

def test_rename_axis_inplace(self):
# GH 15704
series = self.ts.copy()
Expand Down