Skip to content

BUG: bug in left join on multi-index with sort=True or nulls #9210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 10, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ Bug Fixes

- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`)
- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`)
- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`).



Expand Down
47 changes: 29 additions & 18 deletions pandas/tools/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from pandas.core.categorical import Categorical
from pandas.core.frame import DataFrame, _merge_doc
from pandas.core.generic import NDFrame
from pandas.core.groupby import get_group_index
from pandas.core.series import Series
from pandas.core.index import (Index, MultiIndex, _get_combined_index,
_ensure_index, _get_consensus_names,
Expand Down Expand Up @@ -525,27 +524,39 @@ def get_result(self):
return result


def _get_multiindex_indexer(join_keys, index, sort=False):
shape = []
labels = []
for level, key in zip(index.levels, join_keys):
llab, rlab, count = _factorize_keys(level, key, sort=False)
labels.append(rlab)
shape.append(count)
def _get_multiindex_indexer(join_keys, index, sort):
from functools import partial

left_group_key = get_group_index(labels, shape)
right_group_key = get_group_index(index.labels, shape)
# bind `sort` argument
fkeys = partial(_factorize_keys, sort=sort)

left_group_key, right_group_key, max_groups = \
_factorize_keys(left_group_key, right_group_key,
sort=False)
# left & right join labels and num. of levels at each location
rlab, llab, shape = map(list, zip( * map(fkeys, index.levels, join_keys)))
if sort:
rlab = list(map(np.take, rlab, index.labels))
else:
i8copy = lambda a: a.astype('i8', subok=False, copy=True)
rlab = list(map(i8copy, index.labels))

left_indexer, right_indexer = \
algos.left_outer_join(com._ensure_int64(left_group_key),
com._ensure_int64(right_group_key),
max_groups, sort=False)
# fix right labels if there were any nulls
for i in range(len(join_keys)):
mask = index.labels[i] == -1
if mask.any():
# check if there already was any nulls at this location
# if there was, it is factorized to `shape[i] - 1`
a = join_keys[i][llab[i] == shape[i] - 1]
if a.size == 0 or not a[0] != a[0]:
shape[i] += 1

return left_indexer, right_indexer
rlab[i][mask] = shape[i] - 1

# get flat i8 join keys
lkey, rkey = _get_join_keys(llab, rlab, shape, sort)

# factorize keys to a dense i8 space
lkey, rkey, count = fkeys(lkey, rkey)

return algos.left_outer_join(lkey, rkey, count, sort=sort)


def _get_single_indexer(join_key, index, sort=False):
Expand Down
93 changes: 87 additions & 6 deletions pandas/tools/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,14 +901,78 @@ def test_merge_on_multikey(self):
# TODO: columns aren't in the same order yet
assert_frame_equal(joined, expected.ix[:, joined.columns])

left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True)
right = expected.ix[:, joined.columns].sort(['key1', 'key2'],
kind='mergesort')
assert_frame_equal(left, right)

def test_left_join_multi_index(self):
icols = ['1st', '2nd', '3rd']

def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord('a')
return f(df['1st']) + f(df['3rd'])* 1e2 + df['2nd'].fillna(0) * 1e4

def run_asserts(left, right):
for sort in [False, True]:
res = left.join(right, on=icols, how='left', sort=sort)

self.assertTrue(len(left) < len(res) + 1)
self.assertFalse(res['4th'].isnull().any())
self.assertFalse(res['5th'].isnull().any())

tm.assert_series_equal(res['4th'], - res['5th'])
tm.assert_series_equal(res['4th'], bind_cols(res.iloc[:, :-2]))

if sort:
tm.assert_frame_equal(res,
res.sort(icols, kind='mergesort'))

out = merge(left, right.reset_index(), on=icols,
sort=sort, how='left')

res.index = np.arange(len(res))
tm.assert_frame_equal(out, res)

lc = list(map(chr, np.arange(ord('a'), ord('z') + 1)))
left = DataFrame(np.random.choice(lc, (5000, 2)),
columns=['1st', '3rd'])
left.insert(1, '2nd', np.random.randint(0, 1000, len(left)))

i = np.random.permutation(len(left))
right = left.iloc[i].copy()

left['4th'] = bind_cols(left)
right['5th'] = - bind_cols(right)
right.set_index(icols, inplace=True)

run_asserts(left, right)

# inject some nulls
left.loc[1::23, '1st'] = np.nan
left.loc[2::37, '2nd'] = np.nan
left.loc[3::43, '3rd'] = np.nan
left['4th'] = bind_cols(left)

i = np.random.permutation(len(left))
right = left.iloc[i, :-1]
right['5th'] = - bind_cols(right)
right.set_index(icols, inplace=True)

run_asserts(left, right)

def test_merge_right_vs_left(self):
# compare left vs right merge with multikey
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
right_index=True, how='left')
merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
left_index=True, how='right')
merged2 = merged2.ix[:, merged1.columns]
assert_frame_equal(merged1, merged2)
for sort in [False, True]:
merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'],
right_index=True, how='left', sort=sort)

merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'],
left_index=True, how='right', sort=sort)

merged2 = merged2.ix[:, merged1.columns]
assert_frame_equal(merged1, merged2)

def test_compress_group_combinations(self):

Expand Down Expand Up @@ -943,6 +1007,8 @@ def test_left_join_index_preserve_order(self):
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'),
left.join(right, on=['k1', 'k2'], sort=True))

# test join with multi dtypes blocks
left = DataFrame({'k1': [0, 1, 2] * 8,
Expand All @@ -961,6 +1027,8 @@ def test_left_join_index_preserve_order(self):
expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'),
left.join(right, on=['k1', 'k2'], sort=True))

# do a right join for an extra test
joined = merge(right, left, left_index=True,
Expand Down Expand Up @@ -1022,6 +1090,12 @@ def test_left_join_index_multi_match_multiindex(self):

tm.assert_frame_equal(result, expected)

result = left.join(right, on=['cola', 'colb', 'colc'],
how='left', sort=True)

tm.assert_frame_equal(result,
expected.sort(['cola', 'colb', 'colc'], kind='mergesort'))

# GH7331 - maintain left frame order in left merge
right.reset_index(inplace=True)
right.columns = left.columns[:3].tolist() + right.columns[-1:].tolist()
Expand Down Expand Up @@ -1066,6 +1140,9 @@ def test_left_join_index_multi_match(self):

tm.assert_frame_equal(result, expected)

result = left.join(right, on='tag', how='left', sort=True)
tm.assert_frame_equal(result, expected.sort('tag', kind='mergesort'))

# GH7331 - maintain left frame order in left merge
result = merge(left, right.reset_index(), how='left', on='tag')
expected.index = np.arange(len(expected))
Expand Down Expand Up @@ -1094,6 +1171,10 @@ def _test(dtype1,dtype2):

tm.assert_frame_equal(result, expected)

result = left.join(right, on=['k1', 'k2'], sort=True)
expected.sort(['k1', 'k2'], kind='mergesort', inplace=True)
tm.assert_frame_equal(result, expected)

for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]:
for d2 in [np.int64,np.float64,np.float32,np.float16]:
_test(np.dtype(d1),np.dtype(d2))
Expand Down