From 6ca893f5d071a82bd7e2d90f32b3b6ecf50bcaea Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Wed, 7 Jan 2015 19:52:11 -0500 Subject: [PATCH] bug in left join on multi-index with sort=True or nulls --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/tools/merge.py | 47 +++++++++------- pandas/tools/tests/test_merge.py | 93 +++++++++++++++++++++++++++++--- 3 files changed, 117 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 839a055bf2a63..e878851233be1 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -99,6 +99,7 @@ Bug Fixes - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`) - Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`) +- Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`). diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 56eb8c68ad275..27e4845e3faee 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -9,7 +9,6 @@ from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, _merge_doc from pandas.core.generic import NDFrame -from pandas.core.groupby import get_group_index from pandas.core.series import Series from pandas.core.index import (Index, MultiIndex, _get_combined_index, _ensure_index, _get_consensus_names, @@ -525,27 +524,39 @@ def get_result(self): return result -def _get_multiindex_indexer(join_keys, index, sort=False): - shape = [] - labels = [] - for level, key in zip(index.levels, join_keys): - llab, rlab, count = _factorize_keys(level, key, sort=False) - labels.append(rlab) - shape.append(count) +def _get_multiindex_indexer(join_keys, index, sort): + from functools import partial - left_group_key = get_group_index(labels, shape) - right_group_key = get_group_index(index.labels, shape) + # bind `sort` argument + fkeys = partial(_factorize_keys, sort=sort) - left_group_key, right_group_key, max_groups = \ - _factorize_keys(left_group_key, right_group_key, - sort=False) + # left & right join labels and num. of levels at each location + rlab, llab, shape = map(list, zip( * map(fkeys, index.levels, join_keys))) + if sort: + rlab = list(map(np.take, rlab, index.labels)) + else: + i8copy = lambda a: a.astype('i8', subok=False, copy=True) + rlab = list(map(i8copy, index.labels)) - left_indexer, right_indexer = \ - algos.left_outer_join(com._ensure_int64(left_group_key), - com._ensure_int64(right_group_key), - max_groups, sort=False) + # fix right labels if there were any nulls + for i in range(len(join_keys)): + mask = index.labels[i] == -1 + if mask.any(): + # check if there already was any nulls at this location + # if there was, it is factorized to `shape[i] - 1` + a = join_keys[i][llab[i] == shape[i] - 1] + if a.size == 0 or not a[0] != a[0]: + shape[i] += 1 - return left_indexer, right_indexer + rlab[i][mask] = shape[i] - 1 + + # get flat i8 join keys + lkey, rkey = _get_join_keys(llab, rlab, shape, sort) + + # factorize keys to a dense i8 space + lkey, rkey, count = fkeys(lkey, rkey) + + return algos.left_outer_join(lkey, rkey, count, sort=sort) def _get_single_indexer(join_key, index, sort=False): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 58d14154f0190..cf5cc4661ec52 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -901,14 +901,78 @@ def test_merge_on_multikey(self): # TODO: columns aren't in the same order yet assert_frame_equal(joined, expected.ix[:, joined.columns]) + left = self.data.join(self.to_join, on=['key1', 'key2'], sort=True) + right = expected.ix[:, joined.columns].sort(['key1', 'key2'], + kind='mergesort') + assert_frame_equal(left, right) + + def test_left_join_multi_index(self): + icols = ['1st', '2nd', '3rd'] + + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord('a') + return f(df['1st']) + f(df['3rd'])* 1e2 + df['2nd'].fillna(0) * 1e4 + + def run_asserts(left, right): + for sort in [False, True]: + res = left.join(right, on=icols, how='left', sort=sort) + + self.assertTrue(len(left) < len(res) + 1) + self.assertFalse(res['4th'].isnull().any()) + self.assertFalse(res['5th'].isnull().any()) + + tm.assert_series_equal(res['4th'], - res['5th']) + tm.assert_series_equal(res['4th'], bind_cols(res.iloc[:, :-2])) + + if sort: + tm.assert_frame_equal(res, + res.sort(icols, kind='mergesort')) + + out = merge(left, right.reset_index(), on=icols, + sort=sort, how='left') + + res.index = np.arange(len(res)) + tm.assert_frame_equal(out, res) + + lc = list(map(chr, np.arange(ord('a'), ord('z') + 1))) + left = DataFrame(np.random.choice(lc, (5000, 2)), + columns=['1st', '3rd']) + left.insert(1, '2nd', np.random.randint(0, 1000, len(left))) + + i = np.random.permutation(len(left)) + right = left.iloc[i].copy() + + left['4th'] = bind_cols(left) + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right) + + # inject some nulls + left.loc[1::23, '1st'] = np.nan + left.loc[2::37, '2nd'] = np.nan + left.loc[3::43, '3rd'] = np.nan + left['4th'] = bind_cols(left) + + i = np.random.permutation(len(left)) + right = left.iloc[i, :-1] + right['5th'] = - bind_cols(right) + right.set_index(icols, inplace=True) + + run_asserts(left, right) + def test_merge_right_vs_left(self): # compare left vs right merge with multikey - merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], - right_index=True, how='left') - merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], - left_index=True, how='right') - merged2 = merged2.ix[:, merged1.columns] - assert_frame_equal(merged1, merged2) + for sort in [False, True]: + merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], + right_index=True, how='left', sort=sort) + + merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], + left_index=True, how='right', sort=sort) + + merged2 = merged2.ix[:, merged1.columns] + assert_frame_equal(merged1, merged2) def test_compress_group_combinations(self): @@ -943,6 +1007,8 @@ def test_left_join_index_preserve_order(self): expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7 tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'), + left.join(right, on=['k1', 'k2'], sort=True)) # test join with multi dtypes blocks left = DataFrame({'k1': [0, 1, 2] * 8, @@ -961,6 +1027,8 @@ def test_left_join_index_preserve_order(self): expected.loc[(expected.k1 == 1) & (expected.k2 == 'foo'),'v2'] = 7 tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result.sort(['k1', 'k2'], kind='mergesort'), + left.join(right, on=['k1', 'k2'], sort=True)) # do a right join for an extra test joined = merge(right, left, left_index=True, @@ -1022,6 +1090,12 @@ def test_left_join_index_multi_match_multiindex(self): tm.assert_frame_equal(result, expected) + result = left.join(right, on=['cola', 'colb', 'colc'], + how='left', sort=True) + + tm.assert_frame_equal(result, + expected.sort(['cola', 'colb', 'colc'], kind='mergesort')) + # GH7331 - maintain left frame order in left merge right.reset_index(inplace=True) right.columns = left.columns[:3].tolist() + right.columns[-1:].tolist() @@ -1066,6 +1140,9 @@ def test_left_join_index_multi_match(self): tm.assert_frame_equal(result, expected) + result = left.join(right, on='tag', how='left', sort=True) + tm.assert_frame_equal(result, expected.sort('tag', kind='mergesort')) + # GH7331 - maintain left frame order in left merge result = merge(left, right.reset_index(), how='left', on='tag') expected.index = np.arange(len(expected)) @@ -1094,6 +1171,10 @@ def _test(dtype1,dtype2): tm.assert_frame_equal(result, expected) + result = left.join(right, on=['k1', 'k2'], sort=True) + expected.sort(['k1', 'k2'], kind='mergesort', inplace=True) + tm.assert_frame_equal(result, expected) + for d1 in [np.int64,np.int32,np.int16,np.int8,np.uint8]: for d2 in [np.int64,np.float64,np.float32,np.float16]: _test(np.dtype(d1),np.dtype(d2))