Skip to content

Commit cb04ef7

Browse files
committed
ENH: inner_join_indexer/outer_join_indexer functions
1 parent 9a320e8 commit cb04ef7

File tree

3 files changed

+156
-22
lines changed

3 files changed

+156
-22
lines changed

pandas/src/reindex.pyx

Lines changed: 110 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -409,36 +409,131 @@ def ordered_left_join_put(ndarray[int64_t] left, ndarray[int64_t] right,
409409
for k from kleft <= k < kleft + kright:
410410
out[i, k] = NaN
411411

412-
413-
def ordered_outer_join(ndarray[int64] left, ndarray[int64] right):
412+
def inner_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
413+
'''
414+
Two-pass algorithm?
415+
'''
414416
cdef:
415-
Py_ssize_t i, j, k, nright, nleft, kright, kleft
417+
Py_ssize_t i, j, k, nright, nleft, count
416418
int64_t val
419+
ndarray[int32_t] lindexer, rindexer
420+
ndarray[int64_t] result
417421

418-
nleft, kleft = (<object> lvalues).shape
419-
nright, kright = (<object> rvalues).shape
422+
nleft = len(left)
423+
nright = len(right)
420424

425+
lindexer = np.empty(min(nleft, nright), dtype=np.int32)
426+
rindexer = np.empty(min(nleft, nright), dtype=np.int32)
427+
result = np.empty(min(nleft, nright), dtype=np.int64)
428+
429+
i = 0
421430
j = 0
422-
for i from 0 <= i < nleft:
423-
for k from 0 <= k < kleft:
424-
out[i, k] = lvalues[i, k]
431+
count = 0
432+
while True:
433+
if i == nleft:
434+
break
425435

426436
val = left[i]
427437

428438
while j < nright and right[j] < val:
429439
j += 1
430440

431441
if j == nright:
432-
for k from kleft <= k < kleft + kright:
433-
out[i, k] = NaN
434-
continue
442+
break
435443

436444
if val == right[j]:
437-
for k from kleft <= k < kleft + kright:
438-
out[i, k] = rvalues[j, k - kleft]
445+
lindexer[count] = i
446+
rindexer[count] = j
447+
result[count] = val
448+
count += 1
449+
i += 1
450+
j += 1
439451
else:
440-
for k from kleft <= k < kleft + kright:
441-
out[i, k] = NaN
452+
while left[i] < right[j]:
453+
i += 1
454+
455+
return (result[:count].copy(),
456+
lindexer[:count].copy(),
457+
rindexer[:count].copy())
458+
459+
def _inner_join_count(ndarray[int64_t] left, ndarray[int64_t] right):
460+
pass
461+
462+
def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
463+
cdef:
464+
Py_ssize_t i, j, nright, nleft, tot, count
465+
int64_t val
466+
ndarray[int32_t] lindexer, rindexer
467+
ndarray[int64_t] result
468+
469+
nleft = len(left)
470+
nright = len(right)
471+
tot = nleft + nright
472+
473+
lindexer = np.empty(tot, dtype=np.int32)
474+
rindexer = np.empty(tot, dtype=np.int32)
475+
result = np.empty(tot, dtype=np.int64)
476+
477+
i = 0
478+
j = 0
479+
count = 0
480+
while True:
481+
if i == nleft:
482+
if j == nright:
483+
# we are done
484+
break
485+
else:
486+
while j < nright:
487+
lindexer[count] = -1
488+
rindexer[count] = j
489+
result[count] = right[j]
490+
j += 1
491+
count += 1
492+
elif j == nright:
493+
while i < nleft:
494+
lindexer[count] = i
495+
rindexer[count] = -1
496+
result[count] = left[j]
497+
i += 1
498+
count += 1
499+
break
500+
else:
501+
lval = left[i]
502+
rval = right[j]
503+
if lval == rval:
504+
lindexer[count] = i
505+
rindexer[count] = j
506+
result[count] = lval
507+
i += 1
508+
j += 1
509+
elif left[i] < right[j]:
510+
lindexer[count] = i
511+
rindexer[count] = -1
512+
result[count] = lval
513+
i += 1
514+
else:
515+
lindexer[count] = -1
516+
rindexer[count] = j
517+
result[count] = rval
518+
j += 1
519+
520+
count += 1
521+
522+
return (result[:count].copy(),
523+
lindexer[:count].copy(),
524+
rindexer[:count].copy())
525+
526+
def ordered_put_indexer(ndarray[int64_t] left, ndarray[int64_t] right,
527+
ndarray[float64_t, ndim=2] lvalues,
528+
ndarray[float64_t, ndim=2] rvalues,
529+
ndarray[float64_t, ndim=2] out):
530+
pass
531+
532+
def ordered_outer_join(ndarray[int64_t] left, ndarray[int64_t] right):
533+
cdef:
534+
Py_ssize_t i, j, k, nright, nleft, kright, kleft
535+
int64_t val
536+
pass
442537

443538

444539
def ordered_inner_join(ndarray[object] left, ndarray[object] right):

pandas/tests/test_tseries.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import numpy as np
44
from pandas import Index
5+
from pandas.util.testing import assert_almost_equal
56
import pandas.util.testing as common
6-
import pandas._tseries as tseries
7+
import pandas._tseries as lib
78

89
class TestTseriesUtil(unittest.TestCase):
910

@@ -26,7 +27,7 @@ def test_getMergeVec(self):
2627
old = Index([1, 5, 10])
2728
new = Index(range(12))
2829

29-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
30+
filler, mask = lib.getFillVec(old, new, old.indexMap,
3031
new.indexMap, None)
3132

3233
expect_filler = [-1, 0, -1, -1, -1, 1, -1, -1, -1, -1, 2, -1]
@@ -39,7 +40,7 @@ def test_getMergeVec(self):
3940
# corner case
4041
old = Index([1, 4])
4142
new = Index(range(5, 10))
42-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
43+
filler, mask = lib.getFillVec(old, new, old.indexMap,
4344
new.indexMap, None)
4445

4546
expect_filler = [-1, -1, -1, -1, -1]
@@ -51,7 +52,7 @@ def test_backfill(self):
5152
old = Index([1, 5, 10])
5253
new = Index(range(12))
5354

54-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
55+
filler, mask = lib.getFillVec(old, new, old.indexMap,
5556
new.indexMap, 'BACKFILL')
5657

5758
expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1]
@@ -64,7 +65,7 @@ def test_backfill(self):
6465
# corner case
6566
old = Index([1, 4])
6667
new = Index(range(5, 10))
67-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
68+
filler, mask = lib.getFillVec(old, new, old.indexMap,
6869
new.indexMap, 'BACKFILL')
6970

7071
expect_filler = [-1, -1, -1, -1, -1]
@@ -76,7 +77,7 @@ def test_pad(self):
7677
old = Index([1, 5, 10])
7778
new = Index(range(12))
7879

79-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
80+
filler, mask = lib.getFillVec(old, new, old.indexMap,
8081
new.indexMap, 'PAD')
8182

8283
expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2]
@@ -89,13 +90,47 @@ def test_pad(self):
8990
# corner case
9091
old = Index([5, 10])
9192
new = Index(range(5))
92-
filler, mask = tseries.getFillVec(old, new, old.indexMap,
93+
filler, mask = lib.getFillVec(old, new, old.indexMap,
9394
new.indexMap, 'PAD')
9495

9596
expect_filler = [-1, -1, -1, -1, -1]
9697
expect_mask = np.zeros(5, dtype=bool)
9798
self.assert_(np.array_equal(filler, expect_filler))
9899
self.assert_(np.array_equal(mask, expect_mask))
99100

101+
def test_inner_join_indexer():
102+
a = np.array([1, 2, 3, 4, 5])
103+
b = np.array([0, 3, 5, 7, 9])
104+
105+
index, ares, bres = lib.inner_join_indexer(a, b)
106+
107+
index_exp = np.array([3, 5], dtype=np.int64)
108+
assert_almost_equal(index, index_exp)
109+
110+
aexp = np.array([2, 4])
111+
bexp = np.array([1, 2])
112+
assert_almost_equal(ares, aexp)
113+
assert_almost_equal(bres, bexp)
114+
115+
def test_outer_join_indexer():
116+
a = np.array([1, 2, 3, 4, 5])
117+
b = np.array([0, 3, 5, 7, 9])
118+
119+
index, ares, bres = lib.outer_join_indexer(a, b)
120+
121+
index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
122+
assert_almost_equal(index, index_exp)
123+
124+
aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int32)
125+
bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4])
126+
assert_almost_equal(ares, aexp)
127+
assert_almost_equal(bres, bexp)
128+
100129
class TestMoments(unittest.TestCase):
101130
pass
131+
132+
if __name__ == '__main__':
133+
import nose
134+
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
135+
exit=False)
136+

scripts/bench_join.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,7 @@ def do_left_join_frame(a, b):
7878
a.index._indexMap = None
7979
b.index._indexMap = None
8080
return a.join(b, how='left')
81+
82+
# a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
83+
# b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
84+
# print lib.inner_join_indexer(a, b)

0 commit comments

Comments
 (0)