Skip to content

Commit d2484b9

Browse files
committed
ENH: sped up inner/outer_join_indexer cython functions
1 parent b8b6fe2 commit d2484b9

File tree

4 files changed

+93
-34
lines changed

4 files changed

+93
-34
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ This is a bug fix release
2121
- `read_csv` can read multiple columns into a `MultiIndex`. DataFrame's
2222
`to_csv` method will properly write out a `MultiIndex` which can be read
2323
back (GH #151, thanks to Skipper Seabold)
24+
- Wrote fast time series merging / joining methods in Cython. Will be
25+
integrated later into DataFrame.join and related functions
2426

2527
**Bug fixes**
2628

pandas/src/reindex.pyx

Lines changed: 66 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,34 @@ def inner_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
455455
nleft = len(left)
456456
nright = len(right)
457457

458-
lindexer = np.empty(min(nleft, nright), dtype=np.int32)
459-
rindexer = np.empty(min(nleft, nright), dtype=np.int32)
460-
result = np.empty(min(nleft, nright), dtype=np.int64)
458+
i = 0
459+
j = 0
460+
count = 0
461+
while True:
462+
if i == nleft:
463+
break
464+
465+
val = left[i]
466+
467+
while j < nright and right[j] < val:
468+
j += 1
469+
470+
if j == nright:
471+
break
472+
473+
if val == right[j]:
474+
count += 1
475+
i += 1
476+
j += 1
477+
else:
478+
while left[i] < right[j]:
479+
i += 1
480+
481+
# do it again now that result size is known
482+
483+
lindexer = np.empty(count, dtype=np.int32)
484+
rindexer = np.empty(count, dtype=np.int32)
485+
result = np.empty(count, dtype=np.int64)
461486

462487
i = 0
463488
j = 0
@@ -485,9 +510,7 @@ def inner_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
485510
while left[i] < right[j]:
486511
i += 1
487512

488-
return (result[:count].copy(),
489-
lindexer[:count].copy(),
490-
rindexer[:count].copy())
513+
return result, lindexer, rindexer
491514

492515
def _inner_join_count(ndarray[int64_t] left, ndarray[int64_t] right):
493516
pass
@@ -496,18 +519,48 @@ def _inner_join_count(ndarray[int64_t] left, ndarray[int64_t] right):
496519
@cython.boundscheck(False)
497520
def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
498521
cdef:
499-
Py_ssize_t i, j, nright, nleft, tot, count
522+
Py_ssize_t i, j, nright, nleft, count
500523
int64_t lval, rval
501524
ndarray[int32_t] lindexer, rindexer
502525
ndarray[int64_t] result
503526

504527
nleft = len(left)
505528
nright = len(right)
506-
tot = nleft + nright
507529

508-
lindexer = np.empty(tot, dtype=np.int32)
509-
rindexer = np.empty(tot, dtype=np.int32)
510-
result = np.empty(tot, dtype=np.int64)
530+
i = 0
531+
j = 0
532+
count = 0
533+
while True:
534+
if i == nleft:
535+
if j == nright:
536+
# we are done
537+
break
538+
else:
539+
while j < nright:
540+
j += 1
541+
count += 1
542+
break
543+
elif j == nright:
544+
while i < nleft:
545+
i += 1
546+
count += 1
547+
break
548+
else:
549+
if left[i] == right[j]:
550+
i += 1
551+
j += 1
552+
elif left[i] < right[j]:
553+
i += 1
554+
else:
555+
j += 1
556+
557+
count += 1
558+
559+
lindexer = np.empty(count, dtype=np.int32)
560+
rindexer = np.empty(count, dtype=np.int32)
561+
result = np.empty(count, dtype=np.int64)
562+
563+
# do it again, but populate the indexers / result
511564

512565
i = 0
513566
j = 0
@@ -542,7 +595,7 @@ def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
542595
result[count] = lval
543596
i += 1
544597
j += 1
545-
elif left[i] < right[j]:
598+
elif lval < rval:
546599
lindexer[count] = i
547600
rindexer[count] = -1
548601
result[count] = lval
@@ -555,9 +608,7 @@ def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
555608

556609
count += 1
557610

558-
return (result[:count].copy(),
559-
lindexer[:count].copy(),
560-
rindexer[:count].copy())
611+
return result, lindexer, rindexer
561612

562613
@cython.wraparound(False)
563614
@cython.boundscheck(False)

scripts/bench_join.R

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
library(xts)
22

3-
iterations <- 100
3+
iterations <- 50
44

55
ns = c(100, 1000, 10000, 100000, 1000000)
66
kinds = c("outer", "left", "inner")
@@ -9,24 +9,30 @@ result = matrix(0, nrow=3, ncol=length(ns))
99
n <- 100000
1010
pct.overlap <- 0.2
1111

12-
k <- 5
12+
k <- 1
1313

1414
for (ni in 1:length(ns)){
15-
n <- ns[ni]
16-
rng1 <- 1:n
17-
offset <- as.integer(n * pct.overlap)
18-
rng2 <- rng1 + offset
19-
x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
20-
as.POSIXct(Sys.Date()) + rng1)
21-
y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
22-
as.POSIXct(Sys.Date()) + rng2)
23-
for (i in 1:3) {
24-
kind = kinds[i]
25-
timing <- system.time(for (j in 1:iterations) merge(x, y, join=kind),
26-
gcFirst=F)
27-
timing <- as.list(timing)
28-
result[i, ni] = (timing$elapsed / iterations) * 1000
29-
}
15+
n <- ns[ni]
16+
rng1 <- 1:n
17+
offset <- as.integer(n * pct.overlap)
18+
rng2 <- rng1 + offset
19+
x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
20+
as.POSIXct(Sys.Date()) + rng1)
21+
y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
22+
as.POSIXct(Sys.Date()) + rng2)
23+
timing <- numeric()
24+
for (i in 1:3) {
25+
kind = kinds[i]
26+
for(j in 1:iterations) {
27+
gc() # just to be sure
28+
timing[j] <- system.time(merge(x,y,join=kind))[3]
29+
}
30+
#timing <- system.time(for (j in 1:iterations) merge.xts(x, y, join=kind),
31+
# gcFirst=F)
32+
#timing <- as.list(timing)
33+
result[i, ni] <- mean(timing) * 1000
34+
#result[i, ni] = (timing$elapsed / iterations) * 1000
35+
}
3036
}
3137

3238
rownames(result) <- kinds

scripts/bench_join.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from copy import deepcopy
55
import time
66

7-
n = 100000
7+
n = 1000000
88
K = 1
99
pct_overlap = 0.2
1010

@@ -135,7 +135,7 @@ def bench_python(n=100000, pct_overlap=0.20):
135135
import gc
136136
ns = [2, 3, 4, 5, 6]
137137
iterations = 50
138-
K = 1
138+
K = 5
139139
pct_overlap = 0.2
140140
kinds = ['outer', 'left', 'inner']
141141

0 commit comments

Comments
 (0)