Skip to content

Commit 2c2e8e6

Browse files
committed
ENH: bug fixes, speed enh, benchmark suite to compare with xts
1 parent 952c6e7 commit 2c2e8e6

File tree

4 files changed

+317
-21
lines changed

4 files changed

+317
-21
lines changed

bench/bench_take_indexing.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import numpy as np
2+
3+
from pandas import *
4+
import pandas._tseries as lib
5+
6+
from pandas import DataFrame
7+
import timeit
8+
9+
setup = """
10+
from pandas import Series
11+
import pandas._tseries as lib
12+
import random
13+
import numpy as np
14+
15+
import random
16+
n = %d
17+
k = %d
18+
arr = np.random.randn(n, k)
19+
indexer = np.arange(n, dtype=np.int32)
20+
indexer = indexer[::-1]
21+
"""
22+
23+
sizes = [100, 1000, 10000, 100000]
24+
iters = [1000, 1000, 100, 1]
25+
26+
fancy_2d = []
27+
take_2d = []
28+
cython_2d = []
29+
30+
n = 1000
31+
32+
def _timeit(stmt, size, k=5, iters=1000):
33+
timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k))
34+
return timer.timeit(n) / n
35+
36+
for sz, its in zip(sizes, iters):
37+
print sz
38+
fancy_2d.append(_timeit('arr[indexer]', sz, iters=its))
39+
take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its))
40+
cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its))
41+
42+
df = DataFrame({'fancy' : fancy_2d,
43+
'take' : take_2d,
44+
'cython' : cython_2d})
45+
46+
print df
47+
48+
from pandas.rpy.common import r
49+
r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)')
50+
r('set.seed(12345')
51+
r('indexer <- sample(1:10000)')
52+
r('mat[indexer,]')

pandas/src/reindex.pyx

Lines changed: 112 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -376,10 +376,10 @@ def ordered_left_join_int64(ndarray[int64_t] left, ndarray[int64_t] right):
376376

377377
@cython.wraparound(False)
378378
@cython.boundscheck(False)
379-
def ordered_left_join_put(ndarray[int64_t] left, ndarray[int64_t] right,
380-
ndarray[float64_t, ndim=2] lvalues,
381-
ndarray[float64_t, ndim=2] rvalues,
382-
ndarray[float64_t, ndim=2] out):
379+
def left_join_2d(ndarray[int64_t] left, ndarray[int64_t] right,
380+
ndarray[float64_t, ndim=2] lvalues,
381+
ndarray[float64_t, ndim=2] rvalues,
382+
ndarray[float64_t, ndim=2] out):
383383
cdef:
384384
Py_ssize_t i, j, k, nright, nleft, kright, kleft
385385
int64_t val
@@ -409,6 +409,37 @@ def ordered_left_join_put(ndarray[int64_t] left, ndarray[int64_t] right,
409409
for k from kleft <= k < kleft + kright:
410410
out[i, k] = NaN
411411

412+
@cython.wraparound(False)
413+
@cython.boundscheck(False)
414+
def left_join_1d(ndarray[int64_t] left, ndarray[int64_t] right,
415+
ndarray[float64_t] lvalues,
416+
ndarray[float64_t] rvalues,
417+
ndarray[float64_t, ndim=2] out):
418+
cdef:
419+
Py_ssize_t i, j, nright, nleft
420+
int64_t val
421+
422+
nleft = len(lvalues)
423+
nright = len(rvalues)
424+
425+
j = 0
426+
for i from 0 <= i < nleft:
427+
out[i, 0] = lvalues[i]
428+
429+
val = left[i]
430+
431+
while j < nright and right[j] < val:
432+
j += 1
433+
434+
if j == nright:
435+
out[i, 1] = NaN
436+
continue
437+
438+
if val == right[j]:
439+
out[i, 1] = rvalues[j]
440+
else:
441+
out[i, 1] = NaN
442+
412443
@cython.wraparound(False)
413444
@cython.boundscheck(False)
414445
def inner_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
@@ -528,6 +559,83 @@ def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right):
528559
lindexer[:count].copy(),
529560
rindexer[:count].copy())
530561

562+
# @cython.wraparound(False)
563+
# @cython.boundscheck(False)
564+
def take_axis0(ndarray[float64_t, ndim=2] values,
565+
ndarray[int32_t] indexer,
566+
out=None):
567+
cdef:
568+
Py_ssize_t i, j, k, n, idx
569+
ndarray[float64_t, ndim=2] outbuf
570+
571+
n = len(indexer)
572+
k = values.shape[1]
573+
574+
if out is None:
575+
outbuf = np.empty((n, k), dtype=values.dtype)
576+
else:
577+
outbuf = out
578+
579+
for i from 0 <= i < n:
580+
idx = indexer[i]
581+
582+
if idx == -1:
583+
for j from 0 <= j < k:
584+
outbuf[i, j] = NaN
585+
else:
586+
for j from 0 <= j < k:
587+
outbuf[i, j] = values[idx, j]
588+
589+
590+
@cython.wraparound(False)
591+
@cython.boundscheck(False)
592+
def take_axis1(ndarray[float64_t, ndim=2] values,
593+
ndarray[int32_t] indexer,
594+
out=None):
595+
cdef:
596+
Py_ssize_t i, j, k, n, idx
597+
ndarray[float64_t, ndim=2] outbuf
598+
599+
n = len(indexer)
600+
k = values.shape[1]
601+
602+
if out is None:
603+
outbuf = np.empty((n, k), dtype=values.dtype)
604+
else:
605+
outbuf = out
606+
607+
for j from 0 <= j < k:
608+
idx = indexer[j]
609+
610+
if idx == -1:
611+
for i from 0 <= i < n:
612+
outbuf[i, j] = NaN
613+
else:
614+
for i from 0 <= i < n:
615+
outbuf[i, j] = values[i, idx]
616+
617+
@cython.wraparound(False)
618+
@cython.boundscheck(False)
619+
def take_1d(ndarray[float64_t] values, ndarray[int32_t] indexer,
620+
out=None):
621+
cdef:
622+
Py_ssize_t i, n, idx
623+
ndarray[float64_t] outbuf
624+
625+
n = len(indexer)
626+
627+
if out is None:
628+
outbuf = np.empty(n, dtype=values.dtype)
629+
else:
630+
outbuf = out
631+
632+
for i from 0 <= i < n:
633+
idx = indexer[i]
634+
if idx == -1:
635+
outbuf[i] = NaN
636+
else:
637+
outbuf[i] = values[idx]
638+
531639
def ordered_put_indexer(ndarray[int64_t] left, ndarray[int64_t] right,
532640
ndarray[float64_t, ndim=2] lvalues,
533641
ndarray[float64_t, ndim=2] rvalues,

scripts/bench_join.R

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
library(xts)
2+
3+
iterations <- 100
4+
5+
ns = c(100, 1000, 10000, 100000, 1000000)
6+
kinds = c("outer", "left", "inner")
7+
8+
result = matrix(0, nrow=3, ncol=length(ns))
9+
n <- 100000
10+
pct.overlap <- 0.2
11+
12+
k <- 5
13+
14+
for (ni in 1:length(ns)){
15+
n <- ns[ni]
16+
rng1 <- 1:n
17+
offset <- as.integer(n * pct.overlap)
18+
rng2 <- rng1 + offset
19+
x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
20+
as.POSIXct(Sys.Date()) + rng1)
21+
y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k),
22+
as.POSIXct(Sys.Date()) + rng2)
23+
for (i in 1:3) {
24+
kind = kinds[i]
25+
timing <- system.time(for (j in 1:iterations) merge(x, y, join=kind),
26+
gcFirst=F)
27+
timing <- as.list(timing)
28+
result[i, ni] = (timing$elapsed / iterations) * 1000
29+
}
30+
}
31+
32+
rownames(result) <- kinds
33+
colnames(result) <- log10(ns)

0 commit comments

Comments
 (0)