Skip to content

CLN: ASV reindex #18938

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 26, 2017
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 86 additions & 117 deletions asv_bench/benchmarks/reindex.py
Original file line number Diff line number Diff line change
@@ -1,89 +1,77 @@
from .pandas_vb_common import *
from random import shuffle
import numpy as np
import pandas.util.testing as tm
from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index,
date_range)
from .pandas_vb_common import setup, lib # noqa


class Reindexing(object):
class Reindex(object):

goal_time = 0.2

def setup(self):
self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
self.df = DataFrame(np.random.rand(10000, 10), index=self.rng,
rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min')
self.df = DataFrame(np.random.rand(10000, 10), index=rng,
columns=range(10))
self.df['foo'] = 'bar'
self.rng2 = Index(self.rng[::2])

self.rng_subset = Index(rng[::2])
self.df2 = DataFrame(index=range(10000),
data=np.random.rand(10000, 30), columns=range(30))

# multi-index
N = 5000
K = 200
level1 = tm.makeStringIndex(N).values.repeat(K)
level2 = np.tile(tm.makeStringIndex(K).values, N)
index = MultiIndex.from_arrays([level1, level2])
self.s1 = Series(np.random.randn((N * K)), index=index)
self.s2 = self.s1[::2]
self.s = Series(np.random.randn(N * K), index=index)
self.s_subset = self.s[::2]

def time_reindex_dates(self):
self.df.reindex(self.rng2)
self.df.reindex(self.rng_subset)

def time_reindex_columns(self):
self.df2.reindex(columns=self.df.columns[1:5])

def time_reindex_multiindex(self):
self.s1.reindex(self.s2.index)
self.s.reindex(self.s_subset.index)


#----------------------------------------------------------------------
# Pad / backfill
class ReindexMethod(object):


class FillMethod(object):
goal_time = 0.2
params = ['pad', 'backfill']
param_names = ['method']

def setup(self):
self.rng = date_range('1/1/2000', periods=100000, freq='1min')
self.ts = Series(np.random.randn(len(self.rng)), index=self.rng)
self.ts2 = self.ts[::2]
self.ts3 = self.ts2.reindex(self.ts.index)
self.ts4 = self.ts3.astype('float32')

def pad(self, source_series, target_index):
try:
source_series.reindex(target_index, method='pad')
except:
source_series.reindex(target_index, fillMethod='pad')

def backfill(self, source_series, target_index):
try:
source_series.reindex(target_index, method='backfill')
except:
source_series.reindex(target_index, fillMethod='backfill')

def time_backfill_dates(self):
self.backfill(self.ts2, self.ts.index)
def setup(self, method):
N = 100000
self.idx = date_range('1/1/2000', periods=N, freq='1min')
self.ts = Series(np.random.randn(N), index=self.idx)[::2]

def time_pad_daterange(self):
self.pad(self.ts2, self.ts.index)
def time_reindex_method(self, method):
self.ts.reindex(self.idx, method=method)

def time_backfill(self):
self.ts3.fillna(method='backfill')

def time_backfill_float32(self):
self.ts4.fillna(method='backfill')
class Fillna(object):

def time_pad(self):
self.ts3.fillna(method='pad')
goal_time = 0.2
params = ['pad', 'backfill']
param_names = ['method']

def time_pad_float32(self):
self.ts4.fillna(method='pad')
def setup(self, method):
N = 100000
self.idx = date_range('1/1/2000', periods=N, freq='1min')
ts = Series(np.random.randn(N), index=self.idx)[::2]
self.ts_reindexed = ts.reindex(self.idx)
self.ts_float32 = self.ts_reindexed.astype('float32')

def time_reindexed(self, method):
self.ts_reindexed.fillna(method=method)

#----------------------------------------------------------------------
# align on level
def time_float_32(self, method):
self.ts_float32.fillna(method=method)


class LevelAlign(object):

goal_time = 0.2

def setup(self):
Expand All @@ -92,7 +80,6 @@ def setup(self):
labels=[np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(self.index.values)
self.df = DataFrame(np.random.randn(len(self.index), 4),
index=self.index)
self.df_level = DataFrame(np.random.randn(100, 4),
Expand All @@ -102,103 +89,85 @@ def time_align_level(self):
self.df.align(self.df_level, level=1, copy=False)

def time_reindex_level(self):
self.df_level.reindex(self.df.index, level=1)
self.df_level.reindex(self.index, level=1)


#----------------------------------------------------------------------
# drop_duplicates
class DropDuplicates(object):


class Duplicates(object):
goal_time = 0.2

def setup(self):
self.N = 10000
self.K = 10
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.df = DataFrame({'key1': self.key1, 'key2': self.key2,
'value': np.random.randn((self.N * self.K)),})
self.col_array_list = list(self.df.values.T)

self.df2 = self.df.copy()
self.df2.ix[:10000, :] = np.nan
params = [True, False]
param_names = ['inplace']

def setup(self, inplace):
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
self.df = DataFrame({'key1': key1, 'key2': key2,
'value': np.random.randn(N * K)})
self.df_nan = self.df.copy()
self.df_nan.iloc[:10000, :] = np.nan

self.s = Series(np.random.randint(0, 1000, size=10000))
self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))

np.random.seed(1234)
self.N = 1000000
self.K = 10000
self.key1 = np.random.randint(0, self.K, size=self.N)
self.df_int = DataFrame({'key1': self.key1})
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
dtype=bool)
for i in range(10)})
self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10))

def time_frame_drop_dups(self):
self.df.drop_duplicates(['key1', 'key2'])
N = 1000000
K = 10000
key1 = np.random.randint(0, K, size=N)
self.df_int = DataFrame({'key1': key1})
self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10),
dtype=bool))

def time_frame_drop_dups_inplace(self):
self.df.drop_duplicates(['key1', 'key2'], inplace=True)
def time_frame_drop_dups(self, inplace):
self.df.drop_duplicates(['key1', 'key2'], inplace=inplace)

def time_frame_drop_dups_na(self):
self.df2.drop_duplicates(['key1', 'key2'])
def time_frame_drop_dups_na(self, inplace):
self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace)

def time_frame_drop_dups_na_inplace(self):
self.df2.drop_duplicates(['key1', 'key2'], inplace=True)
def time_series_drop_dups_int(self, inplace):
self.s.drop_duplicates(inplace=inplace)

def time_series_drop_dups_int(self):
self.s.drop_duplicates()
def time_series_drop_dups_string(self, inplace):
self.s_str.drop_duplicates(inplace=inplace)

def time_series_drop_dups_string(self):
self.s2.drop_duplicates()
def time_frame_drop_dups_int(self, inplace):
self.df_int.drop_duplicates(inplace=inplace)

def time_frame_drop_dups_int(self):
self.df_int.drop_duplicates()

def time_frame_drop_dups_bool(self):
self.df_bool.drop_duplicates()

#----------------------------------------------------------------------
# blog "pandas escaped the zoo"
def time_frame_drop_dups_bool(self, inplace):
self.df_bool.drop_duplicates(inplace=inplace)


class Align(object):
# blog "pandas escaped the zoo"
goal_time = 0.2

def setup(self):
n = 50000
indices = tm.makeStringIndex(n)
subsample_size = 40000

def sample(values, k):
sampler = np.arange(len(values))
shuffle(sampler)
return values.take(sampler[:k])

self.x = Series(np.random.randn(50000), indices)
self.x = Series(np.random.randn(n), indices)
self.y = Series(np.random.randn(subsample_size),
index=sample(indices, subsample_size))
index=np.random.choice(indices, subsample_size,
replace=False))

def time_align_series_irregular_string(self):
(self.x + self.y)
self.x + self.y


class LibFastZip(object):

goal_time = 0.2

def setup(self):
self.N = 10000
self.K = 10
self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K)
self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), })
self.col_array_list = list(self.df.values.T)

self.df2 = self.df.copy()
self.df2.ix[:10000, :] = np.nan
self.col_array_list2 = list(self.df2.values.T)
N = 10000
K = 10
key1 = tm.makeStringIndex(N).values.repeat(K)
key2 = tm.makeStringIndex(N).values.repeat(K)
col_array = np.vstack([key1, key2, np.random.randn(N * K)])
col_array2 = col_array.copy()
col_array2[:, :10000] = np.nan
self.col_array_list = list(col_array)
self.col_array_list2 = list(col_array2)

def time_lib_fast_zip(self):
lib.fast_zip(self.col_array_list)
Expand Down