diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 537d275e7c727..69a1a604b1ccc 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,89 +1,77 @@ -from .pandas_vb_common import * -from random import shuffle +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index, + date_range) +from .pandas_vb_common import setup, lib # noqa -class Reindexing(object): +class Reindex(object): + goal_time = 0.2 def setup(self): - self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') - self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, + rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df['foo'] = 'bar' - self.rng2 = Index(self.rng[::2]) - + self.rng_subset = Index(rng[::2]) self.df2 = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) - - # multi-index N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) - self.s1 = Series(np.random.randn((N * K)), index=index) - self.s2 = self.s1[::2] + self.s = Series(np.random.randn(N * K), index=index) + self.s_subset = self.s[::2] def time_reindex_dates(self): - self.df.reindex(self.rng2) + self.df.reindex(self.rng_subset) def time_reindex_columns(self): self.df2.reindex(columns=self.df.columns[1:5]) def time_reindex_multiindex(self): - self.s1.reindex(self.s2.index) + self.s.reindex(self.s_subset.index) -#---------------------------------------------------------------------- -# Pad / backfill +class ReindexMethod(object): - -class FillMethod(object): goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq='1min') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') - - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') - - def time_backfill_dates(self): - self.backfill(self.ts2, self.ts.index) + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.ts = Series(np.random.randn(N), index=self.idx)[::2] - def time_pad_daterange(self): - self.pad(self.ts2, self.ts.index) + def time_reindex_method(self, method): + self.ts.reindex(self.idx, method=method) - def time_backfill(self): - self.ts3.fillna(method='backfill') - def time_backfill_float32(self): - self.ts4.fillna(method='backfill') +class Fillna(object): - def time_pad(self): - self.ts3.fillna(method='pad') + goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def time_pad_float32(self): - self.ts4.fillna(method='pad') + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + ts = Series(np.random.randn(N), index=self.idx)[::2] + self.ts_reindexed = ts.reindex(self.idx) + self.ts_float32 = self.ts_reindexed.astype('float32') + def time_reindexed(self, method): + self.ts_reindexed.fillna(method=method) -#---------------------------------------------------------------------- -# align on level + def time_float_32(self, method): + self.ts_float32.fillna(method=method) class LevelAlign(object): + goal_time = 0.2 def setup(self): @@ -92,7 +80,6 @@ def setup(self): labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), @@ -102,103 +89,85 @@ def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) def time_reindex_level(self): - self.df_level.reindex(self.df.index, level=1) + self.df_level.reindex(self.index, level=1) -#---------------------------------------------------------------------- -# drop_duplicates +class DropDuplicates(object): - -class Duplicates(object): goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, - 'value': np.random.randn((self.N * self.K)),}) - self.col_array_list = list(self.df.values.T) - - self.df2 = self.df.copy() - self.df2.ix[:10000, :] = np.nan + params = [True, False] + param_names = ['inplace'] + + def setup(self, inplace): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + self.df = DataFrame({'key1': key1, 'key2': key2, + 'value': np.random.randn(N * K)}) + self.df_nan = self.df.copy() + self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - - np.random.seed(1234) - self.N = 1000000 - self.K = 10000 - self.key1 = np.random.randint(0, self.K, size=self.N) - self.df_int = DataFrame({'key1': self.key1}) - self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K, - dtype=bool) - for i in range(10)}) + self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - def time_frame_drop_dups(self): - self.df.drop_duplicates(['key1', 'key2']) + N = 1000000 + K = 10000 + key1 = np.random.randint(0, K, size=N) + self.df_int = DataFrame({'key1': key1}) + self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), + dtype=bool)) - def time_frame_drop_dups_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) + def time_frame_drop_dups(self, inplace): + self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) - def time_frame_drop_dups_na(self): - self.df2.drop_duplicates(['key1', 'key2']) + def time_frame_drop_dups_na(self, inplace): + self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) - def time_frame_drop_dups_na_inplace(self): - self.df2.drop_duplicates(['key1', 'key2'], inplace=True) + def time_series_drop_dups_int(self, inplace): + self.s.drop_duplicates(inplace=inplace) - def time_series_drop_dups_int(self): - self.s.drop_duplicates() + def time_series_drop_dups_string(self, inplace): + self.s_str.drop_duplicates(inplace=inplace) - def time_series_drop_dups_string(self): - self.s2.drop_duplicates() + def time_frame_drop_dups_int(self, inplace): + self.df_int.drop_duplicates(inplace=inplace) - def time_frame_drop_dups_int(self): - self.df_int.drop_duplicates() - - def time_frame_drop_dups_bool(self): - self.df_bool.drop_duplicates() - -#---------------------------------------------------------------------- -# blog "pandas escaped the zoo" + def time_frame_drop_dups_bool(self, inplace): + self.df_bool.drop_duplicates(inplace=inplace) class Align(object): + # blog "pandas escaped the zoo" goal_time = 0.2 def setup(self): n = 50000 indices = tm.makeStringIndex(n) subsample_size = 40000 - - def sample(values, k): - sampler = np.arange(len(values)) - shuffle(sampler) - return values.take(sampler[:k]) - - self.x = Series(np.random.randn(50000), indices) + self.x = Series(np.random.randn(n), indices) self.y = Series(np.random.randn(subsample_size), - index=sample(indices, subsample_size)) + index=np.random.choice(indices, subsample_size, + replace=False)) def time_align_series_irregular_string(self): - (self.x + self.y) + self.x + self.y class LibFastZip(object): + goal_time = 0.2 def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - self.df2 = self.df.copy() - self.df2.ix[:10000, :] = np.nan - self.col_array_list2 = list(self.df2.values.T) + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) + self.col_array_list2 = list(col_array2) def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list)