Skip to content

Commit 82315d5

Browse files
committed
CLN: ASV indexing
1 parent 04dcc72 commit 82315d5

File tree

3 files changed

+152
-129
lines changed

3 files changed

+152
-129
lines changed

asv_bench/benchmarks/index_object.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import string
2+
13
import numpy as np
24
import pandas.util.testing as tm
35
from pandas import (Series, date_range, DatetimeIndex, Index, MultiIndex,
4-
RangeIndex)
6+
RangeIndex, Float64Index)
57

68
from .pandas_vb_common import setup # noqa
79

@@ -222,3 +224,89 @@ def time_slice(self, dtype):
222224

223225
def time_slice_step(self, dtype):
224226
self.idx[::2]
227+
228+
229+
class Float64IndexMethod(object):
230+
# GH 13166
231+
goal_time = 0.2
232+
233+
def setup(self):
234+
N = 100000
235+
a = np.arange(N)
236+
self.ind = Float64Index(a * 4.8000000418824129e-08)
237+
238+
def time_get_loc(self):
239+
self.ind.get_loc(0)
240+
241+
242+
class MultiIndexGet(object):
243+
244+
goal_time = 0.2
245+
246+
def setup(self):
247+
self.mi_large = MultiIndex.from_product(
248+
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
249+
names=['one', 'two', 'three'])
250+
self.mi_med = MultiIndex.from_product(
251+
[np.arange(1000), np.arange(10), list('A')],
252+
names=['one', 'two', 'three'])
253+
self.mi_small = MultiIndex.from_product(
254+
[np.arange(100), list('A'), list('A')],
255+
names=['one', 'two', 'three'])
256+
257+
def time_multiindex_large_get_loc(self):
258+
self.mi_large.get_loc((999, 19, 'Z'))
259+
260+
def time_multiindex_large_get_loc_warm(self):
261+
for _ in range(1000):
262+
self.mi_large.get_loc((999, 19, 'Z'))
263+
264+
def time_multiindex_med_get_loc(self):
265+
self.mi_med.get_loc((999, 9, 'A'))
266+
267+
def time_multiindex_med_get_loc_warm(self):
268+
for _ in range(1000):
269+
self.mi_med.get_loc((999, 9, 'A'))
270+
271+
def time_multiindex_string_get_loc(self):
272+
self.mi_small.get_loc((99, 'A', 'A'))
273+
274+
def time_multiindex_small_get_loc_warm(self):
275+
for _ in range(1000):
276+
self.mi_small.get_loc((99, 'A', 'A'))
277+
278+
279+
class MultiIndexDuplicates(object):
280+
281+
goal_time = 0.2
282+
283+
def setup(self):
284+
size = 65536
285+
arrays = [np.random.randint(0, 8192, size),
286+
np.random.randint(0, 1024, size)]
287+
mask = np.random.rand(size) < 0.1
288+
self.mi_unused_levels = MultiIndex.from_arrays(arrays)
289+
self.mi_unused_levels = self.mi_unused_levels[mask]
290+
291+
def time_remove_unused_levels(self):
292+
self.mi_unused_levels.remove_unused_levels()
293+
294+
295+
class MultiIndexInteger(object):
296+
297+
goal_time = 0.2
298+
299+
def setup(self):
300+
self.mi_int = MultiIndex.from_product([np.arange(1000),
301+
np.arange(1000)],
302+
names=['one', 'two'])
303+
self.obj_index = np.array([(0, 10), (0, 11), (0, 12),
304+
(0, 13), (0, 14), (0, 15),
305+
(0, 16), (0, 17), (0, 18),
306+
(0, 19)], dtype=object)
307+
308+
def time_get_indexer(self):
309+
self.mi_int.get_indexer(self.obj_index)
310+
311+
def time_is_monotonic(self):
312+
self.mi_int.is_monotonic

asv_bench/benchmarks/indexing.py

Lines changed: 62 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
1-
import string
2-
31
import numpy as np
42
import pandas.util.testing as tm
53
from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index,
6-
IntervalIndex, IndexSlice)
4+
IntervalIndex, IndexSlice, concat, date_range)
75
from .pandas_vb_common import setup, Panel # noqa
86

97

@@ -79,27 +77,27 @@ class NonNumericSeriesIndexing(object):
7977
param_names = ['index']
8078

8179
def setup(self, index):
82-
N = 10**6
80+
N = 10**5
8381
indexes = {'string': tm.makeStringIndex(N),
84-
'datetime': tm.makeTimeSeries(N)}
82+
'datetime': date_range('1900', periods=N, freq='s')}
8583
index = indexes[index]
8684
self.s = Series(np.random.rand(N), index=index)
87-
self.lbl = index[800000]
85+
self.lbl = index[80000]
8886

89-
def time_getitem_label_slice(self):
87+
def time_getitem_label_slice(self, index):
9088
self.s[:self.lbl]
9189

92-
def time_getitem_pos_slice(self):
93-
self.s[:800000]
90+
def time_getitem_pos_slice(self, index):
91+
self.s[:80000]
9492

95-
def time_get_value(self):
93+
def time_get_value(self, index):
9694
self.s.get_value(self.lbl)
9795

9896
def time_getitem_scalar(self, index):
9997
self.s[self.lbl]
10098

10199

102-
class DataFrameIndexing(object):
100+
class DataFrameStringIndexing(object):
103101

104102
goal_time = 0.2
105103

@@ -108,67 +106,71 @@ def setup(self):
108106
columns = tm.makeStringIndex(30)
109107
self.df = DataFrame(np.random.randn(1000, 30), index=index,
110108
columns=columns)
111-
self.idx = index[100]
112-
self.col = columns[10]
113-
114-
self.df2 = DataFrame(np.random.randn(10000, 4),
115-
columns=['A', 'B', 'C', 'D'])
116-
self.indexer = self.df2['B'] > 0
117-
self.obj_indexer = self.indexer.astype('O')
118-
119-
# dupes
120-
self.idx_dupe = np.array(range(30)) * 99
121-
self.df3 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000})
122-
self.df3 = concat([self.df3, 2 * self.df3, 3 * self.df3])
123-
124-
self.df_big = DataFrame(dict(A=['foo'] * 1000000))
109+
self.idx_scalar = index[100]
110+
self.col_scalar = columns[10]
111+
self.bool_indexer = self.df[self.col_scalar] > 0
112+
self.bool_obj_indexer = self.bool_indexer.astype(object)
125113

126114
def time_get_value(self):
127-
self.df.get_value(self.idx, self.col)
115+
self.df.get_value(self.idx_scalar, self.col_scalar)
116+
117+
def time_ix(self):
118+
self.df.ix[self.idx_scalar, self.col_scalar]
128119

129-
def time_get_value_ix(self):
130-
self.df.ix[(self.idx, self.col)]
120+
def time_loc(self):
121+
self.df.loc[self.idx_scalar, self.col_scalar]
131122

132123
def time_getitem_scalar(self):
133-
self.df[self.col][self.idx]
124+
self.df[self.col_scalar][self.idx_scalar]
134125

135126
def time_boolean_rows(self):
136-
self.df2[self.indexer]
127+
self.df[self.bool_indexer]
137128

138129
def time_boolean_rows_object(self):
139-
self.df2[self.obj_indexer]
130+
self.df[self.bool_obj_indexer]
131+
132+
133+
class DataFrameNumericIndexing(object):
134+
135+
goal_time = 0.2
136+
137+
def setup(self):
138+
self.idx_dupe = np.array(range(30)) * 99
139+
self.df = DataFrame(np.random.randn(10000, 5))
140+
self.df_dup = concat([self.df, 2 * self.df, 3 * self.df])
141+
self.bool_indexer = [True] * 5000 + [False] * 5000
140142

141143
def time_iloc_dups(self):
142-
self.df3.iloc[self.idx_dupe]
144+
self.df_dup.iloc[self.idx_dupe]
143145

144146
def time_loc_dups(self):
145-
self.df3.loc[self.idx_dupe]
147+
self.df_dup.loc[self.idx_dupe]
146148

147-
def time_iloc_big(self):
148-
self.df_big.iloc[:100, 0]
149+
def time_iloc(self):
150+
self.df.iloc[:100, 0]
149151

152+
def time_loc(self):
153+
self.df.loc[:100, 0]
150154

151-
class IndexingMethods(object):
152-
# GH 13166
153-
goal_time = 0.2
155+
def time_bool_indexer(self):
156+
self.df[self.bool_indexer]
154157

155-
def setup(self):
156-
N = 100000
157-
a = np.arange(N)
158-
self.ind = Float64Index(a * 4.8000000418824129e-08)
159158

160-
self.s = Series(np.random.rand(N))
161-
self.ts = Series(np.random.rand(N),
162-
index=date_range('2011-01-01', freq='S', periods=N))
163-
self.indexer = [True, False, True, True, False] * 20000
159+
class Take(object):
164160

165-
def time_get_loc_float(self):
166-
self.ind.get_loc(0)
161+
goal_time = 0.2
162+
params = ['int', 'datetime']
163+
param_names = ['index']
167164

168-
def time_take_dtindex(self):
169-
self.ts.take(self.indexer)
165+
def setup(self, index):
166+
N = 100000
167+
indexes = {'int': Int64Index(np.arange(N)),
168+
'datetime': date_range('2011-01-01', freq='S', periods=N)}
169+
index = indexes[index]
170+
self.s = Series(np.random.rand(N), index=index)
171+
self.indexer = [True, False, True, True, False] * 20000
170172

171-
def time_take_intindex(self):
173+
def time_take(self, index):
172174
self.s.take(self.indexer)
173175

174176

@@ -177,11 +179,10 @@ class MultiIndexing(object):
177179
goal_time = 0.2
178180

179181
def setup(self):
180-
self.mi = MultiIndex.from_product([range(1000), range(1000)])
181-
self.s = Series(np.random.randn(1000000), index=self.mi)
182+
mi = MultiIndex.from_product([range(1000), range(1000)])
183+
self.s = Series(np.random.randn(1000000), index=mi)
182184
self.df = DataFrame(self.s)
183185

184-
# slicers
185186
n = 100000
186187
self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000),
187188
n),
@@ -191,68 +192,16 @@ def setup(self):
191192
'x': np.random.choice(range(400), n),
192193
'y': np.random.choice(range(25), n)})
193194
self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000]
194-
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
195-
self.miint = MultiIndex.from_product([np.arange(1000),
196-
np.arange(1000)],
197-
names=['one', 'two'])
198-
self.obj_index = np.array([(0, 10), (0, 11), (0, 12),
199-
(0, 13), (0, 14), (0, 15),
200-
(0, 16), (0, 17), (0, 18),
201-
(0, 19)], dtype=object)
202-
203-
self.mi_large = MultiIndex.from_product(
204-
[np.arange(1000), np.arange(20), list(string.ascii_letters)],
205-
names=['one', 'two', 'three'])
206-
self.mi_med = MultiIndex.from_product(
207-
[np.arange(1000), np.arange(10), list('A')],
208-
names=['one', 'two', 'three'])
209-
self.mi_small = MultiIndex.from_product(
210-
[np.arange(100), list('A'), list('A')],
211-
names=['one', 'two', 'three'])
212-
213-
size = 65536
214-
self.mi_unused_levels = pd.MultiIndex.from_arrays([
215-
rng.randint(0, 8192, size),
216-
rng.randint(0, 1024, size)])[rng.random.rand(size) < 0.1]
217-
218-
def time_series_xs_mi_ix(self):
195+
self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index()
196+
197+
def time_series_ix(self):
219198
self.s.ix[999]
220199

221-
def time_frame_xs_mi_ix(self):
200+
def time_frame_ix(self):
222201
self.df.ix[999]
223202

224-
def time_multiindex_slicers(self):
225-
self.mdt2.loc[self.idx, :]
226-
227-
def time_multiindex_get_indexer(self):
228-
self.miint.get_indexer(self.obj_index)
229-
230-
def time_multiindex_large_get_loc(self):
231-
self.mi_large.get_loc((999, 19, 'Z'))
232-
233-
def time_multiindex_large_get_loc_warm(self):
234-
for _ in range(1000):
235-
self.mi_large.get_loc((999, 19, 'Z'))
236-
237-
def time_multiindex_med_get_loc(self):
238-
self.mi_med.get_loc((999, 9, 'A'))
239-
240-
def time_multiindex_med_get_loc_warm(self):
241-
for _ in range(1000):
242-
self.mi_med.get_loc((999, 9, 'A'))
243-
244-
def time_multiindex_string_get_loc(self):
245-
self.mi_small.get_loc((99, 'A', 'A'))
246-
247-
def time_multiindex_small_get_loc_warm(self):
248-
for _ in range(1000):
249-
self.mi_small.get_loc((99, 'A', 'A'))
250-
251-
def time_is_monotonic(self):
252-
self.miint.is_monotonic
253-
254-
def time_remove_unused_levels(self):
255-
self.mi_unused_levels.remove_unused_levels()
203+
def time_index_slice(self):
204+
self.mdt.loc[self.idx, :]
256205

257206

258207
class IntervalIndexing(object):
@@ -307,20 +256,6 @@ def time_lookup_loc(self, s):
307256
s.loc
308257

309258

310-
class BooleanRowSelect(object):
311-
312-
goal_time = 0.2
313-
314-
def setup(self):
315-
N = 10000
316-
self.df = DataFrame(np.random.randn(N, 100))
317-
self.bool_arr = np.zeros(N, dtype=bool)
318-
self.bool_arr[:1000] = True
319-
320-
def time_frame_boolean_row_select(self):
321-
self.df[self.bool_arr]
322-
323-
324259
class GetItemSingleColumn(object):
325260

326261
goal_time = 0.2
@@ -342,7 +277,7 @@ class AssignTimeseriesIndex(object):
342277

343278
def setup(self):
344279
N = 100000
345-
dx = date_range('1/1/2000', periods=N, freq='H')
280+
idx = date_range('1/1/2000', periods=N, freq='H')
346281
self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx)
347282

348283
def time_frame_assign_timeseries_index(self):

ci/lint.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ if [ "$LINT" ]; then
2424
echo "Linting setup.py DONE"
2525

2626
echo "Linting asv_bench/benchmarks/"
27-
flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/[ips]*.py --ignore=F811
27+
flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/[ps]*.py --ignore=F811
2828
if [ $? -ne "0" ]; then
2929
RET=1
3030
fi

0 commit comments

Comments
 (0)