Skip to content

Commit 45f60fe

Browse files
committed
Merge branch 'master' into rank_desc
2 parents f6d0fd0 + 3783ccc commit 45f60fe

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+6574
-1767
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,4 @@ doc/tmp.sv
109109
doc/source/styled.xlsx
110110
doc/source/templates/
111111
env/
112+
doc/source/savefig/

asv_bench/benchmarks/groupby.py

+58-79
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
method_blacklist = {
1515
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
1616
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17-
'var', 'mad', 'describe', 'std'}
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
1821
}
1922

2023

@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093
self.ser.groupby(self.ser).groups
9194

9295

93-
class FirstLast(object):
94-
95-
goal_time = 0.2
96-
97-
param_names = ['dtype']
98-
params = ['float32', 'float64', 'datetime', 'object']
99-
100-
def setup(self, dtype):
101-
N = 10**5
102-
# with datetimes (GH7555)
103-
if dtype == 'datetime':
104-
self.df = DataFrame({'values': date_range('1/1/2011',
105-
periods=N,
106-
freq='s'),
107-
'key': range(N)})
108-
elif dtype == 'object':
109-
self.df = DataFrame({'values': ['foo'] * N,
110-
'key': range(N)})
111-
else:
112-
labels = np.arange(N / 10).repeat(10)
113-
data = Series(np.random.randn(len(labels)), dtype=dtype)
114-
data[::3] = np.nan
115-
data[1::3] = np.nan
116-
labels = labels.take(np.random.permutation(len(labels)))
117-
self.df = DataFrame({'values': data, 'key': labels})
118-
119-
def time_groupby_first(self, dtype):
120-
self.df.groupby('key').first()
121-
122-
def time_groupby_last(self, dtype):
123-
self.df.groupby('key').last()
124-
125-
def time_groupby_nth_all(self, dtype):
126-
self.df.groupby('key').nth(0, dropna='all')
127-
128-
def time_groupby_nth_none(self, dtype):
129-
self.df.groupby('key').nth(0)
130-
131-
13296
class GroupManyLabels(object):
13397

13498
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113

150114
goal_time = 0.2
151115

152-
def setup_cache(self):
153-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
154-
df.iloc[1, 1] = np.nan
155-
return df
156-
157-
def time_frame_nth_any(self, df):
158-
df.groupby(0).nth(0, dropna='any')
159-
160-
def time_frame_nth(self, df):
161-
df.groupby(0).nth(0)
162-
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
163118

164-
def time_series_nth_any(self, df):
165-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
166128

167-
def time_series_nth(self, df):
168-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
169132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
170135

171-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
172138

173-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
174141

175-
def setup_cache(self):
176-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
177-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
178-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
179144

180-
def time_nth(self, df):
181-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
182147

183-
def time_nth_last(self, df):
184-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
185150

186151

187152
class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208
df.groupby(['key1', 'key2']).count()
244209

245210

246-
class CountInt(object):
211+
class CountMultiInt(object):
247212

248213
goal_time = 0.2
249214

@@ -255,18 +220,18 @@ def setup_cache(self):
255220
'ints2': np.random.randint(0, 1000, size=n)})
256221
return df
257222

258-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
259224
df.groupby(['key1', 'key2']).count()
260225

261-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
262227
df.groupby(['key1', 'key2']).nunique()
263228

264229

265230
class AggFunctions(object):
266231

267232
goal_time = 0.2
268233

269-
def setup_cache(self):
234+
def setup_cache():
270235
N = 10**5
271236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
272237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
361326
def time_multi_size(self):
362327
self.df.groupby(['key1', 'key2']).size()
363328

364-
def time_dt_size(self):
365-
self.df.groupby(['dates']).size()
366-
367329
def time_dt_timegrouper_size(self):
368330
with warnings.catch_warnings(record=True):
369331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338

377339
goal_time = 0.2
378340

379-
param_names = ['dtype', 'method']
380-
params = [['int', 'float', 'object'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
381343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
382344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
383345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
384346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
385-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
386349

387-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
388351
if method in method_blacklist.get(dtype, {}):
389352
raise NotImplementedError # skip benchmark
390353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361
np.random.random(ngroups) * 10.0])
399362
elif dtype == 'object':
400363
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
401366

402367
df = DataFrame({'values': values, 'key': key})
403-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
404368

405-
def time_method(self, dtype, method):
406-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
407386

408387

409388
class Float32(object):

ci/requirements-3.6_NUMPY_DEV.build.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
1212
pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
1313

1414
# install dateutil from master
15-
# pip install -U git+git://github.com/dateutil/dateutil.git
16-
pip install dateutil
15+
pip install -U git+git://github.com/dateutil/dateutil.git
1716

1817
# cython via pip
1918
pip install cython

doc/source/contributing.rst

+6-8
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,9 @@ after updating.
262262
Contributing to the documentation
263263
=================================
264264

265-
If you're not the developer type, contributing to the documentation is still of
266-
huge value. You don't even have to be an expert on *pandas* to do so! In fact,
265+
Contributing to the documentation benefits everyone who uses *pandas*.
266+
We encourage you to help us improve the documentation, and
267+
you don't have to be an expert on *pandas* to do so! In fact,
267268
there are sections of the docs that are worse off after being written by
268269
experts. If something in the docs doesn't make sense to you, updating the
269270
relevant section after you figure it out is a great way to ensure it will help
@@ -292,12 +293,9 @@ Some other important things to know about the docs:
292293
overviews per topic together with some other information (what's new,
293294
installation, etc).
294295

295-
- The docstrings follow the **Numpy Docstring Standard**, which is used widely
296-
in the Scientific Python community. This standard specifies the format of
297-
the different sections of the docstring. See `this document
298-
<https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt>`_
299-
for a detailed explanation, or look at some of the existing functions to
300-
extend it in a similar manner.
296+
- The docstrings follow a pandas convention, based on the **Numpy Docstring
297+
Standard**. Follow the :ref:`pandas docstring guide <docstring>` for detailed
298+
instructions on how to write a correct docstring.
301299

302300
- The tutorials make heavy use of the `ipython directive
303301
<http://matplotlib.org/sampledoc/ipython_directive.html>`_ sphinx extension.

0 commit comments

Comments
 (0)