Skip to content

Commit a609f48

Browse files
committed
Merge remote-tracking branch 'upstream/master' into fu1+fillna
2 parents b342efe + 0815c43 commit a609f48

File tree

96 files changed

+6498
-4054
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+6498
-4054
lines changed

.github/PULL_REQUEST_TEMPLATE.md

+24
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
Checklist for the pandas documentation sprint (ignore this if you are doing
2+
an unrelated PR):
3+
4+
- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
5+
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
6+
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
7+
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
8+
- [ ] It has been proofread on language by another sprint participant
9+
10+
Please include the output of the validation script below between the "```" ticks:
11+
12+
```
13+
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
14+
# between the "```" (remove this comment, but keep the "```")
15+
16+
```
17+
18+
If the validation script still gives errors, but you think there is a good reason
19+
to deviate in this case (and there are certainly such cases), please state this
20+
explicitly.
21+
22+
23+
Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
24+
125
- [ ] closes #xxxx
226
- [ ] tests added / passed
327
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ scikits
8888
*.c
8989
*.cpp
9090

91-
# Performance Testing #
92-
#######################
91+
# Unit / Performance Testing #
92+
##############################
93+
.pytest_cache/
9394
asv_bench/env/
9495
asv_bench/html/
9596
asv_bench/results/

asv_bench/benchmarks/groupby.py

+58-79
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
method_blacklist = {
1515
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
1616
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17-
'var', 'mad', 'describe', 'std'}
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
1821
}
1922

2023

@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093
self.ser.groupby(self.ser).groups
9194

9295

93-
class FirstLast(object):
94-
95-
goal_time = 0.2
96-
97-
param_names = ['dtype']
98-
params = ['float32', 'float64', 'datetime', 'object']
99-
100-
def setup(self, dtype):
101-
N = 10**5
102-
# with datetimes (GH7555)
103-
if dtype == 'datetime':
104-
self.df = DataFrame({'values': date_range('1/1/2011',
105-
periods=N,
106-
freq='s'),
107-
'key': range(N)})
108-
elif dtype == 'object':
109-
self.df = DataFrame({'values': ['foo'] * N,
110-
'key': range(N)})
111-
else:
112-
labels = np.arange(N / 10).repeat(10)
113-
data = Series(np.random.randn(len(labels)), dtype=dtype)
114-
data[::3] = np.nan
115-
data[1::3] = np.nan
116-
labels = labels.take(np.random.permutation(len(labels)))
117-
self.df = DataFrame({'values': data, 'key': labels})
118-
119-
def time_groupby_first(self, dtype):
120-
self.df.groupby('key').first()
121-
122-
def time_groupby_last(self, dtype):
123-
self.df.groupby('key').last()
124-
125-
def time_groupby_nth_all(self, dtype):
126-
self.df.groupby('key').nth(0, dropna='all')
127-
128-
def time_groupby_nth_none(self, dtype):
129-
self.df.groupby('key').nth(0)
130-
131-
13296
class GroupManyLabels(object):
13397

13498
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113

150114
goal_time = 0.2
151115

152-
def setup_cache(self):
153-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
154-
df.iloc[1, 1] = np.nan
155-
return df
156-
157-
def time_frame_nth_any(self, df):
158-
df.groupby(0).nth(0, dropna='any')
159-
160-
def time_frame_nth(self, df):
161-
df.groupby(0).nth(0)
162-
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
163118

164-
def time_series_nth_any(self, df):
165-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
166128

167-
def time_series_nth(self, df):
168-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
169132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
170135

171-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
172138

173-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
174141

175-
def setup_cache(self):
176-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
177-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
178-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
179144

180-
def time_nth(self, df):
181-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
182147

183-
def time_nth_last(self, df):
184-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
185150

186151

187152
class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208
df.groupby(['key1', 'key2']).count()
244209

245210

246-
class CountInt(object):
211+
class CountMultiInt(object):
247212

248213
goal_time = 0.2
249214

@@ -255,18 +220,18 @@ def setup_cache(self):
255220
'ints2': np.random.randint(0, 1000, size=n)})
256221
return df
257222

258-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
259224
df.groupby(['key1', 'key2']).count()
260225

261-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
262227
df.groupby(['key1', 'key2']).nunique()
263228

264229

265230
class AggFunctions(object):
266231

267232
goal_time = 0.2
268233

269-
def setup_cache(self):
234+
def setup_cache():
270235
N = 10**5
271236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
272237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
361326
def time_multi_size(self):
362327
self.df.groupby(['key1', 'key2']).size()
363328

364-
def time_dt_size(self):
365-
self.df.groupby(['dates']).size()
366-
367329
def time_dt_timegrouper_size(self):
368330
with warnings.catch_warnings(record=True):
369331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338

377339
goal_time = 0.2
378340

379-
param_names = ['dtype', 'method']
380-
params = [['int', 'float', 'object'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
381343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
382344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
383345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
384346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
385-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
386349

387-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
388351
if method in method_blacklist.get(dtype, {}):
389352
raise NotImplementedError # skip benchmark
390353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361
np.random.random(ngroups) * 10.0])
399362
elif dtype == 'object':
400363
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
401366

402367
df = DataFrame({'values': values, 'key': key})
403-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
404368

405-
def time_method(self, dtype, method):
406-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
407386

408387

409388
class Float32(object):

ci/environment-dev.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- Cython
77
- NumPy
8+
- flake8
89
- moto
910
- pytest>=3.1
1011
- python-dateutil>=2.5.0

ci/requirements-3.6_DOC.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ sphinx
55
nbconvert
66
nbformat
77
notebook
8-
matplotlib
8+
matplotlib=2.1*
99
seaborn
1010
scipy
1111
lxml

ci/requirements-3.6_NUMPY_DEV.build.sh

+1-2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf
1212
pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
1313

1414
# install dateutil from master
15-
# pip install -U git+git://github.com/dateutil/dateutil.git
16-
pip install dateutil
15+
pip install -U git+git://github.com/dateutil/dateutil.git
1716

1817
# cython via pip
1918
pip install cython

ci/requirements_dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
# Do not modify directly
33
Cython
44
NumPy
5+
flake8
56
moto
67
pytest>=3.1
78
python-dateutil>=2.5.0
89
pytz
910
setuptools>=3.3
10-
sphinx
11+
sphinx

doc/make.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
$ python make.py html
1212
$ python make.py latex
1313
"""
14+
import importlib
1415
import sys
1516
import os
1617
import shutil
@@ -20,8 +21,6 @@
2021
import webbrowser
2122
import jinja2
2223

23-
import pandas
24-
2524

2625
DOC_PATH = os.path.dirname(os.path.abspath(__file__))
2726
SOURCE_PATH = os.path.join(DOC_PATH, 'source')
@@ -134,7 +133,7 @@ def _process_single_doc(self, single_doc):
134133
self.single_doc = single_doc
135134
elif single_doc is not None:
136135
try:
137-
obj = pandas
136+
obj = pandas # noqa: F821
138137
for name in single_doc.split('.'):
139138
obj = getattr(obj, name)
140139
except AttributeError:
@@ -332,7 +331,7 @@ def main():
332331
'compile, e.g. "indexing", "DataFrame.join"'))
333332
argparser.add_argument('--python-path',
334333
type=str,
335-
default=os.path.join(DOC_PATH, '..'),
334+
default=os.path.dirname(DOC_PATH),
336335
help='path')
337336
argparser.add_argument('-v', action='count', dest='verbosity', default=0,
338337
help=('increase verbosity (can be repeated), '
@@ -343,7 +342,13 @@ def main():
343342
raise ValueError('Unknown command {}. Available options: {}'.format(
344343
args.command, ', '.join(cmds)))
345344

345+
# Below we update both os.environ and sys.path. The former is used by
346+
# external libraries (namely Sphinx) to compile this module and resolve
347+
# the import of `python_path` correctly. The latter is used to resolve
348+
# the import within the module, injecting it into the global namespace
346349
os.environ['PYTHONPATH'] = args.python_path
350+
sys.path.append(args.python_path)
351+
globals()['pandas'] = importlib.import_module('pandas')
347352

348353
builder = DocBuilder(args.num_jobs, not args.no_api, args.single,
349354
args.verbosity)

doc/source/categorical.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ are consistent among all columns.
177177
.. note::
178178

179179
To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as
180-
categories for each column, the ``categories`` parameter can be determined programatically by
180+
categories for each column, the ``categories`` parameter can be determined programmatically by
181181
``categories = pd.unique(df.values.ravel())``.
182182

183183
If you already have ``codes`` and ``categories``, you can use the

0 commit comments

Comments
 (0)