-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: more flexible describe() + tests #8164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -490,6 +490,24 @@ number of unique values and most frequently occurring values: | |
s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) | ||
s.describe() | ||
|
||
Note that on a mixed-type DataFrame object, `describe` will restrict the summary to | ||
include only numerical columns or, if none are, only categorical columns: | ||
|
||
.. ipython:: python | ||
|
||
frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) | ||
frame.describe() | ||
|
||
This behaviour can be controlled by providing a list of types as ``include``/``exclude`` | ||
arguments. The special value ``all`` can also be used: | ||
|
||
.. ipython:: python | ||
|
||
frame.describe(include=['object']) | ||
frame.describe(include=['number']) | ||
frame.describe(include='all') | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you refer to |
||
That feature relies on :ref:`select_dtypes <basics.selectdtypes>`. Refer to there for details about accepted inputs. | ||
|
||
There also is a utility function, ``value_range`` which takes a DataFrame and | ||
returns a series with the minimum/maximum values in the DataFrame. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3658,27 +3658,51 @@ def abs(self): | |
The percentiles to include in the output. Should all | ||
be in the interval [0, 1]. By default `percentiles` is | ||
[.25, .5, .75], returning the 25th, 50th, and 75th percentiles. | ||
include, exclude : list-like, 'all', or None (default) | ||
Specify the form of the returned result. Either: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You have to leave an empty line after this line in order that the list will be rendered as a list (in the online html docstring pages). Or other option is to remove the line and just have the list items, then there does not need to be an empty line. |
||
|
||
- None to both (default). The result will include only numeric-typed | ||
columns or, if none are, only categorical columns. | ||
- A list of dtypes or strings to be included/excluded. | ||
To select all numeric types use numpy numpy.number. To select | ||
categorical objects use type object. See also the select_dtypes | ||
documentation. eg. df.describe(include=['O']) | ||
- If include is the string 'all', the output column-set will | ||
match the input one. | ||
|
||
Returns | ||
------- | ||
summary: %(klass)s of summary statistics | ||
|
||
Notes | ||
----- | ||
For numeric dtypes the index includes: count, mean, std, min, | ||
The output DataFrame index depends on the requested dtypes: | ||
|
||
For numeric dtypes, it will include: count, mean, std, min, | ||
max, and lower, 50, and upper percentiles. | ||
|
||
If self is of object dtypes (e.g. timestamps or strings), the output | ||
For object dtypes (e.g. timestamps or strings), the index | ||
will include the count, unique, most common, and frequency of the | ||
most common. Timestamps also include the first and last items. | ||
|
||
For mixed dtypes, the index will be the union of the corresponding | ||
output types. Non-applicable entries will be filled with NaN. | ||
Note that mixed-dtype outputs can only be returned from mixed-dtype | ||
inputs and appropriate use of the include/exclude arguments. | ||
|
||
If multiple values have the highest count, then the | ||
`count` and `most common` pair will be arbitrarily chosen from | ||
among those with the highest count. | ||
|
||
The include, exclude arguments are ignored for Series. | ||
|
||
See also | ||
-------- | ||
DataFrame.select_dtypes | ||
""" | ||
|
||
@Appender(_shared_docs['describe'] % _shared_doc_kwargs) | ||
def describe(self, percentile_width=None, percentiles=None): | ||
def describe(self, percentile_width=None, percentiles=None, include=None, exclude=None ): | ||
if self.ndim >= 3: | ||
msg = "describe is not implemented on on Panel or PanelND objects." | ||
raise NotImplementedError(msg) | ||
|
@@ -3715,16 +3739,6 @@ def describe(self, percentile_width=None, percentiles=None): | |
uh = percentiles[percentiles > .5] | ||
percentiles = np.hstack([lh, 0.5, uh]) | ||
|
||
# dtypes: numeric only, numeric mixed, objects only | ||
data = self._get_numeric_data() | ||
if self.ndim > 1: | ||
if len(data._info_axis) == 0: | ||
is_object = True | ||
else: | ||
is_object = False | ||
else: | ||
is_object = not self._is_numeric_mixed_type | ||
|
||
def pretty_name(x): | ||
x *= 100 | ||
if x == int(x): | ||
|
@@ -3733,10 +3747,12 @@ def pretty_name(x): | |
return '%.1f%%' % x | ||
|
||
def describe_numeric_1d(series, percentiles): | ||
return ([series.count(), series.mean(), series.std(), | ||
series.min()] + | ||
[series.quantile(x) for x in percentiles] + | ||
[series.max()]) | ||
stat_index = (['count', 'mean', 'std', 'min'] + | ||
[pretty_name(x) for x in percentiles] + ['max']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
d = ([series.count(), series.mean(), series.std(), series.min()] + | ||
[series.quantile(x) for x in percentiles] + [series.max()]) | ||
return pd.Series(d, index=stat_index, name=series.name) | ||
|
||
|
||
def describe_categorical_1d(data): | ||
names = ['count', 'unique'] | ||
|
@@ -3749,44 +3765,49 @@ def describe_categorical_1d(data): | |
names += ['top', 'freq'] | ||
result += [top, freq] | ||
|
||
elif issubclass(data.dtype.type, np.datetime64): | ||
elif com.is_datetime64_dtype(data): | ||
asint = data.dropna().values.view('i8') | ||
names += ['first', 'last', 'top', 'freq'] | ||
result += [lib.Timestamp(asint.min()), | ||
lib.Timestamp(asint.max()), | ||
lib.Timestamp(top), freq] | ||
|
||
return pd.Series(result, index=names) | ||
|
||
if is_object: | ||
if data.ndim == 1: | ||
return describe_categorical_1d(self) | ||
names += ['top', 'freq', 'first', 'last'] | ||
result += [lib.Timestamp(top), freq, | ||
lib.Timestamp(asint.min()), | ||
lib.Timestamp(asint.max())] | ||
|
||
return pd.Series(result, index=names, name=data.name) | ||
|
||
def describe_1d(data, percentiles): | ||
if com.is_numeric_dtype(data): | ||
return describe_numeric_1d(data, percentiles) | ||
elif com.is_timedelta64_dtype(data): | ||
return describe_numeric_1d(data, percentiles) | ||
else: | ||
result = pd.DataFrame(dict((k, describe_categorical_1d(v)) | ||
for k, v in compat.iteritems(self)), | ||
columns=self._info_axis, | ||
index=['count', 'unique', 'first', 'last', | ||
'top', 'freq']) | ||
# just objects, no datime | ||
if pd.isnull(result.loc['first']).all(): | ||
result = result.drop(['first', 'last'], axis=0) | ||
return result | ||
else: | ||
stat_index = (['count', 'mean', 'std', 'min'] + | ||
[pretty_name(x) for x in percentiles] + | ||
['max']) | ||
if data.ndim == 1: | ||
return pd.Series(describe_numeric_1d(data, percentiles), | ||
index=stat_index) | ||
return describe_categorical_1d(data) | ||
|
||
if self.ndim == 1: | ||
return describe_1d(self, percentiles) | ||
elif (include is None) and (exclude is None): | ||
if len(self._get_numeric_data()._info_axis) > 0: | ||
# when some numerics are found, keep only numerics | ||
data = self.select_dtypes(include=[np.number, np.bool]) | ||
else: | ||
destat = [] | ||
for i in range(len(data._info_axis)): # BAD | ||
series = data.iloc[:, i] | ||
destat.append(describe_numeric_1d(series, percentiles)) | ||
|
||
return self._constructor(lmap(list, zip(*destat)), | ||
index=stat_index, | ||
columns=data._info_axis) | ||
data = self | ||
elif include == 'all': | ||
if exclude != None: | ||
msg = "exclude must be None when include is 'all'" | ||
raise ValueError(msg) | ||
data = self | ||
else: | ||
data = self.select_dtypes(include=include, exclude=exclude) | ||
|
||
ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] | ||
# set a convenient order for rows | ||
names = [] | ||
ldesc_indexes = sorted([x.index for x in ldesc], key=len) | ||
for idxnames in ldesc_indexes: | ||
for name in idxnames: | ||
if name not in names: | ||
names.append(name) | ||
d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) | ||
return d | ||
|
||
_shared_docs['pct_change'] = """ | ||
Percent change over given number of periods. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1005,18 +1005,17 @@ def test_describe_objects(self): | |
df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')}) | ||
df.loc[4] = pd.Timestamp('2010-01-04') | ||
result = df.describe() | ||
expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'), | ||
pd.Timestamp('2010-01-04'), | ||
pd.Timestamp('2010-01-04'), 2]}, | ||
index=['count', 'unique', 'first', 'last', 'top', | ||
'freq']) | ||
expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-04'), 2, | ||
pd.Timestamp('2010-01-01'), | ||
pd.Timestamp('2010-01-04')]}, | ||
index=['count', 'unique', 'top', 'freq', | ||
'first', 'last']) | ||
assert_frame_equal(result, expected) | ||
|
||
# mix time and str | ||
df['C2'] = ['a', 'a', 'b', 'c', 'a'] | ||
result = df.describe() | ||
# when mix of dateimte / obj the index gets reordered. | ||
expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3] | ||
expected['C2'] = [5, 3, 'a', 3, np.nan, np.nan] | ||
assert_frame_equal(result, expected) | ||
|
||
# just str | ||
|
@@ -1036,6 +1035,112 @@ def test_describe_objects(self): | |
assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe()) | ||
assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe()) | ||
|
||
def test_describe_typefiltering(self): | ||
df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add the issue number as a comment here |
||
'catB': ['a', 'b', 'c', 'd'] * 6, | ||
'numC': np.arange(24, dtype='int64'), | ||
'numD': np.arange(24.) + .5, | ||
'ts': tm.makeTimeSeries()[:24].index}) | ||
|
||
descN = df.describe() | ||
expected_cols = ['numC', 'numD',] | ||
expected = DataFrame(dict((k, df[k].describe()) | ||
for k in expected_cols), | ||
columns=expected_cols) | ||
assert_frame_equal(descN, expected) | ||
|
||
desc = df.describe(include=['number']) | ||
assert_frame_equal(desc, descN) | ||
desc = df.describe(exclude=['object', 'datetime']) | ||
assert_frame_equal(desc, descN) | ||
desc = df.describe(include=['float']) | ||
assert_frame_equal(desc, descN.drop('numC',1)) | ||
|
||
descC = df.describe(include=['O']) | ||
expected_cols = ['catA', 'catB'] | ||
expected = DataFrame(dict((k, df[k].describe()) | ||
for k in expected_cols), | ||
columns=expected_cols) | ||
assert_frame_equal(descC, expected) | ||
|
||
descD = df.describe(include=['datetime']) | ||
assert_series_equal( descD.ts, df.ts.describe()) | ||
|
||
desc = df.describe(include=['object','number', 'datetime']) | ||
assert_frame_equal(desc.loc[:,["numC","numD"]].dropna(), descN) | ||
assert_frame_equal(desc.loc[:,["catA","catB"]].dropna(), descC) | ||
descDs = descD.sort_index() # the index order change for mixed-types | ||
assert_frame_equal(desc.loc[:,"ts":].dropna().sort_index(), descDs) | ||
|
||
desc = df.loc[:,'catA':'catB'].describe(include='all') | ||
assert_frame_equal(desc, descC) | ||
desc = df.loc[:,'numC':'numD'].describe(include='all') | ||
assert_frame_equal(desc, descN) | ||
|
||
desc = df.describe(percentiles = [], include='all') | ||
cnt = Series(data=[4,4,6,6,6], index=['catA','catB','numC','numD','ts']) | ||
assert_series_equal( desc.count(), cnt) | ||
self.assertTrue('count' in desc.index) | ||
self.assertTrue('unique' in desc.index) | ||
self.assertTrue('50%' in desc.index) | ||
self.assertTrue('first' in desc.index) | ||
|
||
desc = df.drop("ts", 1).describe(percentiles = [], include='all') | ||
assert_series_equal( desc.count(), cnt.drop("ts")) | ||
self.assertTrue('first' not in desc.index) | ||
desc = df.drop(["numC","numD"], 1).describe(percentiles = [], include='all') | ||
assert_series_equal( desc.count(), cnt.drop(["numC","numD"])) | ||
self.assertTrue('50%' not in desc.index) | ||
|
||
def test_describe_typefiltering_category_bool(self): | ||
df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8), | ||
'B_str': ['a', 'b', 'c', 'd'] * 6, | ||
'C_bool': [True] * 12 + [False] * 12, | ||
'D_num': np.arange(24.) + .5, | ||
'E_ts': tm.makeTimeSeries()[:24].index}) | ||
|
||
# bool is considered numeric in describe, although not an np.number | ||
desc = df.describe() | ||
expected_cols = ['C_bool', 'D_num'] | ||
expected = DataFrame(dict((k, df[k].describe()) | ||
for k in expected_cols), | ||
columns=expected_cols) | ||
assert_frame_equal(desc, expected) | ||
|
||
desc = df.describe(include=["category"]) | ||
self.assertTrue(desc.columns.tolist() == ["A_cat"]) | ||
|
||
# 'all' includes numpy-dtypes + category | ||
desc1 = df.describe(include="all") | ||
desc2 = df.describe(include=[np.generic, "category"]) | ||
assert_frame_equal(desc1, desc2) | ||
|
||
def test_describe_timedelta(self): | ||
df = DataFrame({"td": pd.to_timedelta(np.arange(24)%20,"D")}) | ||
self.assertTrue(df.describe().loc["mean"][0] == pd.to_timedelta("8d4h")) | ||
|
||
def test_describe_typefiltering_dupcol(self): | ||
df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, | ||
'catB': ['a', 'b', 'c', 'd'] * 6, | ||
'numC': np.arange(24), | ||
'numD': np.arange(24.) + .5, | ||
'ts': tm.makeTimeSeries()[:24].index}) | ||
s = df.describe(include='all').shape[1] | ||
df = pd.concat([df, df], axis=1) | ||
s2 = df.describe(include='all').shape[1] | ||
self.assertTrue(s2 == 2 * s) | ||
|
||
def test_describe_typefiltering_groupby(self): | ||
df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8, | ||
'catB': ['a', 'b', 'c', 'd'] * 6, | ||
'numC': np.arange(24), | ||
'numD': np.arange(24.) + .5, | ||
'ts': tm.makeTimeSeries()[:24].index}) | ||
G = df.groupby('catA') | ||
self.assertTrue(G.describe(include=['number']).shape == (16, 2)) | ||
self.assertTrue(G.describe(include=['number', 'object']).shape == (22, 3)) | ||
self.assertTrue(G.describe(include='all').shape == (26, 4)) | ||
|
||
def test_no_order(self): | ||
tm._skip_if_no_scipy() | ||
s = Series([0, 1, np.nan, 3]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pls add similar note to v0.15.0.txt (and include this PR number as a refernce). put in the API section. Include a reference this doc section here.