More flexible describe() via include/exclude type filtering

bthyreau · bthyreau · commit c2a1e188a720 · 2014-10-06T08:56:34.000+09:00
This enhance describe()'s output via new include/exclude list arguments,
letting the user specify the dtypes to be summarized as output.
This provides an simple way to overcome the automatic type-filtering done
by default; it's also convenient with groupby().
Also includes documentation and changelog entries.
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -490,6 +490,24 @@ number of unique values and most frequently occurring values:
    s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])
    s.describe()
 
+Note that on a mixed-type DataFrame object, `describe` will restrict the summary to
+include only numerical columns or, if none are, only categorical columns:
+
+.. ipython:: python
+
+    frame = DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
+    frame.describe()
+
+This behaviour can be controlled by providing a list of types as ``include``/``exclude``
+arguments. The special value ``all`` can also be used:
+
+.. ipython:: python
+
+    frame.describe(include=['object'])
+    frame.describe(include=['number'])
+    frame.describe(include='all')
+
+That feature relies on :ref:`select_dtypes <basics.selectdtypes>`. Refer to there for details about accepted inputs.
 
 There also is a utility function, ``value_range`` which takes a DataFrame and
 returns a series with the minimum/maximum values in the DataFrame.
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -56,6 +56,24 @@ users upgrade to this version.
 
 API changes
 ~~~~~~~~~~~
+- :func:`describe` on mixed-types DataFrames is more flexible. Type-based column filtering is now possible via the ``include``/``exclude`` arguments (:issue:`8164`).
+
+  .. ipython:: python
+
+    df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8,
+                    'catB': ['a', 'b', 'c', 'd'] * 6,
+                    'numC': np.arange(24),
+                    'numD': np.arange(24.) + .5})
+    df.describe(include=["object"])
+    df.describe(include=["number", "object"], exclude=["float"])
+
+  Requesting all columns is possible with the shorthand 'all'
+
+  .. ipython:: python
+
+    df.describe(include='all')
+
+  Without those arguments, 'describe` will behave as before, including only numerical columns or, if none are, only categorical columns. See also the :ref:`docs <basics.describe>`
 
 - Passing multiple levels to `DataFrame.stack()` will now work when multiple level
   numbers are passed (:issue:`7660`), and will raise a ``ValueError`` when the
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3658,27 +3658,51 @@ def abs(self):
             The percentiles to include in the output. Should all
             be in the interval [0, 1]. By default `percentiles` is
             [.25, .5, .75], returning the 25th, 50th, and 75th percentiles.
+        include, exclude : list-like, 'all', or None (default)
+            Specify the form of the returned result. Either:
+
+            - None to both (default). The result will include only numeric-typed
+              columns or, if none are, only categorical columns.
+            - A list of dtypes or strings to be included/excluded.
+              To select all numeric types use numpy numpy.number. To select
+              categorical objects use type object. See also the select_dtypes
+              documentation. eg. df.describe(include=['O'])
+            - If include is the string 'all', the output column-set will
+              match the input one.
 
         Returns
         -------
         summary: %(klass)s of summary statistics
 
         Notes
         -----
-        For numeric dtypes the index includes: count, mean, std, min,
+        The output DataFrame index depends on the requested dtypes:
+
+        For numeric dtypes, it will include: count, mean, std, min,
         max, and lower, 50, and upper percentiles.
 
-        If self is of object dtypes (e.g. timestamps or strings), the output
+        For object dtypes (e.g. timestamps or strings), the index
         will include the count, unique, most common, and frequency of the
         most common. Timestamps also include the first and last items.
 
+        For mixed dtypes, the index will be the union of the corresponding
+        output types. Non-applicable entries will be filled with NaN.
+        Note that mixed-dtype outputs can only be returned from mixed-dtype
+        inputs and appropriate use of the include/exclude arguments.
+
         If multiple values have the highest count, then the
         `count` and `most common` pair will be arbitrarily chosen from
         among those with the highest count.
+
+        The include, exclude arguments are ignored for Series.
+
+        See also
+        --------
+        DataFrame.select_dtypes
         """
 
     @Appender(_shared_docs['describe'] % _shared_doc_kwargs)
-    def describe(self, percentile_width=None, percentiles=None):
+    def describe(self, percentile_width=None, percentiles=None, include=None, exclude=None ):
         if self.ndim >= 3:
             msg = "describe is not implemented on on Panel or PanelND objects."
             raise NotImplementedError(msg)
@@ -3715,16 +3739,6 @@ def describe(self, percentile_width=None, percentiles=None):
             uh = percentiles[percentiles > .5]
             percentiles = np.hstack([lh, 0.5, uh])
 
-        # dtypes: numeric only, numeric mixed, objects only
-        data = self._get_numeric_data()
-        if self.ndim > 1:
-            if len(data._info_axis) == 0:
-                is_object = True
-            else:
-                is_object = False
-        else:
-            is_object = not self._is_numeric_mixed_type
-
         def pretty_name(x):
             x *= 100
             if x == int(x):
@@ -3733,10 +3747,12 @@ def pretty_name(x):
                 return '%.1f%%' % x
 
         def describe_numeric_1d(series, percentiles):
-            return ([series.count(), series.mean(), series.std(),
-                     series.min()] +
-                    [series.quantile(x) for x in percentiles] +
-                    [series.max()])
+            stat_index = (['count', 'mean', 'std', 'min'] +
+                  [pretty_name(x) for x in percentiles] + ['max'])
+            d = ([series.count(), series.mean(), series.std(), series.min()] +
+                 [series.quantile(x) for x in percentiles] + [series.max()])
+            return pd.Series(d, index=stat_index, name=series.name)
+
 
         def describe_categorical_1d(data):
             names = ['count', 'unique']
@@ -3749,44 +3765,49 @@ def describe_categorical_1d(data):
                     names += ['top', 'freq']
                     result += [top, freq]
 
-                elif issubclass(data.dtype.type, np.datetime64):
+                elif com.is_datetime64_dtype(data):
                     asint = data.dropna().values.view('i8')
-                    names += ['first', 'last', 'top', 'freq']
-                    result += [lib.Timestamp(asint.min()),
-                               lib.Timestamp(asint.max()),
-                               lib.Timestamp(top), freq]
-
-            return pd.Series(result, index=names)
-
-        if is_object:
-            if data.ndim == 1:
-                return describe_categorical_1d(self)
+                    names += ['top', 'freq', 'first', 'last']
+                    result += [lib.Timestamp(top), freq,
+                               lib.Timestamp(asint.min()),
+                               lib.Timestamp(asint.max())]
+
+            return pd.Series(result, index=names, name=data.name)
+
+        def describe_1d(data, percentiles):
+            if com.is_numeric_dtype(data):
+                return describe_numeric_1d(data, percentiles)
+            elif com.is_timedelta64_dtype(data):
+                return describe_numeric_1d(data, percentiles)
             else:
-                result = pd.DataFrame(dict((k, describe_categorical_1d(v))
-                                           for k, v in compat.iteritems(self)),
-                                      columns=self._info_axis,
-                                      index=['count', 'unique', 'first', 'last',
-                                             'top', 'freq'])
-                # just objects, no datime
-                if pd.isnull(result.loc['first']).all():
-                    result = result.drop(['first', 'last'], axis=0)
-                return result
-        else:
-            stat_index = (['count', 'mean', 'std', 'min'] +
-                          [pretty_name(x) for x in percentiles] +
-                          ['max'])
-            if data.ndim == 1:
-                return pd.Series(describe_numeric_1d(data, percentiles),
-                                 index=stat_index)
+                return describe_categorical_1d(data)
+
+        if self.ndim == 1:
+            return describe_1d(self, percentiles)
+        elif (include is None) and (exclude is None):
+            if len(self._get_numeric_data()._info_axis) > 0:
+                # when some numerics are found, keep only numerics
+                data = self.select_dtypes(include=[np.number, np.bool])
             else:
-                destat = []
-                for i in range(len(data._info_axis)):  # BAD
-                    series = data.iloc[:, i]
-                    destat.append(describe_numeric_1d(series, percentiles))
-
-                return self._constructor(lmap(list, zip(*destat)),
-                                         index=stat_index,
-                                         columns=data._info_axis)
+                data = self
+        elif include == 'all':
+            if exclude != None:
+                msg = "exclude must be None when include is 'all'"
+                raise ValueError(msg)
+            data = self
+        else:
+            data = self.select_dtypes(include=include, exclude=exclude)
+
+        ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()]
+        # set a convenient order for rows
+        names = []
+        ldesc_indexes = sorted([x.index for x in ldesc], key=len)
+        for idxnames in ldesc_indexes:
+            for name in idxnames:
+                if name not in names:
+                    names.append(name)
+        d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
+        return d
 
     _shared_docs['pct_change'] = """
         Percent change over given number of periods.
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -1005,18 +1005,17 @@ def test_describe_objects(self):
         df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')})
         df.loc[4] = pd.Timestamp('2010-01-04')
         result = df.describe()
-        expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'),
-                                     pd.Timestamp('2010-01-04'),
-                                     pd.Timestamp('2010-01-04'), 2]},
-                             index=['count', 'unique', 'first', 'last', 'top',
-                                    'freq'])
+        expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-04'), 2,
+                                     pd.Timestamp('2010-01-01'),
+                                     pd.Timestamp('2010-01-04')]},
+                             index=['count', 'unique', 'top', 'freq',
+                                    'first', 'last'])
         assert_frame_equal(result, expected)
 
         # mix time and str
         df['C2'] = ['a', 'a', 'b', 'c', 'a']
         result = df.describe()
-        # when mix of dateimte / obj the index gets reordered.
-        expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3]
+        expected['C2'] = [5, 3, 'a', 3, np.nan, np.nan]
         assert_frame_equal(result, expected)
 
         # just str
@@ -1036,6 +1035,112 @@ def test_describe_objects(self):
         assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe())
         assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe())
 
+    def test_describe_typefiltering(self):
+        df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8,
+                        'catB': ['a', 'b', 'c', 'd'] * 6,
+                        'numC': np.arange(24, dtype='int64'),
+                        'numD': np.arange(24.) + .5,
+                        'ts': tm.makeTimeSeries()[:24].index})
+
+        descN = df.describe()
+        expected_cols = ['numC', 'numD',]
+        expected = DataFrame(dict((k, df[k].describe())
+                                  for k in expected_cols),
+                             columns=expected_cols)
+        assert_frame_equal(descN, expected)
+
+        desc = df.describe(include=['number'])
+        assert_frame_equal(desc, descN)
+        desc = df.describe(exclude=['object', 'datetime'])
+        assert_frame_equal(desc, descN)
+        desc = df.describe(include=['float'])
+        assert_frame_equal(desc, descN.drop('numC',1))
+
+        descC = df.describe(include=['O'])
+        expected_cols = ['catA', 'catB']
+        expected = DataFrame(dict((k, df[k].describe())
+                                  for k in expected_cols),
+                             columns=expected_cols)
+        assert_frame_equal(descC, expected)
+
+        descD = df.describe(include=['datetime'])
+        assert_series_equal( descD.ts, df.ts.describe())
+
+        desc = df.describe(include=['object','number', 'datetime'])
+        assert_frame_equal(desc.loc[:,["numC","numD"]].dropna(), descN)
+        assert_frame_equal(desc.loc[:,["catA","catB"]].dropna(), descC)
+        descDs = descD.sort_index() # the index order change for mixed-types
+        assert_frame_equal(desc.loc[:,"ts":].dropna().sort_index(), descDs)
+
+        desc = df.loc[:,'catA':'catB'].describe(include='all')
+        assert_frame_equal(desc, descC)
+        desc = df.loc[:,'numC':'numD'].describe(include='all')
+        assert_frame_equal(desc, descN)
+
+        desc = df.describe(percentiles = [], include='all')
+        cnt = Series(data=[4,4,6,6,6], index=['catA','catB','numC','numD','ts'])
+        assert_series_equal( desc.count(), cnt)
+        self.assertTrue('count' in desc.index)
+        self.assertTrue('unique' in desc.index)
+        self.assertTrue('50%' in desc.index)
+        self.assertTrue('first' in desc.index)
+
+        desc = df.drop("ts", 1).describe(percentiles = [], include='all')
+        assert_series_equal( desc.count(), cnt.drop("ts"))
+        self.assertTrue('first' not in desc.index)
+        desc = df.drop(["numC","numD"], 1).describe(percentiles = [], include='all')
+        assert_series_equal( desc.count(), cnt.drop(["numC","numD"]))
+        self.assertTrue('50%' not in desc.index)
+
+    def test_describe_typefiltering_category_bool(self):
+        df = DataFrame({'A_cat': pd.Categorical(['foo', 'foo', 'bar'] * 8),
+                        'B_str': ['a', 'b', 'c', 'd'] * 6,
+                        'C_bool': [True] * 12 + [False] * 12,
+                        'D_num': np.arange(24.) + .5,
+                        'E_ts': tm.makeTimeSeries()[:24].index})
+
+        # bool is considered numeric in describe, although not an np.number
+        desc = df.describe()
+        expected_cols = ['C_bool', 'D_num']
+        expected = DataFrame(dict((k, df[k].describe())
+                                  for k in expected_cols),
+                             columns=expected_cols)
+        assert_frame_equal(desc, expected)
+
+        desc = df.describe(include=["category"])
+        self.assertTrue(desc.columns.tolist() == ["A_cat"])
+
+        # 'all' includes numpy-dtypes + category
+        desc1 = df.describe(include="all")
+        desc2 = df.describe(include=[np.generic, "category"])
+        assert_frame_equal(desc1, desc2)
+
+    def test_describe_timedelta(self):
+        df = DataFrame({"td": pd.to_timedelta(np.arange(24)%20,"D")})
+        self.assertTrue(df.describe().loc["mean"][0] == pd.to_timedelta("8d4h"))
+
+    def test_describe_typefiltering_dupcol(self):
+        df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8,
+                        'catB': ['a', 'b', 'c', 'd'] * 6,
+                        'numC': np.arange(24),
+                        'numD': np.arange(24.) + .5,
+                        'ts': tm.makeTimeSeries()[:24].index})
+        s = df.describe(include='all').shape[1]
+        df = pd.concat([df, df], axis=1)
+        s2 = df.describe(include='all').shape[1]
+        self.assertTrue(s2 == 2 * s)
+
+    def test_describe_typefiltering_groupby(self):
+        df = DataFrame({'catA': ['foo', 'foo', 'bar'] * 8,
+                'catB': ['a', 'b', 'c', 'd'] * 6,
+                'numC': np.arange(24),
+                'numD': np.arange(24.) + .5,
+                'ts': tm.makeTimeSeries()[:24].index})
+        G = df.groupby('catA')
+        self.assertTrue(G.describe(include=['number']).shape == (16, 2))
+        self.assertTrue(G.describe(include=['number', 'object']).shape == (22, 3))
+        self.assertTrue(G.describe(include='all').shape == (26, 4))
+
     def test_no_order(self):
         tm._skip_if_no_scipy()
         s = Series([0, 1, np.nan, 3])