|
26 | 26 | def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
|
27 | 27 | include_lowest=False):
|
28 | 28 | """
|
29 |
| - Return indices of half-open bins to which each value of `x` belongs. |
| 29 | + Bin values into discrete intervals. |
| 30 | +
|
| 31 | + Use `cut` when you need to segment and sort data values into bins. This |
| 32 | + function is also useful for going from a continuous variable to a |
| 33 | + categorical variable. For example, `cut` could convert ages to groups of |
| 34 | + age ranges. Supports binning into an equal number of bins, or a |
| 35 | + pre-specified array of bins. |
30 | 36 |
|
31 | 37 | Parameters
|
32 | 38 | ----------
|
33 | 39 | x : array-like
|
34 |
| - Input array to be binned. It has to be 1-dimensional. |
35 |
| - bins : int, sequence of scalars, or IntervalIndex |
36 |
| - If `bins` is an int, it defines the number of equal-width bins in the |
37 |
| - range of `x`. However, in this case, the range of `x` is extended |
38 |
| - by .1% on each side to include the min or max values of `x`. If |
39 |
| - `bins` is a sequence it defines the bin edges allowing for |
40 |
| - non-uniform bin width. No extension of the range of `x` is done in |
41 |
| - this case. |
42 |
| - right : bool, optional |
43 |
| - Indicates whether the bins include the rightmost edge or not. If |
44 |
| - right == True (the default), then the bins [1,2,3,4] indicate |
45 |
| - (1,2], (2,3], (3,4]. |
46 |
| - labels : array or boolean, default None |
47 |
| - Used as labels for the resulting bins. Must be of the same length as |
48 |
| - the resulting bins. If False, return only integer indicators of the |
49 |
| - bins. |
50 |
| - retbins : bool, optional |
51 |
| - Whether to return the bins or not. Can be useful if bins is given |
| 40 | + The input array to be binned. Must be 1-dimensional. |
| 41 | + bins : int, sequence of scalars, or pandas.IntervalIndex |
| 42 | + The criteria to bin by. |
| 43 | +
|
| 44 | + * int : Defines the number of equal-width bins in the range of `x`. The |
| 45 | + range of `x` is extended by .1% on each side to include the minimum |
| 46 | + and maximum values of `x`. |
| 47 | + * sequence of scalars : Defines the bin edges allowing for non-uniform |
| 48 | + width. No extension of the range of `x` is done. |
| 49 | + * IntervalIndex : Defines the exact bins to be used. |
| 50 | +
|
| 51 | + right : bool, default True |
| 52 | + Indicates whether `bins` includes the rightmost edge or not. If |
| 53 | + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` |
| 54 | + indicate (1,2], (2,3], (3,4]. This argument is ignored when |
| 55 | + `bins` is an IntervalIndex. |
| 56 | + labels : array or bool, optional |
| 57 | + Specifies the labels for the returned bins. Must be the same length as |
| 58 | + the resulting bins. If False, returns only integer indicators of the |
| 59 | + bins. This affects the type of the output container (see below). |
| 60 | + This argument is ignored when `bins` is an IntervalIndex. |
| 61 | + retbins : bool, default False |
| 62 | + Whether to return the bins or not. Useful when bins is provided |
52 | 63 | as a scalar.
|
53 |
| - precision : int, optional |
54 |
| - The precision at which to store and display the bins labels |
55 |
| - include_lowest : bool, optional |
| 64 | + precision : int, default 3 |
| 65 | + The precision at which to store and display the bins labels. |
| 66 | + include_lowest : bool, default False |
56 | 67 | Whether the first interval should be left-inclusive or not.
|
57 | 68 |
|
58 | 69 | Returns
|
59 | 70 | -------
|
60 |
| - out : Categorical or Series or array of integers if labels is False |
61 |
| - The return type (Categorical or Series) depends on the input: a Series |
62 |
| - of type category if input is a Series else Categorical. Bins are |
63 |
| - represented as categories when categorical data is returned. |
64 |
| - bins : ndarray of floats |
65 |
| - Returned only if `retbins` is True. |
| 71 | + out : pandas.Categorical, Series, or ndarray |
| 72 | + An array-like object representing the respective bin for each value |
| 73 | + of `x`. The type depends on the value of `labels`. |
66 | 74 |
|
67 |
| - Notes |
68 |
| - ----- |
69 |
| - The `cut` function can be useful for going from a continuous variable to |
70 |
| - a categorical variable. For example, `cut` could convert ages to groups |
71 |
| - of age ranges. |
| 75 | + * True (default) : returns a Series for Series `x` or a |
| 76 | + pandas.Categorical for all other inputs. The values stored within |
| 77 | + are Interval dtype. |
72 | 78 |
|
73 |
| - Any NA values will be NA in the result. Out of bounds values will be NA in |
74 |
| - the resulting Categorical object |
| 79 | + * sequence of scalars : returns a Series for Series `x` or a |
| 80 | + pandas.Categorical for all other inputs. The values stored within |
| 81 | + are whatever the type in the sequence is. |
75 | 82 |
|
| 83 | + * False : returns an ndarray of integers. |
| 84 | +
|
| 85 | + bins : numpy.ndarray or IntervalIndex. |
| 86 | + The computed or specified bins. Only returned when `retbins=True`. |
| 87 | + For scalar or sequence `bins`, this is an ndarray with the computed |
| 88 | + bins. For an IntervalIndex `bins`, this is equal to `bins`. |
| 89 | +
|
| 90 | + See Also |
| 91 | + -------- |
| 92 | + qcut : Discretize variable into equal-sized buckets based on rank |
| 93 | + or based on sample quantiles. |
| 94 | + pandas.Categorical : Array type for storing data that come from a |
| 95 | + fixed set of values. |
| 96 | + Series : One-dimensional array with axis labels (including time series). |
| 97 | + pandas.IntervalIndex : Immutable Index implementing an ordered, |
| 98 | + sliceable set. |
| 99 | +
|
| 100 | + Notes |
| 101 | + ----- |
| 102 | + Any NA values will be NA in the result. Out of bounds values will be NA in |
| 103 | + the resulting Series or pandas.Categorical object. |
76 | 104 |
|
77 | 105 | Examples
|
78 | 106 | --------
|
79 |
| - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) |
| 107 | + Discretize into three equal-sized bins. |
| 108 | +
|
| 109 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) |
80 | 110 | ... # doctest: +ELLIPSIS
|
81 |
| - ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... |
82 |
| - Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... |
| 111 | + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... |
| 112 | + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... |
83 | 113 |
|
84 |
| - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), |
85 |
| - ... 3, labels=["good", "medium", "bad"]) |
86 |
| - ... # doctest: +SKIP |
87 |
| - [good, good, good, medium, bad, good] |
88 |
| - Categories (3, object): [good < medium < bad] |
| 114 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) |
| 115 | + ... # doctest: +ELLIPSIS |
| 116 | + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... |
| 117 | + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... |
| 118 | + array([0.994, 3. , 5. , 7. ])) |
| 119 | +
|
| 120 | + Discovers the same bins, but assign them specific labels. Notice that |
| 121 | + the returned Categorical's categories are `labels` and is ordered. |
| 122 | +
|
| 123 | + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), |
| 124 | + ... 3, labels=["bad", "medium", "good"]) |
| 125 | + [bad, good, medium, medium, good, bad] |
| 126 | + Categories (3, object): [bad < medium < good] |
89 | 127 |
|
90 |
| - >>> pd.cut(np.ones(5), 4, labels=False) |
91 |
| - array([1, 1, 1, 1, 1]) |
| 128 | + ``labels=False`` implies you just want the bins back. |
| 129 | +
|
| 130 | + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) |
| 131 | + array([0, 1, 1, 3]) |
| 132 | +
|
| 133 | + Passing a Series as an input returns a Series with categorical dtype: |
| 134 | +
|
| 135 | + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), |
| 136 | + ... index=['a', 'b', 'c', 'd', 'e']) |
| 137 | + >>> pd.cut(s, 3) |
| 138 | + ... # doctest: +ELLIPSIS |
| 139 | + a (1.992, 4.667] |
| 140 | + b (1.992, 4.667] |
| 141 | + c (4.667, 7.333] |
| 142 | + d (7.333, 10.0] |
| 143 | + e (7.333, 10.0] |
| 144 | + dtype: category |
| 145 | + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... |
| 146 | +
|
| 147 | + Passing an IntervalIndex for `bins` results in those categories exactly. |
| 148 | + Notice that values not covered by the IntervalIndex are set to NaN. 0 |
| 149 | + is to the left of the first bin (which is closed on the right), and 1.5 |
| 150 | + falls between two bins. |
| 151 | +
|
| 152 | + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) |
| 153 | + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) |
| 154 | + [NaN, (0, 1], NaN, (2, 3], (4, 5]] |
| 155 | + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] |
92 | 156 | """
|
93 | 157 | # NOTE: this binning code is changed a bit from histogram for var(x) == 0
|
94 | 158 |
|
|
0 commit comments