From bee981ceb9d723a4aa83ed19ecfa4c67f146720d Mon Sep 17 00:00:00 2001 From: Ashish Date: Tue, 27 Dec 2016 17:51:20 -0500 Subject: [PATCH 01/12] adding 'duplicates' option to qcut --- pandas/tools/tests/test_tile.py | 7 +++++++ pandas/tools/tile.py | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index c9a96d80f35ba..9fa672613ec3b 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -272,6 +272,13 @@ def test_series_retbins(self): np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + def test_qcut_duplicates_drop(self): + # GH 7751
 + values = [0, 0, 0, 0, 1, 2, 3] + cats = qcut(values, 3, duplicates='drop') + ex_levels = ['[0, 1]', '(1, 3]'] + self.assertTrue((cats.categories == ex_levels).all()) + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index a372e113f1d7e..46c2ba9f52f48 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -129,7 +129,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, series_index, name) -def qcut(x, q, labels=None, retbins=False, precision=3): +def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): """ Quantile-based discretization function. Discretize variable into equal-sized buckets based on rank or based on sample quantiles. For example @@ -151,6 +151,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3): as a scalar. precision : int The precision at which to store and display the bins labels + duplicates : {'raise', 'drop'}, optional
 + If binned edges are not unique, raise ValueError or drop non-
uniques. Returns ------- @@ -187,7 +189,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): bins = algos.quantile(x, quantiles) fac, bins = _bins_to_cuts(x, bins, labels=labels, precision=precision, include_lowest=True, - dtype=dtype) + dtype=dtype, duplicates=duplicates) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -195,7 +197,18 @@ def qcut(x, q, labels=None, retbins=False, precision=3): def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, - dtype=None): + dtype=None, duplicates='raise'): + + if duplicates not in ['raise', 'drop']: + raise ValueError("invalid value for 'duplicates' parameter, " + + "valid options are: raise, drop") + + if duplicates == 'raise': + raise ValueError('Bin edges must be unique: %s' % repr(bins) + + ' You can drop duplicate edges ' + + 'by setting \'duplicates\' param') + else: + bins = algos.unique(bins) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) From a2dd8cea0b15c2e7b65d6e307238c3d84502cf1b Mon Sep 17 00:00:00 2001 From: Ashish Date: Tue, 27 Dec 2016 20:34:27 -0500 Subject: [PATCH 02/12] fixing duplicates check --- pandas/tools/tile.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 46c2ba9f52f48..a75eb313f1d95 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -203,11 +203,12 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError("invalid value for 'duplicates' parameter, " + "valid options are: raise, drop") - if duplicates == 'raise': + if len(algos.unique(bins)) < len(bins): + if duplicates == 'raise': raise ValueError('Bin edges must be unique: %s' % repr(bins) + ' You can drop duplicate edges ' + 'by setting \'duplicates\' param') - else: + else: bins = algos.unique(bins) side = 'left' if right else 'right' From 0b8efeb57351383f58b67adf6cc574e7c8e9220e Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 28 Dec 2016 08:30:39 -0500 Subject: [PATCH 03/12] Update tile.py --- pandas/tools/tile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index a75eb313f1d95..99af13e383e17 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -151,8 +151,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): as a scalar. precision : int The precision at which to store and display the bins labels - duplicates : {'raise', 'drop'}, optional
 - If binned edges are not unique, raise ValueError or drop non-
uniques. + duplicates : {'raise', 'drop'}, optional + If binned edges are not unique, raise ValueError or drop non-uniques. Returns ------- From 3f98abcf72949e689efe2b931c38cedafcfba6e2 Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 28 Dec 2016 08:49:57 -0500 Subject: [PATCH 04/12] Update tile.py --- pandas/tools/tile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 99af13e383e17..370a5170043f2 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -153,6 +153,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): The precision at which to store and display the bins labels duplicates : {'raise', 'drop'}, optional If binned edges are not unique, raise ValueError or drop non-uniques. + .. versionadded:: 0.20.0 Returns ------- From 1ce77d0f27fd5fdde863f8ceab70de4953bf4c7b Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 28 Dec 2016 10:12:57 -0500 Subject: [PATCH 05/12] Update test_tile.py --- pandas/tools/tests/test_tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 9fa672613ec3b..4aee47e8a5d79 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -273,7 +273,7 @@ def test_series_retbins(self): tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) def test_qcut_duplicates_drop(self): - # GH 7751
 + # GH 7751 values = [0, 0, 0, 0, 1, 2, 3] cats = qcut(values, 3, duplicates='drop') ex_levels = ['[0, 1]', '(1, 3]'] From 2161518da1101d8d1a37fc433fd4243296da6eec Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 28 Dec 2016 14:24:58 -0500 Subject: [PATCH 06/12] Update tile.py Add unique_bins for performance --- pandas/tools/tile.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 370a5170043f2..b40a389f65a37 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -204,13 +204,14 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError("invalid value for 'duplicates' parameter, " + "valid options are: raise, drop") - if len(algos.unique(bins)) < len(bins): + unique_bins = algos.unique(bins) + if len(unique_bins) < len(bins): if duplicates == 'raise': raise ValueError('Bin edges must be unique: %s' % repr(bins) + ' You can drop duplicate edges ' + 'by setting \'duplicates\' param') else: - bins = algos.unique(bins) + bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) From 3dbc416dfc2557b511c2152ec9cdbf3f39513f75 Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Wed, 28 Dec 2016 14:29:12 -0500 Subject: [PATCH 07/12] Update v0.20.0.txt --- doc/source/whatsnew/v0.20.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b1..073fa38787ee0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -105,6 +105,7 @@ Other enhancements of sorting or an incorrect key. See :ref:`here ` - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) +- ``pd.qcut`` can optionally remove duplicate edges instead of throwing an error (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) From 2c5bc35429b298445c7846379a8651d8ad7324f6 Mon Sep 17 00:00:00 2001 From: Ashish Date: Wed, 28 Dec 2016 21:25:40 -0500 Subject: [PATCH 08/12] added duplicates='raise' test. other fixes to qcut for duplicates='raise' --- pandas/tools/tests/test_tile.py | 5 +++++ pandas/tools/tile.py | 13 +++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 4aee47e8a5d79..8b180957801f9 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -279,6 +279,11 @@ def test_qcut_duplicates_drop(self): ex_levels = ['[0, 1]', '(1, 3]'] self.assertTrue((cats.categories == ex_levels).all()) + def test_qcut_duplicates_raise(self): + # GH 7751 + values = [0, 0, 0, 0, 1, 2, 3] + self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') + def test_single_bin(self): # issue 14652 expected = Series([0, 0]) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b40a389f65a37..f7ad0c2931e69 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -152,7 +152,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): precision : int The precision at which to store and display the bins labels duplicates : {'raise', 'drop'}, optional - If binned edges are not unique, raise ValueError or drop non-uniques. + If bin edges are not unique, raise ValueError or drop non-uniques. .. versionadded:: 0.20.0 Returns @@ -202,23 +202,20 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " - + "valid options are: raise, drop") + "valid options are: raise, drop") unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': - raise ValueError('Bin edges must be unique: %s' % repr(bins) + - ' You can drop duplicate edges ' + - 'by setting \'duplicates\' param') + raise ValueError("Bin edges must be unique: {} " + "You can drop duplicate edges " + "by setting 'duplicates' param".format(repr(bins))) else: bins = unique_bins side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) - if len(algos.unique(bins)) < len(bins): - raise ValueError('Bin edges must be unique: %s' % repr(bins)) - if include_lowest: ids[x == bins[0]] = 1 From 221c0b30d198934732a437922ebce1a2b19da34e Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Thu, 29 Dec 2016 06:54:53 -0500 Subject: [PATCH 09/12] Update tile.py Line too long code formatting fix. --- pandas/tools/tile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index f7ad0c2931e69..cbcfcc93d4f5c 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -207,9 +207,9 @@ def _bins_to_cuts(x, bins, right=True, labels=None, unique_bins = algos.unique(bins) if len(unique_bins) < len(bins): if duplicates == 'raise': - raise ValueError("Bin edges must be unique: {} " - "You can drop duplicate edges " - "by setting 'duplicates' param".format(repr(bins))) + raise ValueError("Bin edges must be unique: {}. You " + "can drop duplicate edges by setting " + "'duplicates' param".format(repr(bins))) else: bins = unique_bins From 42bf4820152c26720ccf9053409a2f6a594eac9b Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Thu, 29 Dec 2016 07:17:06 -0500 Subject: [PATCH 10/12] Update tile.py Docstring update --- pandas/tools/tile.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index cbcfcc93d4f5c..3100ea38b2af6 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -151,8 +151,9 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): as a scalar. precision : int The precision at which to store and display the bins labels - duplicates : {'raise', 'drop'}, optional + duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. + .. versionadded:: 0.20.0 Returns From b6bf4016da8e4fb4334784ef432615a628dfc8fb Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Thu, 29 Dec 2016 09:14:17 -0500 Subject: [PATCH 11/12] Update v0.20.0.txt --- doc/source/whatsnew/v0.20.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 073fa38787ee0..64c5de6cb100a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -105,7 +105,7 @@ Other enhancements of sorting or an incorrect key. See :ref:`here ` - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) -- ``pd.qcut`` can optionally remove duplicate edges instead of throwing an error (:issue:`7751`) +- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) From 698b4ecf284283604673ee393bf006de9d38983d Mon Sep 17 00:00:00 2001 From: Ashish Singal Date: Thu, 29 Dec 2016 12:08:33 -0500 Subject: [PATCH 12/12] Update tile.py Deleting whitespace. --- pandas/tools/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 3100ea38b2af6..2875d9c14dc47 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -153,7 +153,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): The precision at which to store and display the bins labels duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. - + .. versionadded:: 0.20.0 Returns