From 8e1e97e7d38005d7e209227525afe6597f7c7356 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 30 Oct 2018 08:15:36 +0100 Subject: [PATCH 1/8] API: fix corner case of lib.infer_dtype --- doc/source/whatsnew/v0.24.0.txt | 2 ++ pandas/_libs/lib.pyx | 17 +++++++++++------ pandas/tests/dtypes/test_inference.py | 15 ++++++++++++++- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index de111072bef02..02f47e8d1c96e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -244,6 +244,8 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) +- The method `pandas._libs.lib.infer_dtype` now returns `'empty'` rather than (sometimes) the dtype of the array in case the array has length zero, + or if it only consists of missing values in case of `skipna=True` (:issue:`23421`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c5d5a431e8139..abb88e2f8b585 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -59,6 +59,7 @@ from tslibs.timezones cimport get_timezone, tz_compare from missing cimport (checknull, is_null_datetime64, is_null_timedelta64, is_null_period) +from missing import isnaobj # constants that will be compared to potentially arbitrarily large @@ -1171,20 +1172,24 @@ def infer_dtype(object value, bint skipna=False): values = construct_1d_object_array_from_listlike(value) values = getattr(values, 'values', values) - val = _try_infer_map(values) - if val is not None: - return val - - if values.dtype != np.object_: - values = values.astype('O') + if skipna: + values = values[~isnaobj(values)] # make contiguous values = values.ravel() n = len(values) if n == 0: + # length check comes before _try_infer_map return 'empty' + val = _try_infer_map(values) + if val is not None: + return val + + if values.dtype != np.object_: + values = values.astype('O') + # try to use a valid value for i in range(n): val = values[i] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index d0dd03d6eb8df..e14d697274206 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -497,7 +497,7 @@ class Dummy(): def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) - assert result == 'integer' + assert result == 'empty' result = lib.infer_dtype([]) assert result == 'empty' @@ -591,6 +591,19 @@ def test_unicode(self): expected = 'unicode' if PY2 else 'string' assert result == expected + @pytest.mark.parametrize('dtype, skipna, expected', [ + (float, False, 'floating'), + (float, True, 'empty'), + (object, False, 'floating'), + (object, True, 'empty') + ]) + def test_object_empty(self, dtype, skipna, expected): + # GH 23421 + arr = pd.Series([np.nan, np.nan], dtype=dtype) + + result = lib.infer_dtype(arr, skipna=skipna) + assert result == expected + def test_datetime(self): dates = [datetime(2012, 1, x) for x in range(1, 20)] From 3f988e8b37af3a21a2632003a7728e1cb38ef222 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 30 Oct 2018 21:48:27 +0100 Subject: [PATCH 2/8] Do not prioritize 'empty' over dtype --- pandas/_libs/lib.pyx | 15 +++++++-------- pandas/tests/dtypes/test_inference.py | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index abb88e2f8b585..c07a0a8822139 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1175,14 +1175,6 @@ def infer_dtype(object value, bint skipna=False): if skipna: values = values[~isnaobj(values)] - # make contiguous - values = values.ravel() - - n = len(values) - if n == 0: - # length check comes before _try_infer_map - return 'empty' - val = _try_infer_map(values) if val is not None: return val @@ -1190,6 +1182,13 @@ def infer_dtype(object value, bint skipna=False): if values.dtype != np.object_: values = values.astype('O') + # make contiguous + values = values.ravel() + + n = len(values) + if n == 0: + return 'empty' + # try to use a valid value for i in range(n): val = values[i] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index e14d697274206..5aa665efe007e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -497,7 +497,7 @@ class Dummy(): def test_length_zero(self): result = lib.infer_dtype(np.array([], dtype='i4')) - assert result == 'empty' + assert result == 'integer' result = lib.infer_dtype([]) assert result == 'empty' @@ -593,7 +593,7 @@ def test_unicode(self): @pytest.mark.parametrize('dtype, skipna, expected', [ (float, False, 'floating'), - (float, True, 'empty'), + (float, True, 'floating'), (object, False, 'floating'), (object, True, 'empty') ]) From 08d67e88fd56298835e0c7d893a1e20a8b821e69 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 30 Oct 2018 21:53:23 +0100 Subject: [PATCH 3/8] Update whatsnew --- doc/source/whatsnew/v0.24.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 02f47e8d1c96e..6271cdaa0a3f5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -244,8 +244,8 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) -- The method `pandas._libs.lib.infer_dtype` now returns `'empty'` rather than (sometimes) the dtype of the array in case the array has length zero, - or if it only consists of missing values in case of `skipna=True` (:issue:`23421`) +- The method `pandas._libs.lib.infer_dtype` now returns `'empty'` rather than (sometimes) the dtype of the array, + in case the array only consists of missing values and `skipna=True` (:issue:`23421`) .. _whatsnew_0240.api_breaking.deps: From bcc481b728a5d4c45e95782ae5dd90572cace683 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 31 Oct 2018 00:34:47 +0100 Subject: [PATCH 4/8] Retrigger CI due to hypothesis timeout From c0ded963bac0aa72bb2c2ca10c4a80560b3b4fa3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 31 Oct 2018 17:14:35 +0100 Subject: [PATCH 5/8] Review (jreback) --- doc/source/whatsnew/v0.24.0.txt | 2 -- pandas/tests/dtypes/test_inference.py | 17 ++++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6271cdaa0a3f5..de111072bef02 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -244,8 +244,6 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) -- The method `pandas._libs.lib.infer_dtype` now returns `'empty'` rather than (sometimes) the dtype of the array, - in case the array only consists of missing values and `skipna=True` (:issue:`23421`) .. _whatsnew_0240.api_breaking.deps: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 5aa665efe007e..c5911da1666d2 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -591,15 +591,18 @@ def test_unicode(self): expected = 'unicode' if PY2 else 'string' assert result == expected - @pytest.mark.parametrize('dtype, skipna, expected', [ - (float, False, 'floating'), - (float, True, 'floating'), - (object, False, 'floating'), - (object, True, 'empty') + @pytest.mark.parametrize('dtype, missing, skipna, expected', [ + (float, np.nan, False, 'floating'), + (float, np.nan, True, 'floating'), + (object, np.nan, False, 'floating'), + (object, np.nan, True, 'empty'), + (object, None, False, 'mixed'), + (object, None, True, 'empty') ]) - def test_object_empty(self, dtype, skipna, expected): + @pytest.mark.parametrize('box', [pd.Series, np.array]) + def test_object_empty(self, box, missing, dtype, skipna, expected): # GH 23421 - arr = pd.Series([np.nan, np.nan], dtype=dtype) + arr = box([missing, missing], dtype=dtype) result = lib.infer_dtype(arr, skipna=skipna) assert result == expected From 7e5d453550beda033c1393e4d1037626a363aeef Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 1 Nov 2018 08:12:57 +0100 Subject: [PATCH 6/8] Review (jreback) --- pandas/_libs/lib.pyx | 3 +-- pandas/_libs/missing.pxd | 4 ++++ pandas/_libs/missing.pyx | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c07a0a8822139..3b28e1f74249a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -57,9 +57,8 @@ from tslibs.conversion cimport convert_to_tsobject from tslibs.timedeltas cimport convert_to_timedelta64 from tslibs.timezones cimport get_timezone, tz_compare -from missing cimport (checknull, +from missing cimport (checknull, isnaobj, is_null_datetime64, is_null_timedelta64, is_null_period) -from missing import isnaobj # constants that will be compared to potentially arbitrarily large diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index 2c1f13eeb5dff..e171dc5f2c962 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -1,10 +1,14 @@ # -*- coding: utf-8 -*- +from numpy cimport ndarray, uint8_t + from tslibs.nattype cimport is_null_datetimelike cpdef bint checknull(object val) cpdef bint checknull_old(object val) +cpdef ndarray[uint8_t] isnaobj(ndarray arr) + cdef bint is_null_datetime64(v) cdef bint is_null_timedelta64(v) cdef bint is_null_period(v) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2590a30c57f33..6776a4b6d7f7e 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -124,7 +124,7 @@ cdef inline bint _check_none_nan_inf_neginf(object val): @cython.wraparound(False) @cython.boundscheck(False) -def isnaobj(ndarray arr): +cpdef ndarray[uint8_t] isnaobj(ndarray arr): """ Return boolean mask denoting which elements of a 1-D array are na-like, according to the criteria defined in `_check_all_nulls`: From a533ed80766e9671f7ec911d98f8f48268bad3ad Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 3 Nov 2018 16:42:14 +0100 Subject: [PATCH 7/8] Fix rebase oversight --- pandas/_libs/missing.pxd | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/missing.pxd b/pandas/_libs/missing.pxd index e171dc5f2c962..d0dd306680ae8 100644 --- a/pandas/_libs/missing.pxd +++ b/pandas/_libs/missing.pxd @@ -2,11 +2,8 @@ from numpy cimport ndarray, uint8_t -from tslibs.nattype cimport is_null_datetimelike - cpdef bint checknull(object val) cpdef bint checknull_old(object val) - cpdef ndarray[uint8_t] isnaobj(ndarray arr) cdef bint is_null_datetime64(v) From 19cf2dd8e66f33de7b2b49d9a1d4d67f4d649a04 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 3 Nov 2018 16:59:37 +0100 Subject: [PATCH 8/8] Retrigger CircleCI