Skip to content

Commit 2f842f2

Browse files
h-vetinarijreback
authored andcommitted
DEPR: deprecate default of skipna=False in infer_dtype (#24050)
1 parent c5166b6 commit 2f842f2

File tree

6 files changed

+73
-58
lines changed

6 files changed

+73
-58
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,7 @@ Deprecations
12201220
- :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`)
12211221
- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`)
12221222
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
1223+
- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`)
12231224
- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
12241225
- :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)
12251226

pandas/_libs/lib.pyx

+9-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ from fractions import Fraction
44
from numbers import Number
55

66
import sys
7+
import warnings
78

89
import cython
910
from cython import Py_ssize_t
@@ -1079,7 +1080,7 @@ cdef _try_infer_map(v):
10791080
return None
10801081

10811082

1082-
def infer_dtype(value: object, skipna: bool=False) -> str:
1083+
def infer_dtype(value: object, skipna: object=None) -> str:
10831084
"""
10841085
Efficiently infer the type of a passed val, or list-like
10851086
array of values. Return a string describing the type.
@@ -1088,8 +1089,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
10881089
----------
10891090
value : scalar, list, ndarray, or pandas type
10901091
skipna : bool, default False
1091-
Ignore NaN values when inferring the type. The default of ``False``
1092-
will be deprecated in a later version of pandas.
1092+
Ignore NaN values when inferring the type.
10931093

10941094
.. versionadded:: 0.21.0
10951095

@@ -1186,6 +1186,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
11861186
bint seen_pdnat = False
11871187
bint seen_val = False
11881188

1189+
if skipna is None:
1190+
msg = ('A future version of pandas will default to `skipna=True`. To '
1191+
'silence this warning, pass `skipna=True|False` explicitly.')
1192+
warnings.warn(msg, FutureWarning, stacklevel=2)
1193+
skipna = False
1194+
11891195
if util.is_array(value):
11901196
values = value
11911197
elif hasattr(value, 'dtype'):

pandas/core/arrays/array_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def array(data, # type: Sequence[object]
209209
return cls._from_sequence(data, dtype=dtype, copy=copy)
210210

211211
if dtype is None:
212-
inferred_dtype = lib.infer_dtype(data)
212+
inferred_dtype = lib.infer_dtype(data, skipna=False)
213213
if inferred_dtype == 'period':
214214
try:
215215
return period_array(data, copy=copy)

pandas/core/reshape/merge.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self):
962962
# object values are allowed to be merged
963963
elif ((lk_is_object and is_numeric_dtype(rk)) or
964964
(is_numeric_dtype(lk) and rk_is_object)):
965-
inferred_left = lib.infer_dtype(lk)
966-
inferred_right = lib.infer_dtype(rk)
965+
inferred_left = lib.infer_dtype(lk, skipna=False)
966+
inferred_right = lib.infer_dtype(rk, skipna=False)
967967
bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
968968
string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
969969

pandas/tests/dtypes/test_inference.py

+58-50
Original file line numberDiff line numberDiff line change
@@ -334,11 +334,11 @@ def test_infer_dtype_bytes(self):
334334

335335
# string array of bytes
336336
arr = np.array(list('abc'), dtype='S1')
337-
assert lib.infer_dtype(arr, skipna=False) == compare
337+
assert lib.infer_dtype(arr, skipna=True) == compare
338338

339339
# object array of bytes
340340
arr = arr.astype(object)
341-
assert lib.infer_dtype(arr, skipna=False) == compare
341+
assert lib.infer_dtype(arr, skipna=True) == compare
342342

343343
# object array of bytes with missing values
344344
assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
@@ -538,32 +538,40 @@ def test_length_zero(self, skipna):
538538

539539
def test_integers(self):
540540
arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
541-
result = lib.infer_dtype(arr, skipna=False)
541+
result = lib.infer_dtype(arr, skipna=True)
542542
assert result == 'integer'
543543

544544
arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
545-
result = lib.infer_dtype(arr, skipna=False)
545+
result = lib.infer_dtype(arr, skipna=True)
546546
assert result == 'mixed-integer'
547547

548548
arr = np.array([1, 2, 3, 4, 5], dtype='i4')
549-
result = lib.infer_dtype(arr, skipna=False)
549+
result = lib.infer_dtype(arr, skipna=True)
550550
assert result == 'integer'
551551

552+
def test_deprecation(self):
553+
# GH 24050
554+
arr = np.array([1, 2, 3], dtype=object)
555+
556+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
557+
result = lib.infer_dtype(arr) # default: skipna=None -> warn
558+
assert result == 'integer'
559+
552560
def test_bools(self):
553561
arr = np.array([True, False, True, True, True], dtype='O')
554-
result = lib.infer_dtype(arr, skipna=False)
562+
result = lib.infer_dtype(arr, skipna=True)
555563
assert result == 'boolean'
556564

557565
arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
558-
result = lib.infer_dtype(arr, skipna=False)
566+
result = lib.infer_dtype(arr, skipna=True)
559567
assert result == 'boolean'
560568

561569
arr = np.array([True, False, True, 'foo'], dtype='O')
562-
result = lib.infer_dtype(arr, skipna=False)
570+
result = lib.infer_dtype(arr, skipna=True)
563571
assert result == 'mixed'
564572

565573
arr = np.array([True, False, True], dtype=bool)
566-
result = lib.infer_dtype(arr, skipna=False)
574+
result = lib.infer_dtype(arr, skipna=True)
567575
assert result == 'boolean'
568576

569577
arr = np.array([True, np.nan, False], dtype='O')
@@ -575,38 +583,38 @@ def test_bools(self):
575583

576584
def test_floats(self):
577585
arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
578-
result = lib.infer_dtype(arr, skipna=False)
586+
result = lib.infer_dtype(arr, skipna=True)
579587
assert result == 'floating'
580588

581589
arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'],
582590
dtype='O')
583-
result = lib.infer_dtype(arr, skipna=False)
591+
result = lib.infer_dtype(arr, skipna=True)
584592
assert result == 'mixed-integer'
585593

586594
arr = np.array([1, 2, 3, 4, 5], dtype='f4')
587-
result = lib.infer_dtype(arr, skipna=False)
595+
result = lib.infer_dtype(arr, skipna=True)
588596
assert result == 'floating'
589597

590598
arr = np.array([1, 2, 3, 4, 5], dtype='f8')
591-
result = lib.infer_dtype(arr, skipna=False)
599+
result = lib.infer_dtype(arr, skipna=True)
592600
assert result == 'floating'
593601

594602
def test_decimals(self):
595603
# GH15690
596604
arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
597-
result = lib.infer_dtype(arr, skipna=False)
605+
result = lib.infer_dtype(arr, skipna=True)
598606
assert result == 'decimal'
599607

600608
arr = np.array([1.0, 2.0, Decimal(3)])
601-
result = lib.infer_dtype(arr, skipna=False)
609+
result = lib.infer_dtype(arr, skipna=True)
602610
assert result == 'mixed'
603611

604612
arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
605-
result = lib.infer_dtype(arr, skipna=False)
613+
result = lib.infer_dtype(arr, skipna=True)
606614
assert result == 'decimal'
607615

608616
arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
609-
result = lib.infer_dtype(arr, skipna=False)
617+
result = lib.infer_dtype(arr, skipna=True)
610618
assert result == 'decimal'
611619

612620
def test_string(self):
@@ -648,34 +656,34 @@ def test_infer_dtype_datetime(self):
648656

649657
arr = np.array([Timestamp('2011-01-01'),
650658
Timestamp('2011-01-02')])
651-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
659+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
652660

653661
arr = np.array([np.datetime64('2011-01-01'),
654662
np.datetime64('2011-01-01')], dtype=object)
655-
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
663+
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
656664

657665
arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
658-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
666+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
659667

660668
# starts with nan
661669
for n in [pd.NaT, np.nan]:
662670
arr = np.array([n, pd.Timestamp('2011-01-02')])
663-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
671+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
664672

665673
arr = np.array([n, np.datetime64('2011-01-02')])
666-
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
674+
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
667675

668676
arr = np.array([n, datetime(2011, 1, 1)])
669-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
677+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
670678

671679
arr = np.array([n, pd.Timestamp('2011-01-02'), n])
672-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
680+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
673681

674682
arr = np.array([n, np.datetime64('2011-01-02'), n])
675-
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
683+
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
676684

677685
arr = np.array([n, datetime(2011, 1, 1), n])
678-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
686+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
679687

680688
# different type of nat
681689
arr = np.array([np.timedelta64('nat'),
@@ -689,58 +697,58 @@ def test_infer_dtype_datetime(self):
689697
# mixed datetime
690698
arr = np.array([datetime(2011, 1, 1),
691699
pd.Timestamp('2011-01-02')])
692-
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
700+
assert lib.infer_dtype(arr, skipna=True) == 'datetime'
693701

694702
# should be datetime?
695703
arr = np.array([np.datetime64('2011-01-01'),
696704
pd.Timestamp('2011-01-02')])
697-
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
705+
assert lib.infer_dtype(arr, skipna=True) == 'mixed'
698706

699707
arr = np.array([pd.Timestamp('2011-01-02'),
700708
np.datetime64('2011-01-01')])
701-
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
709+
assert lib.infer_dtype(arr, skipna=True) == 'mixed'
702710

703711
arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
704-
assert lib.infer_dtype(arr, skipna=False) == 'mixed-integer'
712+
assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'
705713

706714
arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
707-
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
715+
assert lib.infer_dtype(arr, skipna=True) == 'mixed'
708716

709717
arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
710-
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
718+
assert lib.infer_dtype(arr, skipna=True) == 'mixed'
711719

712720
def test_infer_dtype_timedelta(self):
713721

714722
arr = np.array([pd.Timedelta('1 days'),
715723
pd.Timedelta('2 days')])
716-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
724+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
717725

718726
arr = np.array([np.timedelta64(1, 'D'),
719727
np.timedelta64(2, 'D')], dtype=object)
720-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
728+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
721729

722730
arr = np.array([timedelta(1), timedelta(2)])
723-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
731+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
724732

725733
# starts with nan
726734
for n in [pd.NaT, np.nan]:
727735
arr = np.array([n, Timedelta('1 days')])
728-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
736+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
729737

730738
arr = np.array([n, np.timedelta64(1, 'D')])
731-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
739+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
732740

733741
arr = np.array([n, timedelta(1)])
734-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
742+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
735743

736744
arr = np.array([n, pd.Timedelta('1 days'), n])
737-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
745+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
738746

739747
arr = np.array([n, np.timedelta64(1, 'D'), n])
740-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
748+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
741749

742750
arr = np.array([n, timedelta(1), n])
743-
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
751+
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
744752

745753
# different type of nat
746754
arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
@@ -755,19 +763,19 @@ def test_infer_dtype_period(self):
755763
# GH 13664
756764
arr = np.array([pd.Period('2011-01', freq='D'),
757765
pd.Period('2011-02', freq='D')])
758-
assert lib.infer_dtype(arr, skipna=False) == 'period'
766+
assert lib.infer_dtype(arr, skipna=True) == 'period'
759767

760768
arr = np.array([pd.Period('2011-01', freq='D'),
761769
pd.Period('2011-02', freq='M')])
762-
assert lib.infer_dtype(arr, skipna=False) == 'period'
770+
assert lib.infer_dtype(arr, skipna=True) == 'period'
763771

764772
# starts with nan
765773
for n in [pd.NaT, np.nan]:
766774
arr = np.array([n, pd.Period('2011-01', freq='D')])
767-
assert lib.infer_dtype(arr, skipna=False) == 'period'
775+
assert lib.infer_dtype(arr, skipna=True) == 'period'
768776

769777
arr = np.array([n, pd.Period('2011-01', freq='D'), n])
770-
assert lib.infer_dtype(arr, skipna=False) == 'period'
778+
assert lib.infer_dtype(arr, skipna=True) == 'period'
771779

772780
# different type of nat
773781
arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
@@ -846,7 +854,7 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second,
846854

847855
def test_infer_dtype_all_nan_nat_like(self):
848856
arr = np.array([np.nan, np.nan])
849-
assert lib.infer_dtype(arr, skipna=False) == 'floating'
857+
assert lib.infer_dtype(arr, skipna=True) == 'floating'
850858

851859
# nan and None mix are result in mixed
852860
arr = np.array([np.nan, np.nan, None])
@@ -1043,17 +1051,17 @@ def test_categorical(self):
10431051
# GH 8974
10441052
from pandas import Categorical, Series
10451053
arr = Categorical(list('abc'))
1046-
result = lib.infer_dtype(arr, skipna=False)
1054+
result = lib.infer_dtype(arr, skipna=True)
10471055
assert result == 'categorical'
10481056

1049-
result = lib.infer_dtype(Series(arr), skipna=False)
1057+
result = lib.infer_dtype(Series(arr), skipna=True)
10501058
assert result == 'categorical'
10511059

10521060
arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
1053-
result = lib.infer_dtype(arr, skipna=False)
1061+
result = lib.infer_dtype(arr, skipna=True)
10541062
assert result == 'categorical'
10551063

1056-
result = lib.infer_dtype(Series(arr), skipna=False)
1064+
result = lib.infer_dtype(Series(arr), skipna=True)
10571065
assert result == 'categorical'
10581066

10591067

pandas/tests/series/test_constructors.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -813,12 +813,12 @@ def test_constructor_with_datetime_tz(self):
813813
s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
814814
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
815815
assert s.dtype == 'datetime64[ns, US/Pacific]'
816-
assert lib.infer_dtype(s, skipna=False) == 'datetime64'
816+
assert lib.infer_dtype(s, skipna=True) == 'datetime64'
817817

818818
s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
819819
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
820820
assert s.dtype == 'object'
821-
assert lib.infer_dtype(s, skipna=False) == 'datetime'
821+
assert lib.infer_dtype(s, skipna=True) == 'datetime'
822822

823823
# with all NaT
824824
s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')

0 commit comments

Comments
 (0)