From cd9ddf16b260ca42d5129b00efc287d536fde580 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 May 2021 22:27:06 -0700 Subject: [PATCH 1/2] DEPR: datetimelike inference with strings --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_libs/lib.pyi | 2 +- pandas/_libs/lib.pyx | 22 +++++---- pandas/core/dtypes/cast.py | 11 ++++- pandas/tests/apply/test_series_apply.py | 4 +- pandas/tests/arithmetic/test_datetime64.py | 2 +- pandas/tests/dtypes/test_inference.py | 10 ++-- pandas/tests/resample/test_time_grouper.py | 45 +++++++++-------- .../series/accessors/test_dt_accessor.py | 1 + .../series/methods/test_combine_first.py | 6 ++- pandas/tests/series/methods/test_fillna.py | 7 ++- pandas/tests/series/test_constructors.py | 49 +++++++++++++------ pandas/tests/tools/test_to_timedelta.py | 3 +- 13 files changed, 103 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ea9017da8a2f9..e18bee5c56114 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -696,6 +696,7 @@ Deprecations - Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`) - Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) +- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) - Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) - Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`) - Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_csv` (:issue:`41485`) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 5e1cc612bed57..06620c2ad0dca 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -153,7 +153,7 @@ def ensure_string_array( def infer_datetimelike_array( arr: np.ndarray # np.ndarray[object] -) -> str: ... +) -> tuple[str, bool]: ... def astype_intsafe( arr: np.ndarray, # np.ndarray[object] diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d7e15bb2ad197..6a270c0a55638 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1558,7 +1558,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: return "mixed" -def infer_datetimelike_array(arr: ndarray[object]) -> str: +def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: """ Infer if we have a datetime or timedelta array. - date: we have *only* date and maybe strings, nulls @@ -1576,12 +1576,13 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: Returns ------- str: {datetime, timedelta, date, nat, mixed} + bool """ cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = False, seen_date = False, seen_datetime = False bint seen_tz_aware = False, seen_tz_naive = False - bint seen_nat = False + bint seen_nat = False, seen_str = False list objs = [] object v @@ -1589,6 +1590,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: v = arr[i] if isinstance(v, str): objs.append(v) + seen_str = True if len(objs) == 3: break @@ -1609,7 +1611,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: seen_tz_aware = True if seen_tz_naive and seen_tz_aware: - return "mixed" + return "mixed", seen_str elif util.is_datetime64_object(v): # np.datetime64 seen_datetime = True @@ -1619,16 +1621,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: # timedelta, or timedelta64 seen_timedelta = True else: - return "mixed" + return "mixed", seen_str if seen_date and not (seen_datetime or seen_timedelta): - return "date" + return "date", seen_str elif seen_datetime and not seen_timedelta: - return "datetime" + return "datetime", seen_str elif seen_timedelta and not seen_datetime: - return "timedelta" + return "timedelta", seen_str elif seen_nat: - return "nat" + return "nat", seen_str # short-circuit by trying to # actually convert these strings @@ -1637,14 +1639,14 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: if len(objs): try: array_to_datetime(objs, errors="raise") - return "datetime" + return "datetime", seen_str except (ValueError, TypeError): pass # we are *not* going to infer from strings # for timedelta as too much ambiguity - return 'mixed' + return "mixed", seen_str cdef inline bint is_timedelta(object o): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 40883dd8f747b..c4ed81f95ce70 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1545,7 +1545,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: else: return td_values.reshape(shape) - inferred_type = lib.infer_datetimelike_array(ensure_object(v)) + inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v)) if inferred_type == "datetime": # error: Incompatible types in assignment (expression has type "ExtensionArray", @@ -1574,6 +1574,15 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: # "ExtensionArray", variable has type "Union[ndarray, List[Any]]") value = try_datetime(v) # type: ignore[assignment] + if value.dtype.kind in ["m", "M"] and seen_str: + warnings.warn( + f"Inferring {value.dtype} from data containing strings is deprecated " + "and will be removed in a future version. To retain the old behavior " + "explicitly pass Series(data, dtype={value.dtype})", + FutureWarning, + stacklevel=find_stack_level(), + ) + # return v.reshape(shape) return value diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 88c3ad228f8c3..7e8dbea07709f 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -859,7 +859,9 @@ def test_apply_to_timedelta(): list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] a = pd.to_timedelta(list_of_strings) # noqa - b = Series(list_of_strings).apply(pd.to_timedelta) # noqa + with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"): + ser = Series(list_of_strings) + b = ser.apply(pd.to_timedelta) # noqa # Can't compare until apply on a Series gives the correct dtype # assert_series_equal(a, b) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 215b51dd88ef4..6b3309ba8ea1b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -328,7 +328,7 @@ def test_dt64arr_timestamp_equality(self, box_with_array): box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray ) - ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), "NaT"]) + ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), NaT]) ser = tm.box_expected(ser, box_with_array) result = ser != ser diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 09efa97871fae..31903c559d8df 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1169,7 +1169,7 @@ def test_infer_dtype_period_with_na(self, na_value): ], ) def test_infer_datetimelike_array_datetime(self, data): - assert lib.infer_datetimelike_array(data) == "datetime" + assert lib.infer_datetimelike_array(data) == ("datetime", False) @pytest.mark.parametrize( "data", @@ -1181,11 +1181,11 @@ def test_infer_datetimelike_array_datetime(self, data): ], ) def test_infer_datetimelike_array_timedelta(self, data): - assert lib.infer_datetimelike_array(data) == "timedelta" + assert lib.infer_datetimelike_array(data) == ("timedelta", False) def test_infer_datetimelike_array_date(self): arr = [date(2017, 6, 12), date(2017, 3, 11)] - assert lib.infer_datetimelike_array(arr) == "date" + assert lib.infer_datetimelike_array(arr) == ("date", False) @pytest.mark.parametrize( "data", @@ -1200,7 +1200,7 @@ def test_infer_datetimelike_array_date(self): ], ) def test_infer_datetimelike_array_mixed(self, data): - assert lib.infer_datetimelike_array(data) == "mixed" + assert lib.infer_datetimelike_array(data)[0] == "mixed" @pytest.mark.parametrize( "first, expected", @@ -1218,7 +1218,7 @@ def test_infer_datetimelike_array_mixed(self, data): @pytest.mark.parametrize("second", [None, np.nan]) def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): first.append(second) - assert lib.infer_datetimelike_array(first) == expected + assert lib.infer_datetimelike_array(first) == (expected, False) def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 7cc2b7f72fb69..82e6c4daf9515 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -305,27 +305,30 @@ def test_groupby_resample_interpolate(): .resample("1D") .interpolate(method="linear") ) - expected_ind = pd.MultiIndex.from_tuples( - [ - (50, "2018-01-07"), - (50, Timestamp("2018-01-08")), - (50, Timestamp("2018-01-09")), - (50, Timestamp("2018-01-10")), - (50, Timestamp("2018-01-11")), - (50, Timestamp("2018-01-12")), - (50, Timestamp("2018-01-13")), - (50, Timestamp("2018-01-14")), - (50, Timestamp("2018-01-15")), - (50, Timestamp("2018-01-16")), - (50, Timestamp("2018-01-17")), - (50, Timestamp("2018-01-18")), - (50, Timestamp("2018-01-19")), - (50, Timestamp("2018-01-20")), - (50, Timestamp("2018-01-21")), - (60, Timestamp("2018-01-14")), - ], - names=["volume", "week_starting"], - ) + + msg = "containing strings is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + expected_ind = pd.MultiIndex.from_tuples( + [ + (50, "2018-01-07"), + (50, Timestamp("2018-01-08")), + (50, Timestamp("2018-01-09")), + (50, Timestamp("2018-01-10")), + (50, Timestamp("2018-01-11")), + (50, Timestamp("2018-01-12")), + (50, Timestamp("2018-01-13")), + (50, Timestamp("2018-01-14")), + (50, Timestamp("2018-01-15")), + (50, Timestamp("2018-01-16")), + (50, Timestamp("2018-01-17")), + (50, Timestamp("2018-01-18")), + (50, Timestamp("2018-01-19")), + (50, Timestamp("2018-01-20")), + (50, Timestamp("2018-01-21")), + (60, Timestamp("2018-01-14")), + ], + names=["volume", "week_starting"], + ) expected = DataFrame( data={ "price": [ diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index dcdee01bd4df8..62a9099fab1ad 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -679,6 +679,7 @@ def test_dt_timetz_accessor(self, tz_naive_fixture): [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], ], ) + @pytest.mark.filterwarnings("ignore:Inferring datetime64:FutureWarning") def test_isocalendar(self, input_series, expected_output): result = pd.to_datetime(Series(input_series)).dt.isocalendar() expected_frame = DataFrame( diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 4c254c6db2a70..b838797b5f9b9 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,11 @@ def test_combine_first_dt64(self): s0 = to_datetime(Series(["2010", np.NaN])) s1 = Series([np.NaN, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"]) + + msg = "containing strings is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + xp = Series([datetime(2010, 1, 1), "2011"]) + tm.assert_series_equal(rs, xp) def test_combine_first_dt_tz_values(self, tz_naive_fixture): diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 82c52bdaa29d7..1aec2a5e5d726 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -319,8 +319,11 @@ def test_datetime64_fillna(self): # GH#6587 # make sure that we are treating as integer when filling - # this also tests inference of a datetime-like with NaT's - ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) + msg = "containing strings is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + # this also tests inference of a datetime-like with NaT's + ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) + expected = Series( [ "2013-08-05 15:30:00.000001", diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index af730bf299336..b11ce68ccc033 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -900,14 +900,23 @@ def test_constructor_dtype_datetime64_7(self): def test_constructor_dtype_datetime64_6(self): # these will correctly infer a datetime - s = Series([None, NaT, "2013-08-05 15:30:00.000001"]) - assert s.dtype == "datetime64[ns]" - s = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"]) - assert s.dtype == "datetime64[ns]" - s = Series([NaT, None, "2013-08-05 15:30:00.000001"]) - assert s.dtype == "datetime64[ns]" - s = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"]) - assert s.dtype == "datetime64[ns]" + msg = "containing strings is deprecated" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([None, NaT, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == "datetime64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == "datetime64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([NaT, None, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == "datetime64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == "datetime64[ns]" def test_constructor_dtype_datetime64_5(self): # tz-aware (UTC and other tz's) @@ -1365,14 +1374,22 @@ def test_constructor_dtype_timedelta64(self): assert td.dtype == "object" # these will correctly infer a timedelta - s = Series([None, NaT, "1 Day"]) - assert s.dtype == "timedelta64[ns]" - s = Series([np.nan, NaT, "1 Day"]) - assert s.dtype == "timedelta64[ns]" - s = Series([NaT, None, "1 Day"]) - assert s.dtype == "timedelta64[ns]" - s = Series([NaT, np.nan, "1 Day"]) - assert s.dtype == "timedelta64[ns]" + msg = "containing strings is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([None, NaT, "1 Day"]) + assert ser.dtype == "timedelta64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([np.nan, NaT, "1 Day"]) + assert ser.dtype == "timedelta64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([NaT, None, "1 Day"]) + assert ser.dtype == "timedelta64[ns]" + + with tm.assert_produces_warning(FutureWarning, match=msg): + ser = Series([NaT, np.nan, "1 Day"]) + assert ser.dtype == "timedelta64[ns]" # GH 16406 def test_constructor_mixed_tz(self): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 1fc383521d31f..095b59241a836 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -197,7 +197,8 @@ def test_to_timedelta_on_missing_values(self): ) tm.assert_series_equal(actual, expected) - actual = to_timedelta(Series(["00:00:01", pd.NaT])) + with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"): + actual = to_timedelta(Series(["00:00:01", pd.NaT])) tm.assert_series_equal(actual, expected) actual = to_timedelta(np.nan) From f349ae7d443d72b195b8334d885d84eca5ea2b77 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 31 May 2021 19:06:43 -0700 Subject: [PATCH 2/2] TST: no warning on to_timedelta --- pandas/tests/tools/test_to_timedelta.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 095b59241a836..eb26ae688f00e 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -187,6 +187,16 @@ def test_to_timedelta_via_apply(self): result = Series([to_timedelta("00:00:01")]) tm.assert_series_equal(result, expected) + def test_to_timedelta_inference_without_warning(self): + # GH#41731 inference produces a warning in the Series constructor, + # but _not_ in to_timedelta + vals = ["00:00:01", pd.NaT] + with tm.assert_produces_warning(None): + result = to_timedelta(vals) + + expected = TimedeltaIndex([pd.Timedelta(seconds=1), pd.NaT]) + tm.assert_index_equal(result, expected) + def test_to_timedelta_on_missing_values(self): # GH5438 timedelta_NaT = np.timedelta64("NaT")