diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d114d72633012..4577d20a509ce 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -644,6 +644,7 @@ Conversion - Bug in :meth:`Series.convert_dtypes` not converting dtype to nullable dtype when :class:`Series` contains ``NA`` and has dtype ``object`` (:issue:`48791`) - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) +- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) Strings ^^^^^^^ diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index f3a24a707c530..cc08b17e0ff5d 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -24,6 +24,8 @@ def array_to_datetime( yearfirst: bool = ..., utc: bool = ..., require_iso8601: bool = ..., + format: str | None = ..., + exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3104ecbc8bdb8..e01de6b70470e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -446,6 +446,8 @@ cpdef array_to_datetime( bint yearfirst=False, bint utc=False, bint require_iso8601=False, + format: str | None=None, + bint exact=True, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -563,6 +565,16 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): + if require_iso8601: + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError( + f"time data \"{val}\" at position {i} doesn't " + f"match format \"{format}\"" + ) + return values, tz_out # these must be ns unit by-definition seen_integer = True @@ -593,7 +605,7 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False + &out_tzoffset, False, format, exact ) if string_to_dts_failed: # An error at this point is a _parsing_ error @@ -609,7 +621,7 @@ cpdef array_to_datetime( elif is_raise: raise ValueError( f"time data \"{val}\" at position {i} doesn't " - "match format specified" + f"match format \"{format}\"" ) return values, tz_out diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index e51bbd4e074e1..de81c611c9ee9 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -95,6 +95,8 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + format: str | None = *, + bint exact = * ) except? -1 cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index b1ff456c84a70..d49c41e54764f 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -52,7 +52,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) + int *out_local, int *out_tzoffset, + const char *format, int format_len, int exact) # ---------------------------------------------------------------------- @@ -277,14 +278,25 @@ cdef inline int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, + format: str | None=None, + bint exact=True, ) except? -1: cdef: Py_ssize_t length const char* buf + Py_ssize_t format_length + const char* format_buf buf = get_c_string_buf_and_size(val, &length) + if format is None: + format_buf = b'' + format_length = 0 + exact = False + else: + format_buf = get_c_string_buf_and_size(format, &format_length) return parse_iso_8601_datetime(buf, length, want_exc, - dts, out_bestunit, out_local, out_tzoffset) + dts, out_bestunit, out_local, out_tzoffset, + format_buf, format_length, exact) cpdef ndarray astype_overflowsafe( diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index cfbaed01b57c9..597a2aae7a2a3 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,10 +66,45 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ + +// This function will advance the pointer on format +// and decrement characters_remaining by n on success +// On failure will return -1 without incrementing +static int compare_format(const char **format, int *characters_remaining, + const char *compare_to, int n, const int exact) { + if (*characters_remaining < n) { + if (exact) { + // TODO(pandas-dev): in the future we should set a PyErr here + // to be very clear about what went wrong + return -1; + } else if (*characters_remaining) { + // TODO(pandas-dev): same return value in this function as + // above branch, but stub out a future where + // we have a better error message + return -1; + } else { + return 0; + } + } else { + if (strncmp(*format, compare_to, n)) { + // TODO(pandas-dev): PyErr to differentiate what went wrong + return -1; + } else { + *format += n; + *characters_remaining -= n; + return 0; + } + } + return 0; +} + int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, - int *out_local, int *out_tzoffset) { + int *out_local, int *out_tzoffset, + const char* format, int format_len, int exact) { + if (len < 0 || format_len < 0) + goto parse_error; int year_leap = 0; int i, numdigits; const char *substr; @@ -104,6 +139,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; + } } /* Leading '-' sign for negative year */ @@ -117,6 +155,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE YEAR (4 digits) */ + if (compare_format(&format, &format_len, "%Y", 2, exact)) { + goto parse_error; + } + out->year = 0; if (sublen >= 4 && isdigit(substr[0]) && isdigit(substr[1]) && isdigit(substr[2]) && isdigit(substr[3])) { @@ -139,6 +181,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_local != NULL) { *out_local = 0; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_Y; goto finish; } @@ -156,6 +201,10 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, ymd_sep = valid_ymd_sep[i]; ++substr; --sublen; + + if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { + goto parse_error; + } /* Cannot have trailing separator */ if (sublen == 0 || !isdigit(*substr)) { goto parse_error; @@ -163,6 +212,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MONTH */ + if (compare_format(&format, &format_len, "%m", 2, exact)) { + goto parse_error; + } /* First digit required */ out->month = (*substr - '0'); ++substr; @@ -190,6 +242,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!has_ymd_sep) { goto parse_error; } + if (format_len) { + goto parse_error; + } if (out_local != NULL) { *out_local = 0; } @@ -203,9 +258,15 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } ++substr; --sublen; + if (compare_format(&format, &format_len, &ymd_sep, 1, exact)) { + goto parse_error; + } } /* PARSE THE DAY */ + if (compare_format(&format, &format_len, "%d", 2, exact)) { + goto parse_error; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -235,6 +296,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out_local != NULL) { *out_local = 0; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_D; goto finish; } @@ -242,10 +306,16 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if ((*substr != 'T' && *substr != ' ') || sublen == 1) { goto parse_error; } + if (compare_format(&format, &format_len, substr, 1, exact)) { + goto parse_error; + } ++substr; --sublen; /* PARSE THE HOURS */ + if (compare_format(&format, &format_len, "%H", 2, exact)) { + goto parse_error; + } /* First digit required */ if (!isdigit(*substr)) { goto parse_error; @@ -274,6 +344,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (!hour_was_2_digits) { goto parse_error; } + if (format_len) { + goto parse_error; + } bestunit = NPY_FR_h; goto finish; } @@ -286,6 +359,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0 || !isdigit(*substr)) { goto parse_error; } + if (compare_format(&format, &format_len, ":", 1, exact)) { + goto parse_error; + } } else if (!isdigit(*substr)) { if (!hour_was_2_digits) { goto parse_error; @@ -294,6 +370,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE MINUTES */ + if (compare_format(&format, &format_len, "%M", 2, exact)) { + goto parse_error; + } /* First digit required */ out->min = (*substr - '0'); ++substr; @@ -317,12 +396,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen == 0) { bestunit = NPY_FR_m; + if (format_len) { + goto parse_error; + } goto finish; } /* If we make it through this condition block, then the next * character is a digit. */ if (has_hms_sep && *substr == ':') { + if (compare_format(&format, &format_len, ":", 1, exact)) { + goto parse_error; + } ++substr; --sublen; /* Cannot have a trailing ':' */ @@ -335,6 +420,9 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } /* PARSE THE SECONDS */ + if (compare_format(&format, &format_len, "%S", 2, exact)) { + goto parse_error; + } /* First digit required */ out->sec = (*substr - '0'); ++substr; @@ -360,12 +448,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, if (sublen > 0 && *substr == '.') { ++substr; --sublen; + if (compare_format(&format, &format_len, ".", 1, exact)) { + goto parse_error; + } } else { bestunit = NPY_FR_s; goto parse_timezone; } /* PARSE THE MICROSECONDS (0 to 6 digits) */ + if (compare_format(&format, &format_len, "%f", 2, exact)) { + goto parse_error; + } numdigits = 0; for (i = 0; i < 6; ++i) { out->us *= 10; @@ -430,15 +524,24 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; + } } if (sublen == 0) { // Unlike NumPy, treating no time zone as naive + if (format_len > 0) { + goto parse_error; + } goto finish; } /* UTC specifier */ if (*substr == 'Z') { + if (compare_format(&format, &format_len, "%Z", 2, exact)) { + goto parse_error; + } /* "Z" should be equivalent to tz offset "+00:00" */ if (out_local != NULL) { *out_local = 1; @@ -449,12 +552,18 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, } if (sublen == 1) { + if (format_len > 0) { + goto parse_error; + } goto finish; } else { ++substr; --sublen; } } else if (*substr == '-' || *substr == '+') { + if (compare_format(&format, &format_len, "%z", 2, exact)) { + goto parse_error; + } /* Time zone offset */ int offset_neg = 0, offset_hour = 0, offset_minute = 0; @@ -538,9 +647,12 @@ int parse_iso_8601_datetime(const char *str, int len, int want_exc, while (sublen > 0 && isspace(*substr)) { ++substr; --sublen; + if (compare_format(&format, &format_len, " ", 1, exact)) { + goto parse_error; + } } - if (sublen != 0) { + if ((sublen != 0) || (format_len != 0)) { goto parse_error; } diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 511d9a401fed2..734f7daceba05 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -58,7 +58,10 @@ parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, NPY_DATETIMEUNIT *out_bestunit, int *out_local, - int *out_tzoffset); + int *out_tzoffset, + const char* format, + int format_len, + int exact); /* * Provides a string length to use for converting datetime diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 64deba8a9d3ce..ed0a7df41c28d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2102,6 +2102,8 @@ def objects_to_datetime64ns( errors: DateTimeErrorChoices = "raise", require_iso8601: bool = False, allow_object: bool = False, + format: str | None = None, + exact: bool = True, ): """ Convert data to array of timestamps. @@ -2146,6 +2148,8 @@ def objects_to_datetime64ns( dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601, + format=format, + exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 4cd89d25995bb..9b59c9ea43e45 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -424,16 +424,14 @@ def _convert_listlike_datetimes( format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format - format = None - if format is not None: + if format is not None and not require_iso8601: res = _to_datetime_with_format( arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format ) if res is not None: return res - assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, @@ -443,6 +441,8 @@ def _convert_listlike_datetimes( errors=errors, require_iso8601=require_iso8601, allow_object=True, + format=format, + exact=exact, ) if tz_parsed is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 27fe4e2d5e0b6..22dbec337c8b1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1746,6 +1746,130 @@ def test_to_datetime_iso8601(self, cache, arg, exp_str): exp = Timestamp(exp_str) assert result[0] == exp + @pytest.mark.parametrize( + "input, format", + [ + ("2012", "%Y-%m"), + ("2012-01", "%Y-%m-%d"), + ("2012-01-01", "%Y-%m-%d %H"), + ("2012-01-01 10", "%Y-%m-%d %H:%M"), + ("2012-01-01 10:00", "%Y-%m-%d %H:%M:%S"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M:%S.%f"), + ("2012-01-01 10:00:00.123", "%Y-%m-%d %H:%M:%S.%f%Z"), + ("2012-01-01 10:00:00.123", "%Y-%m-%d %H:%M:%S.%f%z"), + (0, "%Y-%m-%d"), + ], + ) + @pytest.mark.parametrize("exact", [True, False]) + def test_to_datetime_iso8601_fails(self, input, format, exact): + # https://github.com/pandas-dev/pandas/issues/12649 + # `format` is longer than the string, so this fails regardless of `exact` + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn't match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format, exact=exact) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 10", "%Y-%m-%d"), + ("2012-01-01 10:00", "%Y-%m-%d %H"), + ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"), + (0, "%Y-%m-%d"), + ], + ) + def test_to_datetime_iso8601_exact_fails(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + # `format` is shorter than the date string, so only fails with `exact=True` + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn't match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2012-01-01", "%Y-%m"), + ("2012-01-01 00", "%Y-%m-%d"), + ("2012-01-01 00:00", "%Y-%m-%d %H"), + ("2012-01-01 00:00:00", "%Y-%m-%d %H:%M"), + ], + ) + def test_to_datetime_iso8601_non_exact(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2012, 1, 1) + result = to_datetime(input, format=format, exact=False) + assert result == expected + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y/%m"), + ("2020-01-01", "%Y/%m/%d"), + ("2020-01-01 00", "%Y/%m/%dT%H"), + ("2020-01-01T00", "%Y/%m/%d %H"), + ("2020-01-01 00:00", "%Y/%m/%dT%H:%M"), + ("2020-01-01T00:00", "%Y/%m/%d %H:%M"), + ("2020-01-01 00:00:00", "%Y/%m/%dT%H:%M:%S"), + ("2020-01-01T00:00:00", "%Y/%m/%d %H:%M:%S"), + ], + ) + def test_to_datetime_iso8601_separator(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + with pytest.raises( + ValueError, + match=( + rf"time data \"{input}\" at position 0 doesn\'t match format " + rf"\"{format}\"" + ), + ): + to_datetime(input, format=format) + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01", "%Y-%m"), + ("2020-01-01", "%Y-%m-%d"), + ("2020-01-01 00", "%Y-%m-%d %H"), + ("2020-01-01T00", "%Y-%m-%dT%H"), + ("2020-01-01 00:00", "%Y-%m-%d %H:%M"), + ("2020-01-01T00:00", "%Y-%m-%dT%H:%M"), + ("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"), + ("2020-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"), + ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-01-01T00:00:00.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_valid(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2020, 1, 1) + result = to_datetime(input, format=format) + assert result == expected + + @pytest.mark.parametrize( + "input, format", + [ + ("2020-01-01T00:00:00.000000000+00:00", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2020-01-01T00:00:00+00:00", "%Y-%m-%dT%H:%M:%S%z"), + ("2020-01-01T00:00:00Z", "%Y-%m-%dT%H:%M:%S%Z"), + ], + ) + def test_to_datetime_iso8601_with_timezone_valid(self, input, format): + # https://github.com/pandas-dev/pandas/issues/12649 + expected = Timestamp(2020, 1, 1, tzinfo=pytz.UTC) + result = to_datetime(input, format=format) + assert result == expected + def test_to_datetime_default(self, cache): rs = to_datetime("2001", cache=cache) xp = datetime(2001, 1, 1) @@ -2259,7 +2383,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format specified' + msg = f'time data "{arg}" at position 0 doesn\'t match format "%Y-%m-%d"' with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache)