From eedf0e68f70ae154ebda15bbaabef324f02f89d9 Mon Sep 17 00:00:00 2001 From: Ruijing Li Date: Mon, 4 Mar 2019 23:59:24 -0800 Subject: [PATCH] Implementing iso_week_year support for to_datetime --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/_libs/tslibs/strptime.pyx | 107 +++++++++++++++++-- pandas/core/tools/datetimes.py | 2 + pandas/tests/indexes/datetimes/test_tools.py | 57 ++++++++++ 4 files changed, 158 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index ddc5e543c6165..6b31678b09a65 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -19,6 +19,7 @@ including other versions of pandas. Other Enhancements ^^^^^^^^^^^^^^^^^^ +- Added support for ISO week year format ('%G-%V-%u') when parsing datetimes using :meth: `to_datetime` (:issue:`16607`) - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`) - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 87658ae92175e..d3461dada0fa5 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -54,7 +54,10 @@ cdef dict _parse_code_table = {'y': 0, 'W': 16, 'Z': 17, 'p': 18, # an additional key, only with I - 'z': 19} + 'z': 19, + 'G': 20, + 'V': 21, + 'u': 22} def array_strptime(object[:] values, object fmt, @@ -77,6 +80,7 @@ def array_strptime(object[:] values, object fmt, object[:] result_timezone int year, month, day, minute, hour, second, weekday, julian int week_of_year, week_of_year_start, parse_code, ordinal + int iso_week, iso_year int64_t us, ns object val, group_key, ampm, found, timezone dict found_key @@ -169,13 +173,14 @@ def array_strptime(object[:] values, object fmt, raise ValueError("time data %r does not match format " "%r (search)" % (values[i], fmt)) + iso_year = -1 year = 1900 month = day = 1 hour = minute = second = ns = us = 0 timezone = None # Default to -1 to signify that values not known; not critical to have, # though - week_of_year = -1 + iso_week = week_of_year = -1 week_of_year_start = -1 # weekday and julian defaulted to -1 so as to signal need to calculate # values @@ -265,13 +270,44 @@ def array_strptime(object[:] values, object fmt, timezone = pytz.timezone(found_dict['Z']) elif parse_code == 19: timezone = parse_timezone_directive(found_dict['z']) + elif parse_code == 20: + iso_year = int(found_dict['G']) + elif parse_code == 21: + iso_week = int(found_dict['V']) + elif parse_code == 22: + weekday = int(found_dict['u']) + weekday -= 1 + + # don't assume default values for ISO week/year + if iso_year != -1: + if iso_week == -1 or weekday == -1: + raise ValueError("ISO year directive '%G' must be used with " + "the ISO week directive '%V' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") + if julian != -1: + raise ValueError("Day of the year directive '%j' is not " + "compatible with ISO year directive '%G'. " + "Use '%Y' instead.") + elif year != -1 and week_of_year == -1 and iso_week != -1: + if weekday == -1: + raise ValueError("ISO week directive '%V' must be used with " + "the ISO year directive '%G' and a weekday " + "directive '%A', '%a', '%w', or '%u'.") + else: + raise ValueError("ISO week directive '%V' is incompatible with" + " the year directive '%Y'. Use the ISO year " + "'%G' instead.") # If we know the wk of the year and what day of that wk, we can figure # out the Julian day of the year. - if julian == -1 and week_of_year != -1 and weekday != -1: - week_starts_Mon = True if week_of_year_start == 0 else False - julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, - week_starts_Mon) + if julian == -1 and weekday != -1: + if week_of_year != -1: + week_starts_Mon = week_of_year_start == 0 + julian = _calc_julian_from_U_or_W(year, week_of_year, weekday, + week_starts_Mon) + elif iso_year != -1 and iso_week != -1: + year, julian = _calc_julian_from_V(iso_year, iso_week, + weekday + 1) # Cannot pre-calculate datetime_date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. @@ -511,6 +547,7 @@ class TimeRE(dict): # The " \d" part of the regex is to make %c from ANSI C work 'd': r"(?P3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])", 'f': r"(?P[0-9]{1,9})", + 'G': r"(?P\d\d\d\d)", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" @@ -518,7 +555,9 @@ class TimeRE(dict): 'm': r"(?P1[0-2]|0[1-9]|[1-9])", 'M': r"(?P[0-5]\d|\d)", 'S': r"(?P6[0-1]|[0-5]\d|\d)", + 'u': r"(?P[1-7])", 'U': r"(?P5[0-3]|[0-4]\d|\d)", + 'V': r"(?P5[0-3]|0[1-9]|[1-4]\d|\d)", 'w': r"(?P[0-6])", # W is set below by using 'U' 'y': r"(?P\d\d)", @@ -593,11 +632,27 @@ _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -cdef _calc_julian_from_U_or_W(int year, int week_of_year, - int day_of_week, int week_starts_Mon): +cdef int _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year - assumes the week starts on Sunday or Monday (6 or 0).""" + assumes the week starts on Sunday or Monday (6 or 0). + + Parameters + ---------- + year : int + the year + week_of_year : int + week taken from format U or W + week_starts_Mon : int + represents whether the week of the year + assumes the week starts on Sunday or Monday (6 or 0) + + Returns + ------- + int + converted julian day + """ cdef: int first_weekday, week_0_length, days_to_week @@ -620,6 +675,40 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, return 1 + days_to_week + day_of_week +cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): + """Calculate the Julian day based on the ISO 8601 year, week, and weekday. + ISO weeks start on Mondays, with week 01 being the week containing 4 Jan. + ISO week days range from 1 (Monday) to 7 (Sunday). + + Parameters + ---------- + iso_year : int + the year taken from format %G + iso_week : int + the week taken from format %V + iso_weekday : int + weekday taken from format %u + + Returns + ------- + (int, int) + the iso year and the Gregorian ordinal date / julian date + """ + + cdef: + int correction, ordinal + + correction = datetime_date(iso_year, 1, 4).isoweekday() + 3 + ordinal = (iso_week * 7) + iso_weekday - correction + # ordinal may be negative or 0 now, which means the date is in the previous + # calendar year + if ordinal < 1: + ordinal += datetime_date(iso_year, 1, 1).toordinal() + iso_year -= 1 + ordinal -= datetime_date(iso_year, 1, 1).toordinal() + return iso_year, ordinal + + cdef parse_timezone_directive(object z): """ Parse the '%z' directive and return a pytz.FixedOffset diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 64e06787db6fe..80a7deecdffbe 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -455,6 +455,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. + See strftime documentation for more information on choices: + https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior exact : boolean, True by default - If True, require an exact format match. diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 1a1e33bd508fc..22e589beb8ba1 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -244,6 +244,63 @@ def test_to_datetime_parse_timezone_keeps_name(self): class TestToDatetime(object): + @pytest.mark.parametrize("s, _format, dt", [ + ['2015-1-1', '%G-%V-%u', datetime(2014, 12, 29, 0, 0)], + ['2015-1-4', '%G-%V-%u', datetime(2015, 1, 1, 0, 0)], + ['2015-1-7', '%G-%V-%u', datetime(2015, 1, 4, 0, 0)] + ]) + def test_to_datetime_iso_week_year_format(self, s, _format, dt): + # See GH#16607 + assert to_datetime(s, format=_format) == dt + + @pytest.mark.parametrize("msg, s, _format", [ + ["ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 50", + "%Y %V"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 51", + "%G %V"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 " + "Monday", "%G %A"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 Mon", + "%G %a"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", + "%G %w"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "1999 6", + "%G %u"], + ["ISO year directive '%G' must be used with the ISO week directive " + "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.", "2051", + "%G"], + ["Day of the year directive '%j' is not compatible with ISO year " + "directive '%G'. Use '%Y' instead.", "1999 51 6 256", "%G %V %u %j"], + ["ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sunday", "%Y %V %A"], + ["ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", "1999 51 Sun", "%Y %V %a"], + ["ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %w"], + ["ISO week directive '%V' is incompatible with the year directive " + "'%Y'. Use the ISO year '%G' instead.", "1999 51 1", "%Y %V %u"], + ["ISO week directive '%V' must be used with the ISO year directive " + "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.", "20", "%V"] + ]) + def test_error_iso_week_year(self, msg, s, _format): + # See GH#16607 + # This test checks for errors thrown when giving the wrong format + # However, as discussed on PR#25541, overriding the locale + # causes a different error to be thrown due to the format being + # locale specific, but the test data is in english. + # Therefore, the tests only run when locale is not overwritten, + # as a sort of solution to this problem. + if (locale.getlocale() != ('zh_CN', 'UTF-8') and + locale.getlocale() != ('it_IT', 'UTF-8')): + with pytest.raises(ValueError, match=msg): + to_datetime(s, format=_format) + @pytest.mark.parametrize('tz', [None, 'US/Central']) def test_to_datetime_dtarr(self, tz): # DatetimeArray