From b0428ec3fd74b0e078372708003e048bc5ea03eb Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 28 Aug 2017 14:30:18 -0700 Subject: [PATCH 01/15] Separate parsing functions out from tslib Move parsing functions from _libs/src/inference and core.tools.datetimes --- pandas/_libs/period.pyx | 5 +- pandas/_libs/src/inference.pyx | 166 +------- pandas/_libs/tslib.pyx | 279 +------------ pandas/_libs/tslibs/__init__.py | 2 + pandas/_libs/tslibs/parsing.pyx | 688 ++++++++++++++++++++++++++++++++ pandas/core/tools/datetimes.py | 158 +------- setup.py | 2 + 7 files changed, 720 insertions(+), 580 deletions(-) create mode 100644 pandas/_libs/tslibs/__init__.py create mode 100644 pandas/_libs/tslibs/parsing.pyx diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index 816b7ebfff86d..be8f9da1b7849 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -40,8 +40,9 @@ from tslib cimport ( _get_dst_info, _nat_scalar_rules) +from tslibs.parsing import parse_time_string, NAT_SENTINEL + from pandas.tseries import offsets -from pandas.core.tools.datetimes import parse_time_string from pandas.tseries import frequencies cdef int64_t NPY_NAT = util.get_nat() @@ -1178,6 +1179,8 @@ class Period(_Period): value = str(value) value = value.upper() dt, _, reso = parse_time_string(value, freq) + if dt is NAT_SENTINEL: + ordinal = iNaT if freq is None: try: diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 6b5a8f20f0067..d71516d309bb9 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -8,6 +8,13 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +from pandas._libs.tslibs.parsing import ( + try_parse_dates, + try_parse_date_and_time, + try_parse_year_month_day, + try_parse_datetime_components) + + from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN) @@ -1383,165 +1390,6 @@ def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) -def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(values) - result = np.empty(n, dtype='O') - - if parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return datetime.strptime(s, '%m/%d/%Y') - except Exception: - return s - # EAFP here - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # failed - return values - else: - parse_date = parser - - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise - - return result - - -def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, - date_parser=None, time_parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import date, time, datetime, timedelta - - n = len(dates) - if len(times) != n: - raise ValueError('Length of dates and times must be equal') - result = np.empty(n, dtype='O') - - if date_parser is None: - if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year, date.month, 1) - - try: - from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) - except ImportError: # pragma: no cover - def parse_date(s): - try: - return date.strptime(s, '%m/%d/%Y') - except Exception: - return s - else: - parse_date = date_parser - - if time_parser is None: - try: - from dateutil.parser import parse - parse_time = lambda x: parse(x) - except ImportError: # pragma: no cover - def parse_time(s): - try: - return time.strptime(s, '%H:%M:%S') - except Exception: - return s - - else: - parse_time = time_parser - - for i from 0 <= i < n: - d = parse_date(str(dates[i])) - t = parse_time(str(times[i])) - result[i] = datetime(d.year, d.month, d.day, - t.hour, t.minute, t.second) - - return result - - -def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, - ndarray[object] days): - cdef: - Py_ssize_t i, n - ndarray[object] result - - from datetime import datetime - - n = len(years) - if len(months) != n or len(days) != n: - raise ValueError('Length of years/months/days must all be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result - - -def try_parse_datetime_components(ndarray[object] years, - ndarray[object] months, - ndarray[object] days, - ndarray[object] hours, - ndarray[object] minutes, - ndarray[object] seconds): - - cdef: - Py_ssize_t i, n - ndarray[object] result - int secs - double float_secs - double micros - - from datetime import datetime - - n = len(years) - if (len(months) != n or len(days) != n or len(hours) != n or - len(minutes) != n or len(seconds) != n): - raise ValueError('Length of all datetime components must be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - float_secs = float(seconds[i]) - secs = int(float_secs) - - micros = float_secs - secs - if micros > 0: - micros = micros * 1000000 - - result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), - int(hours[i]), int(minutes[i]), secs, - int(micros)) - - return result - - def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index b5aca2e3ec309..acd3dfb3d3ed1 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -66,6 +66,15 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) +from tslibs.parsing import parse_time_string +from tslibs import parsing # noqa +from tslibs.parsing import ( # noqa + DateParseError, + NAT_SENTINEL, + parse_datetime_string, + _does_string_look_like_datetime, + parse_datetime_string_with_reso) + cimport cython import re @@ -1822,25 +1831,6 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz -cdef: - set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) - -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True - - try: - if float(date_string) < 1000: - return False - except ValueError: - pass - - if date_string in _not_datelike_strings: - return False - - return True def format_array_from_datetime(ndarray[int64_t] values, object tz=None, @@ -1926,257 +1916,6 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -class DateParseError(ValueError): - pass - - -cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') - - -def parse_datetime_string(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. - Also cares special handling matching time patterns. - - Returns - ------- - datetime - """ - - cdef: - object dt - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - if _TIMEPAT.match(date_string): - # use current datetime as default, not pass _DEFAULT_DATETIME - dt = parse_date(date_string, dayfirst=dayfirst, - yearfirst=yearfirst, **kwargs) - return dt - try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - return dt - except DateParseError: - raise - except ValueError: - pass - - try: - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) - except TypeError: - # following may be raised from dateutil - # TypeError: 'NoneType' object is not iterable - raise ValueError('Given date string not likely a datetime.') - - return dt - - -def parse_datetime_string_with_reso(object date_string, object freq=None, - dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime - - Returns - ------- - datetime - """ - - cdef: - object parsed, reso - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - try: - return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - except DateParseError: - raise - except ValueError: - pass - - try: - parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst) - except Exception as e: - # TODO: allow raise of errors within instead - raise DateParseError(e) - if parsed is None: - raise DateParseError("Could not parse %s" % date_string) - return parsed, parsed, reso - - -cdef inline object _parse_dateabbr_string(object date_string, object default, - object freq): - cdef: - object ret - int year, quarter = -1, month, mnum, date_len - - # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 - assert util.is_string_object(date_string) - - # len(date_string) == 0 - # should be NaT??? - - if date_string in _nat_strings: - return NaT, NaT, '' - - date_string = date_string.upper() - date_len = len(date_string) - - if date_len == 4: - # parse year only like 2000 - try: - ret = default.replace(year=int(date_string)) - return ret, ret, 'year' - except ValueError: - pass - - try: - if 4 <= date_len <= 7: - i = date_string.index('Q', 1, 6) - if i == 1: - quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d)') - year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d\d\d)') - year = int(date_string[-4:]) - else: - raise ValueError - elif i == 2 or i == 3: - # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 - and date_string[i - 1] == '-'): - quarter = int(date_string[-1]) - year = 2000 + int(date_string[:2]) - else: - raise ValueError - elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 - and date_string[i - 1] == '-'): - # r'(\d\d\d\d)-?Q(\d)' - quarter = int(date_string[-1]) - year = int(date_string[:4]) - else: - raise ValueError - - if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {0}') - raise DateParseError(msg.format(date_string)) - - if freq is not None: - # hack attack, #1228 - try: - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 - except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {0}').format(freq) - raise DateParseError(msg) - - month = (mnum + (quarter - 1) * 3) % 12 + 1 - if month > mnum: - year -= 1 - else: - month = (quarter - 1) * 3 + 1 - - ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' - - except DateParseError: - raise - except ValueError: - pass - - if date_len == 6 and (freq == 'M' or getattr( - freq, 'rule_code', None) == 'M'): - year = int(date_string[:4]) - month = int(date_string[4:6]) - try: - ret = default.replace(year=year, month=month) - return ret, ret, 'month' - except ValueError: - pass - - for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: - try: - ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' - except ValueError: - pass - - raise ValueError('Unable to parse {0}'.format(date_string)) - - -def dateutil_parse(object timestr, object default, ignoretz=False, - tzinfos=None, **kwargs): - """ lifted from dateutil to get resolution""" - - cdef: - object fobj, res, attr, ret, tzdata - object reso = None - dict repl = {} - - fobj = StringIO(str(timestr)) - res = DEFAULTPARSER._parse(fobj, **kwargs) - - # dateutil 2.2 compat - if isinstance(res, tuple): - res, _ = res - - if res is None: - msg = "Unknown datetime string format, unable to parse: {0}" - raise ValueError(msg.format(timestr)) - - for attr in ["year", "month", "day", "hour", - "minute", "second", "microsecond"]: - value = getattr(res, attr) - if value is not None: - repl[attr] = value - reso = attr - - if reso is None: - msg = "Unable to parse datetime string: {0}" - raise ValueError(msg.format(timestr)) - - if reso == 'microsecond': - if repl['microsecond'] == 0: - reso = 'second' - elif repl['microsecond'] % 1000 == 0: - reso = 'millisecond' - - ret = default.replace(**repl) - if res.weekday is not None and not res.day: - ret = ret + relativedelta.relativedelta(weekday=res.weekday) - if not ignoretz: - if callable(tzinfos) or tzinfos and res.tzname in tzinfos: - if callable(tzinfos): - tzdata = tzinfos(res.tzname, res.tzoffset) - else: - tzdata = tzinfos.get(res.tzname) - if isinstance(tzdata, datetime.tzinfo): - tzinfo = tzdata - elif isinstance(tzdata, string_types): - tzinfo = _dateutil_tzstr(tzdata) - elif isinstance(tzdata, int): - tzinfo = tzoffset(res.tzname, tzdata) - else: - raise ValueError("offset must be tzinfo subclass, " - "tz string, or int offset") - ret = ret.replace(tzinfo=tzinfo) - elif res.tzname and res.tzname in time.tzname: - ret = ret.replace(tzinfo=_dateutil_tzlocal()) - elif res.tzoffset == 0: - ret = ret.replace(tzinfo=_dateutil_tzutc()) - elif res.tzoffset: - ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - return ret, reso - - # const for parsers _DEFAULT_DATETIME = datetime(1, 1, 1).replace( diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py new file mode 100644 index 0000000000000..faa18be5bbf7d --- /dev/null +++ b/pandas/_libs/tslibs/__init__.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx new file mode 100644 index 0000000000000..5653d05436f36 --- /dev/null +++ b/pandas/_libs/tslibs/parsing.pyx @@ -0,0 +1,688 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# cython: profile=False +# cython: linetrace=False +# distutils: define_macros=CYTHON_TRACE=0 +# distutils: define_macros=CYTHON_TRACE_NOGIL=0 +import sys +import re + +from cpython cimport PyString_Check, PyUnicode_Check + +from libc.stdlib cimport free + +cimport cython +from cython cimport Py_ssize_t + + +from datetime import datetime +import time + +import numpy as np +cimport numpy as np +from numpy cimport int64_t, ndarray +np.import_array() + +# Avoid import from outside _libs +if sys.version_info.major == 2: + string_types = basestring + from StringIO import StringIO +else: + string_types = str + from io import StringIO + + +# dateutil compat +from dateutil.tz import (tzoffset, + tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) +from dateutil.relativedelta import relativedelta +from dateutil.parser import DEFAULTPARSER +from dateutil.parser import parse as du_parse + + +class DateParseError(ValueError): + pass + +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + +_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, + second=0, microsecond=0) +_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} +_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} + +cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') + +cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) + +NAT_SENTINEL = object() +# This allows us to reference NaT without having to import it + + +@cython.locals(date_string=object, freq=object) +def parse_datetime_string(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime. + Also cares special handling matching time patterns. + + Returns + ------- + datetime + """ + + cdef: + object dt + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + if _TIMEPAT.match(date_string): + # use current datetime as default, not pass _DEFAULT_DATETIME + dt = du_parse(date_string, dayfirst=dayfirst, + yearfirst=yearfirst, **kwargs) + return dt + try: + dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + return dt + except DateParseError: + raise + except ValueError: + pass + + try: + dt = du_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + + return dt + + +# Moved from pandas.core.tools.datetimes +def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ + + if not isinstance(arg, string_types): + return arg + + if getattr(freq, "_typ", None) == "dateoffset": + freq = freq.rule_code + + if dayfirst is None: + from pandas.core.config import get_option + dayfirst = get_option("display.date_dayfirst") + if yearfirst is None: + from pandas.core.config import get_option + yearfirst = get_option("display.date_yearfirst") + + return parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst) + + +@cython.locals(date_string=object, freq=object) +def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime + + Returns + ------- + datetime + """ + + cdef: + object parsed, reso + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + try: + return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + except DateParseError: + raise + except ValueError: + pass + + try: + parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst) + except Exception as e: + # TODO: allow raise of errors within instead + raise DateParseError(e) + if parsed is None: + raise DateParseError("Could not parse %s" % date_string) + return parsed, parsed, reso + + +@cython.returns(cython.bint) +@cython.locals(date_string=object) +@cython.ccall +def _does_string_look_like_datetime(date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + + +@cython.returns(object) +@cython.locals(date_string=object, default=object, freq=object) +@cython.inline +@cython.cfunc +def _parse_dateabbr_string(date_string, default, freq): + cdef: + object ret + int year, quarter = -1, month, mnum, date_len + + # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + assert isinstance(date_string, string_types) + + # len(date_string) == 0 + # should be NaT??? + + if date_string in _nat_strings: + return NAT_SENTINEL, NAT_SENTINEL, '' + + date_string = date_string.upper() + date_len = len(date_string) + + if date_len == 4: + # parse year only like 2000 + try: + ret = default.replace(year=int(date_string)) + return ret, ret, 'year' + except ValueError: + pass + + try: + if 4 <= date_len <= 7: + i = date_string.index('Q', 1, 6) + if i == 1: + quarter = int(date_string[0]) + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d)') + year = 2000 + int(date_string[-2:]) + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d\d\d)') + year = int(date_string[-4:]) + else: + raise ValueError + elif i == 2 or i == 3: + # r'(\d\d)-?Q(\d)' + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): + quarter = int(date_string[-1]) + year = 2000 + int(date_string[:2]) + else: + raise ValueError + elif i == 4 or i == 5: + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): + # r'(\d\d\d\d)-?Q(\d)' + quarter = int(date_string[-1]) + year = int(date_string[:4]) + else: + raise ValueError + + if not (1 <= quarter <= 4): + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') + raise DateParseError(msg.format(date_string)) + + if freq is not None: + # hack attack, #1228 + try: + mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + except (KeyError, ValueError): + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) + raise DateParseError(msg) + + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + else: + month = (quarter - 1) * 3 + 1 + + ret = default.replace(year=year, month=month) + return ret, ret, 'quarter' + + except DateParseError: + raise + except ValueError: + pass + + if date_len == 6 and (freq == 'M' or + getattr(freq, 'rule_code', None) == 'M'): + year = int(date_string[:4]) + month = int(date_string[4:6]) + try: + ret = default.replace(year=year, month=month) + return ret, ret, 'month' + except ValueError: + pass + + for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: + try: + ret = datetime.strptime(date_string, pat) + return ret, ret, 'month' + except ValueError: + pass + + raise ValueError('Unable to parse {0}'.format(date_string)) + + +@cython.locals(timestr=object, default=object) +def dateutil_parse(timestr, default, ignoretz=False, tzinfos=None, **kwargs): + """ lifted from dateutil to get resolution""" + + cdef: + object fobj, res, attr, ret, tzdata + object reso = None + dict repl = {} + + fobj = StringIO(str(timestr)) + res = DEFAULTPARSER._parse(fobj, **kwargs) + + # dateutil 2.2 compat + if isinstance(res, tuple): # PyTuple_Check + res, _ = res + + if res is None: + msg = "Unknown datetime string format, unable to parse: {0}" + raise ValueError(msg.format(timestr)) + + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + value = getattr(res, attr) + if value is not None: + repl[attr] = value + reso = attr + + if reso is None: + msg = "Unable to parse datetime string: {0}" + raise ValueError(msg.format(timestr)) + + if reso == 'microsecond': + if repl['microsecond'] == 0: + reso = 'second' + elif repl['microsecond'] % 1000 == 0: + reso = 'millisecond' + + ret = default.replace(**repl) + if res.weekday is not None and not res.day: + ret = ret + relativedelta.relativedelta(weekday=res.weekday) + if not ignoretz: + if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + if callable(tzinfos): + tzdata = tzinfos(res.tzname, res.tzoffset) + else: + tzdata = tzinfos.get(res.tzname) + if isinstance(tzdata, datetime.tzinfo): + tzinfo = tzdata + elif isinstance(tzdata, string_types): + tzinfo = _dateutil_tzstr(tzdata) + elif isinstance(tzdata, int): + tzinfo = tzoffset(res.tzname, tzdata) + else: + raise ValueError("offset must be tzinfo subclass, " + "tz string, or int offset") + ret = ret.replace(tzinfo=tzinfo) + elif res.tzname and res.tzname in time.tzname: + ret = ret.replace(tzinfo=_dateutil_tzlocal()) + elif res.tzoffset == 0: + ret = ret.replace(tzinfo=_dateutil_tzutc()) + elif res.tzoffset: + ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) + return ret, reso + + +# The canonical place for this appears to be in frequencies.pyx. +@cython.returns(object) +@cython.locals(source=object, default=object) +@cython.ccall +def _get_rule_month(source, default='DEC'): + """ + Return starting month of given freq, default is December. + + Example + ------- + >>> _get_rule_month('D') + 'DEC' + + >>> _get_rule_month('A-JAN') + 'JAN' + """ + if hasattr(source, 'freqstr'): + source = source.freqstr + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + + +#---------------------------------------------------------------------- +# Parsing for type-inference + + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + # EAFP here + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + else: + parse_date = date_parser + + if time_parser is None: + parse_time = lambda x: du_parse(x) + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(str(dates[i])) + t = parse_time(str(times[i])) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + + +def try_parse_datetime_components(ndarray[object] years, + ndarray[object] months, + ndarray[object] days, + ndarray[object] hours, + ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + int secs + double float_secs + double micros + + n = len(years) + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + float_secs = float(seconds[i]) + secs = int(float_secs) + + micros = float_secs - secs + if micros > 0: + micros = micros * 1000000 + + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), secs, + int(micros)) + + return result + + +#---------------------------------------------------------------------- +# Miscellaneous functions moved from core.tools.datetimes + +_DATEUTIL_LEXER_SPLIT = None +try: + # Since these are private methods from dateutil, it is safely imported + # here so in case this interface changes, pandas will just fallback + # to not using the functionality + from dateutil.parser import _timelex + + if hasattr(_timelex, 'split'): + def _lexer_split_from_str(dt_str): + # The StringIO(str(_)) is for dateutil 2.2 compatibility + return _timelex.split(StringIO(str(dt_str))) + + _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str +except (ImportError, AttributeError): + pass + + +def _format_is_iso(f): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] + + for date_sep in [' ', '/', '\\', '-', '.', '']: + for time_sep in [' ', 'T']: + if (iso_template(date_sep=date_sep, + time_sep=time_sep + ).startswith(f) and f not in excluded_formats): + return True + return False + + +def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, + dt_str_split=_DATEUTIL_LEXER_SPLIT): + """ + Guess the datetime format of a given datetime string. + + Parameters + ---------- + dt_str : string, datetime string to guess the format of + dayfirst : boolean, default False + If True parses dates with the day first, eg 20/01/2005 + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). + dt_str_parse : function, defaults to `compat.parse_date` (dateutil) + This function should take in a datetime string and return + a `datetime.datetime` guess that the datetime string represents + dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) + This function should take in a datetime string and return + a list of strings, the guess of the various specific parts + e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] + + Returns + ------- + ret : datetime format string (for `strftime` or `strptime`) + """ + if dt_str_parse is None or dt_str_split is None: + return None + + if not isinstance(dt_str, string_types): + return None + + day_attribute_and_format = (('day',), '%d', 2) + + # attr name, format, padding (if any) + datetime_attrs_to_format = [ + (('year', 'month', 'day'), '%Y%m%d', 0), + (('year',), '%Y', 0), + (('month',), '%B', 0), + (('month',), '%b', 0), + (('month',), '%m', 2), + day_attribute_and_format, + (('hour',), '%H', 2), + (('minute',), '%M', 2), + (('second',), '%S', 2), + (('microsecond',), '%f', 6), + (('second', 'microsecond'), '%S.%f', 0), + ] + + if dayfirst: + datetime_attrs_to_format.remove(day_attribute_and_format) + datetime_attrs_to_format.insert(0, day_attribute_and_format) + + try: + parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) + except: + # In case the datetime can't be parsed, its format cannot be guessed + return None + + if parsed_datetime is None: + return None + + try: + tokens = dt_str_split(dt_str) + except: + # In case the datetime string can't be split, its format cannot + # be guessed + return None + + format_guess = [None] * len(tokens) + found_attrs = set() + + for attrs, attr_format, padding in datetime_attrs_to_format: + # If a given attribute has been placed in the format string, skip + # over other formats for that same underlying attribute (IE, month + # can be represented in multiple different ways) + if set(attrs) & found_attrs: + continue + + if all(getattr(parsed_datetime, attr) is not None for attr in attrs): + for i, token_format in enumerate(format_guess): + token_filled = tokens[i].zfill(padding) + if (token_format is None and + token_filled == parsed_datetime.strftime(attr_format)): + format_guess[i] = attr_format + tokens[i] = token_filled + found_attrs.update(attrs) + break + + # Only consider it a valid guess if we have a year, month and day + if len(set(['year', 'month', 'day']) & found_attrs) != 3: + return None + + output_format = [] + for i, guess in enumerate(format_guess): + if guess is not None: + # Either fill in the format placeholder (like %Y) + output_format.append(guess) + else: + # Or just the token separate (IE, the dashes in "01-01-2013") + try: + # If the token is numeric, then we likely didn't parse it + # properly, so our guess is wrong + float(tokens[i]) + return None + except ValueError: + pass + + output_format.append(tokens[i]) + + guessed_format = ''.join(output_format) + + # rebuild string, capturing any inferred padding + dt_str = ''.join(tokens) + if parsed_datetime.strftime(guessed_format) == dt_str: + return guessed_format + else: + return None diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index c0f234a36803d..1bd3ea05aed2d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -23,21 +23,9 @@ import pandas.compat as compat -_DATEUTIL_LEXER_SPLIT = None -try: - # Since these are private methods from dateutil, it is safely imported - # here so in case this interface changes, pandas will just fallback - # to not using the functionality - from dateutil.parser import _timelex - - if hasattr(_timelex, 'split'): - def _lexer_split_from_str(dt_str): - # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(compat.StringIO(str(dt_str))) - - _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str -except (ImportError, AttributeError): - pass +_format_is_iso = tslib.parsing._format_is_iso +_DATEUTIL_LEXER_SPLIT = tslib.parsing._DATEUTIL_LEXER_SPLIT +_guess_datetime_format = tslib.parsing._guess_datetime_format def _infer_tzinfo(start, end): @@ -58,123 +46,6 @@ def _infer(a, b): return tz -def _guess_datetime_format(dt_str, dayfirst=False, - dt_str_parse=compat.parse_date, - dt_str_split=_DATEUTIL_LEXER_SPLIT): - """ - Guess the datetime format of a given datetime string. - - Parameters - ---------- - dt_str : string, datetime string to guess the format of - dayfirst : boolean, default False - If True parses dates with the day first, eg 20/01/2005 - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug). - dt_str_parse : function, defaults to `compat.parse_date` (dateutil) - This function should take in a datetime string and return - a `datetime.datetime` guess that the datetime string represents - dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) - This function should take in a datetime string and return - a list of strings, the guess of the various specific parts - e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] - - Returns - ------- - ret : datetime format string (for `strftime` or `strptime`) - """ - if dt_str_parse is None or dt_str_split is None: - return None - - if not isinstance(dt_str, compat.string_types): - return None - - day_attribute_and_format = (('day',), '%d', 2) - - # attr name, format, padding (if any) - datetime_attrs_to_format = [ - (('year', 'month', 'day'), '%Y%m%d', 0), - (('year',), '%Y', 0), - (('month',), '%B', 0), - (('month',), '%b', 0), - (('month',), '%m', 2), - day_attribute_and_format, - (('hour',), '%H', 2), - (('minute',), '%M', 2), - (('second',), '%S', 2), - (('microsecond',), '%f', 6), - (('second', 'microsecond'), '%S.%f', 0), - ] - - if dayfirst: - datetime_attrs_to_format.remove(day_attribute_and_format) - datetime_attrs_to_format.insert(0, day_attribute_and_format) - - try: - parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: - # In case the datetime can't be parsed, its format cannot be guessed - return None - - if parsed_datetime is None: - return None - - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None - - format_guess = [None] * len(tokens) - found_attrs = set() - - for attrs, attr_format, padding in datetime_attrs_to_format: - # If a given attribute has been placed in the format string, skip - # over other formats for that same underlying attribute (IE, month - # can be represented in multiple different ways) - if set(attrs) & found_attrs: - continue - - if all(getattr(parsed_datetime, attr) is not None for attr in attrs): - for i, token_format in enumerate(format_guess): - token_filled = tokens[i].zfill(padding) - if (token_format is None and - token_filled == parsed_datetime.strftime(attr_format)): - format_guess[i] = attr_format - tokens[i] = token_filled - found_attrs.update(attrs) - break - - # Only consider it a valid guess if we have a year, month and day - if len(set(['year', 'month', 'day']) & found_attrs) != 3: - return None - - output_format = [] - for i, guess in enumerate(format_guess): - if guess is not None: - # Either fill in the format placeholder (like %Y) - output_format.append(guess) - else: - # Or just the token separate (IE, the dashes in "01-01-2013") - try: - # If the token is numeric, then we likely didn't parse it - # properly, so our guess is wrong - float(tokens[i]) - return None - except ValueError: - pass - - output_format.append(tokens[i]) - - guessed_format = ''.join(output_format) - - # rebuild string, capturing any inferred padding - dt_str = ''.join(tokens) - if parsed_datetime.strftime(guessed_format) == dt_str: - return guessed_format - - def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] @@ -689,24 +560,6 @@ def calc_with_mask(carg, mask): return None -def _format_is_iso(f): - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format - excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] - - for date_sep in [' ', '/', '\\', '-', '.', '']: - for time_sep in [' ', 'T']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep - ).startswith(f) and f not in excluded_formats): - return True - return False - - def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): """ Try hard to parse datetime string, leveraging dateutil plus some extra @@ -726,6 +579,11 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): ------- datetime, datetime/dateutil.parser._result, str """ + res = tslib.parse_time_string(arg, freq, dayfirst, yearfirst) + if isinstance(res, tuple) and res[0] is tslib.NAT_SENTINEL: + res = (tslib.NaT,) + res[1:] + return res + from pandas.core.config import get_option if not isinstance(arg, compat.string_types): return arg diff --git a/setup.py b/setup.py index 444db5bc4d275..0ffa8d9960df6 100755 --- a/setup.py +++ b/setup.py @@ -331,6 +331,7 @@ class CheckSDist(sdist_class): _pyxfiles = ['pandas/_libs/lib.pyx', 'pandas/_libs/hashtable.pyx', 'pandas/_libs/tslib.pyx', + 'pandas/_libs/tslibs/parsing.pyx', 'pandas/_libs/period.pyx', 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', @@ -481,6 +482,7 @@ def pxd(name): 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, + '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing'}, '_libs.tslib': {'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends, From b19a31e6ed30984391f50babe1e087a6cb78c340 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 28 Aug 2017 15:54:19 -0700 Subject: [PATCH 02/15] flake8 whitespace fixup --- pandas/_libs/tslib.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index acd3dfb3d3ed1..e58e792048108 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1832,7 +1832,6 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz - def format_array_from_datetime(ndarray[int64_t] values, object tz=None, object format=None, object na_rep=None): """ From fa6def5c33c830bca3afb4b6893bf3f606be1011 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 29 Aug 2017 08:15:38 -0700 Subject: [PATCH 03/15] Address reviewer comments Remove cython decorators Move wrapping of parse_time_string to tslib --- pandas/_libs/tslib.pyx | 17 ++++++++++- pandas/_libs/tslibs/parsing.pyx | 24 ++++++---------- pandas/core/tools/datetimes.py | 50 ++++----------------------------- 3 files changed, 30 insertions(+), 61 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index e58e792048108..d5322b4db6f90 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -66,10 +66,13 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) -from tslibs.parsing import parse_time_string +from tslibs.parsing import parse_time_string as _parse_time_string from tslibs import parsing # noqa from tslibs.parsing import ( # noqa DateParseError, + _format_is_iso, + _DATEUTIL_LEXER_SPLIT, + _guess_datetime_format, NAT_SENTINEL, parse_datetime_string, _does_string_look_like_datetime, @@ -5451,3 +5454,15 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, # def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"): # return _strptime(data_string, format)[0] + +#---------------------------------------------------------------------- +# Parsing +# Wrap tslibs.parsing functions to return `NaT` instead of `NAT_SENTINEL` + + +def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + res = _parse_time_string(arg, freq, dayfirst, yearfirst) + if isinstance(res, tuple) and res[0] is NAT_SENTINEL: + res = (NaT,) + res[1:] + return res +parse_time_string.__doc__ = _parse_time_string.__doc__ diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5653d05436f36..2372e00ae7616 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -1,9 +1,11 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- # cython: profile=False # cython: linetrace=False # distutils: define_macros=CYTHON_TRACE=0 # distutils: define_macros=CYTHON_TRACE_NOGIL=0 +""" +Parsing functions for datetime and datetime-like strings. +""" import sys import re @@ -63,7 +65,6 @@ NAT_SENTINEL = object() # This allows us to reference NaT without having to import it -@cython.locals(date_string=object, freq=object) def parse_datetime_string(date_string, freq=None, dayfirst=False, yearfirst=False, **kwargs): """parse datetime string, only returns datetime. @@ -85,6 +86,7 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, dt = du_parse(date_string, dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) return dt + try: dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) return dt @@ -104,7 +106,6 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, return dt -# Moved from pandas.core.tools.datetimes def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): """ Try hard to parse datetime string, leveraging dateutil plus some extra @@ -143,7 +144,6 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): yearfirst=yearfirst) -@cython.locals(date_string=object, freq=object) def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, yearfirst=False, **kwargs): """parse datetime string, only returns datetime @@ -177,10 +177,7 @@ def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, return parsed, parsed, reso -@cython.returns(cython.bint) -@cython.locals(date_string=object) -@cython.ccall -def _does_string_look_like_datetime(date_string): +cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a # date-like string than a number @@ -198,11 +195,8 @@ def _does_string_look_like_datetime(date_string): return True -@cython.returns(object) -@cython.locals(date_string=object, default=object, freq=object) -@cython.inline -@cython.cfunc -def _parse_dateabbr_string(date_string, default, freq): +cdef inline object _parse_dateabbr_string(object date_string, object default, + object freq): cdef: object ret int year, quarter = -1, month, mnum, date_len @@ -307,8 +301,8 @@ def _parse_dateabbr_string(date_string, default, freq): raise ValueError('Unable to parse {0}'.format(date_string)) -@cython.locals(timestr=object, default=object) -def dateutil_parse(timestr, default, ignoretz=False, tzinfos=None, **kwargs): +def dateutil_parse(object timestr, object default, ignoretz=False, + tzinfos=None, **kwargs): """ lifted from dateutil to get resolution""" cdef: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 1bd3ea05aed2d..2bbe2dfb8bb5c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,6 +3,11 @@ from collections import MutableMapping from pandas._libs import lib, tslib +from pandas._libs.tslib import ( + parse_time_string, + _format_is_iso, + _DATEUTIL_LEXER_SPLIT, + _guess_datetime_format) from pandas.core.dtypes.common import ( _ensure_object, @@ -23,10 +28,6 @@ import pandas.compat as compat -_format_is_iso = tslib.parsing._format_is_iso -_DATEUTIL_LEXER_SPLIT = tslib.parsing._DATEUTIL_LEXER_SPLIT -_guess_datetime_format = tslib.parsing._guess_datetime_format - def _infer_tzinfo(start, end): def _infer(a, b): @@ -560,47 +561,6 @@ def calc_with_mask(carg, mask): return None -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - res = tslib.parse_time_string(arg, freq, dayfirst, yearfirst) - if isinstance(res, tuple) and res[0] is tslib.NAT_SENTINEL: - res = (tslib.NaT,) + res[1:] - return res - - from pandas.core.config import get_option - if not isinstance(arg, compat.string_types): - return arg - - if isinstance(freq, ABCDateOffset): - freq = freq.rule_code - - if dayfirst is None: - dayfirst = get_option("display.date_dayfirst") - if yearfirst is None: - yearfirst = get_option("display.date_yearfirst") - - return tslib.parse_datetime_string_with_reso(arg, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - - DateParseError = tslib.DateParseError normalize_date = tslib.normalize_date From 2cd2ab63fe1931c3c8177fca20f91820b20a7109 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 29 Aug 2017 11:50:57 -0700 Subject: [PATCH 04/15] Fix __doc__ cython complaint --- pandas/_libs/tslib.pyx | 19 ++++++++++++++++++- pandas/_libs/tslibs/parsing.pyx | 20 +------------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d5322b4db6f90..8fef9a28a0f75 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -5461,8 +5461,25 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ res = _parse_time_string(arg, freq, dayfirst, yearfirst) if isinstance(res, tuple) and res[0] is NAT_SENTINEL: res = (NaT,) + res[1:] return res -parse_time_string.__doc__ = _parse_time_string.__doc__ diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 2372e00ae7616..690bf937dc92a 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -107,25 +107,7 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - + """See tslib.parse_time_string.__doc__""" if not isinstance(arg, string_types): return arg From c765e26c6fd30e07e987b404ec0d8a3e23d770f7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 30 Aug 2017 08:42:15 -0700 Subject: [PATCH 05/15] lint fixup --- pandas/core/tools/datetimes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2bbe2dfb8bb5c..eb044002dfb42 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,7 +3,7 @@ from collections import MutableMapping from pandas._libs import lib, tslib -from pandas._libs.tslib import ( +from pandas._libs.tslib import ( # noqa parse_time_string, _format_is_iso, _DATEUTIL_LEXER_SPLIT, @@ -22,12 +22,10 @@ is_numeric_dtype) from pandas.core.dtypes.generic import ( ABCIndexClass, ABCSeries, - ABCDataFrame, ABCDateOffset) + ABCDataFrame) from pandas.core.dtypes.missing import notna from pandas.core import algorithms -import pandas.compat as compat - def _infer_tzinfo(start, end): def _infer(a, b): From b3ca3c05e4134b24b11ccafc1e0fd76da4955519 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 6 Sep 2017 08:08:39 -0700 Subject: [PATCH 06/15] Reviewer comments; remove cython decorators Reviewer request to import NaT in parse_time_string --- pandas/_libs/tslib.pyx | 31 +---------------------------- pandas/_libs/tslibs/parsing.pyx | 35 +++++++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8fef9a28a0f75..ce0b7000a0089 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -66,7 +66,6 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) -from tslibs.parsing import parse_time_string as _parse_time_string from tslibs import parsing # noqa from tslibs.parsing import ( # noqa DateParseError, @@ -75,6 +74,7 @@ from tslibs.parsing import ( # noqa _guess_datetime_format, NAT_SENTINEL, parse_datetime_string, + parse_time_string, _does_string_look_like_datetime, parse_datetime_string_with_reso) @@ -5454,32 +5454,3 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, # def _strptime_time(data_string, format="%a %b %d %H:%M:%S %Y"): # return _strptime(data_string, format)[0] - -#---------------------------------------------------------------------- -# Parsing -# Wrap tslibs.parsing functions to return `NaT` instead of `NAT_SENTINEL` - - -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - res = _parse_time_string(arg, freq, dayfirst, yearfirst) - if isinstance(res, tuple) and res[0] is NAT_SENTINEL: - res = (NaT,) + res[1:] - return res diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 690bf937dc92a..53a9b74278fb6 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -107,7 +107,24 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False, def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """See tslib.parse_time_string.__doc__""" + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ if not isinstance(arg, string_types): return arg @@ -121,9 +138,14 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): from pandas.core.config import get_option yearfirst = get_option("display.date_yearfirst") - return parse_datetime_string_with_reso(arg, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) + res = parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst) + if res[0] is NAT_SENTINEL: + from pandas._libs.tslib import NaT + res = (NaT,) + res[1:] + return res + def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, @@ -349,10 +371,7 @@ def dateutil_parse(object timestr, object default, ignoretz=False, # The canonical place for this appears to be in frequencies.pyx. -@cython.returns(object) -@cython.locals(source=object, default=object) -@cython.ccall -def _get_rule_month(source, default='DEC'): +cpdef object _get_rule_month(object source, object default='DEC'): """ Return starting month of given freq, default is December. From e36f8be13ec9aacdf23cff72aa79915d5ea6647e Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 6 Sep 2017 09:40:10 -0700 Subject: [PATCH 07/15] kludge to move tslibs/parsing to make asv work --- pandas/_libs/period.pyx | 2 +- pandas/_libs/src/inference.pyx | 2 +- pandas/_libs/tslib.pyx | 4 ++-- setup.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/period.pyx b/pandas/_libs/period.pyx index be8f9da1b7849..cd95340f683e7 100644 --- a/pandas/_libs/period.pyx +++ b/pandas/_libs/period.pyx @@ -40,7 +40,7 @@ from tslib cimport ( _get_dst_info, _nat_scalar_rules) -from tslibs.parsing import parse_time_string, NAT_SENTINEL +from .parsing import parse_time_string, NAT_SENTINEL from pandas.tseries import offsets from pandas.tseries import frequencies diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index d71516d309bb9..f611b49131883 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -8,7 +8,7 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 -from pandas._libs.tslibs.parsing import ( +from pandas._libs.parsing import ( try_parse_dates, try_parse_date_and_time, try_parse_year_month_day, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ce0b7000a0089..c6ea78f89abc7 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -66,8 +66,8 @@ from khash cimport ( kh_init_int64, kh_int64_t, kh_resize_int64, kh_get_int64) -from tslibs import parsing # noqa -from tslibs.parsing import ( # noqa +from . import parsing # noqa +from .parsing import ( # noqa DateParseError, _format_is_iso, _DATEUTIL_LEXER_SPLIT, diff --git a/setup.py b/setup.py index 0ffa8d9960df6..27898aef5cd4e 100755 --- a/setup.py +++ b/setup.py @@ -331,7 +331,7 @@ class CheckSDist(sdist_class): _pyxfiles = ['pandas/_libs/lib.pyx', 'pandas/_libs/hashtable.pyx', 'pandas/_libs/tslib.pyx', - 'pandas/_libs/tslibs/parsing.pyx', + 'pandas/_libs/parsing.pyx', 'pandas/_libs/period.pyx', 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', @@ -482,7 +482,7 @@ def pxd(name): 'pxdfiles': ['_libs/hashtable'], 'depends': (['pandas/_libs/src/klib/khash_python.h'] + _pxi_dep['hashtable'])}, - '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing'}, + '_libs.parsing': {'pyxfile': '_libs/parsing'}, '_libs.tslib': {'pyxfile': '_libs/tslib', 'pxdfiles': ['_libs/src/util', '_libs/lib'], 'depends': tseries_depends, From c52c7968ee06b0864e7310721f80f0ac357a1a49 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 6 Sep 2017 09:40:52 -0700 Subject: [PATCH 08/15] kludge to move tslibs/parsing to make asv work --- pandas/_libs/parsing.pyx | 683 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 683 insertions(+) create mode 100644 pandas/_libs/parsing.pyx diff --git a/pandas/_libs/parsing.pyx b/pandas/_libs/parsing.pyx new file mode 100644 index 0000000000000..53a9b74278fb6 --- /dev/null +++ b/pandas/_libs/parsing.pyx @@ -0,0 +1,683 @@ +# -*- coding: utf-8 -*- +# cython: profile=False +# cython: linetrace=False +# distutils: define_macros=CYTHON_TRACE=0 +# distutils: define_macros=CYTHON_TRACE_NOGIL=0 +""" +Parsing functions for datetime and datetime-like strings. +""" +import sys +import re + +from cpython cimport PyString_Check, PyUnicode_Check + +from libc.stdlib cimport free + +cimport cython +from cython cimport Py_ssize_t + + +from datetime import datetime +import time + +import numpy as np +cimport numpy as np +from numpy cimport int64_t, ndarray +np.import_array() + +# Avoid import from outside _libs +if sys.version_info.major == 2: + string_types = basestring + from StringIO import StringIO +else: + string_types = str + from io import StringIO + + +# dateutil compat +from dateutil.tz import (tzoffset, + tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) +from dateutil.relativedelta import relativedelta +from dateutil.parser import DEFAULTPARSER +from dateutil.parser import parse as du_parse + + +class DateParseError(ValueError): + pass + +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + +_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, + second=0, microsecond=0) +_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] +_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} +_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} + +cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') + +cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) + +NAT_SENTINEL = object() +# This allows us to reference NaT without having to import it + + +def parse_datetime_string(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime. + Also cares special handling matching time patterns. + + Returns + ------- + datetime + """ + + cdef: + object dt + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + if _TIMEPAT.match(date_string): + # use current datetime as default, not pass _DEFAULT_DATETIME + dt = du_parse(date_string, dayfirst=dayfirst, + yearfirst=yearfirst, **kwargs) + return dt + + try: + dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + return dt + except DateParseError: + raise + except ValueError: + pass + + try: + dt = du_parse(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + + return dt + + +def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : compat.string_types + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + dayfirst : bool, default None + If None uses default from print_config + yearfirst : bool, default None + If None uses default from print_config + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ + if not isinstance(arg, string_types): + return arg + + if getattr(freq, "_typ", None) == "dateoffset": + freq = freq.rule_code + + if dayfirst is None: + from pandas.core.config import get_option + dayfirst = get_option("display.date_dayfirst") + if yearfirst is None: + from pandas.core.config import get_option + yearfirst = get_option("display.date_yearfirst") + + res = parse_datetime_string_with_reso(arg, freq=freq, + dayfirst=dayfirst, + yearfirst=yearfirst) + if res[0] is NAT_SENTINEL: + from pandas._libs.tslib import NaT + res = (NaT,) + res[1:] + return res + + + +def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, + yearfirst=False, **kwargs): + """parse datetime string, only returns datetime + + Returns + ------- + datetime + """ + + cdef: + object parsed, reso + + if not _does_string_look_like_datetime(date_string): + raise ValueError('Given date string not likely a datetime.') + + try: + return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + except DateParseError: + raise + except ValueError: + pass + + try: + parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst) + except Exception as e: + # TODO: allow raise of errors within instead + raise DateParseError(e) + if parsed is None: + raise DateParseError("Could not parse %s" % date_string) + return parsed, parsed, reso + + +cpdef bint _does_string_look_like_datetime(object date_string): + if date_string.startswith('0'): + # Strings starting with 0 are more consistent with a + # date-like string than a number + return True + + try: + if float(date_string) < 1000: + return False + except ValueError: + pass + + if date_string in _not_datelike_strings: + return False + + return True + + +cdef inline object _parse_dateabbr_string(object date_string, object default, + object freq): + cdef: + object ret + int year, quarter = -1, month, mnum, date_len + + # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + assert isinstance(date_string, string_types) + + # len(date_string) == 0 + # should be NaT??? + + if date_string in _nat_strings: + return NAT_SENTINEL, NAT_SENTINEL, '' + + date_string = date_string.upper() + date_len = len(date_string) + + if date_len == 4: + # parse year only like 2000 + try: + ret = default.replace(year=int(date_string)) + return ret, ret, 'year' + except ValueError: + pass + + try: + if 4 <= date_len <= 7: + i = date_string.index('Q', 1, 6) + if i == 1: + quarter = int(date_string[0]) + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d)') + year = 2000 + int(date_string[-2:]) + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): + # r'(\d)Q-?(\d\d\d\d)') + year = int(date_string[-4:]) + else: + raise ValueError + elif i == 2 or i == 3: + # r'(\d\d)-?Q(\d)' + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): + quarter = int(date_string[-1]) + year = 2000 + int(date_string[:2]) + else: + raise ValueError + elif i == 4 or i == 5: + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): + # r'(\d\d\d\d)-?Q(\d)' + quarter = int(date_string[-1]) + year = int(date_string[:4]) + else: + raise ValueError + + if not (1 <= quarter <= 4): + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') + raise DateParseError(msg.format(date_string)) + + if freq is not None: + # hack attack, #1228 + try: + mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 + except (KeyError, ValueError): + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) + raise DateParseError(msg) + + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + else: + month = (quarter - 1) * 3 + 1 + + ret = default.replace(year=year, month=month) + return ret, ret, 'quarter' + + except DateParseError: + raise + except ValueError: + pass + + if date_len == 6 and (freq == 'M' or + getattr(freq, 'rule_code', None) == 'M'): + year = int(date_string[:4]) + month = int(date_string[4:6]) + try: + ret = default.replace(year=year, month=month) + return ret, ret, 'month' + except ValueError: + pass + + for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: + try: + ret = datetime.strptime(date_string, pat) + return ret, ret, 'month' + except ValueError: + pass + + raise ValueError('Unable to parse {0}'.format(date_string)) + + +def dateutil_parse(object timestr, object default, ignoretz=False, + tzinfos=None, **kwargs): + """ lifted from dateutil to get resolution""" + + cdef: + object fobj, res, attr, ret, tzdata + object reso = None + dict repl = {} + + fobj = StringIO(str(timestr)) + res = DEFAULTPARSER._parse(fobj, **kwargs) + + # dateutil 2.2 compat + if isinstance(res, tuple): # PyTuple_Check + res, _ = res + + if res is None: + msg = "Unknown datetime string format, unable to parse: {0}" + raise ValueError(msg.format(timestr)) + + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + value = getattr(res, attr) + if value is not None: + repl[attr] = value + reso = attr + + if reso is None: + msg = "Unable to parse datetime string: {0}" + raise ValueError(msg.format(timestr)) + + if reso == 'microsecond': + if repl['microsecond'] == 0: + reso = 'second' + elif repl['microsecond'] % 1000 == 0: + reso = 'millisecond' + + ret = default.replace(**repl) + if res.weekday is not None and not res.day: + ret = ret + relativedelta.relativedelta(weekday=res.weekday) + if not ignoretz: + if callable(tzinfos) or tzinfos and res.tzname in tzinfos: + if callable(tzinfos): + tzdata = tzinfos(res.tzname, res.tzoffset) + else: + tzdata = tzinfos.get(res.tzname) + if isinstance(tzdata, datetime.tzinfo): + tzinfo = tzdata + elif isinstance(tzdata, string_types): + tzinfo = _dateutil_tzstr(tzdata) + elif isinstance(tzdata, int): + tzinfo = tzoffset(res.tzname, tzdata) + else: + raise ValueError("offset must be tzinfo subclass, " + "tz string, or int offset") + ret = ret.replace(tzinfo=tzinfo) + elif res.tzname and res.tzname in time.tzname: + ret = ret.replace(tzinfo=_dateutil_tzlocal()) + elif res.tzoffset == 0: + ret = ret.replace(tzinfo=_dateutil_tzutc()) + elif res.tzoffset: + ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) + return ret, reso + + +# The canonical place for this appears to be in frequencies.pyx. +cpdef object _get_rule_month(object source, object default='DEC'): + """ + Return starting month of given freq, default is December. + + Example + ------- + >>> _get_rule_month('D') + 'DEC' + + >>> _get_rule_month('A-JAN') + 'JAN' + """ + if hasattr(source, 'freqstr'): + source = source.freqstr + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + + +#---------------------------------------------------------------------- +# Parsing for type-inference + + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + # EAFP here + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False, default=None): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + if default is None: # GH2618 + date = datetime.now() + default = datetime(date.year, date.month, 1) + + parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) + + else: + parse_date = date_parser + + if time_parser is None: + parse_time = lambda x: du_parse(x) + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(str(dates[i])) + t = parse_time(str(times[i])) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + + +def try_parse_datetime_components(ndarray[object] years, + ndarray[object] months, + ndarray[object] days, + ndarray[object] hours, + ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + int secs + double float_secs + double micros + + n = len(years) + if (len(months) != n or len(days) != n or len(hours) != n or + len(minutes) != n or len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + float_secs = float(seconds[i]) + secs = int(float_secs) + + micros = float_secs - secs + if micros > 0: + micros = micros * 1000000 + + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), secs, + int(micros)) + + return result + + +#---------------------------------------------------------------------- +# Miscellaneous functions moved from core.tools.datetimes + +_DATEUTIL_LEXER_SPLIT = None +try: + # Since these are private methods from dateutil, it is safely imported + # here so in case this interface changes, pandas will just fallback + # to not using the functionality + from dateutil.parser import _timelex + + if hasattr(_timelex, 'split'): + def _lexer_split_from_str(dt_str): + # The StringIO(str(_)) is for dateutil 2.2 compatibility + return _timelex.split(StringIO(str(dt_str))) + + _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str +except (ImportError, AttributeError): + pass + + +def _format_is_iso(f): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format + excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] + + for date_sep in [' ', '/', '\\', '-', '.', '']: + for time_sep in [' ', 'T']: + if (iso_template(date_sep=date_sep, + time_sep=time_sep + ).startswith(f) and f not in excluded_formats): + return True + return False + + +def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, + dt_str_split=_DATEUTIL_LEXER_SPLIT): + """ + Guess the datetime format of a given datetime string. + + Parameters + ---------- + dt_str : string, datetime string to guess the format of + dayfirst : boolean, default False + If True parses dates with the day first, eg 20/01/2005 + Warning: dayfirst=True is not strict, but will prefer to parse + with day first (this is a known bug). + dt_str_parse : function, defaults to `compat.parse_date` (dateutil) + This function should take in a datetime string and return + a `datetime.datetime` guess that the datetime string represents + dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) + This function should take in a datetime string and return + a list of strings, the guess of the various specific parts + e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] + + Returns + ------- + ret : datetime format string (for `strftime` or `strptime`) + """ + if dt_str_parse is None or dt_str_split is None: + return None + + if not isinstance(dt_str, string_types): + return None + + day_attribute_and_format = (('day',), '%d', 2) + + # attr name, format, padding (if any) + datetime_attrs_to_format = [ + (('year', 'month', 'day'), '%Y%m%d', 0), + (('year',), '%Y', 0), + (('month',), '%B', 0), + (('month',), '%b', 0), + (('month',), '%m', 2), + day_attribute_and_format, + (('hour',), '%H', 2), + (('minute',), '%M', 2), + (('second',), '%S', 2), + (('microsecond',), '%f', 6), + (('second', 'microsecond'), '%S.%f', 0), + ] + + if dayfirst: + datetime_attrs_to_format.remove(day_attribute_and_format) + datetime_attrs_to_format.insert(0, day_attribute_and_format) + + try: + parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) + except: + # In case the datetime can't be parsed, its format cannot be guessed + return None + + if parsed_datetime is None: + return None + + try: + tokens = dt_str_split(dt_str) + except: + # In case the datetime string can't be split, its format cannot + # be guessed + return None + + format_guess = [None] * len(tokens) + found_attrs = set() + + for attrs, attr_format, padding in datetime_attrs_to_format: + # If a given attribute has been placed in the format string, skip + # over other formats for that same underlying attribute (IE, month + # can be represented in multiple different ways) + if set(attrs) & found_attrs: + continue + + if all(getattr(parsed_datetime, attr) is not None for attr in attrs): + for i, token_format in enumerate(format_guess): + token_filled = tokens[i].zfill(padding) + if (token_format is None and + token_filled == parsed_datetime.strftime(attr_format)): + format_guess[i] = attr_format + tokens[i] = token_filled + found_attrs.update(attrs) + break + + # Only consider it a valid guess if we have a year, month and day + if len(set(['year', 'month', 'day']) & found_attrs) != 3: + return None + + output_format = [] + for i, guess in enumerate(format_guess): + if guess is not None: + # Either fill in the format placeholder (like %Y) + output_format.append(guess) + else: + # Or just the token separate (IE, the dashes in "01-01-2013") + try: + # If the token is numeric, then we likely didn't parse it + # properly, so our guess is wrong + float(tokens[i]) + return None + except ValueError: + pass + + output_format.append(tokens[i]) + + guessed_format = ''.join(output_format) + + # rebuild string, capturing any inferred padding + dt_str = ''.join(tokens) + if parsed_datetime.strftime(guessed_format) == dt_str: + return guessed_format + else: + return None From f5259eabf918cb1ef19a43bced93f7cbce47a5df Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 6 Sep 2017 20:08:00 -0700 Subject: [PATCH 09/15] flake8 whitespace fixup --- pandas/_libs/tslibs/parsing.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 53a9b74278fb6..452649b0e2abc 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -147,7 +147,6 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): return res - def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, yearfirst=False, **kwargs): """parse datetime string, only returns datetime From d1765ab62f12db48611067384693ad1f92002f37 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 7 Sep 2017 09:40:01 -0700 Subject: [PATCH 10/15] Dummy commit to force CI --- pandas/_libs/tslibs/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index faa18be5bbf7d..40a96afc6ff09 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,2 +1 @@ -#!/usr/bin/env python # -*- coding: utf-8 -*- From e3995beadf87a02172d979ebdb40398825b0a935 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Sep 2017 20:39:46 -0700 Subject: [PATCH 11/15] Whitespace fixup Made this locally long ago; for some reason it is not getting reflected on GH when I push. It's a mystery. --- pandas/_libs/parsing.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/parsing.pyx b/pandas/_libs/parsing.pyx index 53a9b74278fb6..452649b0e2abc 100644 --- a/pandas/_libs/parsing.pyx +++ b/pandas/_libs/parsing.pyx @@ -147,7 +147,6 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): return res - def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, yearfirst=False, **kwargs): """parse datetime string, only returns datetime From ff578617310d361df28a0de93f50d223e6a6578b Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 15 Sep 2017 08:02:36 -0700 Subject: [PATCH 12/15] Remove duplicate file --- pandas/_libs/parsing.pyx | 682 --------------------------------------- 1 file changed, 682 deletions(-) delete mode 100644 pandas/_libs/parsing.pyx diff --git a/pandas/_libs/parsing.pyx b/pandas/_libs/parsing.pyx deleted file mode 100644 index 452649b0e2abc..0000000000000 --- a/pandas/_libs/parsing.pyx +++ /dev/null @@ -1,682 +0,0 @@ -# -*- coding: utf-8 -*- -# cython: profile=False -# cython: linetrace=False -# distutils: define_macros=CYTHON_TRACE=0 -# distutils: define_macros=CYTHON_TRACE_NOGIL=0 -""" -Parsing functions for datetime and datetime-like strings. -""" -import sys -import re - -from cpython cimport PyString_Check, PyUnicode_Check - -from libc.stdlib cimport free - -cimport cython -from cython cimport Py_ssize_t - - -from datetime import datetime -import time - -import numpy as np -cimport numpy as np -from numpy cimport int64_t, ndarray -np.import_array() - -# Avoid import from outside _libs -if sys.version_info.major == 2: - string_types = basestring - from StringIO import StringIO -else: - string_types = str - from io import StringIO - - -# dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzfile as _dateutil_tzfile, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) -from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse - - -class DateParseError(ValueError): - pass - -_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) - -_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, - second=0, microsecond=0) -_MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', - 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] -_MONTH_NUMBERS = {k: i for i, k in enumerate(_MONTHS)} -_MONTH_ALIASES = {(k + 1): v for k, v in enumerate(_MONTHS)} - -cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') - -cdef set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) - -NAT_SENTINEL = object() -# This allows us to reference NaT without having to import it - - -def parse_datetime_string(date_string, freq=None, dayfirst=False, - yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. - Also cares special handling matching time patterns. - - Returns - ------- - datetime - """ - - cdef: - object dt - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - if _TIMEPAT.match(date_string): - # use current datetime as default, not pass _DEFAULT_DATETIME - dt = du_parse(date_string, dayfirst=dayfirst, - yearfirst=yearfirst, **kwargs) - return dt - - try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - return dt - except DateParseError: - raise - except ValueError: - pass - - try: - dt = du_parse(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) - except TypeError: - # following may be raised from dateutil - # TypeError: 'NoneType' object is not iterable - raise ValueError('Given date string not likely a datetime.') - - return dt - - -def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): - """ - Try hard to parse datetime string, leveraging dateutil plus some extra - goodies like quarter recognition. - - Parameters - ---------- - arg : compat.string_types - freq : str or DateOffset, default None - Helps with interpreting time string if supplied - dayfirst : bool, default None - If None uses default from print_config - yearfirst : bool, default None - If None uses default from print_config - - Returns - ------- - datetime, datetime/dateutil.parser._result, str - """ - if not isinstance(arg, string_types): - return arg - - if getattr(freq, "_typ", None) == "dateoffset": - freq = freq.rule_code - - if dayfirst is None: - from pandas.core.config import get_option - dayfirst = get_option("display.date_dayfirst") - if yearfirst is None: - from pandas.core.config import get_option - yearfirst = get_option("display.date_yearfirst") - - res = parse_datetime_string_with_reso(arg, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - if res[0] is NAT_SENTINEL: - from pandas._libs.tslib import NaT - res = (NaT,) + res[1:] - return res - - -def parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False, - yearfirst=False, **kwargs): - """parse datetime string, only returns datetime - - Returns - ------- - datetime - """ - - cdef: - object parsed, reso - - if not _does_string_look_like_datetime(date_string): - raise ValueError('Given date string not likely a datetime.') - - try: - return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) - except DateParseError: - raise - except ValueError: - pass - - try: - parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst) - except Exception as e: - # TODO: allow raise of errors within instead - raise DateParseError(e) - if parsed is None: - raise DateParseError("Could not parse %s" % date_string) - return parsed, parsed, reso - - -cpdef bint _does_string_look_like_datetime(object date_string): - if date_string.startswith('0'): - # Strings starting with 0 are more consistent with a - # date-like string than a number - return True - - try: - if float(date_string) < 1000: - return False - except ValueError: - pass - - if date_string in _not_datelike_strings: - return False - - return True - - -cdef inline object _parse_dateabbr_string(object date_string, object default, - object freq): - cdef: - object ret - int year, quarter = -1, month, mnum, date_len - - # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 - assert isinstance(date_string, string_types) - - # len(date_string) == 0 - # should be NaT??? - - if date_string in _nat_strings: - return NAT_SENTINEL, NAT_SENTINEL, '' - - date_string = date_string.upper() - date_len = len(date_string) - - if date_len == 4: - # parse year only like 2000 - try: - ret = default.replace(year=int(date_string)) - return ret, ret, 'year' - except ValueError: - pass - - try: - if 4 <= date_len <= 7: - i = date_string.index('Q', 1, 6) - if i == 1: - quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d)') - year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 - and date_string[i + 1] == '-'): - # r'(\d)Q-?(\d\d\d\d)') - year = int(date_string[-4:]) - else: - raise ValueError - elif i == 2 or i == 3: - # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 - and date_string[i - 1] == '-'): - quarter = int(date_string[-1]) - year = 2000 + int(date_string[:2]) - else: - raise ValueError - elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 - and date_string[i - 1] == '-'): - # r'(\d\d\d\d)-?Q(\d)' - quarter = int(date_string[-1]) - year = int(date_string[:4]) - else: - raise ValueError - - if not (1 <= quarter <= 4): - msg = ('Incorrect quarterly string is given, quarter must be ' - 'between 1 and 4: {0}') - raise DateParseError(msg.format(date_string)) - - if freq is not None: - # hack attack, #1228 - try: - mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 - except (KeyError, ValueError): - msg = ('Unable to retrieve month information from given ' - 'freq: {0}').format(freq) - raise DateParseError(msg) - - month = (mnum + (quarter - 1) * 3) % 12 + 1 - if month > mnum: - year -= 1 - else: - month = (quarter - 1) * 3 + 1 - - ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' - - except DateParseError: - raise - except ValueError: - pass - - if date_len == 6 and (freq == 'M' or - getattr(freq, 'rule_code', None) == 'M'): - year = int(date_string[:4]) - month = int(date_string[4:6]) - try: - ret = default.replace(year=year, month=month) - return ret, ret, 'month' - except ValueError: - pass - - for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']: - try: - ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' - except ValueError: - pass - - raise ValueError('Unable to parse {0}'.format(date_string)) - - -def dateutil_parse(object timestr, object default, ignoretz=False, - tzinfos=None, **kwargs): - """ lifted from dateutil to get resolution""" - - cdef: - object fobj, res, attr, ret, tzdata - object reso = None - dict repl = {} - - fobj = StringIO(str(timestr)) - res = DEFAULTPARSER._parse(fobj, **kwargs) - - # dateutil 2.2 compat - if isinstance(res, tuple): # PyTuple_Check - res, _ = res - - if res is None: - msg = "Unknown datetime string format, unable to parse: {0}" - raise ValueError(msg.format(timestr)) - - for attr in ["year", "month", "day", "hour", - "minute", "second", "microsecond"]: - value = getattr(res, attr) - if value is not None: - repl[attr] = value - reso = attr - - if reso is None: - msg = "Unable to parse datetime string: {0}" - raise ValueError(msg.format(timestr)) - - if reso == 'microsecond': - if repl['microsecond'] == 0: - reso = 'second' - elif repl['microsecond'] % 1000 == 0: - reso = 'millisecond' - - ret = default.replace(**repl) - if res.weekday is not None and not res.day: - ret = ret + relativedelta.relativedelta(weekday=res.weekday) - if not ignoretz: - if callable(tzinfos) or tzinfos and res.tzname in tzinfos: - if callable(tzinfos): - tzdata = tzinfos(res.tzname, res.tzoffset) - else: - tzdata = tzinfos.get(res.tzname) - if isinstance(tzdata, datetime.tzinfo): - tzinfo = tzdata - elif isinstance(tzdata, string_types): - tzinfo = _dateutil_tzstr(tzdata) - elif isinstance(tzdata, int): - tzinfo = tzoffset(res.tzname, tzdata) - else: - raise ValueError("offset must be tzinfo subclass, " - "tz string, or int offset") - ret = ret.replace(tzinfo=tzinfo) - elif res.tzname and res.tzname in time.tzname: - ret = ret.replace(tzinfo=_dateutil_tzlocal()) - elif res.tzoffset == 0: - ret = ret.replace(tzinfo=_dateutil_tzutc()) - elif res.tzoffset: - ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset)) - return ret, reso - - -# The canonical place for this appears to be in frequencies.pyx. -cpdef object _get_rule_month(object source, object default='DEC'): - """ - Return starting month of given freq, default is December. - - Example - ------- - >>> _get_rule_month('D') - 'DEC' - - >>> _get_rule_month('A-JAN') - 'JAN' - """ - if hasattr(source, 'freqstr'): - source = source.freqstr - source = source.upper() - if '-' not in source: - return default - else: - return source.split('-')[1] - - -#---------------------------------------------------------------------- -# Parsing for type-inference - - -def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(values) - result = np.empty(n, dtype='O') - - if parser is None: - if default is None: # GH2618 - date = datetime.now() - default = datetime(date.year, date.month, 1) - - parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) - - # EAFP here - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # failed - return values - else: - parse_date = parser - - try: - for i from 0 <= i < n: - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise - - return result - - -def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, - date_parser=None, time_parser=None, - dayfirst=False, default=None): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(dates) - if len(times) != n: - raise ValueError('Length of dates and times must be equal') - result = np.empty(n, dtype='O') - - if date_parser is None: - if default is None: # GH2618 - date = datetime.now() - default = datetime(date.year, date.month, 1) - - parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default) - - else: - parse_date = date_parser - - if time_parser is None: - parse_time = lambda x: du_parse(x) - - else: - parse_time = time_parser - - for i from 0 <= i < n: - d = parse_date(str(dates[i])) - t = parse_time(str(times[i])) - result[i] = datetime(d.year, d.month, d.day, - t.hour, t.minute, t.second) - - return result - - -def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, - ndarray[object] days): - cdef: - Py_ssize_t i, n - ndarray[object] result - - n = len(years) - if len(months) != n or len(days) != n: - raise ValueError('Length of years/months/days must all be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) - - return result - - -def try_parse_datetime_components(ndarray[object] years, - ndarray[object] months, - ndarray[object] days, - ndarray[object] hours, - ndarray[object] minutes, - ndarray[object] seconds): - - cdef: - Py_ssize_t i, n - ndarray[object] result - int secs - double float_secs - double micros - - n = len(years) - if (len(months) != n or len(days) != n or len(hours) != n or - len(minutes) != n or len(seconds) != n): - raise ValueError('Length of all datetime components must be equal') - result = np.empty(n, dtype='O') - - for i from 0 <= i < n: - float_secs = float(seconds[i]) - secs = int(float_secs) - - micros = float_secs - secs - if micros > 0: - micros = micros * 1000000 - - result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), - int(hours[i]), int(minutes[i]), secs, - int(micros)) - - return result - - -#---------------------------------------------------------------------- -# Miscellaneous functions moved from core.tools.datetimes - -_DATEUTIL_LEXER_SPLIT = None -try: - # Since these are private methods from dateutil, it is safely imported - # here so in case this interface changes, pandas will just fallback - # to not using the functionality - from dateutil.parser import _timelex - - if hasattr(_timelex, 'split'): - def _lexer_split_from_str(dt_str): - # The StringIO(str(_)) is for dateutil 2.2 compatibility - return _timelex.split(StringIO(str(dt_str))) - - _DATEUTIL_LEXER_SPLIT = _lexer_split_from_str -except (ImportError, AttributeError): - pass - - -def _format_is_iso(f): - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format - excluded_formats = ['%Y%m%d', '%Y%m', '%Y'] - - for date_sep in [' ', '/', '\\', '-', '.', '']: - for time_sep in [' ', 'T']: - if (iso_template(date_sep=date_sep, - time_sep=time_sep - ).startswith(f) and f not in excluded_formats): - return True - return False - - -def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, - dt_str_split=_DATEUTIL_LEXER_SPLIT): - """ - Guess the datetime format of a given datetime string. - - Parameters - ---------- - dt_str : string, datetime string to guess the format of - dayfirst : boolean, default False - If True parses dates with the day first, eg 20/01/2005 - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug). - dt_str_parse : function, defaults to `compat.parse_date` (dateutil) - This function should take in a datetime string and return - a `datetime.datetime` guess that the datetime string represents - dt_str_split : function, defaults to `_DATEUTIL_LEXER_SPLIT` (dateutil) - This function should take in a datetime string and return - a list of strings, the guess of the various specific parts - e.g. '2011/12/30' -> ['2011', '/', '12', '/', '30'] - - Returns - ------- - ret : datetime format string (for `strftime` or `strptime`) - """ - if dt_str_parse is None or dt_str_split is None: - return None - - if not isinstance(dt_str, string_types): - return None - - day_attribute_and_format = (('day',), '%d', 2) - - # attr name, format, padding (if any) - datetime_attrs_to_format = [ - (('year', 'month', 'day'), '%Y%m%d', 0), - (('year',), '%Y', 0), - (('month',), '%B', 0), - (('month',), '%b', 0), - (('month',), '%m', 2), - day_attribute_and_format, - (('hour',), '%H', 2), - (('minute',), '%M', 2), - (('second',), '%S', 2), - (('microsecond',), '%f', 6), - (('second', 'microsecond'), '%S.%f', 0), - ] - - if dayfirst: - datetime_attrs_to_format.remove(day_attribute_and_format) - datetime_attrs_to_format.insert(0, day_attribute_and_format) - - try: - parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: - # In case the datetime can't be parsed, its format cannot be guessed - return None - - if parsed_datetime is None: - return None - - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None - - format_guess = [None] * len(tokens) - found_attrs = set() - - for attrs, attr_format, padding in datetime_attrs_to_format: - # If a given attribute has been placed in the format string, skip - # over other formats for that same underlying attribute (IE, month - # can be represented in multiple different ways) - if set(attrs) & found_attrs: - continue - - if all(getattr(parsed_datetime, attr) is not None for attr in attrs): - for i, token_format in enumerate(format_guess): - token_filled = tokens[i].zfill(padding) - if (token_format is None and - token_filled == parsed_datetime.strftime(attr_format)): - format_guess[i] = attr_format - tokens[i] = token_filled - found_attrs.update(attrs) - break - - # Only consider it a valid guess if we have a year, month and day - if len(set(['year', 'month', 'day']) & found_attrs) != 3: - return None - - output_format = [] - for i, guess in enumerate(format_guess): - if guess is not None: - # Either fill in the format placeholder (like %Y) - output_format.append(guess) - else: - # Or just the token separate (IE, the dashes in "01-01-2013") - try: - # If the token is numeric, then we likely didn't parse it - # properly, so our guess is wrong - float(tokens[i]) - return None - except ValueError: - pass - - output_format.append(tokens[i]) - - guessed_format = ''.join(output_format) - - # rebuild string, capturing any inferred padding - dt_str = ''.join(tokens) - if parsed_datetime.strftime(guessed_format) == dt_str: - return guessed_format - else: - return None From 62cc7b0e9afe45d14056ddcfd5c5b495fd774e91 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 17 Sep 2017 15:19:58 -0700 Subject: [PATCH 13/15] Reviewer comments; import try_parse_xyz directly from tslibs.parsing --- pandas/_libs/src/inference.pyx | 7 ------- pandas/_libs/tslibs/parsing.pyx | 3 +-- pandas/core/indexes/base.py | 3 ++- pandas/core/tools/datetimes.py | 7 ++++--- pandas/io/date_converters.py | 12 ++++++------ pandas/io/parsers.py | 10 +++++----- pandas/tests/indexes/datetimes/test_tools.py | 5 +++-- pandas/tests/io/parser/parse_dates.py | 5 +++-- setup.py | 3 ++- 9 files changed, 26 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 8d2897c9ae4a2..a0b43a084632f 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -9,13 +9,6 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 -from pandas._libs.tslibs.parsing import ( - try_parse_dates, - try_parse_date_and_time, - try_parse_year_month_day, - try_parse_datetime_components) - - from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, INT8_MIN, INT8_MAX, INT16_MIN, INT16_MAX, INT32_MAX, INT32_MIN, INT64_MAX, INT64_MIN) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 452649b0e2abc..845d1b8dcabba 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -369,7 +369,6 @@ def dateutil_parse(object timestr, object default, ignoretz=False, return ret, reso -# The canonical place for this appears to be in frequencies.pyx. cpdef object _get_rule_month(object source, object default='DEC'): """ Return starting month of given freq, default is December. @@ -527,7 +526,7 @@ def try_parse_datetime_components(ndarray[object] years, #---------------------------------------------------------------------- -# Miscellaneous functions moved from core.tools.datetimes +# Miscellaneous _DATEUTIL_LEXER_SPLIT = None try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 008828cf4f309..0071efb5ed939 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7,6 +7,7 @@ algos as libalgos, join as libjoin, Timestamp, Timedelta, ) from pandas._libs.lib import is_datetime_array +from pandas._libs.tslibs import parsing from pandas.compat import range, u from pandas.compat.numpy import function as nv @@ -1034,7 +1035,7 @@ def to_datetime(self, dayfirst=False): if self.inferred_type == 'string': from dateutil.parser import parse parser = lambda x: parse(x, dayfirst=dayfirst) - parsed = lib.try_parse_dates(self.values, parser=parser) + parsed = parsing.try_parse_dates(self.values, parser=parser) return DatetimeIndex(parsed) else: return DatetimeIndex(self.values) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index be9a25a0c9917..9f360b7dde336 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -3,6 +3,7 @@ from collections import MutableMapping from pandas._libs import lib, tslib +from pandas._libs.tslibs import parsing from pandas._libs.tslib import ( # noqa parse_time_string, _format_is_iso, @@ -523,9 +524,9 @@ def _attempt_YYYYMMDD(arg, errors): def calc(carg): # calculate the actual result carg = carg.astype(object) - parsed = lib.try_parse_year_month_day(carg / 10000, - carg / 100 % 100, - carg % 100) + parsed = parsing.try_parse_year_month_day(carg / 10000, + carg / 100 % 100, + carg % 100) return tslib.array_to_datetime(parsed, errors=errors) def calc_with_mask(carg, mask): diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 080d6c3e273a3..f475122188ccd 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,20 +1,19 @@ """This module is designed for community supported date conversion functions""" from pandas.compat import range, map import numpy as np -import pandas._libs.lib as lib - +from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) - return lib.try_parse_date_and_time(date_col, time_col) + return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) - return lib.try_parse_year_month_day(year_col, month_col, day_col) + return parsing.try_parse_year_month_day(year_col, month_col, day_col) def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, @@ -25,8 +24,9 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, hour_col = _maybe_cast(hour_col) minute_col = _maybe_cast(minute_col) second_col = _maybe_cast(second_col) - return lib.try_parse_datetime_components(year_col, month_col, day_col, - hour_col, minute_col, second_col) + return parsing.try_parse_datetime_components(year_col, month_col, day_col, + hour_col, minute_col, + second_col) def generic_parser(parse_func, *cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d9e83176d0d6e..8101732265457 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -42,7 +42,7 @@ import pandas._libs.lib as lib import pandas._libs.parsers as parsers - +from pandas._libs.tslibs import parsing # BOM character (byte order mark) # This exists at the beginning of a file to indicate endianness @@ -2952,7 +2952,7 @@ def converter(*date_cols): ) except: return tools.to_datetime( - lib.try_parse_dates(strs, dayfirst=dayfirst)) + parsing.try_parse_dates(strs, dayfirst=dayfirst)) else: try: result = tools.to_datetime( @@ -2963,9 +2963,9 @@ def converter(*date_cols): except Exception: try: return tools.to_datetime( - lib.try_parse_dates(_concat_date_cols(date_cols), - parser=date_parser, - dayfirst=dayfirst), + parsing.try_parse_dates(_concat_date_cols(date_cols), + parser=date_parser, + dayfirst=dayfirst), errors='ignore') except Exception: return generic_parser(date_parser, *date_cols) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index be27334384f6b..6a3fcde54c0c4 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -12,7 +12,8 @@ from distutils.version import LooseVersion import pandas as pd -from pandas._libs import tslib, lib +from pandas._libs import tslib +from pandas._libs.tslibs import parsing from pandas.core.tools import datetimes as tools from pandas.core.tools.datetimes import normalize_date from pandas.compat import lmap @@ -1405,7 +1406,7 @@ class TestArrayToDatetime(object): def test_try_parse_dates(self): arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - result = lib.try_parse_dates(arr, dayfirst=True) + result = parsing.try_parse_dates(arr, dayfirst=True) expected = [parse(d, dayfirst=True) for d in arr] assert np.array_equal(result, expected) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index e1ae1b577ea29..90103e7bf26b0 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -10,7 +10,7 @@ import pytest import numpy as np -import pandas._libs.lib as lib +from pandas._libs.tslibs import parsing from pandas._libs.lib import Timestamp import pandas as pd @@ -53,7 +53,8 @@ def test_multiple_date_col(self): """ def func(*date_cols): - return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + res = parsing.try_parse_dates(parsers._concat_date_cols(date_cols)) + return res df = self.read_csv(StringIO(data), header=None, date_parser=func, diff --git a/setup.py b/setup.py index 952b081c0214d..2f9aa970d739d 100755 --- a/setup.py +++ b/setup.py @@ -486,7 +486,8 @@ def pxd(name): 'sources': ['pandas/_libs/src/datetime/np_datetime.c', 'pandas/_libs/src/datetime/np_datetime_strings.c', 'pandas/_libs/src/period_helper.c']}, - '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing'}, + '_libs.tslibs.parsing': {'pyxfile': '_libs/tslibs/parsing', + 'pxdfiles': ['_libs/src/util']}, '_libs.tslibs.frequencies': {'pyxfile': '_libs/tslibs/frequencies', 'pxdfiles': ['_libs/src/util']}, '_libs.index': {'pyxfile': '_libs/index', From 21046b3687ad45d692b3c3bc1cc27d2114291bda Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 17 Sep 2017 17:47:42 -0700 Subject: [PATCH 14/15] flake8 fixup --- pandas/core/tools/datetimes.py | 2 +- pandas/io/date_converters.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 9f360b7dde336..729ca72c787f5 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,7 +2,7 @@ import numpy as np from collections import MutableMapping -from pandas._libs import lib, tslib +from pandas._libs import tslib from pandas._libs.tslibs import parsing from pandas._libs.tslib import ( # noqa parse_time_string, diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index f475122188ccd..377373f8a0135 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -3,6 +3,7 @@ import numpy as np from pandas._libs.tslibs import parsing + def parse_date_time(date_col, time_col): date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) From 364a6b0fa65985bf367d9577ff0f38ac27c4274d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 24 Sep 2017 10:09:25 -0700 Subject: [PATCH 15/15] keep _DATEUTIL_LEXER_SPLIT private to tslibs.parsing --- pandas/core/tools/datetimes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 5578d4af3f89e..0cf42d855ca4c 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -8,7 +8,6 @@ from pandas._libs.tslibs.parsing import ( # noqa parse_time_string, _format_is_iso, - _DATEUTIL_LEXER_SPLIT, _guess_datetime_format) from pandas.core.dtypes.common import (