Skip to content

PERF: Parse certain dates in Cython instead of falling back to dateutil.parse #25922

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Apr 20, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
4222fd5
Add new benchmarks for parsing datetime strings
anmyachev Mar 26, 2019
78254a4
Implement parsing dd/mm/yyyy and mm/dd/yyyy in Cython
vnlitvinov Mar 28, 2019
1608090
fix code style
anmyachev Mar 29, 2019
eec3beb
using DEF statement for compile-time constant
anmyachev Mar 29, 2019
d322b8d
parse_slashed_date simplification
anmyachev Mar 29, 2019
0546e0a
removed micro-bench
anmyachev Mar 29, 2019
4a673ff
Support mm-yyyy along with mm-dd-yyyy
vnlitvinov Mar 29, 2019
23df426
Rename parse_slashed_date to parse_delimited_date
vnlitvinov Mar 29, 2019
3538566
Speed up parse_datetime_string_with_reso
vnlitvinov Mar 29, 2019
4d4df11
fix code style
anmyachev Mar 29, 2019
504de84
Move to datetime_new, add docstring to _parse_delimited_date
vnlitvinov Apr 1, 2019
0613e66
Add whatsnew entry
vnlitvinov Apr 1, 2019
b985e37
fix parsing MM/YYYY for MM > 12
anmyachev Apr 2, 2019
f2843e1
added tests for parse_delimited_date
anmyachev Apr 2, 2019
4f66004
fix flake8 bugs in test_parse_dates.py
anmyachev Apr 2, 2019
ac6e348
Fix date parsing for Python <= 3.6.0
vnlitvinov Apr 3, 2019
5384ebe
removed parsing MM.YYYY format, because, for example, 10.2019 interpr…
anmyachev Apr 3, 2019
889ef7a
Remove whatsnew entry for the change
vnlitvinov Apr 4, 2019
a6926e7
Remove duplicate parsing of MM-YYYY in _parse_dateabbr_string
vnlitvinov Apr 4, 2019
b7cd6b1
added some comments in _parse_delimited_date
anmyachev Apr 5, 2019
4a2929d
fix docstring in _parse_delimited_date
anmyachev Apr 5, 2019
4bc1821
fix bug when parsing 01/12/2019 with dayfirst==True
anmyachev Apr 8, 2019
a43fa7b
first attemp to use hypothesis in tests
anmyachev Apr 8, 2019
710a287
apply isort on pandas/tests/io/parser/test_parse_dates.py
anmyachev Apr 8, 2019
859e312
added new '%Y %m %d' format and 2 @pytest.mark.parametrize for test_h…
anmyachev Apr 8, 2019
b41ea63
removed test_parse_delimited_date; added next formats: '%y %m %d', '%…
anmyachev Apr 9, 2019
6fad4f4
added message for pytest.skip(); more complete docstring in _parse_de…
anmyachev Apr 9, 2019
7113c75
removed \ delimiter
anmyachev Apr 9, 2019
d0bfd91
using is_platform_windows() in date_strategy definition; changed date…
anmyachev Apr 9, 2019
da845ed
fixed import order; using @settings(deadline=None) now; dates with ye…
anmyachev Apr 9, 2019
13717ec
removed extra 'parse' import
anmyachev Apr 18, 2019
2cd971a
_is_not_delimiter is inline now
anmyachev Apr 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,22 @@ def mem_parser_chunks(self):
pass


class ReadCSVParseSpecialDate(StringIORewind):
params = (['mY', 'mdY'],)
params_name = ['value']
objects = {
'mY': '01-2019\n10-2019\n02/2000\n',
'mdY': '12/02/2010\n'
}

def setup(self, value):
count_elem = 10000
data = self.objects[value] * count_elem
self.StringIO_input = StringIO(data)

def time_read_special_date(self, value):
read_csv(self.data(self.StringIO_input), sep=',', header=None,
names=['Date'], parse_dates=['Date'])


from ..pandas_vb_common import setup # noqa: F401
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ Performance Improvements
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
- Improved performance of :meth:DataFrame.`to_csv` when write datetime dtype data (:issue:`25708`)
- Improved performance of :meth:`read_csv` by much faster parsing of MM/YYYY and DD/MM/YYYY datetime formats (:issue:`25922`)

.. _whatsnew_0250.bug_fixes:

Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/src/headers/portable.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
// GH-23516 - works around locale perf issues
// from MUSL libc, MIT Licensed - see LICENSES
#define isdigit_ascii(c) (((unsigned)(c) - '0') < 10u)
#define getdigit_ascii(c, default) (isdigit_ascii(c) ? ((int)((c) - '0')) : default)
#define isspace_ascii(c) (((c) == ' ') || (((unsigned)(c) - '\t') < 5))
#define toupper_ascii(c) ((((unsigned)(c) - 'a') < 26) ? ((c) & 0x5f) : (c))
#define tolower_ascii(c) ((((unsigned)(c) - 'A') < 26) ? ((c) | 0x20) : (c))
Expand Down
112 changes: 110 additions & 2 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@ import re
import time
from io import StringIO

from cpython.datetime cimport datetime
from libc.string cimport strchr

from cpython.datetime cimport datetime, datetime_new, import_datetime
from cpython.version cimport PY_VERSION_HEX
import_datetime()

import numpy as np

Expand All @@ -24,6 +27,10 @@ from pandas._config import get_option

from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
from pandas._libs.tslibs.nattype import nat_strings, NaT
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size

cdef extern from "../src/headers/portable.h":
int getdigit_ascii(char c, int default) nogil

# ----------------------------------------------------------------------
# Constants
Expand All @@ -42,6 +49,99 @@ cdef:
set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'}

# ----------------------------------------------------------------------
cdef:
const char* delimiters = " /-."
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12


cdef inline bint _is_not_delimiter(const char ch):
return strchr(delimiters, ch) == NULL


cdef inline int _parse_2digit(const char* s):
cdef int result = 0
result += getdigit_ascii(s[0], -10) * 10
result += getdigit_ascii(s[1], -100) * 1
return result


cdef inline int _parse_4digit(const char* s):
cdef int result = 0
result += getdigit_ascii(s[0], -10) * 1000
result += getdigit_ascii(s[1], -100) * 100
result += getdigit_ascii(s[2], -1000) * 10
result += getdigit_ascii(s[3], -10000) * 1
return result


cdef inline object _parse_delimited_date(object date_string, bint dayfirst):
"""
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
At the beginning function tries to parse date in MM/DD/YYYY format, but
if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
With `dayfirst == True` function makes an attempt to parse date in
DD/MM/YYYY, if an attemp is wrong - in DD/MM/YYYY

Note
----
For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
For MM/YYYY: delimiter can be a space or one of /-
If `date_string` can't be converted to date, then function returns
None, None

Parameters
----------
date_string : str
dayfirst : bint

Returns:
--------
datetime, resolution
"""
cdef:
const char* buf
Py_ssize_t length
int day = 1, month = 1, year
bint can_swap = 0

buf = get_c_string_buf_and_size(date_string, &length)
if length == 10:
# parsing MM?DD?YYYY and DD?MM?YYYY dates
if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]):
return None, None
month = _parse_2digit(buf)
day = _parse_2digit(buf + 3)
year = _parse_4digit(buf + 6)
reso = 'day'
can_swap = 1
elif length == 7:
# parsing MM?YYYY dates
if buf[2] == b'.' or _is_not_delimiter(buf[2]):
# we cannot reliably tell whether e.g. 10.2010 is a float
# or a date, thus we refuse to parse it here
return None, None
month = _parse_2digit(buf)
year = _parse_4digit(buf + 3)
reso = 'month'
else:
return None, None

if month < 0 or day < 0 or year < 1000:
# some part is not an integer, so
# date_string can't be converted to date, above format
return None, None

if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
and (month <= MAX_MONTH or day <= MAX_MONTH):
if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
day, month = month, day
if PY_VERSION_HEX >= 0x03060100:
# In Python <= 3.6.0 there is no range checking for invalid dates
# in C api, thus we call faster C version for 3.6.1 or newer
return datetime_new(year, month, day, 0, 0, 0, 0, None), reso
return datetime(year, month, day, 0, 0, 0, 0, None), reso

raise DateParseError("Invalid date specified ({}/{})".format(month, day))


def parse_datetime_string(date_string, freq=None, dayfirst=False,
Expand All @@ -66,6 +166,10 @@ def parse_datetime_string(date_string, freq=None, dayfirst=False,
yearfirst=yearfirst, **kwargs)
return dt

dt, _ = _parse_delimited_date(date_string, dayfirst)
if dt is not None:
return dt

try:
dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
return dt
Expand Down Expand Up @@ -146,6 +250,10 @@ cdef parse_datetime_string_with_reso(date_string, freq=None, dayfirst=False,
if not _does_string_look_like_datetime(date_string):
raise ValueError('Given date string not likely a datetime.')

parsed, reso = _parse_delimited_date(date_string, dayfirst)
if parsed is not None:
return parsed, parsed, reso

try:
return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
except DateParseError:
Expand Down Expand Up @@ -279,7 +387,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default,
except ValueError:
pass

for pat in ['%Y-%m', '%m-%Y', '%b %Y', '%b-%Y']:
for pat in ['%Y-%m', '%b %Y', '%b-%Y']:
try:
ret = datetime.strptime(date_string, pat)
return ret, ret, 'month'
Expand Down
98 changes: 94 additions & 4 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@
from datetime import date, datetime
from io import StringIO

from dateutil.parser import parse
from dateutil.parser import parse as du_parse
from hypothesis import given, settings, strategies as st
import numpy as np
import pytest
import pytz

from pandas._libs.tslib import Timestamp
from pandas._libs.tslibs import parsing
from pandas.compat import lrange
from pandas._libs.tslibs.parsing import parse_datetime_string
from pandas.compat import is_platform_windows, lrange
from pandas.compat.numpy import np_array_datetime64_compat

import pandas as pd
Expand All @@ -26,6 +28,15 @@
import pandas.io.date_converters as conv
import pandas.io.parsers as parsers

# constant
_DEFAULT_DATETIME = datetime(1, 1, 1)

# Strategy for hypothesis
if is_platform_windows():
date_strategy = st.datetimes(min_value=datetime(1900, 1, 1))
else:
date_strategy = st.datetimes()


def test_separator_date_conflict(all_parsers):
# Regression test for gh-4678
Expand Down Expand Up @@ -439,7 +450,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
"""
if "dayfirst" in kwargs:
df = parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
date_parser=lambda d: parse(d, **kwargs),
date_parser=lambda d: du_parse(d, **kwargs),
header=0, index_col=0, parse_dates=True,
na_values=["NA"])
exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1),
Expand All @@ -451,7 +462,7 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
msg = "got an unexpected keyword argument 'day_first'"
with pytest.raises(TypeError, match=msg):
parser.read_csv(StringIO(data), names=["time", "Q", "NTU"],
date_parser=lambda d: parse(d, **kwargs),
date_parser=lambda d: du_parse(d, **kwargs),
skiprows=[0], index_col=0, parse_dates=True,
na_values=["NA"])

Expand Down Expand Up @@ -849,3 +860,82 @@ def test_parse_timezone(all_parsers):

expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("date_string", [
"32/32/2019",
"02/30/2019",
"13/13/2019",
"13/2019",
"a3/11/2018",
"10/11/2o17"
])
def test_invalid_parse_delimited_date(all_parsers, date_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
result = parser.read_csv(StringIO(date_string),
header=None, parse_dates=[0])
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("date_string,dayfirst,expected", [
# %d/%m/%Y; month > 12 thus replacement
("13/02/2019", False, datetime(2019, 2, 13)),
("13/02/2019", True, datetime(2019, 2, 13)),
# %m/%d/%Y; day > 12 thus there will be no replacement
("02/13/2019", False, datetime(2019, 2, 13)),
("02/13/2019", True, datetime(2019, 2, 13)),
# %d/%m/%Y; dayfirst==True thus replacement
("04/02/2019", True, datetime(2019, 2, 4))
])
def test_parse_delimited_date_swap(all_parsers, date_string,
dayfirst, expected):
parser = all_parsers
expected = DataFrame({0: [expected]}, dtype="datetime64[ns]")
result = parser.read_csv(StringIO(date_string), header=None,
dayfirst=dayfirst, parse_dates=[0])
tm.assert_frame_equal(result, expected)


def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
msg, result = None, None
try:
result = call(date_string, **kwargs)
except ValueError as er:
msg = str(er)
pass
return msg, result


@given(date_strategy)
@settings(deadline=None)
@pytest.mark.parametrize("delimiter", list(" -./"))
@pytest.mark.parametrize("dayfirst", [True, False])
@pytest.mark.parametrize("date_format", [
"%d %m %Y",
"%m %d %Y",
"%m %Y",
"%Y %m %d",
"%y %m %d",
"%Y%m%d",
"%y%m%d",
])
def test_hypothesis_delimited_date(date_format, dayfirst,
delimiter, test_datetime):
if date_format == "%m %Y" and delimiter == ".":
pytest.skip("parse_datetime_string cannot reliably tell whether \
e.g. %m.%Y is a float or a date, thus we skip it")
result, expected = None, None
except_in_dateutil, except_out_dateutil = None, None
date_string = test_datetime.strftime(date_format.replace(' ', delimiter))

except_out_dateutil, result = _helper_hypothesis_delimited_date(
parse_datetime_string, date_string,
dayfirst=dayfirst)
except_in_dateutil, expected = _helper_hypothesis_delimited_date(
du_parse, date_string,
default=_DEFAULT_DATETIME,
dayfirst=dayfirst, yearfirst=False)

assert except_out_dateutil == except_in_dateutil
assert result == expected