Skip to content

Commit 30dd412

Browse files
committed
ENH: enable to_datetime to be vectorized, handle NAs, close #858
1 parent 7d2603d commit 30dd412

File tree

4 files changed

+135
-53
lines changed

4 files changed

+135
-53
lines changed

pandas/core/datetools.py

+54-21
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
import pandas._tseries as lib
66
import re
77

8-
from pandas._tseries import Timestamp, monthrange
8+
from pandas._tseries import Timestamp
9+
import pandas.core.common as com
910

1011
try:
1112
import dateutil
@@ -65,15 +66,33 @@ def parser(x):
6566
data = p_ufunc(arr)
6667
return np.array(data, dtype='M8[us]')
6768

68-
def to_datetime(arg):
69-
"""Attempts to convert arg to datetime"""
69+
def to_datetime(arg, errors='ignore'):
70+
"""
71+
Convert argument to datetime
72+
73+
Parameters
74+
----------
75+
arg : string, datetime, array of strings (with possible NAs)
76+
errors : {'ignore', 'raise'}, default 'ignore'
77+
Errors are ignored by default (values left untouched)
78+
79+
Returns
80+
-------
81+
ret : datetime if parsing succeeded
82+
"""
7083
if arg is None:
7184
return arg
7285
elif isinstance(arg, datetime):
7386
return arg
87+
elif isinstance(arg, np.ndarray):
88+
return lib.string_to_datetime(com._ensure_object(arg),
89+
raise_=errors == 'raise')
90+
7491
try:
7592
return parser.parse(arg)
7693
except Exception:
94+
if errors == 'raise':
95+
raise
7796
return arg
7897

7998

@@ -1151,7 +1170,7 @@ class MonthEnd(DateOffset, CacheableOffset):
11511170

11521171
def apply(self, other):
11531172
n = self.n
1154-
_, days_in_month = monthrange(other.year, other.month)
1173+
_, days_in_month = lib.monthrange(other.year, other.month)
11551174
if other.day != days_in_month:
11561175
other = other + relativedelta(months=-1, day=31)
11571176
if n <= 0:
@@ -1161,8 +1180,8 @@ def apply(self, other):
11611180

11621181
@classmethod
11631182
def onOffset(cls, someDate):
1164-
__junk, days_in_month = monthrange(someDate.year,
1165-
someDate.month)
1183+
__junk, days_in_month = lib.monthrange(someDate.year,
1184+
someDate.month)
11661185
return someDate.day == days_in_month
11671186

11681187
def rule_code(self):
@@ -1184,7 +1203,7 @@ def apply(self, other):
11841203

11851204
@classmethod
11861205
def onOffset(cls, someDate):
1187-
firstDay, _ = monthrange(someDate.year, someDate.month)
1206+
firstDay, _ = lib.monthrange(someDate.year, someDate.month)
11881207
return someDate.day == (firstDay + 1)
11891208

11901209
def rule_code(self):
@@ -1202,7 +1221,7 @@ def isAnchored(self):
12021221
def apply(self, other):
12031222
n = self.n
12041223

1205-
wkday, days_in_month = monthrange(other.year, other.month)
1224+
wkday, days_in_month = lib.monthrange(other.year, other.month)
12061225
lastBDay = days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0)
12071226

12081227
if n > 0 and not other.day >= lastBDay:
@@ -1227,15 +1246,15 @@ class BMonthBegin(DateOffset, CacheableOffset):
12271246
def apply(self, other):
12281247
n = self.n
12291248

1230-
wkday, _ = monthrange(other.year, other.month)
1249+
wkday, _ = lib.monthrange(other.year, other.month)
12311250
firstBDay = _get_firstbday(wkday)
12321251

12331252
if other.day > firstBDay and n<=0:
12341253
# as if rolled forward already
12351254
n += 1
12361255

12371256
other = other + relativedelta(months=n)
1238-
wkday, _ = monthrange(other.year, other.month)
1257+
wkday, _ = lib.monthrange(other.year, other.month)
12391258
firstBDay = _get_firstbday(wkday)
12401259
result = datetime(other.year, other.month, firstBDay)
12411260
return result
@@ -1403,7 +1422,7 @@ def isAnchored(self):
14031422
def apply(self, other):
14041423
n = self.n
14051424

1406-
wkday, days_in_month = monthrange(other.year, other.month)
1425+
wkday, days_in_month = lib.monthrange(other.year, other.month)
14071426
lastBDay = days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0)
14081427

14091428
monthsToGo = 3 - ((other.month - self.startingMonth) % 3)
@@ -1465,7 +1484,7 @@ def apply(self, other):
14651484
if self._normalizeFirst:
14661485
other = normalize_date(other)
14671486

1468-
wkday, _ = monthrange(other.year, other.month)
1487+
wkday, _ = lib.monthrange(other.year, other.month)
14691488

14701489
firstBDay = _get_firstbday(wkday)
14711490

@@ -1485,7 +1504,7 @@ def apply(self, other):
14851504

14861505
# get the first bday for result
14871506
other = other + relativedelta(months=3*n - monthsSince)
1488-
wkday, _ = monthrange(other.year, other.month)
1507+
wkday, _ = lib.monthrange(other.year, other.month)
14891508
firstBDay = _get_firstbday(wkday)
14901509
result = datetime(other.year, other.month, firstBDay)
14911510
return result
@@ -1517,7 +1536,7 @@ def isAnchored(self):
15171536
def apply(self, other):
15181537
n = self.n
15191538

1520-
wkday, days_in_month = monthrange(other.year, other.month)
1539+
wkday, days_in_month = lib.monthrange(other.year, other.month)
15211540

15221541
monthsToGo = 3 - ((other.month - self.startingMonth) % 3)
15231542
if monthsToGo == 3:
@@ -1556,7 +1575,7 @@ def isAnchored(self):
15561575
def apply(self, other):
15571576
n = self.n
15581577

1559-
wkday, days_in_month = monthrange(other.year, other.month)
1578+
wkday, days_in_month = lib.monthrange(other.year, other.month)
15601579

15611580
monthsSince = (other.month - self.startingMonth) % 3
15621581

@@ -1598,7 +1617,7 @@ def apply(self, other):
15981617
if self._normalizeFirst:
15991618
other = normalize_date(other)
16001619

1601-
wkday, days_in_month = monthrange(other.year, self.month)
1620+
wkday, days_in_month = lib.monthrange(other.year, self.month)
16021621
lastBDay = (days_in_month -
16031622
max(((wkday + days_in_month - 1) % 7) - 4, 0))
16041623

@@ -1614,7 +1633,7 @@ def apply(self, other):
16141633

16151634
other = other + relativedelta(years=years)
16161635

1617-
_, days_in_month = monthrange(other.year, self.month)
1636+
_, days_in_month = lib.monthrange(other.year, self.month)
16181637
result = datetime(other.year, self.month, days_in_month)
16191638

16201639
if result.weekday() > 4:
@@ -1646,7 +1665,7 @@ def apply(self, other):
16461665
if self._normalizeFirst:
16471666
other = normalize_date(other)
16481667

1649-
wkday, days_in_month = monthrange(other.year, self.month)
1668+
wkday, days_in_month = lib.monthrange(other.year, self.month)
16501669

16511670
firstBDay = _get_firstbday(wkday)
16521671

@@ -1664,7 +1683,7 @@ def apply(self, other):
16641683

16651684
# set first bday for result
16661685
other = other + relativedelta(years = years)
1667-
wkday, days_in_month = monthrange(other.year, self.month)
1686+
wkday, days_in_month = lib.monthrange(other.year, self.month)
16681687
firstBDay = _get_firstbday(wkday)
16691688
result = datetime(other.year, self.month, firstBDay)
16701689
return result
@@ -1688,7 +1707,7 @@ def __init__(self, n=1, **kwds):
16881707

16891708
def apply(self, other):
16901709
n = self.n
1691-
wkday, days_in_month = monthrange(other.year, self.month)
1710+
wkday, days_in_month = lib.monthrange(other.year, self.month)
16921711
if other.month != self.month or other.day != days_in_month:
16931712
other = datetime(other.year - 1, self.month, days_in_month)
16941713
if n <= 0:
@@ -1697,7 +1716,7 @@ def apply(self, other):
16971716
return other
16981717

16991718
def onOffset(self, someDate):
1700-
wkday, days_in_month = monthrange(someDate.year, self.month)
1719+
wkday, days_in_month = lib.monthrange(someDate.year, self.month)
17011720
return self.month == someDate.month and someDate.day == days_in_month
17021721

17031722
def rule_code(self):
@@ -1826,6 +1845,20 @@ def rule_code(self):
18261845
_offset_map = {
18271846
"WEEKDAY" : BDay(1),
18281847

1848+
# Annual - Calendar
1849+
"A-JAN" : YearEnd(month=1),
1850+
"A-FEB" : YearEnd(month=2),
1851+
"A-MAR" : YearEnd(month=3),
1852+
"A-APR" : YearEnd(month=4),
1853+
"A-MAY" : YearEnd(month=5),
1854+
"A-JUN" : YearEnd(month=6),
1855+
"A-JUL" : YearEnd(month=7),
1856+
"A-AUG" : YearEnd(month=8),
1857+
"A-SEP" : YearEnd(month=9),
1858+
"A-OCT" : YearEnd(month=10),
1859+
"A-NOV" : YearEnd(month=11),
1860+
"A-DEC" : YearEnd(month=12),
1861+
18291862
# Annual - Calendar
18301863
"A@JAN" : YearEnd(month=1),
18311864
"A@FEB" : YearEnd(month=2),

pandas/core/index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1162,9 +1162,9 @@ class DatetimeIndex(Int64Index):
11621162
_outer_indexer = _join_i8_wrapper(lib.outer_join_indexer_int64)
11631163
_left_indexer = _join_i8_wrapper(lib.left_join_indexer_int64,
11641164
with_indexers=False)
1165-
_groupby = lib.groupby_arrays # _wrap_i8_function(lib.groupby_int64)
1165+
_groupby = lib.groupby_arrays # _wrap_i8_function(lib.groupby_int64)
11661166

1167-
_arrmap = _wrap_dt_function(lib.arrmap_object)
1167+
_arrmap = _wrap_dt_function(lib.arrmap_object)
11681168

11691169
__eq__ = _dt_index_cmp('__eq__')
11701170
__ne__ = _dt_index_cmp('__ne__')

pandas/src/tseries.pyx

+21
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,27 @@ def vec_binop(ndarray[object] left, ndarray[object] right, object op):
568568

569569
return maybe_convert_bool(result)
570570

571+
def string_to_datetime(ndarray[object] strings, raise_=False):
572+
cdef:
573+
Py_ssize_t i, n = len(strings)
574+
object val
575+
from dateutil.parser import parse
576+
577+
result = np.empty(n, dtype=object)
578+
579+
for i in range(n):
580+
val = strings[i]
581+
if util._checknull(val):
582+
result[i] = val
583+
else:
584+
try:
585+
result[i] = parse(val)
586+
except Exception:
587+
if raise_:
588+
raise
589+
result[i] = val
590+
591+
return result
571592

572593
def value_count_int64(ndarray[int64_t] values):
573594
cdef:

pandas/tests/test_timeseries.py

+58-30
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
# pylint: disable-msg=E1101,W0612
22

3-
from cStringIO import StringIO
4-
from datetime import datetime, timedelta
5-
import os
6-
import operator
3+
from datetime import datetime
74
import unittest
85

96
import nose
@@ -26,6 +23,35 @@
2623
import pandas.util.testing as tm
2724
import pandas
2825

26+
27+
import pandas._tseries as lib
28+
from datetime import datetime
29+
30+
import cPickle as pickle
31+
32+
import pandas.core.datetools as dt
33+
from pandas.core.index import Index, DatetimeIndex, Int64Index
34+
from pandas.core.frame import DataFrame
35+
36+
import unittest
37+
import numpy as np
38+
39+
from pandas import Series
40+
41+
from numpy.random import rand
42+
43+
from pandas.util.testing import assert_series_equal, assert_frame_equal
44+
45+
from pandas.core.groupby import Tinterval
46+
from pandas.core.datetools import Minute, BDay, Timestamp
47+
48+
import pandas.core.common as com
49+
50+
try:
51+
import pytz
52+
except ImportError:
53+
pass
54+
2955
class TestTimeSeriesDuplicates(unittest.TestCase):
3056

3157
def setUp(self):
@@ -86,6 +112,33 @@ def assert_range_equal(left, right):
86112

87113
class TestTimeSeries(unittest.TestCase):
88114

115+
def test_string_na_conversion(self):
116+
from dateutil.parser import parse
117+
from pandas.core.datetools import to_datetime
118+
119+
strings = np.array(['1/1/2000', '1/2/2000', np.nan,
120+
'1/4/2000, 12:34:56'], dtype=object)
121+
122+
expected = []
123+
for val in strings:
124+
if com.isnull(val):
125+
expected.append(val)
126+
else:
127+
expected.append(parse(val))
128+
129+
result = lib.string_to_datetime(strings)
130+
assert_almost_equal(result, expected)
131+
132+
result2 = to_datetime(strings)
133+
assert_almost_equal(result, result2)
134+
135+
malformed = np.array(['1/100/2000', np.nan], dtype=object)
136+
result = to_datetime(malformed)
137+
assert_almost_equal(result, malformed)
138+
139+
self.assertRaises(ValueError, to_datetime, malformed,
140+
errors='raise')
141+
89142
def test_dti_slicing(self):
90143
dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M')
91144
dti2 = dti[[1,3,5]]
@@ -201,32 +254,6 @@ def test_pad_require_monotonicity(self):
201254
self.assertRaises(AssertionError, rng2.get_indexer, rng,
202255
method='pad')
203256

204-
import pandas._tseries as lib
205-
from datetime import datetime
206-
207-
import cPickle as pickle
208-
209-
import pandas.core.datetools as dt
210-
from pandas.core.index import Index, DatetimeIndex, Int64Index
211-
from pandas.core.frame import DataFrame
212-
213-
import unittest
214-
import numpy as np
215-
216-
from pandas import Series
217-
218-
from numpy.random import rand
219-
220-
from pandas.util.testing import assert_series_equal, assert_frame_equal
221-
222-
from pandas.core.groupby import Tinterval
223-
from pandas.core.datetools import Minute, BDay, Timestamp
224-
225-
try:
226-
import pytz
227-
except ImportError:
228-
pass
229-
230257
def _skip_if_no_pytz():
231258
try:
232259
import pytz
@@ -900,6 +927,7 @@ def test_dayofmonthoffset(self):
900927
self.assert_(t.weekday() == day)
901928

902929

930+
903931
if __name__ == '__main__':
904932
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
905933
exit=False)

0 commit comments

Comments
 (0)