Skip to content

Commit 49f8516

Browse files
icexellossHyukjinKwon
authored andcommitted
Add dt functions to koalas Series
1 parent 314aedc commit 49f8516

File tree

7 files changed

+285
-4
lines changed

7 files changed

+285
-4
lines changed

databricks/koalas/base.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
to_arrow_type
3131

3232
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
33+
from databricks.koalas.typedef import pandas_wraps
3334

3435

3536
def _column_op(f):
@@ -74,6 +75,25 @@ def wrapper(self, *args):
7475
return wrapper
7576

7677

78+
def _wrap_accessor_spark(accessor, fn, return_type=None):
79+
"""
80+
Wrap an accessor property or method, e.g., Series.dt.date with a spark function.
81+
"""
82+
if return_type:
83+
return _column_op(
84+
lambda col: fn(col).cast(return_type)
85+
)(accessor._data)
86+
else:
87+
return _column_op(fn)(accessor._data)
88+
89+
90+
def _wrap_accessor_pandas(accessor, fn, return_type):
91+
"""
92+
Wrap an accessor property or method, e.g, Series.dt.date with a pandas function.
93+
"""
94+
return pandas_wraps(fn, return_col=return_type)(accessor._data)
95+
96+
7797
class IndexOpsMixin(object):
7898
"""common ops mixin to support a unified interface / docs for Series / Index
7999

databricks/koalas/datetime.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#
2+
# Copyright (C) 2019 Databricks, Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
"""
18+
Date/Time related functions on Koalas Series
19+
"""
20+
import pyspark.sql.functions as F
21+
22+
from pyspark.sql.types import DateType, TimestampType, LongType, StringType
23+
from databricks.koalas.base import (
24+
_wrap_accessor_pandas,
25+
_wrap_accessor_spark
26+
)
27+
28+
import databricks.koalas as ks
29+
30+
from databricks.koalas.utils import lazy_property
31+
32+
33+
class DatetimeMethods(object):
34+
"""Date/Time methods for Koalas Series"""
35+
def __init__(self, series):
36+
if not isinstance(series.spark_type, (DateType, TimestampType)):
37+
raise ValueError(
38+
"Cannot call DatetimeMethods on type {}"
39+
.format(series.spark_type))
40+
self._data = series
41+
42+
# Properties
43+
@lazy_property
44+
def date(self) -> ks.Series:
45+
"""
46+
The date part of the datetime.
47+
"""
48+
# TODO: Hit a weird exception
49+
# syntax error in attribute name: `to_date(`start_date`)` with alias
50+
return _wrap_accessor_spark(
51+
self, lambda col: F.to_date(col).alias('date')
52+
)
53+
54+
@lazy_property
55+
def time(self) -> ks.Series:
56+
raise NotImplementedError()
57+
58+
@lazy_property
59+
def timetz(self) -> ks.Series:
60+
raise NotImplementedError()
61+
62+
@lazy_property
63+
def year(self) -> ks.Series:
64+
"""
65+
The year of the datetime.
66+
`"""
67+
return _wrap_accessor_spark(self, F.year, LongType())
68+
69+
@lazy_property
70+
def month(self) -> ks.Series:
71+
"""
72+
The month of the timestamp as January = 1 December = 12.
73+
"""
74+
return _wrap_accessor_spark(self, F.month, LongType())
75+
76+
@lazy_property
77+
def day(self) -> ks.Series:
78+
"""
79+
The days of the datetime.
80+
"""
81+
return _wrap_accessor_spark(self, F.dayofmonth, LongType())
82+
83+
@lazy_property
84+
def hour(self) -> ks.Series:
85+
"""
86+
The hours of the datetime.
87+
"""
88+
return _wrap_accessor_spark(self, F.hour, LongType())
89+
90+
@lazy_property
91+
def minute(self) -> ks.Series:
92+
"""
93+
The minutes of the datetime.
94+
"""
95+
return _wrap_accessor_spark(self, F.minute, LongType())
96+
97+
@lazy_property
98+
def second(self) -> ks.Series:
99+
"""
100+
The seconds of the datetime.
101+
"""
102+
return _wrap_accessor_spark(self, F.second, LongType())
103+
104+
@lazy_property
105+
def millisecond(self) -> ks.Series:
106+
"""
107+
The milliseconds of the datetime.
108+
"""
109+
return _wrap_accessor_pandas(
110+
self, lambda x: x.dt.millisecond, LongType())
111+
112+
@lazy_property
113+
def microsecond(self) -> ks.Series:
114+
"""
115+
The microseconds of the datetime.
116+
"""
117+
return _wrap_accessor_pandas(
118+
self, lambda x: x.dt.microsecond, LongType())
119+
120+
@lazy_property
121+
def nanosecond(self) -> ks.Series:
122+
raise NotImplementedError()
123+
124+
@lazy_property
125+
def week(self) -> ks.Series:
126+
"""
127+
The week ordinal of the year.
128+
"""
129+
return _wrap_accessor_spark(self, F.weekofyear, LongType())
130+
131+
@lazy_property
132+
def weekofyear(self) -> ks.Series:
133+
"""
134+
The week ordinal of the year.
135+
"""
136+
return _wrap_accessor_spark(self, F.weekofyear, LongType())
137+
138+
@lazy_property
139+
def dayofweek(self) -> ks.Series:
140+
"""
141+
The day of the week with Monday=0, Sunday=6.
142+
"""
143+
return _wrap_accessor_pandas(self, lambda s: s.dt.dayofweek, LongType())
144+
145+
@lazy_property
146+
def dayofyear(self) -> ks.Series:
147+
"""
148+
The day ordinal of the year.
149+
"""
150+
return _wrap_accessor_pandas(self, lambda s: s.dt.dayofyear, LongType())
151+
152+
# Methods
153+
def strftime(self, date_format) -> ks.Series:
154+
"""
155+
Convert to a String Series using specified date_format.
156+
"""
157+
return _wrap_accessor_pandas(
158+
self,
159+
lambda x: x.dt.strftime(date_format),
160+
StringType()
161+
)

databricks/koalas/series.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ def _with_new_scol(self, scol: spark.Column) -> 'Series':
133133
"""
134134
return Series(scol, anchor=self._kdf, index=self._index_map)
135135

136+
@property
137+
def dt(self):
138+
from databricks.koalas.datetime import DatetimeMethods
139+
return DatetimeMethods(self)
140+
136141
@property
137142
def dtypes(self):
138143
"""Return the dtype object of the underlying data.

databricks/koalas/tests/test_series_datetime.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,37 @@
1818

1919
import numpy as np
2020
import pandas as pd
21+
import pandas.testing as mt
2122

2223
from databricks import koalas
23-
from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils
24+
from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
2425

2526

26-
class SeriesDatetimeTest(ReusedSQLTestCase, TestUtils):
27+
class SeriesDateTimeTest(ReusedSQLTestCase, SQLTestUtils):
2728

2829
@property
2930
def pdf1(self):
3031
date1 = pd.Series(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M'))
3132
date2 = pd.Series(pd.date_range('2013-3-11 21:45:00', periods=3, freq='W'))
3233
return pd.DataFrame(dict(start_date=date1, end_date=date2))
3334

35+
@property
36+
def pd_start_date(self):
37+
return self.pdf1['start_date']
38+
39+
@property
40+
def ks_start_date(self):
41+
return koalas.from_pandas(self.pd_start_date)
42+
43+
def check_func(self, func):
44+
# import pdb; pdb.set_trace()
45+
46+
mt.assert_series_equal(
47+
func(self.ks_start_date).to_pandas(),
48+
func(self.pd_start_date),
49+
check_names=False
50+
)
51+
3452
@unittest.skip(
3553
"It fails in certain OSs presumably due to different "
3654
"timezone behaviours inherited from C library.")
@@ -49,3 +67,51 @@ def test_div(self):
4967
self.assert_eq(
5068
(kdf['end_date'] - kdf['start_date']) / duration,
5169
(pdf['end_date'] - pdf['start_date']) / duration)
70+
71+
def test_date(self):
72+
self.check_func(lambda x: x.dt.date)
73+
74+
def test_time(self):
75+
with self.assertRaises(NotImplementedError):
76+
self.check_func(lambda x: x.dt.time)
77+
78+
def test_timetz(self):
79+
with self.assertRaises(NotImplementedError):
80+
self.check_func(lambda x: x.dt.timetz)
81+
82+
def test_year(self):
83+
self.check_func(lambda x: x.dt.year)
84+
85+
def test_month(self):
86+
self.check_func(lambda x: x.dt.month)
87+
88+
def test_day(self):
89+
self.check_func(lambda x: x.dt.day)
90+
91+
def test_hour(self):
92+
self.check_func(lambda x: x.dt.hour)
93+
94+
def test_minute(self):
95+
self.check_func(lambda x: x.dt.minute)
96+
97+
def test_second(self):
98+
self.check_func(lambda x: x.dt.second)
99+
100+
def test_microsecond(self):
101+
self.check_func(lambda x: x.dt.microsecond)
102+
103+
def test_nanosecond(self):
104+
with self.assertRaises(NotImplementedError):
105+
self.check_func(lambda x: x.dt.nanosecond)
106+
107+
def test_week(self):
108+
self.check_func(lambda x: x.dt.week)
109+
110+
def test_weekofyear(self):
111+
self.check_func(lambda x: x.dt.weekofyear)
112+
113+
def test_dayofweek(self):
114+
self.check_func(lambda x: x.dt.dayofweek)
115+
116+
def test_strftime(self):
117+
self.check_func(lambda x: x.dt.strftime('%Y-%m-%d'))

databricks/koalas/typedef.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
Utilities to deal with types. This is mostly focused on python3.
1919
"""
2020
import typing
21+
import datetime
2122
from inspect import getfullargspec
2223
from functools import wraps
2324

@@ -108,6 +109,7 @@ def _to_stype(tpe) -> X:
108109
types.FloatType(): [float, 'float', np.float],
109110
types.DoubleType(): [np.float64, 'float64', 'double'],
110111
types.TimestampType(): [np.datetime64],
112+
types.DateType(): [datetime.date],
111113
types.BooleanType(): [bool, 'boolean', 'bool', np.bool],
112114
}
113115

@@ -361,9 +363,10 @@ def wrapper(*args, **kwargs):
361363
spark_return_type = sig_return.inner
362364
return _make_fun(f, spark_return_type, *args, **kwargs)
363365
return wrapper
364-
if return_col is not None or return_scalar is not None:
366+
if callable(function):
367+
return function_wrapper(function)
368+
else:
365369
return function_wrapper
366-
return function_wrapper(function)
367370

368371

369372
def _infer_return_type(f, return_col_hint=None, return_scalar_hint=None) -> X:

databricks/koalas/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
Commonly used utils in Koalas.
1818
"""
1919

20+
import functools
2021
from typing import Callable, Dict, Union
2122

2223
from pyspark import sql as spark
@@ -90,6 +91,7 @@ def lazy_property(fn):
9091
attr_name = '_lazy_' + fn.__name__
9192

9293
@property
94+
@functools.wraps(fn)
9395
def _lazy_property(self):
9496
if not hasattr(self, attr_name):
9597
setattr(self, attr_name, fn(self))

docs/source/reference/series.rst

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,27 @@ Serialization / IO / Conversion
114114
Series.to_json
115115
Series.to_csv
116116
Series.to_excel
117+
118+
Datetime Methods
119+
----------------
120+
Methods accessible through `Series.dt`
121+
122+
.. currentmodule:: databricks.koalas.datetime
123+
.. autosummary::
124+
:toctree: api/
125+
126+
DatetimeMethods.date
127+
DatetimeMethods.year
128+
DatetimeMethods.month
129+
DatetimeMethods.week
130+
DatetimeMethods.weekofyear
131+
DatetimeMethods.day
132+
DatetimeMethods.dayofweek
133+
DatetimeMethods.dayofyear
134+
DatetimeMethods.hour
135+
DatetimeMethods.minute
136+
DatetimeMethods.second
137+
DatetimeMethods.millisecond
138+
DatetimeMethods.microsecond
139+
140+
DatetimeMethods.strftime

0 commit comments

Comments
 (0)