Skip to content

Commit e6cdd46

Browse files
committed
Merge pull request #3595 from jreback/combine_first_timestamp
BUG: (GH3593) fixed a bug in the incorrect conversion of datetime64[ns] in combine_first
2 parents 5e9db38 + 6b5ca31 commit e6cdd46

File tree

7 files changed

+119
-14
lines changed

7 files changed

+119
-14
lines changed

RELEASE.rst

+2
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ pandas 0.11.1
104104
- ``combine_first`` not returning the same dtype in cases where it can (GH3552_)
105105
- Fixed bug with ``Panel.transpose`` argument aliases (GH3556_)
106106
- Fixed platform bug in ``PeriodIndex.take`` (GH3579_)
107+
- Fixed bud in incorrect conversion of datetime64[ns] in ``combine_first`` (GH3593_)
107108
- Fixed bug in reset_index with ``NaN`` in a multi-index (GH3586_)
108109

109110
.. _GH3164: https://github.com/pydata/pandas/issues/3164
@@ -145,6 +146,7 @@ pandas 0.11.1
145146
.. _GH3586: https://github.com/pydata/pandas/issues/3586
146147
.. _GH3493: https://github.com/pydata/pandas/issues/3493
147148
.. _GH3579: https://github.com/pydata/pandas/issues/3579
149+
.. _GH3593: https://github.com/pydata/pandas/issues/3593
148150
.. _GH3556: https://github.com/pydata/pandas/issues/3556
149151

150152

pandas/core/common.py

+34
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,33 @@ def _possibly_downcast_to_dtype(result, dtype):
921921

922922
return result
923923

924+
def _lcd_dtypes(a_dtype, b_dtype):
925+
""" return the lcd dtype to hold these types """
926+
927+
if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype):
928+
return _NS_DTYPE
929+
elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype):
930+
return _TD_DTYPE
931+
elif is_complex_dtype(a_dtype):
932+
if is_complex_dtype(b_dtype):
933+
return a_dtype
934+
return np.float64
935+
elif is_integer_dtype(a_dtype):
936+
if is_integer_dtype(b_dtype):
937+
if a_dtype.itemsize == b_dtype.itemsize:
938+
return a_dtype
939+
return np.int64
940+
return np.float64
941+
elif is_float_dtype(a_dtype):
942+
if is_float_dtype(b_dtype):
943+
if a_dtype.itemsize == b_dtype.itemsize:
944+
return a_dtype
945+
else:
946+
return np.float64
947+
elif is_integer(b_dtype):
948+
return np.float64
949+
return np.object
950+
924951
def _interp_wrapper(f, wrap_dtype, na_override=None):
925952
def wrapper(arr, mask, limit=None):
926953
view = arr.view(wrap_dtype)
@@ -1524,6 +1551,13 @@ def is_float_dtype(arr_or_dtype):
15241551
tipo = arr_or_dtype.dtype.type
15251552
return issubclass(tipo, np.floating)
15261553

1554+
def is_complex_dtype(arr_or_dtype):
1555+
if isinstance(arr_or_dtype, np.dtype):
1556+
tipo = arr_or_dtype.type
1557+
else:
1558+
tipo = arr_or_dtype.dtype.type
1559+
return issubclass(tipo, np.complexfloating)
1560+
15271561

15281562
def is_list_like(arg):
15291563
return hasattr(arg, '__iter__') and not isinstance(arg, basestring) or hasattr(arg,'len')

pandas/core/frame.py

+41-6
Original file line numberDiff line numberDiff line change
@@ -3738,8 +3738,11 @@ def combine(self, other, func, fill_value=None, overwrite=True):
37383738

37393739
result = {}
37403740
for col in new_columns:
3741-
series = this[col].values
3742-
otherSeries = other[col].values
3741+
series = this[col]
3742+
otherSeries = other[col]
3743+
3744+
this_dtype = series.dtype
3745+
other_dtype = otherSeries.dtype
37433746

37443747
this_mask = isnull(series)
37453748
other_mask = isnull(otherSeries)
@@ -3756,18 +3759,40 @@ def combine(self, other, func, fill_value=None, overwrite=True):
37563759
series[this_mask] = fill_value
37573760
otherSeries[other_mask] = fill_value
37583761

3759-
arr = func(series, otherSeries)
3762+
# if we have different dtypes, possibily promote
3763+
new_dtype = this_dtype
3764+
if this_dtype != other_dtype:
3765+
new_dtype = com._lcd_dtypes(this_dtype,other_dtype)
3766+
series = series.astype(new_dtype)
3767+
otherSeries = otherSeries.astype(new_dtype)
3768+
3769+
# see if we need to be represented as i8 (datetimelike)
3770+
# try to keep us at this dtype
3771+
needs_i8_conversion = com.needs_i8_conversion(new_dtype)
3772+
if needs_i8_conversion:
3773+
this_dtype = new_dtype
3774+
arr = func(series, otherSeries, True)
3775+
else:
3776+
arr = func(series, otherSeries)
37603777

37613778
if do_fill:
37623779
arr = com.ensure_float(arr)
37633780
arr[this_mask & other_mask] = NA
37643781

3782+
# try to downcast back to the original dtype
3783+
if needs_i8_conversion:
3784+
arr = com._possibly_cast_to_datetime(arr, this_dtype)
3785+
else:
3786+
arr = com._possibly_downcast_to_dtype(arr, this_dtype)
3787+
37653788
result[col] = arr
37663789

37673790
# convert_objects just in case
37683791
return self._constructor(result,
37693792
index=new_index,
3770-
columns=new_columns).convert_objects(copy=False)
3793+
columns=new_columns).convert_objects(
3794+
convert_dates=True,
3795+
copy=False)
37713796

37723797
def combine_first(self, other):
37733798
"""
@@ -3788,8 +3813,18 @@ def combine_first(self, other):
37883813
-------
37893814
combined : DataFrame
37903815
"""
3791-
def combiner(x, y):
3792-
return expressions.where(isnull(x), y, x, raise_on_error=True)
3816+
def combiner(x, y, needs_i8_conversion=False):
3817+
x_values = x.values if hasattr(x,'values') else x
3818+
y_values = y.values if hasattr(y,'values') else y
3819+
if needs_i8_conversion:
3820+
mask = isnull(x)
3821+
x_values = x_values.view('i8')
3822+
y_values = y_values.view('i8')
3823+
else:
3824+
mask = isnull(x_values)
3825+
3826+
return expressions.where(mask, y_values, x_values, raise_on_error=True)
3827+
37933828
return self.combine(other, combiner, overwrite=False)
37943829

37953830
def update(self, other, join='left', overwrite=True, filter_func=None,

pandas/core/internals.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -258,14 +258,15 @@ def downcast(self, dtypes = None):
258258

259259
return blocks
260260

261-
def astype(self, dtype, copy = True, raise_on_error = True):
261+
def astype(self, dtype, copy = True, raise_on_error = True, values = None):
262262
"""
263263
Coerce to the new type (if copy=True, return a new copy)
264264
raise on an except if raise == True
265265
"""
266266
try:
267-
newb = make_block(com._astype_nansafe(self.values, dtype, copy = copy),
268-
self.items, self.ref_items, fastpath=True)
267+
if values is None:
268+
values = com._astype_nansafe(self.values, dtype, copy = copy)
269+
newb = make_block(values, self.items, self.ref_items, fastpath=True)
269270
except:
270271
if raise_on_error is True:
271272
raise
@@ -708,6 +709,15 @@ def is_bool(self):
708709
""" we can be a bool if we have only bool values but are of type object """
709710
return lib.is_bool_array(self.values.ravel())
710711

712+
def astype(self, dtype, copy=True, raise_on_error=True, values=None):
713+
""" allow astypes to datetime64[ns],timedelta64[ns] with coercion """
714+
dtype = np.dtype(dtype)
715+
if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
716+
values = com._possibly_convert_datetime(self.values,dtype)
717+
else:
718+
values = None
719+
return super(ObjectBlock, self).astype(dtype=dtype,copy=copy,raise_on_error=raise_on_error,values=values)
720+
711721
def convert(self, convert_dates = True, convert_numeric = True, copy = True):
712722
""" attempt to coerce any object types to better types
713723
return a copy of the block (if copy = True)

pandas/core/series.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
from pandas.core.common import (isnull, notnull, _is_bool_indexer,
1818
_default_index, _maybe_promote, _maybe_upcast,
1919
_asarray_tuplesafe, is_integer_dtype,
20-
_infer_dtype_from_scalar, is_list_like)
20+
_infer_dtype_from_scalar, is_list_like,
21+
_NS_DTYPE, _TD_DTYPE)
2122
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2223
_ensure_index, _handle_legacy_indexes)
2324
from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer, _check_slice_bounds
@@ -929,9 +930,13 @@ def astype(self, dtype):
929930
"""
930931
See numpy.ndarray.astype
931932
"""
932-
casted = com._astype_nansafe(self.values, dtype)
933-
return self._constructor(casted, index=self.index, name=self.name,
934-
dtype=casted.dtype)
933+
dtype = np.dtype(dtype)
934+
if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
935+
values = com._possibly_cast_to_datetime(self.values,dtype)
936+
else:
937+
values = com._astype_nansafe(self.values, dtype)
938+
return self._constructor(values, index=self.index, name=self.name,
939+
dtype=values.dtype)
935940

936941
def convert_objects(self, convert_dates=True, convert_numeric=True, copy=True):
937942
"""

pandas/tests/test_frame.py

+19
Original file line numberDiff line numberDiff line change
@@ -7907,6 +7907,25 @@ def test_combine_first_mixed_bug(self):
79077907
expected = Series([True,True,False])
79087908
assert_series_equal(result,expected)
79097909

7910+
# GH 3593, converting datetime64[ns] incorrecly
7911+
df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]})
7912+
df1 = DataFrame({"a":[None, None, None]})
7913+
df2 = df1.combine_first(df0)
7914+
assert_frame_equal(df2,df0)
7915+
7916+
df2 = df0.combine_first(df1)
7917+
assert_frame_equal(df2,df0)
7918+
7919+
df0 = DataFrame({"a":[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]})
7920+
df1 = DataFrame({"a":[datetime(2000, 1, 2), None, None]})
7921+
df2 = df1.combine_first(df0)
7922+
result = df0.copy()
7923+
result.iloc[0,:] = df1.iloc[0,:]
7924+
assert_frame_equal(df2,result)
7925+
7926+
df2 = df0.combine_first(df1)
7927+
assert_frame_equal(df2,df0)
7928+
79107929
def test_update(self):
79117930
df = DataFrame([[1.5, nan, 3.],
79127931
[1.5, nan, 3.],

pandas/tests/test_series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1856,7 +1856,7 @@ def test_operators_timedelta64(self):
18561856
v1 = date_range('2012-1-1', periods=3, freq='D')
18571857
v2 = date_range('2012-1-2', periods=3, freq='D')
18581858
rs = Series(v2) - Series(v1)
1859-
xp = Series(1e9 * 3600 * 24, rs.index).astype('timedelta64[ns]')
1859+
xp = Series(1e9 * 3600 * 24, rs.index).astype('int64').astype('timedelta64[ns]')
18601860
assert_series_equal(rs, xp)
18611861
self.assert_(rs.dtype=='timedelta64[ns]')
18621862

0 commit comments

Comments
 (0)