Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ def wrapper(self, *args):
args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args]
scol = f(self._scol, *args)

# If f is a logistic operator, fill NULL with False
log_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt']
is_log_op = any(f == getattr(spark.Column, '__{}__'.format(log_op))
for log_op in log_ops)
if is_log_op:
filler = f == spark.Column.__ne__
scol = F.when(scol.isNull(), filler).otherwise(scol)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@HyukjinKwon @ueshin (cc @itholic @charlesdong1991 )
Not sure if this is the right implementation ...

Copy link
Contributor

@itholic itholic Nov 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm not pretty sure, but maybe is scol = F.when(scol.isNull(), False).otherwise(True) correct?

Because when type of scol is not boolean, it may raises AnalysisException due to data type mismatch between False and scol

Copy link
Contributor

@itholic itholic Nov 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, and i think maybe it should be better to make test of this if you available 😄
ah it's already tested :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@itholic Thanks! I haven't fixed the tests yet because I want to make sure first if this is the right implementation.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually the problem is not that simple. E.g.,:

>>> pser == 0
0     True
1    False
2    False
3    False
dtype: bool
>>> pser != 0
0    False
1     True
2     True
3     True
dtype: bool

and

>>> ks.from_pandas(pser) == 0
0     True
1    False
2    False
3     None
Name: 0, dtype: object
>>> ks.from_pandas(pser) != 0
0    False
1     True
2     True
3     None
Name: 0, dtype: object

If we simply convert None to False, the != case is not the same as pandas'.

We might need to add some logic one-by-one.

Copy link
Contributor Author

@harupy harupy Nov 8, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ueshin @charlesdong1991

Looks like this is how pandas should behave.

https://github.com/pandas-dev/pandas/blob/master/pandas/tests/series/test_operators.py#L447-L452

# True | np.nan => True
exp_or1 = pd.Series([True, True, True, False], index=list("ABCD"), name="x")
tm.assert_series_equal(s1 | s2, exp_or1)
# np.nan | True => np.nan, filled with False
exp_or = pd.Series([True, True, False, False], index=list("ABCD"), name="x")
tm.assert_series_equal(s2 | s1, exp_or)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh, then it behaves this way on purpose in pandas. thanks for looking at source code to clear things up @harupy

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's discuss & and | and address if needed in a separate issue/PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think & and | should be addressed in a separate PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like pandas tries to follow the boolean operations rules in Python (or and and) - https://docs.python.org/3/library/stdtypes.html#boolean-operations-and-or-not

# True | np.nan => True
# np.nan | True => np.nan

Then, ^ could make sense.

However, I am still kind of lost about "np.nan, filled with False" because bool(np.nan) is coerced to True, e.g.)

>>> pd.Series([float("nan")]).astype("bool")
0    True
dtype: bool

return self._with_new_scol(scol)
else:
# Different DataFrame anchors
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like we should apply the updates for this branch as well?

>>> ks.options.compute.ops_on_diff_frames = True
>>> s1 = pd.Series([True, False, True], index=list("ABC"), name="x")
>>> s2 = pd.Series([True, True, False], index=list("ABD"), name="x")
>>> s1
A     True
B    False
C     True
Name: x, dtype: bool
>>> s2
A     True
B     True
D    False
Name: x, dtype: bool
>>> s1 | s2
A     True
B     True
C     True
D    False
Name: x, dtype: bool
>>> (ks.from_pandas(s1) | ks.from_pandas(s2)).sort_index()
A    True
B    True
C    True
D    None
Name: x, dtype: object

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ueshin Thank you for catching this. I'm working on this.

Expand Down
42 changes: 21 additions & 21 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -749,11 +749,11 @@ def eq(self, other):
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])

>>> df.eq(1)
a b
a True True
b False None
c False True
d False None
a b
a True True
b False False
c False True
d False False
"""
return self == other

Expand All @@ -770,9 +770,9 @@ def gt(self, other):
>>> df.gt(2)
a b
a False False
b False None
b False False
c True False
d True None
d True False
"""
return self > other

Expand All @@ -785,11 +785,11 @@ def ge(self, other):
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])

>>> df.ge(1)
a b
a True True
b True None
c True True
d True None
a b
a True True
b True False
c True True
d True False
"""
return self >= other

Expand All @@ -804,9 +804,9 @@ def lt(self, other):
>>> df.lt(1)
a b
a False False
b False None
b False False
c False False
d False None
d False False
"""
return self < other

Expand All @@ -819,11 +819,11 @@ def le(self, other):
... index=['a', 'b', 'c', 'd'], columns=['a', 'b'])

>>> df.le(2)
a b
a True True
b True None
c False True
d False None
a b
a True True
b True False
c False True
d False False
"""
return self <= other

Expand All @@ -838,9 +838,9 @@ def ne(self, other):
>>> df.ne(1)
a b
a False False
b True None
b True True
c True False
d True None
d True True
"""
return self != other

Expand Down
48 changes: 24 additions & 24 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,11 +539,11 @@ def eq(self, other):
Name: a, dtype: bool

>>> df.b.eq(1)
a True
b None
c True
d None
Name: b, dtype: object
a True
b False
c True
d False
Name: b, dtype: bool
"""
return (self == other).rename(self.name)

Expand All @@ -566,10 +566,10 @@ def gt(self, other):

>>> df.b.gt(1)
a False
b None
b False
c False
d None
Name: b, dtype: object
d False
Name: b, dtype: bool
"""
return (self > other).rename(self.name)

Expand All @@ -590,10 +590,10 @@ def ge(self, other):

>>> df.b.ge(2)
a False
b None
b False
c False
d None
Name: b, dtype: object
d False
Name: b, dtype: bool
"""
return (self >= other).rename(self.name)

Expand All @@ -613,11 +613,11 @@ def lt(self, other):
Name: a, dtype: bool

>>> df.b.lt(2)
a True
b None
c True
d None
Name: b, dtype: object
a True
b False
c True
d False
Name: b, dtype: bool
"""
return (self < other).rename(self.name)

Expand All @@ -637,11 +637,11 @@ def le(self, other):
Name: a, dtype: bool

>>> df.b.le(2)
a True
b None
c True
d None
Name: b, dtype: object
a True
b False
c True
d False
Name: b, dtype: bool
"""
return (self <= other).rename(self.name)

Expand All @@ -662,10 +662,10 @@ def ne(self, other):

>>> df.b.ne(1)
a False
b None
b True
c False
d None
Name: b, dtype: object
d True
Name: b, dtype: bool
"""
return (self != other).rename(self.name)

Expand Down