Reduce the number of joins for mod/rmod. (#1409)

ueshin · web-flow · commit b5676f396d55 · 2020-04-08T17:22:23.000-07:00
This is a follow-up of #1399. When performing mod/rmod, if the operands are series from different dataframes, we needed three joins. ```py >>> kser = ks.Series([100, None, -300, None, 500, -700], name="Koalas") >>> (kser % ks.Series([150] * 6)).to_frame().explain() == Physical Plan == *(9) Project [CASE WHEN isnotnull(__index_level_0__#317L) THEN __index_level_0__#317L ELSE __index_level_0__#228L END AS __index_level_0__#378L, (Koalas#364 % cast(0#229L as double)) AS Koalas#425] +- SortMergeJoin [__index_level_0__#317L], [__index_level_0__#228L], FullOuter :- *(7) Sort [__index_level_0__#317L ASC NULLS FIRST], false, 0 : +- Exchange hashpartitioning(__index_level_0__#317L, 200) : +- *(6) Project [CASE WHEN isnotnull(__index_level_0__#254L) THEN __index_level_0__#254L ELSE __index_level_0__#228L END AS __index_level_0__#317L, (Koalas#303 + cast(0#229L as double)) AS Koalas#364] : +- SortMergeJoin [__index_level_0__#254L], [__index_level_0__#228L], FullOuter : :- *(4) Sort [__index_level_0__#254L ASC NULLS FIRST], false, 0 : : +- Exchange hashpartitioning(__index_level_0__#254L, 200) : : +- *(3) Project [CASE WHEN isnotnull(__index_level_0__#0L) THEN __index_level_0__#0L ELSE __index_level_0__#228L END AS __index_level_0__#254L, (Koalas#1 % cast(0#229L as double)) AS Koalas#303] : : +- SortMergeJoin [__index_level_0__#0L], [__index_level_0__#228L], FullOuter : : :- *(1) Sort [__index_level_0__#0L ASC NULLS FIRST], false, 0 : : : +- Exchange hashpartitioning(__index_level_0__#0L, 200) : : : +- Scan ExistingRDD[__index_level_0__#0L,Koalas#1] : : +- *(2) Sort [__index_level_0__#228L ASC NULLS FIRST], false, 0 : : +- Exchange hashpartitioning(__index_level_0__#228L, 200) : : +- Scan ExistingRDD[__index_level_0__#228L,0#229L] : +- *(5) Sort [__index_level_0__#228L ASC NULLS FIRST], false, 0 : +- ReusedExchange [__index_level_0__#228L, 0#229L], Exchange hashpartitioning(__index_level_0__#228L, 200) +- *(8) Sort [__index_level_0__#228L ASC NULLS FIRST], false, 0 +- ReusedExchange [__index_level_0__#228L, 0#229L], Exchange hashpartitioning(__index_level_0__#228L, 200) ``` We can reduce the number to only one. ```py >>> (kser % ks.Series([150] * 6)).to_frame().explain() == Physical Plan == *(3) Project [CASE WHEN isnotnull(__index_level_0__#0L) THEN __index_level_0__#0L ELSE __index_level_0__#98L END AS __index_level_0__#118L, (((Koalas#1 % cast(0#99L as double)) + cast(0#99L as double)) % cast(0#99L as double)) AS Koalas#165] +- SortMergeJoin [__index_level_0__#0L], [__index_level_0__#98L], FullOuter :- *(1) Sort [__index_level_0__#0L ASC NULLS FIRST], false, 0 : +- Exchange hashpartitioning(__index_level_0__#0L, 200) : +- Scan ExistingRDD[__index_level_0__#0L,Koalas#1] +- *(2) Sort [__index_level_0__#98L ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(__index_level_0__#98L, 200) +- Scan ExistingRDD[__index_level_0__#98L,0#99L] ```
diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.api.types import is_list_like, is_scalar
+from pandas.api.types import is_list_like
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Window
 from pyspark.sql.types import DoubleType, FloatType, LongType, StringType, TimestampType
@@ -188,11 +188,10 @@ def __sub__(self, other):
     __truediv__ = _numpy_column_op(spark.Column.__truediv__)
 
     def __mod__(self, other):
-        if is_scalar(other):
-            return self._with_new_scol((self._scol % other + other) % other)
-        else:
-            result_spark = _column_op(spark.Column.__mod__)(self, other)
-            return _column_op(spark.Column.__mod__)(result_spark + other, other)
+        def mod(left, right):
+            return ((left % right) + right) % right
+
+        return _column_op(mod)(self, other)
 
     def __radd__(self, other):
         # Handle 'literal' + df['col']
@@ -217,11 +216,10 @@ def __rfloordiv__(self, other):
         )
 
     def __rmod__(self, other):
-        if is_scalar(other):
-            return self._with_new_scol((other % self._scol + self._scol) % self._scol)
-        else:
-            result_spark = _column_op(spark.Column.__mod__)(other, self)
-            return _column_op(spark.Column.__mod__)(result_spark + self, self)
+        def rmod(left, right):
+            return ((right % left) + left) % left
+
+        return _column_op(rmod)(self, other)
 
     __pow__ = _column_op(spark.Column.__pow__)
     __rpow__ = _column_op(spark.Column.__rpow__)
diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -310,6 +310,38 @@ def test_arithmetic_chain(self):
             (kser1 + kser2 * kser3).sort_index(), (pser1 + pser2 * pser3).sort_index(), almost=True
         )
 
+    def test_mod(self):
+        pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
+        pser_other = pd.Series([-150] * 6)
+        kser = ks.from_pandas(pser)
+        kser_other = ks.from_pandas(pser_other)
+
+        self.assert_eq(
+            repr(kser.mod(kser_other).sort_index()), repr(pser.mod(pser_other).rename("Koalas"))
+        )
+        self.assert_eq(
+            repr(kser.mod(kser_other).sort_index()), repr(pser.mod(pser_other).rename("Koalas"))
+        )
+        self.assert_eq(
+            repr(kser.mod(kser_other).sort_index()), repr(pser.mod(pser_other).rename("Koalas"))
+        )
+
+    def test_rmod(self):
+        pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
+        pser_other = pd.Series([-150] * 6)
+        kser = ks.from_pandas(pser)
+        kser_other = ks.from_pandas(pser_other)
+
+        self.assert_eq(
+            repr(kser.rmod(kser_other).sort_index()), repr(pser.rmod(pser_other).rename("Koalas"))
+        )
+        self.assert_eq(
+            repr(kser.rmod(kser_other).sort_index()), repr(pser.rmod(pser_other).rename("Koalas"))
+        )
+        self.assert_eq(
+            repr(kser.rmod(kser_other).sort_index()), repr(pser.rmod(pser_other).rename("Koalas"))
+        )
+
     def test_getitem_boolean_series(self):
         pdf1 = pd.DataFrame(
             {"A": [0, 1, 2, 3, 4], "B": [100, 200, 300, 400, 500]}, index=[20, 10, 30, 0, 50]
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1381,6 +1381,10 @@ def test_mod(self):
         self.assert_eq(repr(kser.mod(0)), repr(pser.mod(0)))
         self.assert_eq(repr(kser.mod(150)), repr(pser.mod(150)))
 
+        pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6})
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(repr(kdf.a.mod(kdf.b)), repr(pdf.a.mod(pdf.b).rename("a")))
+
     def test_rmod(self):
         pser = pd.Series([100, None, -300, None, 500, -700], name="Koalas")
         kser = ks.from_pandas(pser)
@@ -1389,6 +1393,10 @@ def test_rmod(self):
         self.assert_eq(repr(kser.rmod(0)), repr(pser.rmod(0)))
         self.assert_eq(repr(kser.rmod(150)), repr(pser.rmod(150)))
 
+        pdf = pd.DataFrame({"a": [100, None, -300, None, 500, -700], "b": [150] * 6})
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(repr(kdf.a.rmod(kdf.b)), repr(pdf.a.rmod(pdf.b).rename("a")))
+
     def test_asof(self):
         pser = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40], name="Koalas")
         kser = ks.from_pandas(pser)