Support DataFrame parameter in Series.dot (#1931)

xinrong-meng · web-flow · commit 901a6f0721a4 · 2020-12-04T11:06:11.000-08:00
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -4721,9 +4721,9 @@ def combine_first(self, other) -> "Series":
             *index_scols, cond.alias(self._internal.data_spark_column_names[0])
         ).distinct()
         internal = self._internal.with_new_sdf(sdf)
-        return first_series(ks.DataFrame(internal))
+        return first_series(DataFrame(internal))
 
-    def dot(self, other) -> Union[Scalar, "Series"]:
+    def dot(self, other: Union["Series", DataFrame]) -> Union[Scalar, "Series"]:
         """
         Compute the dot product between the Series and the columns of other.
 
@@ -4732,7 +4732,7 @@ def dot(self, other) -> Union[Scalar, "Series"]:
 
         It can also be called using `self @ other` in Python >= 3.5.
 
-        .. note:: This API is slightly different from pandas when indexes from both
+        .. note:: This API is slightly different from pandas when indexes from both Series
             are not aligned. To match with pandas', it requires to read the whole data for,
             for example, counting. pandas raises an exception; however, Koalas just proceeds
             and performs by ignoring mismatches with NaN permissively.
@@ -4774,20 +4774,48 @@ def dot(self, other) -> Union[Scalar, "Series"]:
 
         >>> s @ s
         14
+
+        >>> kdf = ks.DataFrame({'x': [0, 1, 2, 3], 'y': [0, -1, -2, -3]})
+        >>> kdf
+           x  y
+        0  0  0
+        1  1 -1
+        2  2 -2
+        3  3 -3
+
+        >>> with ks.option_context("compute.ops_on_diff_frames", True):
+        ...     s.dot(kdf)
+        ...
+        x    14
+        y   -14
+        dtype: int64
         """
         if isinstance(other, DataFrame):
-            raise ValueError(
-                "Series.dot() is currently not supported with DataFrame since "
-                "it will cause expansive calculation as many as the number "
-                "of columns of DataFrame"
-            )
-        if self._kdf is not other._kdf:
-            if len(self.index) != len(other.index):
-                raise ValueError("matrices are not aligned")
-        if isinstance(other, Series):
-            result = (self * other).sum()
+            if not same_anchor(self, other):
+                if not self.index.sort_values().equals(other.index.sort_values()):
+                    raise ValueError("matrices are not aligned")
 
-        return result
+            other = other.copy()
+            column_labels = other._internal.column_labels
+
+            self_column_label = verify_temp_column_name(other, "__self_column__")
+            other[self_column_label] = self
+            self_kser = other._kser_for(self_column_label)
+
+            product_ksers = [other._kser_for(label) * self_kser for label in column_labels]
+
+            dot_product_kser = DataFrame(
+                other._internal.with_new_columns(product_ksers, column_labels)
+            ).sum()
+
+            return cast(Series, dot_product_kser).rename(self.name)
+
+        else:
+            assert isinstance(other, Series)
+            if not same_anchor(self, other):
+                if len(self.index) != len(other.index):
+                    raise ValueError("matrices are not aligned")
+            return (self * other).sum()
 
     def __matmul__(self, other):
         """
@@ -4945,7 +4973,7 @@ def asof(self, where) -> Union[Scalar, "Series"]:
         should_return_series = True
         if isinstance(self.index, ks.MultiIndex):
             raise ValueError("asof is not supported for a MultiIndex")
-        if isinstance(where, (ks.Index, ks.Series, ks.DataFrame)):
+        if isinstance(where, (ks.Index, ks.Series, DataFrame)):
             raise ValueError("where cannot be an Index, Series or a DataFrame")
         if not self.index.is_monotonic_increasing:
             raise ValueError("asof requires a sorted index")
diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -870,13 +870,6 @@ def test_dot(self):
         with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
             kser.dot(kser_other)
 
-        # with DataFram is not supported for now since performance issue,
-        # now we raise ValueError with proper message instead.
-        kdf = ks.DataFrame([[0, 1], [-2, 3], [4, -5]], index=[2, 4, 1])
-
-        with self.assertRaisesRegex(ValueError, r"Series\.dot\(\) is currently not supported*"):
-            kser.dot(kdf)
-
         # for MultiIndex
         midx = pd.MultiIndex(
             [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
@@ -886,9 +879,44 @@ def test_dot(self):
         kser = ks.from_pandas(pser)
         pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx)
         kser_other = ks.from_pandas(pser_other)
-
         self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))
 
+        pser = pd.Series([0, 1, 2, 3])
+        kser = ks.from_pandas(pser)
+
+        # DataFrame "other" without Index/MultiIndex as columns
+        pdf = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+
+        # DataFrame "other" with Index as columns
+        pdf.columns = pd.Index(["x", "y"])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+        pdf.columns = pd.Index(["x", "y"], name="cols_name")
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+
+        pdf = pdf.reindex([1, 0, 2, 3])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+
+        # DataFrame "other" with MultiIndex as columns
+        pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+        pdf.columns = pd.MultiIndex.from_tuples(
+            [("a", "x"), ("b", "y")], names=["cols_name1", "cols_name2"]
+        )
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+
+        kser = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).b
+        pser = kser.to_pandas()
+        kdf = ks.DataFrame({"c": [7, 8, 9]})
+        pdf = kdf.to_pandas()
+        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
+
     def test_to_series_comparison(self):
         kidx1 = ks.Index([1, 2, 3, 4, 5])
         kidx2 = ks.Index([1, 2, 3, 4, 5])
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -2123,6 +2123,14 @@ def test_droplevel(self):
                 pser.droplevel([("a", "1"), ("c", "3")]), kser.droplevel([("a", "1"), ("c", "3")])
             )
 
+    def test_dot(self):
+        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq((kdf["b"] * 10).dot(kdf["a"]), (pdf["b"] * 10).dot(pdf["a"]))
+        self.assert_eq((kdf["b"] * 10).dot(kdf), (pdf["b"] * 10).dot(pdf))
+        self.assert_eq((kdf["b"] * 10).dot(kdf + 1), (pdf["b"] * 10).dot(pdf + 1))
+
     @unittest.skipIf(
         LooseVersion(pyspark.__version__) < LooseVersion("3.0"),
         "tail won't work properly with PySpark<3.0",