Add schema inference support at DataFrame.transform

HyukjinKwon · HyukjinKwon · commit 688a54a5311c · 2019-09-02T10:57:01.000+09:00
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -43,7 +43,7 @@
                                IntegerType, LongType, NumericType, ShortType, StructType)
 from pyspark.sql.utils import AnalysisException
 from pyspark.sql.window import Window
-from pyspark.sql.functions import pandas_udf
+from pyspark.sql.functions import pandas_udf, PandasUDFType
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
 from databricks.koalas.utils import validate_arguments_and_invoke_function, align_diff_frames
@@ -1539,7 +1539,16 @@ def transform(self, func):
         Call ``func`` on self producing a Series with transformed values
         and that has the same length as its input.
 
-        .. note:: unlike pandas, it is required for ``func`` to specify return type hint.
+        .. note:: this API executes the function once to infer the type which is
+             potentially expensive, for instance, when the dataset is created after
+             aggregations or sorting.
+
+             To avoid this, specify return type in ``func``, for instance, as below:
+
+             >>> def square(x) -> ks.Series[np.int32]:
+             ...     return x ** 2
+
+             Koalas uses return type hint and does not try to infer the type.
 
         .. note:: the series within ``func`` is actually a pandas series, and
             the length of each series is not guaranteed.
@@ -1575,20 +1584,48 @@ def transform(self, func):
         0  0  1
         1  1  4
         2  4  9
+
+        You can omit the type hint and let Koalas infer its type.
+
+        >>> df.transform(lambda x: x ** 2)
+           A  B
+        0  0  1
+        1  1  4
+        2  4  9
+
         """
         assert callable(func), "the first argument should be a callable function."
         spec = inspect.getfullargspec(func)
         return_sig = spec.annotations.get("return", None)
-        if return_sig is None:
-            raise ValueError("Given function must have return type hint; however, not found.")
+        should_infer_schema = return_sig is None
 
-        wrapped = ks.pandas_wraps(func)
-        applied = []
-        for column in self._internal.data_columns:
-            applied.append(wrapped(self[column]).rename(column))
+        if should_infer_schema:
+            # Here we execute with the first 1000 to get the return type.
+            # If the records were less than 1000, it uses pandas API directly for a shortcut.
+            limit = 1000
+            pdf = self.head(limit + 1)._to_internal_pandas()
+            transformed = pdf.transform(func)
+            kdf = DataFrame(transformed)
+            return_schema = kdf._sdf.schema
+            if len(pdf) <= limit:
+                return kdf
+
+            applied = []
+            for input_column, output_column in zip(
+                    self._internal.data_columns, kdf._internal.data_columns):
+                pandas_func = pandas_udf(
+                    func,
+                    returnType=return_schema[output_column].dataType,
+                    functionType=PandasUDFType.SCALAR)
+                applied.append(pandas_func(self[input_column]._scol).alias(output_column))
+        else:
+            wrapped = ks.pandas_wraps(func)
+            applied = []
+            for column in self._internal.data_columns:
+                applied.append(wrapped(self[column]).rename(column)._scol)
 
         sdf = self._sdf.select(
-            self._internal.index_scols + [c._scol for c in applied])
+            self._internal.index_scols + [c for c in applied])
         internal = self._internal.copy(sdf=sdf)
 
         return DataFrame(internal)
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -1554,3 +1554,14 @@ def test_pipe(self):
             "arg is both the pipe target and a keyword argument",
             lambda: kdf.pipe((lambda x: x, 'arg'), arg='1')
         )
+
+    def test_transform(self):
+        # Data is intentionally big to test when schema inference is on.
+        pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6] * 300,
+                            'b': [1., 1., 2., 3., 5., 8.] * 300,
+                            'c': [1, 4, 9, 16, 25, 36] * 300}, columns=['a', 'b', 'c'])
+        kdf = ks.DataFrame(pdf)
+        self.assert_eq(kdf.transform(lambda x: x + 1).sort_index(),
+                       pdf.transform(lambda x: x + 1).sort_index())
+        with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
+            kdf.transform(1)