Add fillna (#78)

garawalid · HyukjinKwon · commit 21cffccef974 · 2019-05-02T09:42:49.000+09:00
+ Only axis = 0 is supported in this method 
+ Dataframe and Series are not supported as value
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -17,11 +17,11 @@
 """
 A wrapper class for Spark DataFrame to behave similar to pandas DataFrame.
 """
-from decorator import dispatch_on
 from functools import partial, reduce
 
 import numpy as np
 import pandas as pd
+from decorator import dispatch_on
 from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Column
@@ -37,7 +37,7 @@
 from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame
 from databricks.koalas.ml import corr
 from databricks.koalas.selection import SparkDataFrameLocator
-from databricks.koalas.typedef import infer_pd_series_spark_type
+from databricks.koalas.typedef import infer_pd_series_spark_type, dict_sanitizer
 
 
 class DataFrame(_Frame):
@@ -731,6 +731,79 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False):
         else:
             raise NotImplementedError("dropna currently only works for axis=0 or axis='index'")
 
+    def fillna(self, value=None, axis=None, inplace=False):
+        """Fill NA/NaN values.
+
+        :param value: scalar, dict, Series
+                    Value to use to fill holes. alternately a dict/Series of values
+                    specifying which value to use for each column.
+                    DataFrame is not supported.
+        :param axis: {0 or `index`}
+                    1 and `columns` are not supported.
+        :param inplace: boolean, default False
+                    Fill in place (do not create a new object)
+        :return: :class:`DataFrame`
+
+        Examples
+        --------
+        >>> df = ks.DataFrame({
+        ...     'A': [None, 3, None, None],
+        ...     'B': [2, 4, None, 3],
+        ...     'C': [None, None, None, 1],
+        ...     'D': [0, 1, 5, 4]
+        ...     })
+        >>> df
+             A    B    C  D
+        0  NaN  2.0  NaN  0
+        1  3.0  4.0  NaN  1
+        2  NaN  NaN  NaN  5
+        3  NaN  3.0  1.0  4
+
+        Replace all NaN elements with 0s.
+
+        >>> df.fillna(0)
+             A    B    C  D
+        0  0.0  2.0  0.0  0
+        1  3.0  4.0  0.0  1
+        2  0.0  0.0  0.0  5
+        3  0.0  3.0  1.0  4
+
+        Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
+        2, and 3 respectively.
+
+        >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
+        >>> df.fillna(value=values)
+             A    B    C  D
+        0  0.0  2.0  2.0  0
+        1  3.0  4.0  2.0  1
+        2  0.0  1.0  2.0  5
+        3  0.0  3.0  1.0  4
+        """
+
+        if axis is None:
+            axis = 0
+        if value is not None:
+            if axis == 0 or axis == "index":
+                if isinstance(value, (float, int, str, bool)):
+                    sdf = self._sdf.fillna(value)
+                if isinstance(value, dict):
+                    dict_sanitizer(value)
+                    sdf = self._sdf.fillna(value)
+                if isinstance(value, pd.Series):
+                    dict_sanitizer(value.to_dict())
+                    sdf = self._sdf.fillna(value.to_dict())
+                elif isinstance(value, pd.DataFrame):
+                    raise NotImplementedError("Dataframe value is not supported")
+            else:
+                raise NotImplementedError("fillna currently only works for axis=0 or axis='index'")
+        else:
+            raise ValueError('Must specify value')
+
+        if inplace:
+            self._sdf = sdf
+        else:
+            return DataFrame(sdf, self._metadata.copy())
+
     def head(self, n=5):
         """
         Return the first `n` rows.
diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -69,7 +69,6 @@ class _MissingPandasLikeDataFrame(object):
     ewm = unsupported_function('ewm')
     expanding = unsupported_function('expanding')
     ffill = unsupported_function('ffill')
-    fillna = unsupported_function('fillna')
     filter = unsupported_function('filter')
     first = unsupported_function('first')
     first_valid_index = unsupported_function('first_valid_index')
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -346,6 +346,46 @@ def test_value_counts(self):
         ds.name = 'index'
         self.assertPandasAlmostEqual(ds.value_counts().toPandas(), s.value_counts())
 
+    def test_fillna(self):
+        pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6],
+                            'y': [1, 2, np.nan, 4, np.nan, np.nan],
+                            'z': [1, 2, 3, 4, np.nan, np.nan]},
+                           index=[10, 20, 30, 40, 50, 60])
+
+        kdf = koalas.from_pandas(pdf)
+
+        self.assert_eq(kdf, pdf)
+        self.assert_eq(kdf.fillna(-1), pdf.fillna(-1))
+        self.assert_eq(kdf.fillna({'x': -1, 'y': -2, 'z': -5}),
+                       pdf.fillna({'x': -1, 'y': -2, 'z': -5}))
+
+        pdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True)
+        kdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True)
+        self.assert_eq(kdf, pdf)
+
+        s_nan = pd.Series([-1, -2, -5], index=['x', 'y', 'z'], dtype=int)
+        self.assert_eq(kdf.fillna(s_nan),
+                       pdf.fillna(s_nan))
+
+        msg = "fillna currently only works for axis=0 or axis='index'"
+        with self.assertRaisesRegex(NotImplementedError, msg):
+            kdf.fillna(-1, axis=1)
+        with self.assertRaisesRegex(NotImplementedError, msg):
+            kdf.fillna(-1, axis='column')
+        msg = 'Must specify value'
+        with self.assertRaisesRegex(ValueError, msg):
+            kdf.fillna()
+        df_nan = pd.DataFrame({'x': [-1], 'y': [-1], 'z': [-1]})
+        msg = "Dataframe value is not supported"
+        with self.assertRaisesRegex(NotImplementedError, msg):
+            kdf.fillna(df_nan)
+
+        # Test dict sanitizer
+        value_dict = {'x': np.int64(-6), 'y': np.int64(-4), 'z': -5}
+        msg = "Dict contains unsupported type <class 'numpy.int64'>"
+        with self.assertRaisesRegex(TypeError, msg):
+            kdf.fillna(value_dict)
+
     def test_isnull(self):
         pdf = pd.DataFrame({'x': [1, 2, 3, 4, None, 6], 'y': list('abdabd')},
                            index=[10, 20, 30, 40, 50, 60])
diff --git a/databricks/koalas/typedef.py b/databricks/koalas/typedef.py
@@ -29,7 +29,6 @@
 from pyspark.sql.functions import pandas_udf
 import pyspark.sql.types as types
 
-
 T = typing.TypeVar("T")
 
 
@@ -113,7 +112,6 @@ def _build_py_type_dict():
 
 _known_types = _build_type_dict()
 
-
 _py_conversions = _build_py_type_dict()
 
 
@@ -172,6 +170,18 @@ def _check_compatible(arg, sig_arg: X):
     assert False, (arg, sig_arg)
 
 
+def dict_sanitizer(input_dict):
+    """
+    This function checks if elements inside a given dict are supported by Spark.
+    :param input_dict: dict
+    :raises TypeError if any element is not supported by Spark
+    """
+    _possible_type_values = [int, str, float, np.float64, np.float, bool]
+    for e in input_dict.values():
+        if not type(e) in _possible_type_values:
+            raise TypeError("Dict contains unsupported type {}".format(type(e)))
+
+
 def make_fun(f, *args, **kwargs):
     """
     This function calls the function f while taking into account some of the
@@ -225,7 +235,7 @@ def make_fun(f, *args, **kwargs):
     all_indexes = col_indexes + col_keys  # type: typing.Union[str, int]
 
     def clean_fun(*args2):
-        assert len(args2) == len(all_indexes),\
+        assert len(args2) == len(all_indexes), \
             "Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2])
         full_args = list(frozen_args)
         full_kwargs = dict(frozen_kwargs)
@@ -236,6 +246,7 @@ def clean_fun(*args2):
                 assert isinstance(idx, str), str(idx)
                 full_kwargs[idx] = arg
         return f(*full_args, **full_kwargs)
+
     udf = pandas_udf(clean_fun, returnType=spark_ret_type)
     wrapped_udf = udf  # udf #_wrap_callable(udf)
     col_args = []
@@ -254,6 +265,7 @@ def _wrap_callable(obj):
 
     def f(*args, **kwargs):
         return f0(*args, **kwargs)
+
     obj.__call__ = f
     return obj
 
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -93,6 +93,7 @@ Missing data handling
    :toctree: api/
 
    DataFrame.dropna
+   DataFrame.fillna
 
 Reshaping, sorting, transposing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~