Skip to content

Commit 21cffcc

Browse files
garawalidHyukjinKwon
authored andcommitted
Add fillna (#78)
+ Only axis = 0 is supported in this method + Dataframe and Series are not supported as value
1 parent ee493f2 commit 21cffcc

File tree

5 files changed

+131
-6
lines changed

5 files changed

+131
-6
lines changed

databricks/koalas/frame.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@
1717
"""
1818
A wrapper class for Spark DataFrame to behave similar to pandas DataFrame.
1919
"""
20-
from decorator import dispatch_on
2120
from functools import partial, reduce
2221

2322
import numpy as np
2423
import pandas as pd
24+
from decorator import dispatch_on
2525
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
2626
from pyspark import sql as spark
2727
from pyspark.sql import functions as F, Column
@@ -37,7 +37,7 @@
3737
from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame
3838
from databricks.koalas.ml import corr
3939
from databricks.koalas.selection import SparkDataFrameLocator
40-
from databricks.koalas.typedef import infer_pd_series_spark_type
40+
from databricks.koalas.typedef import infer_pd_series_spark_type, dict_sanitizer
4141

4242

4343
class DataFrame(_Frame):
@@ -731,6 +731,79 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False):
731731
else:
732732
raise NotImplementedError("dropna currently only works for axis=0 or axis='index'")
733733

734+
def fillna(self, value=None, axis=None, inplace=False):
735+
"""Fill NA/NaN values.
736+
737+
:param value: scalar, dict, Series
738+
Value to use to fill holes. alternately a dict/Series of values
739+
specifying which value to use for each column.
740+
DataFrame is not supported.
741+
:param axis: {0 or `index`}
742+
1 and `columns` are not supported.
743+
:param inplace: boolean, default False
744+
Fill in place (do not create a new object)
745+
:return: :class:`DataFrame`
746+
747+
Examples
748+
--------
749+
>>> df = ks.DataFrame({
750+
... 'A': [None, 3, None, None],
751+
... 'B': [2, 4, None, 3],
752+
... 'C': [None, None, None, 1],
753+
... 'D': [0, 1, 5, 4]
754+
... })
755+
>>> df
756+
A B C D
757+
0 NaN 2.0 NaN 0
758+
1 3.0 4.0 NaN 1
759+
2 NaN NaN NaN 5
760+
3 NaN 3.0 1.0 4
761+
762+
Replace all NaN elements with 0s.
763+
764+
>>> df.fillna(0)
765+
A B C D
766+
0 0.0 2.0 0.0 0
767+
1 3.0 4.0 0.0 1
768+
2 0.0 0.0 0.0 5
769+
3 0.0 3.0 1.0 4
770+
771+
Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
772+
2, and 3 respectively.
773+
774+
>>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
775+
>>> df.fillna(value=values)
776+
A B C D
777+
0 0.0 2.0 2.0 0
778+
1 3.0 4.0 2.0 1
779+
2 0.0 1.0 2.0 5
780+
3 0.0 3.0 1.0 4
781+
"""
782+
783+
if axis is None:
784+
axis = 0
785+
if value is not None:
786+
if axis == 0 or axis == "index":
787+
if isinstance(value, (float, int, str, bool)):
788+
sdf = self._sdf.fillna(value)
789+
if isinstance(value, dict):
790+
dict_sanitizer(value)
791+
sdf = self._sdf.fillna(value)
792+
if isinstance(value, pd.Series):
793+
dict_sanitizer(value.to_dict())
794+
sdf = self._sdf.fillna(value.to_dict())
795+
elif isinstance(value, pd.DataFrame):
796+
raise NotImplementedError("Dataframe value is not supported")
797+
else:
798+
raise NotImplementedError("fillna currently only works for axis=0 or axis='index'")
799+
else:
800+
raise ValueError('Must specify value')
801+
802+
if inplace:
803+
self._sdf = sdf
804+
else:
805+
return DataFrame(sdf, self._metadata.copy())
806+
734807
def head(self, n=5):
735808
"""
736809
Return the first `n` rows.

databricks/koalas/missing/frame.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ class _MissingPandasLikeDataFrame(object):
6969
ewm = unsupported_function('ewm')
7070
expanding = unsupported_function('expanding')
7171
ffill = unsupported_function('ffill')
72-
fillna = unsupported_function('fillna')
7372
filter = unsupported_function('filter')
7473
first = unsupported_function('first')
7574
first_valid_index = unsupported_function('first_valid_index')

databricks/koalas/tests/test_dataframe.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,46 @@ def test_value_counts(self):
346346
ds.name = 'index'
347347
self.assertPandasAlmostEqual(ds.value_counts().toPandas(), s.value_counts())
348348

349+
def test_fillna(self):
350+
pdf = pd.DataFrame({'x': [np.nan, 2, 3, 4, np.nan, 6],
351+
'y': [1, 2, np.nan, 4, np.nan, np.nan],
352+
'z': [1, 2, 3, 4, np.nan, np.nan]},
353+
index=[10, 20, 30, 40, 50, 60])
354+
355+
kdf = koalas.from_pandas(pdf)
356+
357+
self.assert_eq(kdf, pdf)
358+
self.assert_eq(kdf.fillna(-1), pdf.fillna(-1))
359+
self.assert_eq(kdf.fillna({'x': -1, 'y': -2, 'z': -5}),
360+
pdf.fillna({'x': -1, 'y': -2, 'z': -5}))
361+
362+
pdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True)
363+
kdf.fillna({'x': -1, 'y': -2, 'z': -5}, inplace=True)
364+
self.assert_eq(kdf, pdf)
365+
366+
s_nan = pd.Series([-1, -2, -5], index=['x', 'y', 'z'], dtype=int)
367+
self.assert_eq(kdf.fillna(s_nan),
368+
pdf.fillna(s_nan))
369+
370+
msg = "fillna currently only works for axis=0 or axis='index'"
371+
with self.assertRaisesRegex(NotImplementedError, msg):
372+
kdf.fillna(-1, axis=1)
373+
with self.assertRaisesRegex(NotImplementedError, msg):
374+
kdf.fillna(-1, axis='column')
375+
msg = 'Must specify value'
376+
with self.assertRaisesRegex(ValueError, msg):
377+
kdf.fillna()
378+
df_nan = pd.DataFrame({'x': [-1], 'y': [-1], 'z': [-1]})
379+
msg = "Dataframe value is not supported"
380+
with self.assertRaisesRegex(NotImplementedError, msg):
381+
kdf.fillna(df_nan)
382+
383+
# Test dict sanitizer
384+
value_dict = {'x': np.int64(-6), 'y': np.int64(-4), 'z': -5}
385+
msg = "Dict contains unsupported type <class 'numpy.int64'>"
386+
with self.assertRaisesRegex(TypeError, msg):
387+
kdf.fillna(value_dict)
388+
349389
def test_isnull(self):
350390
pdf = pd.DataFrame({'x': [1, 2, 3, 4, None, 6], 'y': list('abdabd')},
351391
index=[10, 20, 30, 40, 50, 60])

databricks/koalas/typedef.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from pyspark.sql.functions import pandas_udf
3030
import pyspark.sql.types as types
3131

32-
3332
T = typing.TypeVar("T")
3433

3534

@@ -113,7 +112,6 @@ def _build_py_type_dict():
113112

114113
_known_types = _build_type_dict()
115114

116-
117115
_py_conversions = _build_py_type_dict()
118116

119117

@@ -172,6 +170,18 @@ def _check_compatible(arg, sig_arg: X):
172170
assert False, (arg, sig_arg)
173171

174172

173+
def dict_sanitizer(input_dict):
174+
"""
175+
This function checks if elements inside a given dict are supported by Spark.
176+
:param input_dict: dict
177+
:raises TypeError if any element is not supported by Spark
178+
"""
179+
_possible_type_values = [int, str, float, np.float64, np.float, bool]
180+
for e in input_dict.values():
181+
if not type(e) in _possible_type_values:
182+
raise TypeError("Dict contains unsupported type {}".format(type(e)))
183+
184+
175185
def make_fun(f, *args, **kwargs):
176186
"""
177187
This function calls the function f while taking into account some of the
@@ -225,7 +235,7 @@ def make_fun(f, *args, **kwargs):
225235
all_indexes = col_indexes + col_keys # type: typing.Union[str, int]
226236

227237
def clean_fun(*args2):
228-
assert len(args2) == len(all_indexes),\
238+
assert len(args2) == len(all_indexes), \
229239
"Missing some inputs:{}!={}".format(all_indexes, [str(c) for c in args2])
230240
full_args = list(frozen_args)
231241
full_kwargs = dict(frozen_kwargs)
@@ -236,6 +246,7 @@ def clean_fun(*args2):
236246
assert isinstance(idx, str), str(idx)
237247
full_kwargs[idx] = arg
238248
return f(*full_args, **full_kwargs)
249+
239250
udf = pandas_udf(clean_fun, returnType=spark_ret_type)
240251
wrapped_udf = udf # udf #_wrap_callable(udf)
241252
col_args = []
@@ -254,6 +265,7 @@ def _wrap_callable(obj):
254265

255266
def f(*args, **kwargs):
256267
return f0(*args, **kwargs)
268+
257269
obj.__call__ = f
258270
return obj
259271

docs/source/reference/frame.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ Missing data handling
9393
:toctree: api/
9494

9595
DataFrame.dropna
96+
DataFrame.fillna
9697

9798
Reshaping, sorting, transposing
9899
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

0 commit comments

Comments
 (0)