|
17 | 17 | """ |
18 | 18 | A wrapper class for Spark DataFrame to behave similar to pandas DataFrame. |
19 | 19 | """ |
20 | | -from decorator import dispatch_on |
21 | 20 | from functools import partial, reduce |
22 | 21 |
|
23 | 22 | import numpy as np |
24 | 23 | import pandas as pd |
| 24 | +from decorator import dispatch_on |
25 | 25 | from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype |
26 | 26 | from pyspark import sql as spark |
27 | 27 | from pyspark.sql import functions as F, Column |
|
37 | 37 | from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame |
38 | 38 | from databricks.koalas.ml import corr |
39 | 39 | from databricks.koalas.selection import SparkDataFrameLocator |
40 | | -from databricks.koalas.typedef import infer_pd_series_spark_type |
| 40 | +from databricks.koalas.typedef import infer_pd_series_spark_type, dict_sanitizer |
41 | 41 |
|
42 | 42 |
|
43 | 43 | class DataFrame(_Frame): |
@@ -731,6 +731,79 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): |
731 | 731 | else: |
732 | 732 | raise NotImplementedError("dropna currently only works for axis=0 or axis='index'") |
733 | 733 |
|
| 734 | + def fillna(self, value=None, axis=None, inplace=False): |
| 735 | + """Fill NA/NaN values. |
| 736 | +
|
| 737 | + :param value: scalar, dict, Series |
| 738 | + Value to use to fill holes. alternately a dict/Series of values |
| 739 | + specifying which value to use for each column. |
| 740 | + DataFrame is not supported. |
| 741 | + :param axis: {0 or `index`} |
| 742 | + 1 and `columns` are not supported. |
| 743 | + :param inplace: boolean, default False |
| 744 | + Fill in place (do not create a new object) |
| 745 | + :return: :class:`DataFrame` |
| 746 | +
|
| 747 | + Examples |
| 748 | + -------- |
| 749 | + >>> df = ks.DataFrame({ |
| 750 | + ... 'A': [None, 3, None, None], |
| 751 | + ... 'B': [2, 4, None, 3], |
| 752 | + ... 'C': [None, None, None, 1], |
| 753 | + ... 'D': [0, 1, 5, 4] |
| 754 | + ... }) |
| 755 | + >>> df |
| 756 | + A B C D |
| 757 | + 0 NaN 2.0 NaN 0 |
| 758 | + 1 3.0 4.0 NaN 1 |
| 759 | + 2 NaN NaN NaN 5 |
| 760 | + 3 NaN 3.0 1.0 4 |
| 761 | +
|
| 762 | + Replace all NaN elements with 0s. |
| 763 | +
|
| 764 | + >>> df.fillna(0) |
| 765 | + A B C D |
| 766 | + 0 0.0 2.0 0.0 0 |
| 767 | + 1 3.0 4.0 0.0 1 |
| 768 | + 2 0.0 0.0 0.0 5 |
| 769 | + 3 0.0 3.0 1.0 4 |
| 770 | +
|
| 771 | + Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1, |
| 772 | + 2, and 3 respectively. |
| 773 | +
|
| 774 | + >>> values = {'A': 0, 'B': 1, 'C': 2, 'D': 3} |
| 775 | + >>> df.fillna(value=values) |
| 776 | + A B C D |
| 777 | + 0 0.0 2.0 2.0 0 |
| 778 | + 1 3.0 4.0 2.0 1 |
| 779 | + 2 0.0 1.0 2.0 5 |
| 780 | + 3 0.0 3.0 1.0 4 |
| 781 | + """ |
| 782 | + |
| 783 | + if axis is None: |
| 784 | + axis = 0 |
| 785 | + if value is not None: |
| 786 | + if axis == 0 or axis == "index": |
| 787 | + if isinstance(value, (float, int, str, bool)): |
| 788 | + sdf = self._sdf.fillna(value) |
| 789 | + if isinstance(value, dict): |
| 790 | + dict_sanitizer(value) |
| 791 | + sdf = self._sdf.fillna(value) |
| 792 | + if isinstance(value, pd.Series): |
| 793 | + dict_sanitizer(value.to_dict()) |
| 794 | + sdf = self._sdf.fillna(value.to_dict()) |
| 795 | + elif isinstance(value, pd.DataFrame): |
| 796 | + raise NotImplementedError("Dataframe value is not supported") |
| 797 | + else: |
| 798 | + raise NotImplementedError("fillna currently only works for axis=0 or axis='index'") |
| 799 | + else: |
| 800 | + raise ValueError('Must specify value') |
| 801 | + |
| 802 | + if inplace: |
| 803 | + self._sdf = sdf |
| 804 | + else: |
| 805 | + return DataFrame(sdf, self._metadata.copy()) |
| 806 | + |
734 | 807 | def head(self, n=5): |
735 | 808 | """ |
736 | 809 | Return the first `n` rows. |
|
0 commit comments