Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,108 @@ def assign(self, **kwargs):
[name for name, _ in pairs if name not in self._metadata.column_fields]))
return DataFrame(sdf, metadata)

def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None,
columns=None, header=True, index=True, index_label=None, startrow=0,
startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep="inf",
verbose=True, freeze_panes=None):
"""
Write object to an Excel sheet.

To write a single object to an Excel .xlsx file it is only necessary to
specify a target file name. To write to multiple sheets it is necessary to
create an `ExcelWriter` object with a target file name, and specify a sheet
in the file to write to.

Multiple sheets may be written to by specifying unique `sheet_name`.
With all data written to the file it is necessary to save the changes.
Note that creating an `ExcelWriter` object with a file name that already
exists will result in the contents of the existing file being erased.

Parameters
----------
excel_writer : str or ExcelWriter object
File path or existing ExcelWriter.
sheet_name : str, default 'Sheet1'
Name of sheet which will contain DataFrame.
na_rep : str, default ''
Missing data representation.
float_format : str, optional
Format string for floating point numbers. For example
``float_format="%%.2f"`` will format 0.1234 to 0.12.
columns : sequence or list of str, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of string is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, optional
Column label for index column(s) if desired. If not specified, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
startrow : int, default 0
Upper left cell row to dump data frame.
startcol : int, default 0
Upper left cell column to dump data frame.
engine : str, optional
Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
``io.excel.xlsm.writer``.
merge_cells : bool, default True
Write MultiIndex and Hierarchical Rows as merged cells.
encoding : str, optional
Encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.
inf_rep : str, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel).
verbose : bool, default True
Display more information in the error logs.
freeze_panes : tuple of int (length 2), optional
Specifies the one-based bottommost row and rightmost column that
is to be frozen.

Notes
-----
Once a workbook has been saved it is not possible write further data
without rewriting the whole workbook.

Examples
--------
Create, write to and save a workbook:

>>> df1 = ks.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
>>> df1.to_excel("output.xlsx") # doctest: +SKIP

To specify the sheet name:

>>> df1.to_excel("output.xlsx") # doctest: +SKIP
>>> df1.to_excel("output.xlsx",
... sheet_name='Sheet_name_1') # doctest: +SKIP

If you wish to write to more than one sheet in the workbook, it is
necessary to specify an ExcelWriter object:

>>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
... df1.to_excel(writer, sheet_name='Sheet_name_1')
... df2.to_excel(writer, sheet_name='Sheet_name_2')

To set the library that is used to write the Excel file,
you can pass the `engine` keyword (the default engine is
automatically chosen depending on the file extension):

>>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
"""

# Make sure locals() call is at the top of the function so we don't capture local variables.
args = locals()
kdf = self

return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_excel, pd.DataFrame.to_excel, args)

@property
def loc(self):
return SparkDataFrameLocator(self)
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ class _MissingPandasLikeDataFrame(object):
to_clipboard = unsupported_function('to_clipboard')
to_csv = unsupported_function('to_csv')
to_dense = unsupported_function('to_dense')
to_excel = unsupported_function('to_excel')
to_feather = unsupported_function('to_feather')
to_gbq = unsupported_function('to_gbq')
to_hdf = unsupported_function('to_hdf')
Expand Down
31 changes: 31 additions & 0 deletions databricks/koalas/tests/test_dataframe_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,34 @@ def test_to_html(self):
""")
got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2))
self.assert_eq(got, expected)

def test_to_excel(self):
pdf = self.pdf
kdf = self.kdf
excel_writer = "output.xlsx"

self.assert_eq(kdf.to_excel(excel_writer), pdf.to_excel(excel_writer))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we assert the output.xlsx is the same, rather than asserting on the return value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rxin, should I load the values of the generated excel files in 2 dataframes and check the values?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes that'd work.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure you write it to a temporary location that's unique too.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rxin, I have made the changes you asked for.
Please have a look.


pdf = pd.DataFrame({
'a': [1, None, 3],
'b': ["one", "two", None],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

self.assert_eq(kdf.to_excel(excel_writer, na_rep='null'),
pdf.to_excel(excel_writer, na_rep='null'))

pdf = pd.DataFrame({
'a': [1.0, 2.0, 3.0],
'b': [4.0, 5.0, 6.0],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

self.assert_eq(kdf.to_excel(excel_writer, float_format='%.1f'),
pdf.to_excel(excel_writer, float_format='%.1f'))
self.assert_eq(kdf.to_excel(excel_writer, header=False),
pdf.to_excel(excel_writer, header=False))
self.assert_eq(kdf.to_excel(excel_writer, index=False),
pdf.to_excel(excel_writer, index=False))
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,4 @@ Serialization / IO / Conversion
DataFrame.to_spark
DataFrame.to_string
DataFrame.to_dict
DataFrame.to_excel
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
pandas>=0.23
pyarrow>=0.10,<0.11
numpy>=1.14
openpyxl>=2.6.2

# Documentation build
sphinx
Expand Down