diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 0301bf0a23dd5..0f6660d2f4125 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -205,6 +205,8 @@ Other API Changes - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) +- In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) +- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) .. _whatsnew_0230.deprecations: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 97a739b349a98..4f0655cff9b57 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -137,7 +137,7 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'. + as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. @@ -148,6 +148,10 @@ this parameter is only necessary for columns stored as TEXT in Excel, any numeric columns will automatically be parsed, regardless of display format. +comment : str, default None + Comments out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. skip_footer : int, default 0 .. deprecated:: 0.23.0 @@ -164,6 +168,77 @@ parsed : DataFrame or Dict of DataFrames DataFrame from the passed in Excel file. See notes in sheet_name argument for more information on when a Dict of Dataframes is returned. + +Examples +-------- + +An example DataFrame written to a local file + +>>> df_out = pd.DataFrame([('string1', 1), +... ('string2', 2), +... ('string3', 3)], +... columns=['Name', 'Value']) +>>> df_out + Name Value +0 string1 1 +1 string2 2 +2 string3 3 +>>> df_out.to_excel('tmp.xlsx') + +The file can be read using the file name as string or an open file object: + +>>> pd.read_excel('tmp.xlsx') + Name Value +0 string1 1 +1 string2 2 +2 string3 3 + +>>> pd.read_excel(open('tmp.xlsx','rb')) + Name Value +0 string1 1 +1 string2 2 +2 string3 3 + +Index and header can be specified via the `index_col` and `header` arguments + +>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) + 0 1 2 +0 NaN Name Value +1 0.0 string1 1 +2 1.0 string2 2 +3 2.0 string3 3 + +Column types are inferred but can be explicitly specified + +>>> pd.read_excel('tmp.xlsx', dtype={'Name':str, 'Value':float}) + Name Value +0 string1 1.0 +1 string2 2.0 +2 string3 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.read_excel('tmp.xlsx', +... na_values=['string1', 'string2']) + Name Value +0 NaN 1 +1 NaN 2 +2 string3 3 + +Comment lines in the excel input file can be skipped using the `comment` kwarg + +>>> df = pd.DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) +>>> df.to_excel('tmp.xlsx', index=False) +>>> pd.read_excel('tmp.xlsx') + a b +0 1 2 +1 #2 3 + +>>> pd.read_excel('tmp.xlsx', comment='#') + a b +0 1 2 """ @@ -223,6 +298,7 @@ def read_excel(io, parse_dates=False, date_parser=None, thousands=None, + comment=None, skipfooter=0, convert_float=True, **kwds): @@ -256,6 +332,7 @@ def read_excel(io, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, + comment=comment, skipfooter=skipfooter, convert_float=convert_float, **kwds) @@ -338,6 +415,7 @@ def parse(self, parse_dates=False, date_parser=None, thousands=None, + comment=None, skipfooter=0, convert_float=True, **kwds): @@ -363,6 +441,7 @@ def parse(self, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, + comment=comment, skipfooter=skipfooter, convert_float=convert_float, **kwds) @@ -417,6 +496,7 @@ def _parse_excel(self, parse_dates=False, date_parser=None, thousands=None, + comment=None, skipfooter=0, convert_float=True, **kwds): @@ -591,6 +671,7 @@ def _parse_cell(cell_contents, cell_typ): parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, + comment=comment, skipfooter=skipfooter, **kwds) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 71677322329f5..168144d78b3be 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1858,6 +1858,68 @@ def test_invalid_columns(self): with pytest.raises(KeyError): write_frame.to_excel(path, 'test1', columns=['C', 'D']) + def test_comment_arg(self): + # Re issue #18735 + # Test the comment argument functionality to read_excel + with ensure_clean(self.ext) as path: + + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') + + # Read file without comment arg + result1 = read_excel(path, 'test_c') + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + result2 = read_excel(path, 'test_c', comment='#') + tm.assert_frame_equal(result1, result2) + + def test_comment_default(self): + # Re issue #18735 + # Test the comment argument default to read_excel + with ensure_clean(self.ext) as path: + + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') + + # Read file with default and explicit comment=None + result1 = read_excel(path, 'test_c') + result2 = read_excel(path, 'test_c', comment=None) + tm.assert_frame_equal(result1, result2) + + def test_comment_used(self): + # Re issue #18735 + # Test the comment argument is working as expected when used + with ensure_clean(self.ext) as path: + + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') + + # Test read_frame_comment against manually produced expected output + expected = DataFrame({'A': ['one', None, 'one'], + 'B': ['two', None, None]}) + result = read_excel(path, 'test_c', comment='#') + tm.assert_frame_equal(result, expected) + + def test_comment_emptyline(self): + # Re issue #18735 + # Test that read_excel ignores commented lines at the end of file + with ensure_clean(self.ext) as path: + + df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df.to_excel(path, index=False) + + # Test that all-comment lines at EoF are ignored + expected = DataFrame({'a': [1], 'b': [2]}) + result = read_excel(path, comment='#') + tm.assert_frame_equal(result, expected) + def test_datetimes(self): # Test writing and reading datetimes. For issue #9139. (xref #9185)