From 1157d72f4bfc0033bb497c892a0cb0fffed90dce Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Tue, 12 Dec 2017 23:34:30 +0000 Subject: [PATCH 01/11] DOC: read_excel - added examples and fixed formatting bug --- pandas/io/excel.py | 76 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 97a739b349a98..46753ef5b1c1d 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -137,7 +137,7 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'. + as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 999) + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. @@ -164,6 +164,80 @@ parsed : DataFrame or Dict of DataFrames DataFrame from the passed in Excel file. See notes in sheet_name argument for more information on when a Dict of Dataframes is returned. + +Examples +-------- + +An example DataFrame written to a local file + +>>> df_out = pd.DataFrame([('string1', 1), +... ('string2', 2), +... ('string3', 3)], +... columns=('Name', 'Value')) +>>> df_out + Name Value +0 string1 1 +1 string2 2 +2 string3 3 +>>> df_out.to_excel('tmp.xlsx') + +The file can be read using the file name as string or an open file object: + +>>> pd.read_excel('tmp.xlsx') + Name Value +0 string1 1 +1 string2 2 +2 string3 3 + +>>> pd.read_excel(open('tmp.xlsx','rb')) + Name Value +0 string1 1 +1 string2 2 +2 string3 3 + +Index and header can be specified via the `index_col` and `header` arguments + +>>> pd.read_excel(open('tmp.xlsx','rb'), index_col=None, header=None) + 0 1 2 +0 NaN Name Value +1 0.0 string1 1 +2 1.0 string2 2 +3 2.0 string3 3 + +Column types are inferred but can be explicitly specified + +>>> pd.read_excel(open('tmp.xlsx','rb'), dtype={'Name':str, 'Value':float}) + Name Value +0 string1 1.0 +1 string2 2.0 +2 string3 3.0 + +True, False, and NA values, and thousands separators have defaults, +but can be explicitly specified, too. Supply the values you would like +as strings or lists of strings! + +>>> pd.read_excel(open('tmp.xlsx','rb'), +... true_values='2', +... false_values='3', +... na_values=['string1', 'string2'], +... thousands=',') + Name Value +0 NaN 1 +1 NaN 2 +2 string3 3 + +Comment lines in the excel input file can be skipped using the `comment` kwarg + +>>> df = pd.DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) +>>> df.to_excel('tmp.xlsx', index=False) +>>> pd.read_excel('tmp.xlsx') + a b +0 1 2 +1 #2 3 + +>>> pd.read_excel('tmp.xlsx', comment='#') + a b +0 1 2 """ From 53a61db0433a98ca6b91a59f932ca27f0f09509b Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Wed, 13 Dec 2017 12:40:20 +0000 Subject: [PATCH 02/11] read_excel - added comment as named argument comment and test_comment_* tests --- pandas/io/excel.py | 8 +++++ pandas/tests/io/test_excel.py | 56 +++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 46753ef5b1c1d..96817a895a139 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -158,6 +158,9 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally +comment : str, default None + Comment out remainder of line. Character or characters to indicate comments + in the input file. Any data between comment and line end is ignored. Returns ------- @@ -299,6 +302,7 @@ def read_excel(io, thousands=None, skipfooter=0, convert_float=True, + comment=None, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -332,6 +336,7 @@ def read_excel(io, thousands=thousands, skipfooter=skipfooter, convert_float=convert_float, + comment=comment, **kwds) @@ -414,6 +419,7 @@ def parse(self, thousands=None, skipfooter=0, convert_float=True, + comment=None, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -439,6 +445,7 @@ def parse(self, thousands=thousands, skipfooter=skipfooter, convert_float=convert_float, + comment=comment, **kwds) def _should_parse(self, i, usecols): @@ -493,6 +500,7 @@ def _parse_excel(self, thousands=None, skipfooter=0, convert_float=True, + comment=None, **kwds): _validate_header_arg(header) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 71677322329f5..266ccb011c57b 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1858,6 +1858,62 @@ def test_invalid_columns(self): with pytest.raises(KeyError): write_frame.to_excel(path, 'test1', columns=['C', 'D']) + def test_comment_arg(self): + # Test the comment argument functionality to read_excel + with ensure_clean(self.ext) as path: + + # Create file to read in + write_frame = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + write_frame.to_excel(path, 'test_c') + + # Read file without comment arg + read_frame = read_excel(path, 'test_c') + read_frame_commented = read_excel(path, 'test_c', comment='#') + tm.assert_class_equal(read_frame, read_frame_commented) + + def test_comment_default(self): + # Test the comment argument default to read_excel + with ensure_clean(self.ext) as path: + + # Create file to read in + write_frame = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + write_frame.to_excel(path, 'test_c') + + # Read file with default and explicit comment=None + read_frame = read_excel(path, 'test_c') + read_frame_uncommented = read_excel(path, 'test_c', comment=None) + tm.assert_frame_equal(read_frame, read_frame_uncommented) + + def test_comment_used(self): + # Test the comment argument is working as expected when used + with ensure_clean(self.ext) as path: + + # Create file to read in + write_frame = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + write_frame.to_excel(path, 'test_c') + + # Test read_frame_comment against manually produced expected output + read_frame_commented = read_excel(path, 'test_c', comment='#') + expected = read_excel(path, 'test_c') + expected.iloc[1, 0] = None + expected.iloc[1, 1] = None + expected.iloc[2, 1] = None + tm.assert_frame_equal(read_frame_commented, expected) + + def test_comment_emptyline(self): + # Test that read_excel ignores commented lines at the end of file + with ensure_clean(self.ext) as path: + + write_frame = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + write_frame.to_excel(path, index=False) + + # Test that all-comment lines at EoF are ignored + read_frame_short = read_excel(path, comment='#') + assert (read_frame_short.shape == write_frame.iloc[0:1, :].shape) + def test_datetimes(self): # Test writing and reading datetimes. For issue #9139. (xref #9185) From d2f31237e24db35a17f6bfd9169591c483f32737 Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Wed, 13 Dec 2017 14:36:42 +0000 Subject: [PATCH 03/11] added whatsnew entry --- doc/source/whatsnew/v0.22.0.txt | 195 ++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2d30e00142846..1fa2c17869d34 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -12,3 +12,198 @@ version. Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0220.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`15184`). +If installed, we now require: + + +-----------------+-----------------+----------+ + | Package | Minimum Version | Required | + +=================+=================+==========+ + | python-dateutil | 2.5.0 | X | + +-----------------+-----------------+----------+ + | openpyxl | 2.4.0 | | + +-----------------+-----------------+----------+ + + +- Building pandas for development now requires ``cython >= 0.24`` (:issue:`18613`) +- Building from source now explicity requires ``setuptools`` in ``setup.py`` (:issue:`18113`) + +.. _whatsnew_0220.api: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`) +- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`) +- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`) +- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) +- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`). +- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) +- :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) +- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) +- ``pandas.tseries.frequencies.get_freq_group()`` and ``pandas.tseries.frequencies.DAYS`` are removed from the public API (:issue:`18034`) +- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`) +- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`, :issue:`18482`, :issue:`18509`). +- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`) +- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`) +- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) +- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). +- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) +- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) +- :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) +- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) +- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) +- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) +- :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) +- comment arg is exposed as a named parameter in :func:`read_excel` + +.. _whatsnew_0220.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). +- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). +- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`) + +.. _whatsnew_0220.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Warnings against the obsolete usage ``Categorical(codes, categories)``, which were emitted for instance when the first two arguments to ``Categorical()`` had different dtypes, and recommended the use of ``Categorical.from_codes``, have now been removed (:issue:`8074`) +- The ``levels`` and ``labels`` attributes of a ``MultiIndex`` can no longer be set directly (:issue:`4039`). +- ``pd.tseries.util.pivot_annual`` has been removed (deprecated since v0.19). Use ``pivot_table`` instead (:issue:`18370`) +- ``pd.tseries.util.isleapyear`` has been removed (deprecated since v0.19). Use ``.is_leap_year`` property in Datetime-likes instead (:issue:`18370`) +- ``pd.ordered_merge`` has been removed (deprecated since v0.19). Use ``pd.merge_ordered`` instead (:issue:`18459`) +- The ``SparseList`` class has been removed (:issue:`14007`) +- The ``pandas.io.wb`` and ``pandas.io.data`` stub modules have been removed (:issue:`13735`) +- ``Categorical.from_array`` has been removed (:issue:`13854`) +- The ``freq`` and ``how`` parameters have been removed from the ``rolling``/``expanding``/``ewm`` methods of DataFrame + and Series (deprecated since v0.18). Instead, resample before calling the methods. (:issue:18601 & :issue:18668) +- ``DatetimeIndex.to_datetime``, ``Timestamp.to_datetime``, ``PeriodIndex.to_datetime``, and ``Index.to_datetime`` have been removed (:issue:`8254`, :issue:`14096`, :issue:`14113`) +- :func:`read_csv` has dropped the ``skip_footer`` parameter (:issue:`13386`) + +.. _whatsnew_0220.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) +- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) +- :class`DateOffset` arithmetic performance is improved (:issue:`18218`) +- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) +- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`) +- The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`) +- ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`) +- Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`) +- Improved performance of :func:`Series.dt.time` and :func:`DatetimeIndex.time` (:issue:`18461`) +- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`) +- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`) +- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) + +.. _whatsnew_0220.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- +- +- + +.. _whatsnew_0220.bug_fixes: + +Bug Fixes +~~~~~~~~~ + + +Conversion +^^^^^^^^^^ + +- Bug in :class:`Index` constructor with ``dtype='uint64'`` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) +- Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) +- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) +- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) +- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) + + +Indexing +^^^^^^^^ + +- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) +- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) +- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`) +- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`) +- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`) +- Bug in :class:`Index` construction from list of mixed type tuples (:issue:`18505`) +- Bug in :class:`IntervalIndex` where empty and purely NA data was constructed inconsistently depending on the construction method (:issue:`18421`) +- Bug in :func:`IntervalIndex.symmetric_difference` where the symmetric difference with a non-``IntervalIndex`` did not raise (:issue:`18475`) +- Bug in indexing a datetimelike ``Index`` that raised ``ValueError`` instead of ``IndexError`` (:issue:`18386`). +- Bug in tz-aware :class:`DatetimeIndex` where addition/subtraction with a :class:`TimedeltaIndex` or array with ``dtype='timedelta64[ns]'`` was incorrect (:issue:`17558`) + +I/O +^^^ + +- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) +- Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) +- Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) +- Bug in :func:`read_sas` where a file with 0 variables gave an ``AttributeError`` incorrectly. Now it gives an ``EmptyDataError`` (:issue:`18184`) +- +- + +Plotting +^^^^^^^^ + +- :func: `DataFrame.plot` now raises a ``ValueError`` when the ``x`` or ``y`` argument is improperly formed (:issue:`18671`) +- +- + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) +- +- + +Sparse +^^^^^^ + +- +- +- + +Reshaping +^^^^^^^^^ + +- Bug in :func:`DataFrame.stack` which fails trying to sort mixed type levels under Python 3 (:issue:`18310`) +- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`) + +- + +Numeric +^^^^^^^ + +- +- +- + +Categorical +^^^^^^^^^^^ + +- +- +- + +Other +^^^^^ + +- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- From cc8a5c2681bfc6e209968ff9eb801e55454dfead Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Wed, 13 Dec 2017 17:15:36 +0000 Subject: [PATCH 04/11] modified tests as requested --- pandas/tests/io/test_excel.py | 56 +++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 266ccb011c57b..168144d78b3be 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1859,60 +1859,66 @@ def test_invalid_columns(self): write_frame.to_excel(path, 'test1', columns=['C', 'D']) def test_comment_arg(self): + # Re issue #18735 # Test the comment argument functionality to read_excel with ensure_clean(self.ext) as path: # Create file to read in - write_frame = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - write_frame.to_excel(path, 'test_c') + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') # Read file without comment arg - read_frame = read_excel(path, 'test_c') - read_frame_commented = read_excel(path, 'test_c', comment='#') - tm.assert_class_equal(read_frame, read_frame_commented) + result1 = read_excel(path, 'test_c') + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + result2 = read_excel(path, 'test_c', comment='#') + tm.assert_frame_equal(result1, result2) def test_comment_default(self): + # Re issue #18735 # Test the comment argument default to read_excel with ensure_clean(self.ext) as path: # Create file to read in - write_frame = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - write_frame.to_excel(path, 'test_c') + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') # Read file with default and explicit comment=None - read_frame = read_excel(path, 'test_c') - read_frame_uncommented = read_excel(path, 'test_c', comment=None) - tm.assert_frame_equal(read_frame, read_frame_uncommented) + result1 = read_excel(path, 'test_c') + result2 = read_excel(path, 'test_c', comment=None) + tm.assert_frame_equal(result1, result2) def test_comment_used(self): + # Re issue #18735 # Test the comment argument is working as expected when used with ensure_clean(self.ext) as path: # Create file to read in - write_frame = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - write_frame.to_excel(path, 'test_c') + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(path, 'test_c') # Test read_frame_comment against manually produced expected output - read_frame_commented = read_excel(path, 'test_c', comment='#') - expected = read_excel(path, 'test_c') - expected.iloc[1, 0] = None - expected.iloc[1, 1] = None - expected.iloc[2, 1] = None - tm.assert_frame_equal(read_frame_commented, expected) + expected = DataFrame({'A': ['one', None, 'one'], + 'B': ['two', None, None]}) + result = read_excel(path, 'test_c', comment='#') + tm.assert_frame_equal(result, expected) def test_comment_emptyline(self): + # Re issue #18735 # Test that read_excel ignores commented lines at the end of file with ensure_clean(self.ext) as path: - write_frame = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) - write_frame.to_excel(path, index=False) + df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df.to_excel(path, index=False) # Test that all-comment lines at EoF are ignored - read_frame_short = read_excel(path, comment='#') - assert (read_frame_short.shape == write_frame.iloc[0:1, :].shape) + expected = DataFrame({'a': [1], 'b': [2]}) + result = read_excel(path, comment='#') + tm.assert_frame_equal(result, expected) def test_datetimes(self): From 5d4be77f6987759ba11a5ff365b90b3a4ddab691 Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Thu, 14 Dec 2017 14:17:53 +0000 Subject: [PATCH 05/11] changed order of arguments --- pandas/io/excel.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 96817a895a139..cc06a6c0fb859 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -148,6 +148,10 @@ this parameter is only necessary for columns stored as TEXT in Excel, any numeric columns will automatically be parsed, regardless of display format. +comment : str, default None + Comment out remainder of line. Pass a character or characters to this + argument to indicate comments in the input file. Any data between the + comment string and the end of the current line is ignored. skip_footer : int, default 0 .. deprecated:: 0.23.0 @@ -158,9 +162,6 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally -comment : str, default None - Comment out remainder of line. Character or characters to indicate comments - in the input file. Any data between comment and line end is ignored. Returns ------- @@ -302,7 +303,6 @@ def read_excel(io, thousands=None, skipfooter=0, convert_float=True, - comment=None, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -334,9 +334,13 @@ def read_excel(io, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, +<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead skipfooter=skipfooter, - convert_float=convert_float, +======= comment=comment, + skip_footer=skip_footer, +>>>>>>> changed order of arguments + convert_float=convert_float, **kwds) @@ -417,9 +421,13 @@ def parse(self, parse_dates=False, date_parser=None, thousands=None, +<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead skipfooter=0, - convert_float=True, +======= comment=None, + skip_footer=0, +>>>>>>> changed order of arguments + convert_float=True, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -443,9 +451,13 @@ def parse(self, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, +<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead skipfooter=skipfooter, - convert_float=convert_float, +======= comment=comment, + skip_footer=skip_footer, +>>>>>>> changed order of arguments + convert_float=convert_float, **kwds) def _should_parse(self, i, usecols): @@ -498,9 +510,13 @@ def _parse_excel(self, parse_dates=False, date_parser=None, thousands=None, +<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead skipfooter=0, - convert_float=True, +======= comment=None, + skip_footer=0, +>>>>>>> changed order of arguments + convert_float=True, **kwds): _validate_header_arg(header) @@ -673,7 +689,12 @@ def _parse_cell(cell_contents, cell_typ): parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, +<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead skipfooter=skipfooter, +======= + comment=comment, + skipfooter=skip_footer, +>>>>>>> changed order of arguments **kwds) output[asheetname] = parser.read(nrows=nrows) From e7ca7e661f6609af0e9df369b4dc9ce4ce4cc8f3 Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Mon, 18 Dec 2017 10:25:06 +0000 Subject: [PATCH 06/11] trigger travisCI build --- pandas/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index cc06a6c0fb859..135f82d5880a9 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -149,7 +149,7 @@ any numeric columns will automatically be parsed, regardless of display format. comment : str, default None - Comment out remainder of line. Pass a character or characters to this + Comments out remainder of line. Pass a character or characters to this argument to indicate comments in the input file. Any data between the comment string and the end of the current line is ignored. skip_footer : int, default 0 From 128e1486caa77b669e316c37fe8e169455533aa1 Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Wed, 20 Dec 2017 08:26:31 +0000 Subject: [PATCH 07/11] modified whatsnew entry --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1fa2c17869d34..7bd8222c8c26f 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -62,7 +62,7 @@ Other API Changes - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) -- comment arg is exposed as a named parameter in :func:`read_excel` +- in :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) .. _whatsnew_0220.deprecations: From c56da898a3e0bf78e3f31044f1083398e078145c Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Fri, 29 Dec 2017 08:07:07 +0000 Subject: [PATCH 08/11] rebase on master --- pandas/io/excel.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 135f82d5880a9..68b68df8d4431 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -221,10 +221,7 @@ as strings or lists of strings! >>> pd.read_excel(open('tmp.xlsx','rb'), -... true_values='2', -... false_values='3', -... na_values=['string1', 'string2'], -... thousands=',') +... na_values=['string1', 'string2']) Name Value 0 NaN 1 1 NaN 2 From fda8fa2aacfac867d7266f540222f4a31cb19a6f Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Fri, 29 Dec 2017 08:40:18 +0000 Subject: [PATCH 09/11] DOC: read_excel doc - fixed formatting and added examples --- doc/source/whatsnew/v0.22.0.txt | 2 -- doc/source/whatsnew/v0.23.0.txt | 2 ++ pandas/io/excel.py | 46 ++++++++++----------------------- 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 7bd8222c8c26f..caa94f4003764 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -59,10 +59,8 @@ Other API Changes - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) -- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) -- in :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) .. _whatsnew_0220.deprecations: diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 5e55efb4e21fb..4b16c1e86840c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -203,6 +203,8 @@ Other API Changes - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) - :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`) +- In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`) +- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`) .. _whatsnew_0230.deprecations: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 68b68df8d4431..3f3dff2742d52 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -132,12 +132,13 @@ nrows : int, default None Number of rows to parse - .. versionadded:: 0.23.0 + .. versionadded:: 0.22.0 na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 999) + """'. + as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, + subsequent_indent=" ") + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to. @@ -154,7 +155,7 @@ comment string and the end of the current line is ignored. skip_footer : int, default 0 - .. deprecated:: 0.23.0 + .. deprecated:: 0.22.0 Pass in `skipfooter` instead. skipfooter : int, default 0 Rows at the end to skip (0-indexed) @@ -177,7 +178,7 @@ >>> df_out = pd.DataFrame([('string1', 1), ... ('string2', 2), ... ('string3', 3)], -... columns=('Name', 'Value')) +... columns=['Name', 'Value']) >>> df_out Name Value 0 string1 1 @@ -201,7 +202,7 @@ Index and header can be specified via the `index_col` and `header` arguments ->>> pd.read_excel(open('tmp.xlsx','rb'), index_col=None, header=None) +>>> pd.read_excel('tmp.xlsx', index_col=None, header=None) 0 1 2 0 NaN Name Value 1 0.0 string1 1 @@ -210,7 +211,7 @@ Column types are inferred but can be explicitly specified ->>> pd.read_excel(open('tmp.xlsx','rb'), dtype={'Name':str, 'Value':float}) +>>> pd.read_excel('tmp.xlsx', dtype={'Name':str, 'Value':float}) Name Value 0 string1 1.0 1 string2 2.0 @@ -220,7 +221,7 @@ but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! ->>> pd.read_excel(open('tmp.xlsx','rb'), +>>> pd.read_excel('tmp.xlsx', ... na_values=['string1', 'string2']) Name Value 0 NaN 1 @@ -298,6 +299,7 @@ def read_excel(io, parse_dates=False, date_parser=None, thousands=None, + comment=None, skipfooter=0, convert_float=True, **kwds): @@ -331,12 +333,8 @@ def read_excel(io, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, -<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead - skipfooter=skipfooter, -======= comment=comment, - skip_footer=skip_footer, ->>>>>>> changed order of arguments + skipfooter=skipfooter, convert_float=convert_float, **kwds) @@ -418,12 +416,8 @@ def parse(self, parse_dates=False, date_parser=None, thousands=None, -<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead - skipfooter=0, -======= comment=None, - skip_footer=0, ->>>>>>> changed order of arguments + skipfooter=0, convert_float=True, **kwds): """ @@ -448,12 +442,8 @@ def parse(self, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, -<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead - skipfooter=skipfooter, -======= comment=comment, - skip_footer=skip_footer, ->>>>>>> changed order of arguments + skipfooter=skipfooter, convert_float=convert_float, **kwds) @@ -507,12 +497,8 @@ def _parse_excel(self, parse_dates=False, date_parser=None, thousands=None, -<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead - skipfooter=0, -======= comment=None, - skip_footer=0, ->>>>>>> changed order of arguments + skipfooter=0, convert_float=True, **kwds): @@ -686,12 +672,8 @@ def _parse_cell(cell_contents, cell_typ): parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, -<<<<<<< cc8a5c2681bfc6e209968ff9eb801e55454dfead - skipfooter=skipfooter, -======= comment=comment, - skipfooter=skip_footer, ->>>>>>> changed order of arguments + skipfooter=skipfooter, **kwds) output[asheetname] = parser.read(nrows=nrows) From 10966555b672eb81d9f4508946192bf028db5391 Mon Sep 17 00:00:00 2001 From: JanLauGe Date: Sat, 30 Dec 2017 02:35:45 +0000 Subject: [PATCH 10/11] DOC: read_excel doc - fixed formatting and added examples --- pandas/io/excel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 3f3dff2742d52..1703c0dcf0cf1 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -132,7 +132,7 @@ nrows : int, default None Number of rows to parse - .. versionadded:: 0.22.0 + .. versionadded:: 0.23.0 na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific @@ -155,7 +155,7 @@ comment string and the end of the current line is ignored. skip_footer : int, default 0 - .. deprecated:: 0.22.0 + .. deprecated:: 0.23.0 Pass in `skipfooter` instead. skipfooter : int, default 0 Rows at the end to skip (0-indexed) From 6afed0692a6fddfb161aeeb312924db311cc9252 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 30 Dec 2017 07:34:58 -0500 Subject: [PATCH 11/11] lint --- pandas/io/excel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 1703c0dcf0cf1..4f0655cff9b57 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -137,8 +137,7 @@ na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted - as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, - subsequent_indent=" ") + """'. + as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to.