From afc7b2e42e6a6d35d27600a3fb786de9ca8effe9 Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 7 Jul 2016 01:29:50 -0500 Subject: [PATCH 1/9] Adding and arguments to for #13461 and #10534 --- pandas/io/html.py | 28 +++++++++++++++---- pandas/io/tests/test_html.py | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 48caaa39dd711..b0d04507899bc 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -613,7 +613,7 @@ def _expand_elements(body): def _data_to_frame(data, header, index_col, skiprows, parse_dates, tupleize_cols, thousands, - decimal): + decimal, converters, na_values): head, body, foot = data if head: @@ -631,7 +631,8 @@ def _data_to_frame(data, header, index_col, skiprows, tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands, decimal=decimal) + thousands=thousands, decimal=decimal, + converters=converters, na_values=na_values) df = tp.read() return df @@ -718,7 +719,7 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal): + decimal, converters, na_values): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -747,7 +748,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, - decimal=decimal + decimal=decimal, + converters=converters, + na_values=na_values )) except EmptyDataError: # empty table continue @@ -757,7 +760,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, - decimal='.'): + decimal='.', converters=None, na_values=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -839,6 +842,19 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.18.2 + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + .. versionadded:: 0.19.0 + + na_values : iterable, default None + Custom NA values + + .. versionadded:: 0.19.0 + Returns ------- dfs : list of DataFrames @@ -883,4 +899,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, _validate_header_arg(header) return _parse(flavor, io, match, header, index_col, skiprows, parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal) + decimal, converters, na_values) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 5a95fe7727df0..1ad6114adf691 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -694,6 +694,59 @@ def test_bool_header_arg(self): with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def test_converters(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + + + + + + + +
NamesC_l0_g0C_l0_g1
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285
""" + raw_data = np.array([[u'R_l0_g0', '0.763', 0.233], + [u'R_l0_g1', '0.244', 0.285]], dtype=object) + html_df = read_html(html_data, converters={'C_l0_g0': str})[0] + tm.assert_numpy_array_equal(raw_data, html_df.values) + + def test_na_values(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + + + + + + + +
NamesC_l0_g0C_l0_g1
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285
""" + raw_data = np.array([[u'R_l0_g0', 0.763, 0.233], + [u'R_l0_g1', 0.244, np.nan]], dtype=object) + html_df = read_html(html_data, na_values=[0.285])[0] + tm.assert_numpy_array_equal(raw_data, html_df.values) def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') From b7fa8a7220be3178745fa78f273ed71062e574fd Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 7 Jul 2016 01:35:53 -0500 Subject: [PATCH 2/9] fixing lint --- pandas/io/tests/test_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 1ad6114adf691..fb69a90e4bfdf 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -748,6 +748,7 @@ def test_na_values(self): html_df = read_html(html_data, na_values=[0.285])[0] tm.assert_numpy_array_equal(raw_data, html_df.values) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') From cfe786c7018ac4b7e322898c0e5c31fda56e6973 Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 7 Jul 2016 01:46:06 -0500 Subject: [PATCH 3/9] adding whatsnew --- doc/source/whatsnew/v0.18.2.txt | 4 ++++ pandas/io/html.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b9afa7fcb7959..226019d25e807 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -207,6 +207,10 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`) + + - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index b0d04507899bc..93303cc910f5d 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -848,12 +848,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, input argument, the cell (not column) content, and return the transformed content. - .. versionadded:: 0.19.0 + .. versionadded:: 0.18.2 na_values : iterable, default None Custom NA values - .. versionadded:: 0.19.0 + .. versionadded:: 0.18.2 Returns ------- From 3f828e29c80bf973e5ac238ca9f1e88164eec6b8 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 8 Jul 2016 14:04:26 -0500 Subject: [PATCH 4/9] removing old whatsnew --- doc/source/whatsnew/v0.18.2.txt | 534 -------------------------------- 1 file changed, 534 deletions(-) delete mode 100644 doc/source/whatsnew/v0.18.2.txt diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt deleted file mode 100644 index 226019d25e807..0000000000000 --- a/doc/source/whatsnew/v0.18.2.txt +++ /dev/null @@ -1,534 +0,0 @@ -.. _whatsnew_0182: - -v0.18.2 (July ??, 2016) ------------------------ - -This is a minor bug-fix release from 0.18.1 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. -We recommend that all users upgrade to this version. - -Highlights include: - -- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` - -.. contents:: What's new in v0.18.2 - :local: - :backlinks: none - -.. _whatsnew_0182.new_features: - -New features -~~~~~~~~~~~~ - -.. _whatsnew_0182.enhancements.asof_merge: - -:func:`merge_asof` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A long-time requested feature has been added through the :func:`merge_asof` function, to -support asof style joining of time-series. (:issue:`1870`). Full documentation is -:ref:`here ` - -The :func:`merge_asof` performs an asof merge, which is similar to a left-join -except that we match on nearest key rather than equal keys. - -.. ipython:: python - - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) - - left - right - -We typically want to match exactly when possible, and use the most -recent value otherwise. - -.. ipython:: python - - pd.merge_asof(left, right, on='a') - -We can also match rows ONLY with prior data, and not an exact match. - -.. ipython:: python - - pd.merge_asof(left, right, on='a', allow_exact_matches=False) - - -In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. -This also illustrates using the ``by`` parameter to group data before merging. - -.. ipython:: python - - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) - -.. ipython:: python - - trades - quotes - -An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and -in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except -that forward filling happens automatically taking the most recent non-NaN value. - -.. ipython:: python - - pd.merge_asof(trades, quotes, - on='time', - by='ticker') - -This returns a merged DataFrame with the entries in the same order as the original left -passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. - -.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: - -:func:`read_csv` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether -they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) - -.. ipython :: python - - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] - -Previous behaviour: - -.. code-block:: ipython - - In [2]: pd.read_csv(StringIO(data), names=names) - Out[2]: - a b a - 0 2 1 2 - 1 5 4 5 - -The first 'a' column contains the same data as the second 'a' column, when it should have -contained the array ``[0, 3]``. - -New behaviour: - -.. ipython :: python - - In [2]: pd.read_csv(StringIO(data), names=names) - -.. _whatsnew_0182.enhancements.semi_month_offsets: - -Semi-Month Offsets -^^^^^^^^^^^^^^^^^^ - -Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). -These provide date offsets anchored (by default) to the 15th and end of month, and 15th and 1st of month respectively. -(:issue:`1543`) - -.. ipython:: python - - from pandas.tseries.offsets import SemiMonthEnd, SemiMonthBegin - -SemiMonthEnd: - -.. ipython:: python - - Timestamp('2016-01-01') + SemiMonthEnd() - - pd.date_range('2015-01-01', freq='SM', periods=4) - -SemiMonthBegin: - -.. ipython:: python - - Timestamp('2016-01-01') + SemiMonthBegin() - - pd.date_range('2015-01-01', freq='SMS', periods=4) - -Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. - -.. ipython:: python - - pd.date_range('2015-01-01', freq='SMS-16', periods=4) - - pd.date_range('2015-01-01', freq='SM-14', periods=4) - -.. _whatsnew_0182.enhancements.other: - -Other enhancements -^^^^^^^^^^^^^^^^^^ - -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - -- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see :ref:`documentation here ` (:issue:`10008`, :issue:`13156`) -- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) - - .. ipython:: python - - idx = pd.Index(["a1a2", "b1", "c1"]) - idx.str.extractall("[ab](?P\d)") - -- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) - - .. ipython:: python - - pd.Timestamp(2012, 1, 1) - - pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) - -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) - -- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`) -- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`) - - -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - - .. ipython:: python - - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) - -- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) -- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) -- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) -- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) -- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) -- A ``union_categorical`` function has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`) -- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) -- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - -.. _whatsnew_0182.api: - -API changes -~~~~~~~~~~~ - - -- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) -- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) -- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) - -.. _whatsnew_0182.api.tolist: - -``Series.tolist()`` will now return Python types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) - - -.. ipython:: python - - s = pd.Series([1,2,3]) - type(s.tolist()[0]) - -Previous Behavior: - -.. code-block:: ipython - - In [7]: type(s.tolist()[0]) - Out[7]: - - -New Behavior: - -.. ipython:: python - - type(s.tolist()[0]) - -.. _whatsnew_0182.api.promote: - -``Series`` type promotion on assignment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) - - -.. ipython:: python - - s = pd.Series() - -Previous Behavior: - -.. code-block:: ipython - - In [2]: s["a"] = pd.Timestamp("2016-01-01") - - In [3]: s["b"] = 3.0 - TypeError: invalid type promotion - -New Behavior: - -.. ipython:: python - - s["a"] = pd.Timestamp("2016-01-01") - s["b"] = 3.0 - s - s.dtype - -.. _whatsnew_0182.api.to_datetime_coerce: - -``.to_datetime()`` when coercing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). -Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pd.to_datetime([1, 'foo'], errors='coerce') - Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) - -This will now convert integers/floats with the default unit of ``ns``. - -.. ipython:: python - - pd.to_datetime([1, 'foo'], errors='coerce') - -.. _whatsnew_0182.api.merging: - -Merging changes -^^^^^^^^^^^^^^^ - -Merging will now preserve the dtype of the join keys (:issue:`8596`) - -.. ipython:: python - - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) - df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) - df2 - -Previous Behavior: - -.. code-block:: ipython - - In [5]: pd.merge(df1, df2, how='outer') - Out[5]: - key v1 - 0 1.0 10.0 - 1 1.0 20.0 - 2 2.0 30.0 - - In [6]: pd.merge(df1, df2, how='outer').dtypes - Out[6]: - key float64 - v1 float64 - dtype: object - -New Behavior: - -We are able to preserve the join keys - -.. ipython:: python - - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes - -Of course if you have missing values that are introduced, then the -resulting dtype will be upcast (unchanged from previous). - -.. ipython:: python - - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes - -.. _whatsnew_0182.describe: - -``.describe()`` changes -^^^^^^^^^^^^^^^^^^^^^^^ - -Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) - -.. ipython:: python - - s = pd.Series([0, 1, 2, 3, 4]) - df = pd.DataFrame([0, 1, 2, 3, 4]) - -Previous Behavior: - -The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. - -.. code-block:: ipython - - In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[3]: - count 5.000000 - mean 2.000000 - std 1.581139 - min 0.000000 - 0.0% 0.000400 - 0.1% 0.002000 - 0.1% 0.004000 - 50% 2.000000 - 99.9% 3.996000 - 100.0% 3.998000 - 100.0% 3.999600 - max 4.000000 - dtype: float64 - - In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[4]: - ... - ValueError: cannot reindex from a duplicate axis - -New Behavior: - -.. ipython:: python - - s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - -Furthermore: - -- Passing duplicated ``percentiles`` will now raise a ``ValueError``. -- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) - -.. _whatsnew_0182.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) -- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) -- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) -- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) -- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) - -.. _whatsnew_0182.deprecations: - -Deprecations -^^^^^^^^^^^^ - -- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) -- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) -- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) -- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) - -.. _whatsnew_0182.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) -- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) -- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - -- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) -- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - - -.. _whatsnew_0182.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) -- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) -- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - - -- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) -- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) - -- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) - -- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - -- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - -- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) -- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) - - -- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) -- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) -- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) -- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) -- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) - -- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) -- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) -- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) - -- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) -- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) -- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) - - -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) -- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a tempfile.TemporaryFile on Windows with Python 3 (:issue:`13398`) -- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) -- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) -- Bug in ``pd.read_csv()`` with ``engine=='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) -- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) - - - -- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) - - -- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) -- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, `int16`` dtypes (:issue:`13451`) -- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - -- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) -- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) -- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) -- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) -- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - - -- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) -- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) -- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) - -- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - - -- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) -- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) From ef371d0c5be1043c0610ac32d6bb2577a073a3ff Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 8 Jul 2016 15:50:55 -0500 Subject: [PATCH 5/9] adding support for more read_html arguments --- doc/source/whatsnew/v0.19.0.txt | 6 ++ pandas/io/html.py | 64 +++++++++++--------- pandas/io/tests/test_html.py | 101 +++++++++++++++++++++++++------- 3 files changed, 120 insertions(+), 51 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 657de7ec26efc..b9bce9c75dc0d 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -207,6 +207,12 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) +- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``keep_default_na`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``squeeze`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``date_parser`` option (:issue:`13461`) + - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 86dd92a80cf00..3c7553f66f983 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -611,10 +611,10 @@ def _expand_elements(body): body[ind] += empty * (lens_max - length) -def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, - decimal, converters, na_values): - head, body, foot = data +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop('data') + header = kwargs.pop('header') + kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = [head] + body @@ -628,11 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows, # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, index_col=index_col, - skiprows=_get_skiprows(skiprows), - parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands, decimal=decimal, - converters=converters, na_values=na_values) + tp = TextParser(body, header=header, **kwargs) df = tp.read() return df @@ -717,9 +713,9 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal, converters, na_values): +def _parse(flavor, io, match, + attrs, encoding, + **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -741,17 +737,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, - header=header, - index_col=index_col, - skiprows=skiprows, - parse_dates=parse_dates, - tupleize_cols=tupleize_cols, - thousands=thousands, - decimal=decimal, - converters=converters, - na_values=na_values - )) + ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret @@ -760,7 +746,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, - decimal='.', converters=None, na_values=None): + decimal='.', converters=None, na_values=None, + keep_default_na=True, squeeze=False, date_parser=None): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -848,12 +835,27 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, input argument, the cell (not column) content, and return the transformed content. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 na_values : iterable, default None Custom NA values - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to + + .. versionadded:: 0.19.0 + + squeeze : boolean, default False + If the parsed data only contains one column then return a Series + + .. versionadded:: 0.19.0 + + date_parser : function, default None + + .. versionadded:: 0.19.0 Returns ------- @@ -897,6 +899,10 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') _validate_header_arg(header) - return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal, converters, na_values) + return _parse(flavor=flavor, io=io, match=match, header=header, + index_col=index_col, skiprows=skiprows, + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands, attrs=attrs, encoding=encoding, + decimal=decimal, converters=converters, na_values=na_values, + keep_default_na=keep_default_na, squeeze=squeeze, + date_parser=date_parser) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index fb69a90e4bfdf..f5100185670af 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -4,6 +4,7 @@ import os import re import warnings +from datetime import datetime try: from importlib import import_module @@ -24,6 +25,7 @@ from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas.parser import CParserError +from pandas.io.date_converters import parse_date_time import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -698,55 +700,110 @@ def test_converters(self): # GH 13461 html_data = """ - - - + - - - -
NamesC_l0_g0C_l0_g1a
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285
""" - raw_data = np.array([[u'R_l0_g0', '0.763', 0.233], - [u'R_l0_g1', '0.244', 0.285]], dtype=object) - html_df = read_html(html_data, converters={'C_l0_g0': str})[0] - tm.assert_numpy_array_equal(raw_data, html_df.values) + + expected_df = DataFrame({'a': ['0.763', '0.244']}) + html_df = read_html(html_data, converters={'a': str})[0] + tm.assert_frame_equal(expected_df, html_df) def test_na_values(self): # GH 13461 html_data = """ - - - + - - - -
NamesC_l0_g0C_l0_g1a
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285
""" - raw_data = np.array([[u'R_l0_g0', 0.763, 0.233], - [u'R_l0_g1', 0.244, np.nan]], dtype=object) - html_df = read_html(html_data, na_values=[0.285])[0] - tm.assert_numpy_array_equal(raw_data, html_df.values) + + expected_df = DataFrame({'a': [0.763, np.nan]}) + html_df = read_html(html_data, na_values=[0.244])[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_keep_default_na(self): + html_data = """ + + + + + + + + + + + + +
a
N/A
NA
""" + + expected_df = DataFrame({'a': ['N/A', 'NA']}) + html_df = read_html(html_data, keep_default_na=False)[0] + tm.assert_frame_equal(expected_df, html_df) + + expected_df = DataFrame({'a': [np.nan, np.nan]}) + html_df = read_html(html_data, keep_default_na=True)[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_squeeze(self): + html_data = """ + + + + + + + + + +
a
0.763
""" + + expected_s = Series({0: 0.763}, name='a') + html_s = read_html(html_data, squeeze=True)[0] + tm.assert_series_equal(expected_s, html_s) + + def test_date_parser(self): + html_data = """ + + + + + + + + + + + + + + + +
datetime
2001-01-05 10:00:00
2001-01-05 00:00:00
""" + + datecols = {'date_time': [0, 1]} + df = read_html(html_data, header=0, + parse_dates=datecols, + date_parser=parse_date_time)[0] + self.assertIn('date_time', df) + self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) def _lang_enc(filename): From 7e6b5fe77a974811bc1e6e7fcca23e33057a023f Mon Sep 17 00:00:00 2001 From: Bob Date: Mon, 11 Jul 2016 11:06:30 -0400 Subject: [PATCH 6/9] consolidating args list and whatsnew list --- doc/source/whatsnew/v0.19.0.txt | 6 +----- pandas/io/html.py | 4 +--- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index b9bce9c75dc0d..96344896980f0 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -207,11 +207,7 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) -- The ``pd.read_html()`` has gained support for the ``na_values`` option (:issue:`13461`) -- The ``pd.read_html()`` has gained support for the ``converters`` option (:issue:`13461`) -- The ``pd.read_html()`` has gained support for the ``keep_default_na`` option (:issue:`13461`) -- The ``pd.read_html()`` has gained support for the ``squeeze`` option (:issue:`13461`) -- The ``pd.read_html()`` has gained support for the ``date_parser`` option (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na``, ``squeeze``, ``date_parser`` options (:issue:`13461`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 3c7553f66f983..4827d7318fca6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -713,9 +713,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, - attrs, encoding, - **kwargs): +def _parse(flavor, io, match, attrs, encoding, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here From dac660ab8e72368a73f3f5cf6746a5215a5a36ba Mon Sep 17 00:00:00 2001 From: Bob Date: Mon, 18 Jul 2016 23:05:56 -0400 Subject: [PATCH 7/9] removing squeeze and date_parser --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/html.py | 14 ++-------- pandas/io/tests/test_html.py | 45 --------------------------------- 3 files changed, 3 insertions(+), 58 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 96344896980f0..351b0ba9b2906 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -207,7 +207,7 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`) -- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na``, ``squeeze``, ``date_parser`` options (:issue:`13461`) +- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/html.py b/pandas/io/html.py index 4827d7318fca6..79f0f326c4dd7 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -745,7 +745,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True, squeeze=False, date_parser=None): + keep_default_na=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -846,15 +846,6 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 - squeeze : boolean, default False - If the parsed data only contains one column then return a Series - - .. versionadded:: 0.19.0 - - date_parser : function, default None - - .. versionadded:: 0.19.0 - Returns ------- dfs : list of DataFrames @@ -902,5 +893,4 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na, squeeze=squeeze, - date_parser=date_parser) + keep_default_na=keep_default_na) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index f5100185670af..7b4e775db9476 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -4,7 +4,6 @@ import os import re import warnings -from datetime import datetime try: from importlib import import_module @@ -25,7 +24,6 @@ from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas.parser import CParserError -from pandas.io.date_converters import parse_date_time import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -762,49 +760,6 @@ def test_keep_default_na(self): html_df = read_html(html_data, keep_default_na=True)[0] tm.assert_frame_equal(expected_df, html_df) - def test_squeeze(self): - html_data = """ - - - - - - - - - -
a
0.763
""" - - expected_s = Series({0: 0.763}, name='a') - html_s = read_html(html_data, squeeze=True)[0] - tm.assert_series_equal(expected_s, html_s) - - def test_date_parser(self): - html_data = """ - - - - - - - - - - - - - - - -
datetime
2001-01-05 10:00:00
2001-01-05 00:00:00
""" - - datecols = {'date_time': [0, 1]} - df = read_html(html_data, header=0, - parse_dates=datecols, - date_parser=parse_date_time)[0] - self.assertIn('date_time', df) - self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) - def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') From 2abb473a6728b9d7037afa8e78afc39115144d53 Mon Sep 17 00:00:00 2001 From: Bob Date: Tue, 19 Jul 2016 10:48:50 -0400 Subject: [PATCH 8/9] adding new read_html functions to docs --- doc/source/io.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index da0444a8b8df9..8bff027467415 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1959,6 +1959,29 @@ Specify an HTML attribute dfs2 = read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True +Specify values that should be converted to NaN + +.. code-block:: python + + dfs = read_html(url, na_values=['No Acquirer']) + +Specify whether to keep the default set of NaN values + +.. code-block:: python + + dfs = read_html(url, keep_default_na=False) + +Specify converters for columns. This is useful for numerical text data that has +leading zeros. By default columns that are numerical are cast to numeric +types and the leading zeros are lost. To avoid this, we can convert these +columns to strings. + +.. code-block:: python + + url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' + dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': + str}) + Use some combination of the above .. code-block:: python From 5cb8243f2dd31cc2155627f29cfc89bbf6d4b84b Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 21 Jul 2016 08:50:09 -0400 Subject: [PATCH 9/9] adding versionadded to docs --- doc/source/io.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 8bff027467415..113afa32d182e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1965,12 +1965,16 @@ Specify values that should be converted to NaN dfs = read_html(url, na_values=['No Acquirer']) +.. versionadded:: 0.19 + Specify whether to keep the default set of NaN values .. code-block:: python dfs = read_html(url, keep_default_na=False) +.. versionadded:: 0.19 + Specify converters for columns. This is useful for numerical text data that has leading zeros. By default columns that are numerical are cast to numeric types and the leading zeros are lost. To avoid this, we can convert these @@ -1982,6 +1986,8 @@ columns to strings. dfs = read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': str}) +.. versionadded:: 0.19 + Use some combination of the above .. code-block:: python