From d63ac05a71dee525f51091a0d7be110f178d587b Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 28 Aug 2013 12:06:19 -0400 Subject: [PATCH 1/5] CLN/ENH: add parse_dates arg and use TextReader --- pandas/io/html.py | 150 +++++++++++------------------------ pandas/io/parsers.py | 13 +-- pandas/io/tests/test_html.py | 94 ++++++++++++---------- 3 files changed, 102 insertions(+), 155 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index df94e0ffa2e79..e2b3eca9c923b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -7,15 +7,18 @@ import re import numbers import collections +import warnings +from itertools import repeat from distutils.version import LooseVersion import numpy as np -from pandas import DataFrame, MultiIndex, isnull from pandas.io.common import _is_url, urlopen, parse_url -from pandas.compat import range, lrange, lmap, u, map -from pandas import compat +from pandas.io.parsers import TextParser +from pandas.compat import lrange, lmap, u +from pandas.core import common as com +from pandas import compat, Series try: @@ -67,7 +70,7 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): return regex.sub(' ', s.strip()) -def _get_skiprows_iter(skiprows): +def _get_skiprows(skiprows): """Get an iterator given an integer, slice or container. Parameters @@ -92,10 +95,10 @@ def _get_skiprows_iter(skiprows): """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral): - return lrange(skiprows) - elif isinstance(skiprows, collections.Container): + elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): return skiprows + elif skiprows is None: + return 0 else: raise TypeError('{0} is not a valid type for skipping' ' rows'.format(type(skiprows))) @@ -583,101 +586,34 @@ def _parse_raw_tfoot(self, table): table.xpath(expr)] -def _data_to_frame(data, header, index_col, infer_types, skiprows): - """Parse a BeautifulSoup table into a DataFrame. +def _nan_list(n): + return list(repeat(np.nan, n)) - Parameters - ---------- - data : tuple of lists - The raw data to be placed into a DataFrame. This is a list of lists of - strings or unicode. If it helps, it can be thought of as a matrix of - strings instead. - - header : int or None - An integer indicating the row to use for the column header or None - indicating no header will be used. - - index_col : int or None - An integer indicating the column to use for the index or None - indicating no column will be used. - - infer_types : bool - Whether to convert numbers and dates. - - skiprows : collections.Container or int or slice - Iterable used to skip rows. - - Returns - ------- - df : DataFrame - A DataFrame containing the data from `data` - - Raises - ------ - ValueError - * If `skiprows` is not found in the rows of the parsed DataFrame. - Raises - ------ - ValueError - * If `skiprows` is not found in the rows of the parsed DataFrame. +def _expand_elements(body): + lens = Series(lmap(len, body)) + lens_max = lens.max() + not_max = lens[lens != lens_max] - See Also - -------- - read_html + for ind, length in not_max.iteritems(): + body[ind] += _nan_list(lens_max - length) - Notes - ----- - The `data` parameter is guaranteed not to be a list of empty lists. - """ - thead, tbody, tfoot = data - columns = thead or None - df = DataFrame(tbody, columns=columns) - if skiprows is not None: - it = _get_skiprows_iter(skiprows) +def _data_to_frame(data, header, index_col, skiprows, infer_types, + parse_dates): + head, body, _ = data # _ is footer which is rarely used: ignore for now + _expand_elements(body) + body = [head] + body + import ipdb; ipdb.set_trace() + tp = TextParser(body, header=header, index_col=index_col, + skiprows=_get_skiprows(skiprows), + parse_dates=parse_dates, tupleize_cols=False) + df = tp.read() - try: - df = df.drop(it) - except ValueError: - raise ValueError('Labels {0} not found when trying to skip' - ' rows'.format(it)) - - # convert to numbers/dates where possible - # must be sequential since dates trump numbers if both args are given - if infer_types: - df = df.convert_objects(convert_numeric=True) + if infer_types: # remove in 0.14 df = df.convert_objects(convert_dates='coerce') - - if header is not None: - header_rows = df.iloc[header] - - if header_rows.ndim == 2: - names = header_rows.index - df.columns = MultiIndex.from_arrays(header_rows.values, - names=names) - else: - df.columns = header_rows - - df = df.drop(df.index[header]) - - if index_col is not None: - cols = df.columns[index_col] - - try: - cols = cols.tolist() - except AttributeError: - pass - - # drop by default - df.set_index(cols, inplace=True) - if df.index.nlevels == 1: - if isnull(df.index.name) or not df.index.name: - df.index.name = None - else: - names = [name or None for name in df.index.names] - df.index = MultiIndex.from_tuples(df.index.values, names=names) - + else: + df = df.applymap(compat.text_type) return df @@ -750,7 +686,8 @@ def _validate_parser_flavor(flavor): return flavor -def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): +def _parse(flavor, io, match, header, index_col, skiprows, infer_types, + parse_dates, attrs): # bonus: re.compile is idempotent under function iteration so you can pass # a compiled regex to it and it will return itself flavor = _validate_parser_flavor(flavor) @@ -771,12 +708,12 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, attrs): else: raise retained - return [_data_to_frame(table, header, index_col, infer_types, skiprows) - for table in tables] + return [_data_to_frame(table, header, index_col, skiprows, infer_types, + parse_dates) for table in tables] -def read_html(io, match='.+', flavor=None, header=None, index_col=None, - skiprows=None, infer_types=True, attrs=None): +def read_html(io, match='.+', flavor=None, header=0, index_col=None, + skiprows=None, infer_types=None, attrs=None, parse_dates=False): r"""Read an HTML table into a DataFrame. Parameters @@ -801,7 +738,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or array-like or None, optional, default ``None`` + header : int or array-like, optional, default ``0`` The row (or rows for a MultiIndex) to use to make the columns headers. Note that this row will be removed from the data. @@ -828,9 +765,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, it is treated as "skip :math:`n` rows", *not* as "skip the :math:`n^\textrm{th}` row". - infer_types : bool, optional, default ``True`` - Whether to convert numeric types and date-appearing strings to numbers - and dates, respectively. + infer_types : bool or None, optional, default ``None``, deprecated since 0.13, removed in 0.14 attrs : dict or None, optional, default ``None`` This is a dictionary of attributes that you can pass to use to identify @@ -896,8 +831,13 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, """ # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. + if infer_types is not None: + warnings.warn("infer_types will be removed in 0.14", UserWarning) + else: + infer_types = True # remove in 0.14 + if isinstance(skiprows, numbers.Integral) and skiprows < 0: raise AssertionError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - attrs) + parse_dates, attrs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ef3cbf856fef..0e1ffee42e655 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1468,22 +1468,23 @@ def _convert_data(self, data): col = self.orig_names[col] clean_conv[col] = f - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, - clean_conv) + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, + self.verbose, clean_conv) def _infer_columns(self): + #import ipdb; ipdb.set_trace() names = self.names if self.header is not None: header = self.header # we have a mi columns, so read and extra line - if isinstance(header,(list,tuple,np.ndarray)): + if isinstance(header, (list, tuple, np.ndarray)): have_mi_columns = True - header = list(header) + [header[-1]+1] + header = list(header) + [header[-1] + 1] else: have_mi_columns = False - header = [ header ] + header = [header] columns = [] for level, hr in enumerate(header): @@ -1498,7 +1499,7 @@ def _infer_columns(self): this_columns = [] for i, c in enumerate(line): - if c == '': + if not c: if have_mi_columns: this_columns.append('Unnamed: %d_level_%d' % (i,level)) else: diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 44e4b5cfda7b6..6ce0855c1db7b 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -1,33 +1,30 @@ from __future__ import print_function + import os import re -from unittest import TestCase import warnings + +try: + from importlib import import_module +except ImportError: + import_module = __import__ + +from unittest import TestCase from distutils.version import LooseVersion -from pandas.io.common import URLError import nose -from nose.tools import assert_raises import numpy as np from numpy.random import rand from numpy.testing.decorators import slow -from pandas.compat import map, zip, StringIO -import pandas.compat as compat - -try: - from importlib import import_module -except ImportError: - import_module = __import__ - -from pandas.io.html import read_html -from pandas.io.common import urlopen from pandas import DataFrame, MultiIndex, read_csv, Timestamp -from pandas.util.testing import (assert_frame_equal, network, - get_data_path) +from pandas.compat import map, zip, StringIO, string_types +from pandas.io.common import URLError, urlopen +from pandas.io.html import read_html -from pandas.util.testing import makeCustomDataframe as mkdf +import pandas.util.testing as tm +from pandas.util.testing import makeCustomDataframe as mkdf, network def _have_module(module_name): @@ -40,11 +37,11 @@ def _have_module(module_name): def _skip_if_no(module_name): if not _have_module(module_name): - raise nose.SkipTest("{0} not found".format(module_name)) + raise nose.SkipTest("{0!r} not found".format(module_name)) def _skip_if_none_of(module_names): - if isinstance(module_names, compat.string_types): + if isinstance(module_names, string_types): _skip_if_no(module_names) if module_names == 'bs4': import bs4 @@ -54,17 +51,14 @@ def _skip_if_none_of(module_names): not_found = [module_name for module_name in module_names if not _have_module(module_name)] if set(not_found) & set(module_names): - raise nose.SkipTest("{0} not found".format(not_found)) + raise nose.SkipTest("{0!r} not found".format(not_found)) if 'bs4' in module_names: import bs4 if bs4.__version__ == LooseVersion('4.2.0'): raise nose.SkipTest("Bad version of bs4: 4.2.0") -DATA_PATH = get_data_path() - -def isframe(x): - return isinstance(x, DataFrame) +DATA_PATH = tm.get_data_path() def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -72,10 +66,12 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): 'len(list1) == {0}, ' 'len(list2) == {1}'.format(len(list1), len(list2))) - assert all(map(lambda x, y: isframe(x) and isframe(y), list1, list2)), \ - 'not all list elements are DataFrames' + msg = 'not all list elements are DataFrames' + both_frames = all(map(lambda x, y: isinstance(x, DataFrame) and + isinstance(y, DataFrame), list1, list2)) + assert both_frames, msg for frame_i, frame_j in zip(list1, list2): - assert_frame_equal(frame_i, frame_j, *args, **kwargs) + tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs) assert not frame_i.empty, 'frames are both empty' @@ -83,9 +79,9 @@ def test_bs4_version_fails(): _skip_if_none_of(('bs4', 'html5lib')) import bs4 if bs4.__version__ == LooseVersion('4.2.0'): - assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, - "spam.html"), - flavor='bs4') + tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH, + "spam.html"), + flavor='bs4') class TestReadHtmlBase(TestCase): @@ -116,7 +112,7 @@ def test_to_html_compat(self): index_col=0)[0] print(df.dtypes) print(res.dtypes) - assert_frame_equal(res, df) + tm.assert_frame_equal(res, df) @network def test_banklist_url(self): @@ -145,13 +141,20 @@ def test_banklist(self): assert_framelist_equal(df1, df2) - def test_spam(self): + def test_spam_no_types(self): df1 = self.run_read_html(self.spam_data, '.*Water.*', infer_types=False) df2 = self.run_read_html(self.spam_data, 'Unit', infer_types=False) assert_framelist_equal(df1, df2) - print(df1[0]) + + self.assertEqual(df1[0].ix[0, 0], 'Proximates') + self.assertEqual(df1[0].columns[0], 'Nutrient') + + def test_spam_with_types(self): + df1 = self.run_read_html(self.spam_data, '.*Water.*') + df2 = self.run_read_html(self.spam_data, 'Unit') + assert_framelist_equal(df1, df2) self.assertEqual(df1[0].ix[0, 0], 'Proximates') self.assertEqual(df1[0].columns[0], 'Nutrient') @@ -167,9 +170,8 @@ def test_banklist_no_match(self): self.assert_(isinstance(df, DataFrame)) def test_spam_header(self): - df = self.run_read_html(self.spam_data, '.*Water.*', header=0) df = self.run_read_html(self.spam_data, '.*Water.*', header=1)[0] - self.assertEqual(df.columns[0], 'Water') + self.assertEqual(df.columns[0], 'Proximates') self.assertFalse(df.empty) def test_skiprows_int(self): @@ -179,10 +181,10 @@ def test_skiprows_int(self): assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = [self.run_read_html(self.spam_data, '.*Water.*').pop()[2:]] - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=range(2)) - - assert_framelist_equal(df1, df2) + df1 = self.run_read_html(self.spam_data, '.*Water.*', + skiprows=range(2))[0] + df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) @@ -226,7 +228,7 @@ def test_skiprows_ndarray(self): assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - self.assertRaises(ValueError, self.run_read_html, self.spam_data, + self.assertRaises(TypeError, self.run_read_html, self.spam_data, '.*Water.*', skiprows='asdf') def test_index(self): @@ -237,8 +239,8 @@ def test_index(self): def test_header_and_index_no_types(self): df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, index_col=0, infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, index_col=0, - infer_types=False) + df2 = self.run_read_html(self.spam_data, 'Unit', header=1, + index_col=0, infer_types=False) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): @@ -336,6 +338,7 @@ def test_multiindex_header_index(self): @slow def test_multiindex_header_skiprows(self): + import ipdb; ipdb.set_trace() df = self._bank_data(header=[0, 1], skiprows=1)[0] self.assert_(isinstance(df.columns, MultiIndex)) @@ -343,6 +346,7 @@ def test_multiindex_header_skiprows(self): def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] self.assert_(isinstance(df.index, MultiIndex)) + self.assert_(isinstance(df.columns, MultiIndex)) @slow def test_regex_idempotency(self): @@ -382,6 +386,7 @@ def test_pythonxy_plugins_table(self): @slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace + def try_remove_ws(x): try: return _remove_whitespace(x) @@ -412,8 +417,8 @@ def try_remove_ws(x): dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew.convert_objects(convert_numeric=True) - assert_frame_equal(converted.convert_objects(convert_dates='coerce'), - gtnew) + tm.assert_frame_equal(converted.convert_objects(convert_dates='coerce'), + gtnew) @slow def test_gold_canyon(self): @@ -446,7 +451,8 @@ def test_spam_data_fail(self): def test_banklist_data_fail(self): from lxml.etree import XMLSyntaxError banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.assertRaises(XMLSyntaxError, self.run_read_html, banklist_data, flavor=['lxml']) + self.assertRaises(XMLSyntaxError, self.run_read_html, banklist_data, + flavor=['lxml']) def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') From 92aa2774ca9aa89d0a2cca011707b80db995df92 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 6 Sep 2013 22:54:37 -0400 Subject: [PATCH 2/5] ENH: add tupleize_cols, thousands args tupleize_cols=False by default, for back compat thousands=',' by default, because we're not parsing CSV --- pandas/io/html.py | 146 +- pandas/io/parsers.py | 13 +- pandas/io/tests/data/macau.html | 3691 +++++++++++++++++++++++++++++++ pandas/io/tests/test_html.py | 168 +- 4 files changed, 3905 insertions(+), 113 deletions(-) create mode 100644 pandas/io/tests/data/macau.html diff --git a/pandas/io/html.py b/pandas/io/html.py index e2b3eca9c923b..57c3d26b7937c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,8 +8,8 @@ import numbers import collections import warnings +import itertools -from itertools import repeat from distutils.version import LooseVersion import numpy as np @@ -48,7 +48,7 @@ ############# # READ HTML # ############# -_RE_WHITESPACE = re.compile(r'([\r\n]+|\s{2,})') +_RE_WHITESPACE = re.compile(r'[\r\n]+|\s{2,}') def _remove_whitespace(s, regex=_RE_WHITESPACE): @@ -100,8 +100,8 @@ def _get_skiprows(skiprows): elif skiprows is None: return 0 else: - raise TypeError('{0} is not a valid type for skipping' - ' rows'.format(type(skiprows))) + raise TypeError('{0!r} is not a valid type for skipping' + ' rows'.format(type(skiprows).__name__)) def _read(io): @@ -127,7 +127,7 @@ def _read(io): raw_text = io else: raise TypeError("Cannot read object of type " - "'{0.__class__.__name__!r}'".format(io)) + "{0!r}".format(type(io).__name__)) return raw_text @@ -587,7 +587,7 @@ def _parse_raw_tfoot(self, table): def _nan_list(n): - return list(repeat(np.nan, n)) + return list(itertools.repeat(np.nan, n)) def _expand_elements(body): @@ -595,22 +595,30 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] - for ind, length in not_max.iteritems(): + for ind, length in compat.iteritems(not_max): body[ind] += _nan_list(lens_max - length) def _data_to_frame(data, header, index_col, skiprows, infer_types, - parse_dates): + parse_dates, tupleize_cols, thousands): head, body, _ = data # _ is footer which is rarely used: ignore for now + + if head: + body = [head] + body + + if header is None: # special case when a table has elements + header = 0 + + # fill out elements of body that are "ragged" _expand_elements(body) - body = [head] + body - import ipdb; ipdb.set_trace() + tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), - parse_dates=parse_dates, tupleize_cols=False) + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands) df = tp.read() - if infer_types: # remove in 0.14 + if infer_types: # TODO: remove in 0.14 df = df.convert_objects(convert_dates='coerce') else: df = df.applymap(compat.text_type) @@ -687,7 +695,7 @@ def _validate_parser_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, attrs): + parse_dates, tupleize_cols, thousands, attrs): # bonus: re.compile is idempotent under function iteration so you can pass # a compiled regex to it and it will return itself flavor = _validate_parser_flavor(flavor) @@ -709,22 +717,23 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, raise retained return [_data_to_frame(table, header, index_col, skiprows, infer_types, - parse_dates) for table in tables] + parse_dates, tupleize_cols, thousands) + for table in tables] -def read_html(io, match='.+', flavor=None, header=0, index_col=None, - skiprows=None, infer_types=None, attrs=None, parse_dates=False): - r"""Read an HTML table into a DataFrame. +def read_html(io, match='.+', flavor=None, header=None, index_col=None, + skiprows=None, infer_types=None, attrs=None, parse_dates=False, + tupleize_cols=False, thousands=','): + r"""Read HTML tables into a ``list`` of DataFrames. Parameters ---------- io : str or file-like - A string or file like object that can be either a url, a file-like - object, or a raw string containing HTML. Note that lxml only accepts - the http, ftp and file url protocols. If you have a URI that starts - with ``'https'`` you might removing the ``'s'``. + A URL, a file-like object, or a raw string containing HTML. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might removing the ``'s'``. - match : str or regex, optional, default '.+' + match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be returned. Unless the HTML is extremely simple you will probably need to pass a non-empty string here. Defaults to '.+' (match any non-empty @@ -732,42 +741,41 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None, This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str, container of strings, default ``None`` - The parsing engine to use under the hood. 'bs4' and 'html5lib' are - synonymous with each other, they are both there for backwards - compatibility. The default of ``None`` tries to use ``lxml`` to parse - and if that fails it falls back on ``bs4`` + ``html5lib``. + flavor : str or None, container of strings + The parsing engine to use. 'bs4' and 'html5lib' are synonymous with + each other, they are both there for backwards compatibility. The + default of ``None`` tries to use ``lxml`` to parse and if that fails it + falls back on ``bs4`` + ``html5lib``. - header : int or array-like, optional, default ``0`` - The row (or rows for a MultiIndex) to use to make the columns headers. - Note that this row will be removed from the data. + header : int or list-like or None, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. - index_col : int or array-like or None, optional, default ``None`` - The column to use to make the index. Note that this column will be - removed from the data. + index_col : int or list-like or None, optional + The column (or list of columns) to use to create the index. - skiprows : int or collections.Container or slice or None, optional, default ``None`` + skiprows : int or list-like or slice or None, optional If an integer is given then skip this many rows after parsing the column header. If a sequence of integers is given skip those specific rows (0-based). Note that .. code-block:: python - skiprows == 0 + pandas.read_html(..., skiprows=0) yields the same result as .. code-block:: python - skiprows is None + pandas.read_html(..., skiprows=None) If `skiprows` is a positive integer, say :math:`n`, then it is treated as "skip :math:`n` rows", *not* as "skip the :math:`n^\textrm{th}` row". - infer_types : bool or None, optional, default ``None``, deprecated since 0.13, removed in 0.14 + infer_types : bool, optional, deprecated since 0.13, removed in 0.14 - attrs : dict or None, optional, default ``None`` + attrs : dict or None, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be @@ -793,33 +801,43 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None, `__. It contains the latest information on table attributes for the modern web. + parse_dates : bool, optional + See :func:`~pandas.read_csv` for details. + + tupleize_cols : bool, optional + If ``False`` try to parse multiple header rows into a + :class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more + details. Defaults to ``False`` for backwards compatibility. This is in + contrast to other IO functions which default to ``True``. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. Note that + this is different from :func:`~pandas.read_csv` because + :func:`~pandas.read_csv` must be able to parse different separators, + and the default separator is ``','``. :func:`~pandas.read_html` does + not need to do this, so it defaults to ``','``. + Returns ------- dfs : list of DataFrames - A list of DataFrames, each of which is the parsed data from each of the - tables on the page. Notes ----- - Before using this function you should probably read the :ref:`gotchas about - the parser libraries that this function uses `. - - There's as little cleaning of the data as possible due to the heterogeneity - and general disorder of HTML on the web. + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. - Expect some cleanup after you call this function. For example, - you might need to pass `infer_types=False` and perform manual conversion if - the column names are converted to NaN when you pass the `header=0` - argument. We try to assume as little as possible about the structure of the - table and push the idiosyncrasies of the HTML contained in the table to - you, the user. + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. - This function only searches for elements and only for and - - - - - - - - - - - - - - - - - - - + +
- rows and elements within those rows. This could be extended by - subclassing one of the parser classes contained in :mod:`pandas.io.html`. + This function searches for ```` elements and only for ```` + and ```` or ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". - Similar to :func:`read_csv` the `header` argument is applied **after** - `skiprows` is applied. + Similar to :func:`~pandas.read_csv` the `header` argument is applied + **after** `skiprows` is applied. This function will *always* return a list of :class:`DataFrame` *or* it will fail, e.g., it will *not* return an empty list. @@ -827,17 +845,21 @@ def read_html(io, match='.+', flavor=None, header=0, index_col=None, Examples -------- See the :ref:`read_html documentation in the IO section of the docs - ` for many examples of reading HTML. + ` for some examples of reading in HTML tables. + + See Also + -------- + pandas.read_csv """ - # Type check here. We don't want to parse only to fail because of an - # invalid value of an integer skiprows. if infer_types is not None: - warnings.warn("infer_types will be removed in 0.14", UserWarning) + warnings.warn("infer_types will be removed in 0.14") else: - infer_types = True # remove in 0.14 + infer_types = True # TODO: remove in 0.14 + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: raise AssertionError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, - parse_dates, attrs) + parse_dates, tupleize_cols, thousands, attrs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0e1ffee42e655..3ef3cbf856fef 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1468,23 +1468,22 @@ def _convert_data(self, data): col = self.orig_names[col] clean_conv[col] = f - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv) + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, + clean_conv) def _infer_columns(self): - #import ipdb; ipdb.set_trace() names = self.names if self.header is not None: header = self.header # we have a mi columns, so read and extra line - if isinstance(header, (list, tuple, np.ndarray)): + if isinstance(header,(list,tuple,np.ndarray)): have_mi_columns = True - header = list(header) + [header[-1] + 1] + header = list(header) + [header[-1]+1] else: have_mi_columns = False - header = [header] + header = [ header ] columns = [] for level, hr in enumerate(header): @@ -1499,7 +1498,7 @@ def _infer_columns(self): this_columns = [] for i, c in enumerate(line): - if not c: + if c == '': if have_mi_columns: this_columns.append('Unnamed: %d_level_%d' % (i,level)) else: diff --git a/pandas/io/tests/data/macau.html b/pandas/io/tests/data/macau.html new file mode 100644 index 0000000000000..be62b3221518d --- /dev/null +++ b/pandas/io/tests/data/macau.html @@ -0,0 +1,3691 @@ + + + + + + + + + + + + + + + +Traffic Statistics - Passengers + + + + +
+
+ + +
+ +
+ + + + + + + + + + + + + + +
+
+ + +
+ +
+
+

Traffic Statistics - Passengers

+ +
+
+
+ + +
+ +
+
+
+
+ + + Traffic Statistics + + + + + +


+ Passengers Figure(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 374,917 + + + 362,379 + + + 301,503 + + + 358,902 + + + 342,323 + + + 420,574 +
February + + 393,152 + + + 312,405 + + + 301,259 + + + 351,654 + + + 297,755 + + + 442,809 +
March + + 408,755 + + + 334,000 + + + 318,908 + + + 360,365 + + + 387,879 + + + 468,540 +
April + + 408,860 + + + 358,198 + + + 339,060 + + + 352,976 + + + 400,553 + + + 492,930 +
May + + 374,397 + + + 329,218 + + + 321,060 + + + 330,407 + + + 335,967 + + + 465,045 +
June + + 401,995 + + + 356,679 + + + 343,006 + + + 326,724 + + + 296,748 + + + 426,764 +
July + + + + + 423,081 + + + 378,993 + + + 356,580 + + + 351,110 + + + 439,425 +
August + + + + + 453,391 + + + 395,883 + + + 364,011 + + + 404,076 + + + 425,814 +
September + + + + + 384,887 + + + 325,124 + + + 308,940 + + + 317,226 + + + 379,898 +
October + + + + + 383,889 + + + 333,102 + + + 317,040 + + + 355,935 + + + 415,339 +
November + + + + + 379,065 + + + 327,803 + + + 303,186 + + + 372,104 + + + 366,411 +
December + + + + + 413,873 + + + 359,313 + + + 348,051 + + + 388,573 + + + 354,253 +
Total + + 2,362,076 + + + 4,491,065 + + + 4,045,014 + + + 4,078,836 + + + 4,250,249 + + + 5,097,802 +
+ +


+ Passengers Figure(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 381,887 + + + 323,282 + + + 289,701 + + + 288,507 + + + 290,140 + + + 268,783 +
February + + 426,014 + + + 360,820 + + + 348,723 + + + 207,710 + + + 323,264 + + + 323,654 +
March + + 443,805 + + + 389,125 + + + 321,953 + + + 273,910 + + + 295,052 + + + 360,668 +
April + + 500,917 + + + 431,550 + + + 367,976 + + + 324,931 + + + 144,082 + + + 380,648 +
May + + 468,637 + + + 399,743 + + + 359,298 + + + 250,601 + + + 47,333 + + + 359,547 +
June + + 463,676 + + + 393,713 + + + 360,147 + + + 296,000 + + + 94,294 + + + 326,508 +
July + + 490,404 + + + 465,497 + + + 413,131 + + + 365,454 + + + 272,784 + + + 388,061 +
August + + 490,830 + + + 478,474 + + + 409,281 + + + 372,802 + + + 333,840 + + + 384,719 +
September + + 446,594 + + + 412,444 + + + 354,751 + + + 321,456 + + + 295,447 + + + 334,029 +
October + + 465,757 + + + 461,215 + + + 390,435 + + + 358,362 + + + 291,193 + + + 372,706 +
November + + 455,132 + + + 425,116 + + + 323,347 + + + 327,593 + + + 268,282 + + + 350,324 +
December + + 465,225 + + + 435,114 + + + 308,999 + + + 326,933 + + + 249,855 + + + 322,056 +
Total + + 5,498,878 + + + 4,976,093 + + + 4,247,742 + + + 3,714,259 + + + 2,905,566 + + + 4,171,703 +
+ +


+ Passengers Figure(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 265,603 + + + 184,381 + + + 161,264 + + + 161,432 + + + 117,984 + + + +
February + + 249,259 + + + 264,066 + + + 209,569 + + + 168,777 + + + 150,772 + + + +
March + + 312,319 + + + 226,483 + + + 186,965 + + + 172,060 + + + 149,795 + + + +
April + + 351,793 + + + 296,541 + + + 237,449 + + + 180,241 + + + 179,049 + + + +
May + + 338,692 + + + 288,949 + + + 230,691 + + + 172,391 + + + 189,925 + + + +
June + + 332,630 + + + 271,181 + + + 231,328 + + + 157,519 + + + 175,402 + + + +
July + + 344,658 + + + 304,276 + + + 243,534 + + + 205,595 + + + 173,103 + + + +
August + + 360,899 + + + 300,418 + + + 257,616 + + + 241,140 + + + 178,118 + + + +
September + + 291,817 + + + 280,803 + + + 210,885 + + + 183,954 + + + 163,385 + + + +
October + + 327,232 + + + 298,873 + + + 231,251 + + + 205,726 + + + 176,879 + + + +
November + + 315,538 + + + 265,528 + + + 228,637 + + + 181,677 + + + 146,804 + + + +
December + + 314,866 + + + 257,929 + + + 210,922 + + + 183,975 + + + 151,362 + + + +
Total + + 3,805,306 + + + 3,239,428 + + + 2,640,111 + + + 2,214,487 + + + 1,952,578 + + + 0 +
+ +


+ Passengers Figure(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 6,601 +
December + + 37,041 +
Total + + 43,642 +
+ + +


+
passenger statistic picture
+


+ + + + +


+ Movement Statistics(2008-2013)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  201320122011201020092008
January + + 3,925 + + + 3,463 + + + 3,289 + + + 3,184 + + + 3,488 + + + 4,568 +
February + + 3,632 + + + 2,983 + + + 2,902 + + + 3,053 + + + 3,347 + + + 4,527 +
March + + 3,909 + + + 3,166 + + + 3,217 + + + 3,175 + + + 3,636 + + + 4,594 +
April + + 3,903 + + + 3,258 + + + 3,146 + + + 3,023 + + + 3,709 + + + 4,574 +
May + + 4,075 + + + 3,234 + + + 3,266 + + + 3,033 + + + 3,603 + + + 4,511 +
June + + 4,038 + + + 3,272 + + + 3,316 + + + 2,909 + + + 3,057 + + + 4,081 +
July + + + + + 3,661 + + + 3,359 + + + 3,062 + + + 3,354 + + + 4,215 +
August + + + + + 3,942 + + + 3,417 + + + 3,077 + + + 3,395 + + + 4,139 +
September + + + + + 3,703 + + + 3,169 + + + 3,095 + + + 3,100 + + + 3,752 +
October + + + + + 3,727 + + + 3,469 + + + 3,179 + + + 3,375 + + + 3,874 +
November + + + + + 3,722 + + + 3,145 + + + 3,159 + + + 3,213 + + + 3,567 +
December + + + + + 3,866 + + + 3,251 + + + 3,199 + + + 3,324 + + + 3,362 +
Total + + 23,482 + + + 41,997 + + + 38,946 + + + 37,148 + + + 40,601 + + + 49,764 +
+ +


+ Movement Statistics(2002-2007)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200720062005200420032002
January + + 4,384 + + + 3,933 + + + 3,528 + + + 3,051 + + + 3,257 + + + 2,711 +
February + + 4,131 + + + 3,667 + + + 3,331 + + + 2,372 + + + 3,003 + + + 2,747 +
March + + 4,349 + + + 4,345 + + + 3,549 + + + 3,049 + + + 3,109 + + + 2,985 +
April + + 4,460 + + + 4,490 + + + 3,832 + + + 3,359 + + + 2,033 + + + 2,928 +
May + + 4,629 + + + 4,245 + + + 3,663 + + + 3,251 + + + 1,229 + + + 3,109 +
June + + 4,365 + + + 4,124 + + + 3,752 + + + 3,414 + + + 1,217 + + + 3,049 +
July + + 4,612 + + + 4,386 + + + 3,876 + + + 3,664 + + + 2,423 + + + 3,078 +
August + + 4,446 + + + 4,373 + + + 3,987 + + + 3,631 + + + 3,040 + + + 3,166 +
September + + 4,414 + + + 4,311 + + + 3,782 + + + 3,514 + + + 2,809 + + + 3,239 +
October + + 4,445 + + + 4,455 + + + 3,898 + + + 3,744 + + + 3,052 + + + 3,562 +
November + + 4,563 + + + 4,285 + + + 3,951 + + + 3,694 + + + 3,125 + + + 3,546 +
December + + 4,588 + + + 4,435 + + + 3,855 + + + 3,763 + + + 2,996 + + + 3,444 +
Total + + 53,386 + + + 51,049 + + + 45,004 + + + 40,506 + + + 31,293 + + + 37,564 +
+ +


+ Movement Statistics(1996-2001)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  200120001999199819971996
January + + 2,694 + + + 2,201 + + + 1,835 + + + 2,177 + + + 1,353 + + + 744 +
February + + 2,364 + + + 2,357 + + + 1,826 + + + 1,740 + + + 1,339 + + + 692 +
March + + 2,543 + + + 2,206 + + + 1,895 + + + 1,911 + + + 1,533 + + + 872 +
April + + 2,531 + + + 2,311 + + + 2,076 + + + 1,886 + + + 1,587 + + + 1,026 +
May + + 2,579 + + + 2,383 + + + 1,914 + + + 2,102 + + + 1,720 + + + 1,115 +
June + + 2,681 + + + 2,370 + + + 1,890 + + + 2,038 + + + 1,716 + + + 1,037 +
July + + 2,903 + + + 2,609 + + + 1,916 + + + 2,078 + + + 1,693 + + + 1,209 +
August + + 3,037 + + + 2,487 + + + 1,968 + + + 2,061 + + + 1,676 + + + 1,241 +
September + + 2,767 + + + 2,329 + + + 1,955 + + + 1,970 + + + 1,681 + + + 1,263 +
October + + 2,922 + + + 2,417 + + + 2,267 + + + 1,969 + + + 1,809 + + + 1,368 +
November + + 2,670 + + + 2,273 + + + 2,132 + + + 2,102 + + + 1,786 + + + 1,433 +
December + + 2,815 + + + 2,749 + + + 2,187 + + + 1,981 + + + 1,944 + + + 1,386 +
Total + + 32,506 + + + 28,692 + + + 23,861 + + + 24,015 + + + 19,837 + + + 13,386 +
+ +


+ Movement Statistics(1995-1995)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
  1995
January + + +
February + + +
March + + +
April + + +
May + + +
June + + +
July + + +
August + + +
September + + +
October + + +
November + + 126 +
December + + 536 +
Total + + 662 +
+ + +


+
passenger statistic picture
+ + +
+ +
+
+
+ + + +
+
+ +
+ +
+ + + +
+ + + +
+
+ + \ No newline at end of file diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 6ce0855c1db7b..a412c3e398462 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -3,13 +3,13 @@ import os import re import warnings +import unittest try: from importlib import import_module except ImportError: import_module = __import__ -from unittest import TestCase from distutils.version import LooseVersion import nose @@ -84,7 +84,7 @@ def test_bs4_version_fails(): flavor='bs4') -class TestReadHtmlBase(TestCase): +class TestReadHtml(unittest.TestCase): def run_read_html(self, *args, **kwargs): kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs) @@ -110,8 +110,6 @@ def test_to_html_compat(self): out = df.to_html() res = self.run_read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] - print(df.dtypes) - print(res.dtypes) tm.assert_frame_equal(res, df) @network @@ -142,9 +140,11 @@ def test_banklist(self): assert_framelist_equal(df1, df2) def test_spam_no_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', - infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', infer_types=False) + with tm.assert_produces_warning(): + df1 = self.run_read_html(self.spam_data, '.*Water.*', + infer_types=False) + with tm.assert_produces_warning(): + df2 = self.run_read_html(self.spam_data, 'Unit', infer_types=False) assert_framelist_equal(df1, df2) @@ -162,12 +162,12 @@ def test_spam_with_types(self): def test_spam_no_match(self): dfs = self.run_read_html(self.spam_data) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) def test_banklist_no_match(self): dfs = self.run_read_html(self.banklist_data, attrs={'id': 'table'}) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) def test_spam_header(self): df = self.run_read_html(self.spam_data, '.*Water.*', header=1)[0] @@ -237,10 +237,12 @@ def test_index(self): assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, - index_col=0, infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, - index_col=0, infer_types=False) + with tm.assert_produces_warning(): + df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, + index_col=0, infer_types=False) + with tm.assert_produces_warning(): + df2 = self.run_read_html(self.spam_data, 'Unit', header=1, + index_col=0, infer_types=False) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): @@ -250,14 +252,17 @@ def test_header_and_index_with_types(self): assert_framelist_equal(df1, df2) def test_infer_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0, - infer_types=False) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, - infer_types=False) + with tm.assert_produces_warning(): + df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0, + infer_types=False) + with tm.assert_produces_warning(): + df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, + infer_types=False) assert_framelist_equal(df1, df2) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, - infer_types=True) + with tm.assert_produces_warning(): + df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, + infer_types=True) self.assertRaises(AssertionError, assert_framelist_equal, df1, df2) @@ -268,25 +273,25 @@ def test_string_io(self): with open(self.spam_data) as f: data2 = StringIO(f.read()) - df1 = self.run_read_html(data1, '.*Water.*', infer_types=False) - df2 = self.run_read_html(data2, 'Unit', infer_types=False) + df1 = self.run_read_html(data1, '.*Water.*') + df2 = self.run_read_html(data2, 'Unit') assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data) as f: data = f.read() - df1 = self.run_read_html(data, '.*Water.*', infer_types=False) - df2 = self.run_read_html(data, 'Unit', infer_types=False) + df1 = self.run_read_html(data, '.*Water.*') + df2 = self.run_read_html(data, 'Unit') assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data) as f: - df1 = self.run_read_html(f, '.*Water.*', infer_types=False) + df1 = self.run_read_html(f, '.*Water.*') with open(self.spam_data) as f: - df2 = self.run_read_html(f, 'Unit', infer_types=False) + df2 = self.run_read_html(f, 'Unit') assert_framelist_equal(df1, df2) @@ -305,9 +310,9 @@ def test_file_url(self): url = self.banklist_data dfs = self.run_read_html('file://' + url, 'First', attrs={'id': 'table'}) - self.assert_(isinstance(dfs, list)) + tm.assert_isinstance(dfs, list) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) @slow def test_invalid_table_attrs(self): @@ -323,30 +328,31 @@ def _bank_data(self, *args, **kwargs): @slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] - self.assert_(isinstance(df.columns, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] - self.assert_(isinstance(df.index, MultiIndex)) + tm.assert_isinstance(df.index, MultiIndex) @slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] - self.assert_(isinstance(df.columns, MultiIndex)) - self.assert_(isinstance(df.index, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) + tm.assert_isinstance(df.index, MultiIndex) @slow def test_multiindex_header_skiprows(self): - import ipdb; ipdb.set_trace() + # wtf does skiprows=1 fail here?!? df = self._bank_data(header=[0, 1], skiprows=1)[0] - self.assert_(isinstance(df.columns, MultiIndex)) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_multiindex_header_index_skiprows(self): + # wtf does skiprows=1 fail here?!? df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] - self.assert_(isinstance(df.index, MultiIndex)) - self.assert_(isinstance(df.columns, MultiIndex)) + tm.assert_isinstance(df.index, MultiIndex) + tm.assert_isinstance(df.columns, MultiIndex) @slow def test_regex_idempotency(self): @@ -354,9 +360,9 @@ def test_regex_idempotency(self): dfs = self.run_read_html('file://' + url, match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) - self.assert_(isinstance(dfs, list)) + tm.assert_isinstance(dfs, list) for df in dfs: - self.assert_(isinstance(df, DataFrame)) + tm.assert_isinstance(df, DataFrame) def test_negative_skiprows_spam(self): url = self.spam_data @@ -383,6 +389,16 @@ def test_pythonxy_plugins_table(self): zz = [df.iloc[0, 0] for df in dfs] self.assertEqual(sorted(zz), sorted(['Python', 'SciTE'])) + @network + def test_thousands_macau_stats(self): + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.run_read_html(macau_data, index_col=0, + attrs={'class': 'style1'}) + + # no columns should have all nans + res = any((df.count() == 0).any() for df in dfs) + self.assertEqual(res, False) + @slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -428,11 +444,78 @@ def test_gold_canyon(self): self.assert_(gc in raw_text) df = self.run_read_html(self.banklist_data, 'Gold Canyon', - attrs={'id': 'table'}, infer_types=False)[0] + attrs={'id': 'table'})[0] self.assert_(gc in df.to_string()) + def test_different_number_of_rows(self): + expected = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233 nan nan nan
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + out = """ + + + + + + + + + + + + + + + + + + + + + + + + + +
C_l0_g0C_l0_g1C_l0_g2C_l0_g3C_l0_g4
R_l0_g0 0.763 0.233
R_l0_g1 0.244 0.285 0.392 0.137 0.222
""" + expected = self.run_read_html(out, attrs={'class': 'dataframe'}, + index_col=0)[0] + res = self.run_read_html(out, attrs={'class': 'dataframe'}, + index_col=0)[0] + tm.assert_frame_equal(expected, res) + + +class TestReadHtmlLxml(unittest.TestCase): + def setUp(self): + self.try_skip() -class TestReadHtmlLxml(TestCase): def run_read_html(self, *args, **kwargs): self.flavor = ['lxml'] self.try_skip() @@ -457,11 +540,8 @@ def test_banklist_data_fail(self): def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') dfs = self.run_read_html(filename, index_col=0, flavor=['lxml']) - self.assert_(isinstance(dfs, list)) - self.assert_(isinstance(dfs[0], DataFrame)) - - def setUp(self): - self.try_skip() + tm.assert_isinstance(dfs, list) + tm.assert_isinstance(dfs[0], DataFrame) @slow def test_fallback_success(self): From 584fbf81419216763e2ceb5fc02c929295d1de69 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 29 Sep 2013 19:42:03 -0400 Subject: [PATCH 3/5] TST: add test case for GH5048 Parsing headers of non-string columns --- pandas/io/tests/test_html.py | 173 +++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 68 deletions(-) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index a412c3e398462..c266a6c900e90 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -85,7 +85,7 @@ def test_bs4_version_fails(): class TestReadHtml(unittest.TestCase): - def run_read_html(self, *args, **kwargs): + def read_html(self, *args, **kwargs): kwargs['flavor'] = kwargs.get('flavor', self.flavor) return read_html(*args, **kwargs) @@ -108,16 +108,16 @@ def test_to_html_compat(self): df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False, r_idx_names=False).applymap('{0:.3f}'.format).astype(float) out = df.to_html() - res = self.run_read_html(out, attrs={'class': 'dataframe'}, + res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] tm.assert_frame_equal(res, df) @network def test_banklist_url(self): url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - df1 = self.run_read_html(url, 'First Federal Bank of Florida', + df1 = self.read_html(url, 'First Federal Bank of Florida', attrs={"id": 'table'}) - df2 = self.run_read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) + df2 = self.read_html(url, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) @@ -125,26 +125,26 @@ def test_banklist_url(self): def test_spam_url(self): url = ('http://ndb.nal.usda.gov/ndb/foods/show/1732?fg=&man=&' 'lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam') - df1 = self.run_read_html(url, '.*Water.*') - df2 = self.run_read_html(url, 'Unit') + df1 = self.read_html(url, '.*Water.*') + df2 = self.read_html(url, 'Unit') assert_framelist_equal(df1, df2) @slow def test_banklist(self): - df1 = self.run_read_html(self.banklist_data, '.*Florida.*', + df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) - df2 = self.run_read_html(self.banklist_data, 'Metcalf Bank', + df2 = self.read_html(self.banklist_data, 'Metcalf Bank', attrs={'id': 'table'}) assert_framelist_equal(df1, df2) def test_spam_no_types(self): with tm.assert_produces_warning(): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', infer_types=False) with tm.assert_produces_warning(): - df2 = self.run_read_html(self.spam_data, 'Unit', infer_types=False) + df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) assert_framelist_equal(df1, df2) @@ -152,116 +152,116 @@ def test_spam_no_types(self): self.assertEqual(df1[0].columns[0], 'Nutrient') def test_spam_with_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*') - df2 = self.run_read_html(self.spam_data, 'Unit') + df1 = self.read_html(self.spam_data, '.*Water.*') + df2 = self.read_html(self.spam_data, 'Unit') assert_framelist_equal(df1, df2) self.assertEqual(df1[0].ix[0, 0], 'Proximates') self.assertEqual(df1[0].columns[0], 'Nutrient') def test_spam_no_match(self): - dfs = self.run_read_html(self.spam_data) + dfs = self.read_html(self.spam_data) for df in dfs: tm.assert_isinstance(df, DataFrame) def test_banklist_no_match(self): - dfs = self.run_read_html(self.banklist_data, attrs={'id': 'table'}) + dfs = self.read_html(self.banklist_data, attrs={'id': 'table'}) for df in dfs: tm.assert_isinstance(df, DataFrame) def test_spam_header(self): - df = self.run_read_html(self.spam_data, '.*Water.*', header=1)[0] + df = self.read_html(self.spam_data, '.*Water.*', header=1)[0] self.assertEqual(df.columns[0], 'Proximates') self.assertFalse(df.empty) def test_skiprows_int(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_xrange(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=range(2))[0] - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=range(2))[0] + df2 = self.read_html(self.spam_data, 'Unit', skiprows=range(2))[0] tm.assert_frame_equal(df1, df2) def test_skiprows_list(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=[2, 1]) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=[1, 2]) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=[2, 1]) assert_framelist_equal(df1, df2) def test_skiprows_set(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=set([1, 2])) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=set([2, 1])) assert_framelist_equal(df1, df2) def test_skiprows_slice(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', skiprows=1) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=1) + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=1) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=1) assert_framelist_equal(df1, df2) def test_skiprows_slice_short(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2)) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=slice(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(2)) assert_framelist_equal(df1, df2) def test_skiprows_slice_long(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=slice(2, 5)) - df2 = self.run_read_html(self.spam_data, 'Unit', + df2 = self.read_html(self.spam_data, 'Unit', skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) def test_skiprows_ndarray(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', + df1 = self.read_html(self.spam_data, '.*Water.*', skiprows=np.arange(2)) - df2 = self.run_read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) + df2 = self.read_html(self.spam_data, 'Unit', skiprows=np.arange(2)) assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - self.assertRaises(TypeError, self.run_read_html, self.spam_data, + self.assertRaises(TypeError, self.read_html, self.spam_data, '.*Water.*', skiprows='asdf') def test_index(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0) - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0) + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', index_col=0) assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): with tm.assert_produces_warning(): - df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0, infer_types=False) with tm.assert_produces_warning(): - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0, infer_types=False) assert_framelist_equal(df1, df2) def test_header_and_index_with_types(self): - df1 = self.run_read_html(self.spam_data, '.*Water.*', header=1, + df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0) - df2 = self.run_read_html(self.spam_data, 'Unit', header=1, index_col=0) + df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0) assert_framelist_equal(df1, df2) def test_infer_types(self): with tm.assert_produces_warning(): - df1 = self.run_read_html(self.spam_data, '.*Water.*', index_col=0, + df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0, infer_types=False) with tm.assert_produces_warning(): - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, infer_types=False) assert_framelist_equal(df1, df2) with tm.assert_produces_warning(): - df2 = self.run_read_html(self.spam_data, 'Unit', index_col=0, + df2 = self.read_html(self.spam_data, 'Unit', index_col=0, infer_types=True) self.assertRaises(AssertionError, assert_framelist_equal, df1, df2) @@ -273,42 +273,42 @@ def test_string_io(self): with open(self.spam_data) as f: data2 = StringIO(f.read()) - df1 = self.run_read_html(data1, '.*Water.*') - df2 = self.run_read_html(data2, 'Unit') + df1 = self.read_html(data1, '.*Water.*') + df2 = self.read_html(data2, 'Unit') assert_framelist_equal(df1, df2) def test_string(self): with open(self.spam_data) as f: data = f.read() - df1 = self.run_read_html(data, '.*Water.*') - df2 = self.run_read_html(data, 'Unit') + df1 = self.read_html(data, '.*Water.*') + df2 = self.read_html(data, 'Unit') assert_framelist_equal(df1, df2) def test_file_like(self): with open(self.spam_data) as f: - df1 = self.run_read_html(f, '.*Water.*') + df1 = self.read_html(f, '.*Water.*') with open(self.spam_data) as f: - df2 = self.run_read_html(f, 'Unit') + df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) @network def test_bad_url_protocol(self): - self.assertRaises(URLError, self.run_read_html, + self.assertRaises(URLError, self.read_html, 'git://github.com', '.*Water.*') @network def test_invalid_url(self): - self.assertRaises(URLError, self.run_read_html, + self.assertRaises(URLError, self.read_html, 'http://www.a23950sdfa908sd.com') @slow def test_file_url(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, 'First', + dfs = self.read_html('file://' + url, 'First', attrs={'id': 'table'}) tm.assert_isinstance(dfs, list) for df in dfs: @@ -317,12 +317,12 @@ def test_file_url(self): @slow def test_invalid_table_attrs(self): url = self.banklist_data - self.assertRaises(AssertionError, self.run_read_html, url, + self.assertRaises(AssertionError, self.read_html, url, 'First Federal Bank of Florida', attrs={'id': 'tasdfable'}) def _bank_data(self, *args, **kwargs): - return self.run_read_html(self.banklist_data, 'Metcalf', + return self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'}, *args, **kwargs) @slow @@ -357,7 +357,7 @@ def test_multiindex_header_index_skiprows(self): @slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.run_read_html('file://' + url, + dfs = self.read_html('file://' + url, match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) tm.assert_isinstance(dfs, list) @@ -366,39 +366,76 @@ def test_regex_idempotency(self): def test_negative_skiprows_spam(self): url = self.spam_data - self.assertRaises(AssertionError, self.run_read_html, url, 'Water', + self.assertRaises(AssertionError, self.read_html, url, 'Water', skiprows=-1) def test_negative_skiprows_banklist(self): url = self.banklist_data - self.assertRaises(AssertionError, self.run_read_html, url, 'Florida', + self.assertRaises(AssertionError, self.read_html, url, 'Florida', skiprows=-1) @network def test_multiple_matches(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.run_read_html(url, match='Python', + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) self.assert_(len(dfs) > 1) @network def test_pythonxy_plugins_table(self): url = 'http://code.google.com/p/pythonxy/wiki/StandardPlugins' - dfs = self.run_read_html(url, match='Python', + dfs = self.read_html(url, match='Python', attrs={'class': 'wikitable'}) zz = [df.iloc[0, 0] for df in dfs] self.assertEqual(sorted(zz), sorted(['Python', 'SciTE'])) - @network + @slow def test_thousands_macau_stats(self): macau_data = os.path.join(DATA_PATH, 'macau.html') - dfs = self.run_read_html(macau_data, index_col=0, + dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) # no columns should have all nans res = any((df.count() == 0).any() for df in dfs) self.assertEqual(res, False) + def test_countries_municipalities(self): + # GH5048 + data1 = StringIO(u''' + + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + data2 = StringIO(u''' + + + + + + + + + + + + + +
CountryMunicipalityYear
UkraineOdessa1944
''') + res1 = self.read_html(data1) + res2 = self.read_html(data2, header=0) + assert_framelist_equal(res1, res2) + @slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -409,7 +446,7 @@ def try_remove_ws(x): except AttributeError: return x - df = self.run_read_html(self.banklist_data, 'Metcalf', + df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), converters={'Updated Date': Timestamp, @@ -443,7 +480,7 @@ def test_gold_canyon(self): raw_text = f.read() self.assert_(gc in raw_text) - df = self.run_read_html(self.banklist_data, 'Gold Canyon', + df = self.read_html(self.banklist_data, 'Gold Canyon', attrs={'id': 'table'})[0] self.assert_(gc in df.to_string()) @@ -505,9 +542,9 @@ def test_different_number_of_rows(self):
""" - expected = self.run_read_html(out, attrs={'class': 'dataframe'}, + expected = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] - res = self.run_read_html(out, attrs={'class': 'dataframe'}, + res = self.read_html(out, attrs={'class': 'dataframe'}, index_col=0)[0] tm.assert_frame_equal(expected, res) @@ -516,7 +553,7 @@ class TestReadHtmlLxml(unittest.TestCase): def setUp(self): self.try_skip() - def run_read_html(self, *args, **kwargs): + def read_html(self, *args, **kwargs): self.flavor = ['lxml'] self.try_skip() kwargs['flavor'] = kwargs.get('flavor', self.flavor) @@ -528,18 +565,18 @@ def try_skip(self): def test_spam_data_fail(self): from lxml.etree import XMLSyntaxError spam_data = os.path.join(DATA_PATH, 'spam.html') - self.assertRaises(XMLSyntaxError, self.run_read_html, spam_data, + self.assertRaises(XMLSyntaxError, self.read_html, spam_data, flavor=['lxml']) def test_banklist_data_fail(self): from lxml.etree import XMLSyntaxError banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.assertRaises(XMLSyntaxError, self.run_read_html, banklist_data, + self.assertRaises(XMLSyntaxError, self.read_html, banklist_data, flavor=['lxml']) def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') - dfs = self.run_read_html(filename, index_col=0, flavor=['lxml']) + dfs = self.read_html(filename, index_col=0, flavor=['lxml']) tm.assert_isinstance(dfs, list) tm.assert_isinstance(dfs[0], DataFrame) @@ -547,7 +584,7 @@ def test_works_on_valid_markup(self): def test_fallback_success(self): _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.run_read_html(banklist_data, '.*Water.*', flavor=['lxml', + self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) From e22fe1b25d1ff9d5c880f6c35bc91dfab7aa0b84 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 29 Sep 2013 20:29:07 -0400 Subject: [PATCH 4/5] TST: add nyse wsj test --- pandas/io/html.py | 206 ++-- pandas/io/parsers.py | 35 +- pandas/io/tests/data/nyse_wsj.html | 1207 ++++++++++++++++++++++++ pandas/io/tests/data/valid_markup.html | 37 +- pandas/io/tests/test_html.py | 124 ++- 5 files changed, 1399 insertions(+), 210 deletions(-) create mode 100644 pandas/io/tests/data/nyse_wsj.html diff --git a/pandas/io/html.py b/pandas/io/html.py index 57c3d26b7937c..ac3e3ad096392 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,7 +8,6 @@ import numbers import collections import warnings -import itertools from distutils.version import LooseVersion @@ -16,9 +15,10 @@ from pandas.io.common import _is_url, urlopen, parse_url from pandas.io.parsers import TextParser -from pandas.compat import lrange, lmap, u +from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type, + raise_with_traceback, OrderedDict) from pandas.core import common as com -from pandas import compat, Series +from pandas import Series try: @@ -83,11 +83,6 @@ def _get_skiprows(skiprows): TypeError * If `skiprows` is not a slice, integer, or Container - Raises - ------ - TypeError - * If `skiprows` is not a slice, integer, or Container - Returns ------- it : iterable @@ -99,9 +94,8 @@ def _get_skiprows(skiprows): return skiprows elif skiprows is None: return 0 - else: - raise TypeError('{0!r} is not a valid type for skipping' - ' rows'.format(type(skiprows).__name__)) + raise TypeError('%r is not a valid type for skipping rows' % + type(skiprows).__name__) def _read(io): @@ -123,11 +117,10 @@ def _read(io): elif os.path.isfile(io): with open(io) as f: raw_text = f.read() - elif isinstance(io, compat.string_types): + elif isinstance(io, string_types): raw_text = io else: - raise TypeError("Cannot read object of type " - "{0!r}".format(type(io).__name__)) + raise TypeError("Cannot read object of type %r" % type(io).__name__) return raw_text @@ -197,12 +190,6 @@ def _parse_raw_data(self, rows): A callable that takes a row node as input and returns a list of the column node in that row. This must be defined by subclasses. - Raises - ------ - AssertionError - * If `text_getter` is not callable - * If `column_finder` is not callable - Returns ------- data : list of list of strings @@ -257,7 +244,7 @@ def _parse_tables(self, doc, match, attrs): Raises ------ - AssertionError + ValueError * If `match` does not match any text in the document. Returns @@ -409,25 +396,28 @@ def _parse_tfoot(self, table): def _parse_tables(self, doc, match, attrs): element_name = self._strainer.name tables = doc.find_all(element_name, attrs=attrs) + if not tables: - # known sporadically working release - raise AssertionError('No tables found') + raise ValueError('No tables found') - mts = [table.find(text=match) for table in tables] - matched_tables = [mt for mt in mts if mt is not None] - tables = list(set(mt.find_parent(element_name) - for mt in matched_tables)) + result = [] + unique_tables = set() - if not tables: - raise AssertionError("No tables found matching " - "'{0}'".format(match.pattern)) - return tables + for table in tables: + if (table not in unique_tables and + table.find(text=match) is not None): + result.append(table) + unique_tables.add(table) + + if not result: + raise ValueError("No tables found matching pattern %r" % + match.pattern) + return result def _setup_build_doc(self): raw_text = _read(self.io) if not raw_text: - raise AssertionError('No text parsed from document: ' - '{0}'.format(self.io)) + raise ValueError('No text parsed from document: %s' % self.io) return raw_text def _build_doc(self): @@ -435,7 +425,7 @@ def _build_doc(self): return BeautifulSoup(self._setup_build_doc(), features='html5lib') -def _build_node_xpath_expr(attrs): +def _build_xpath_expr(attrs): """Build an xpath expression to simulate bs4's ability to pass in kwargs to search for attributes when using the lxml parser. @@ -453,8 +443,8 @@ def _build_node_xpath_expr(attrs): if 'class_' in attrs: attrs['class'] = attrs.pop('class_') - s = (u("@{k}='{v}'").format(k=k, v=v) for k, v in compat.iteritems(attrs)) - return u('[{0}]').format(' and '.join(s)) + s = [u("@%s=%r") % (k, v) for k, v in iteritems(attrs)] + return u('[%s]') % ' and '.join(s) _re_namespace = {'re': 'http://exslt.org/regular-expressions'} @@ -494,23 +484,20 @@ def _parse_tr(self, table): def _parse_tables(self, doc, match, kwargs): pattern = match.pattern - # check all descendants for the given pattern - check_all_expr = u('//*') - if pattern: - check_all_expr += u("[re:test(text(), '{0}')]").format(pattern) - - # go up the tree until we find a table - check_table_expr = '/ancestor::table' - xpath_expr = check_all_expr + check_table_expr + # 1. check all descendants for the given pattern and only search tables + # 2. go up the tree until we find a table or if we are a table use that + query = '//table/*[re:test(text(), %r)]/ancestor-or-self::table' + xpath_expr = u(query) % pattern # if any table attributes were given build an xpath expression to # search for them if kwargs: - xpath_expr += _build_node_xpath_expr(kwargs) + xpath_expr += _build_xpath_expr(kwargs) + tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + if not tables: - raise AssertionError("No tables found matching regex " - "'{0}'".format(pattern)) + raise ValueError("No tables found matching regex %r" % pattern) return tables def _build_doc(self): @@ -531,6 +518,7 @@ def _build_doc(self): """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + parser = HTMLParser(recover=False) try: @@ -555,8 +543,8 @@ def _build_doc(self): scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it - msg = ('{0} is not a valid url scheme, valid schemes are ' - '{1}').format(scheme, _valid_schemes) + msg = ('%r is not a valid url scheme, valid schemes are ' + '%s') % (scheme, _valid_schemes) raise ValueError(msg) else: # something else happened: maybe a faulty connection @@ -586,17 +574,13 @@ def _parse_raw_tfoot(self, table): table.xpath(expr)] -def _nan_list(n): - return list(itertools.repeat(np.nan, n)) - - def _expand_elements(body): lens = Series(lmap(len, body)) lens_max = lens.max() not_max = lens[lens != lens_max] - for ind, length in compat.iteritems(not_max): - body[ind] += _nan_list(lens_max - length) + for ind, length in iteritems(not_max): + body[ind] += [np.nan] * (lens_max - length) def _data_to_frame(data, header, index_col, skiprows, infer_types, @@ -618,10 +602,10 @@ def _data_to_frame(data, header, index_col, skiprows, infer_types, thousands=thousands) df = tp.read() - if infer_types: # TODO: remove in 0.14 + if infer_types: # TODO: rm this code so infer_types has no effect in 0.14 df = df.convert_objects(convert_dates='coerce') else: - df = df.applymap(compat.text_type) + df = df.applymap(text_type) return df @@ -645,15 +629,15 @@ def _parser_dispatch(flavor): Raises ------ - AssertionError + ValueError * If `flavor` is not a valid backend. ImportError * If you do not have the requested `flavor` """ valid_parsers = list(_valid_parsers.keys()) if flavor not in valid_parsers: - raise AssertionError('"{0!r}" is not a valid flavor, valid flavors are' - ' {1}'.format(flavor, valid_parsers)) + raise ValueError('%r is not a valid flavor, valid flavors are %s' % + (flavor, valid_parsers)) if flavor in ('bs4', 'html5lib'): if not _HAS_HTML5LIB: @@ -661,47 +645,54 @@ def _parser_dispatch(flavor): if not _HAS_BS4: raise ImportError("bs4 not found please install it") if bs4.__version__ == LooseVersion('4.2.0'): - raise AssertionError("You're using a version" - " of BeautifulSoup4 (4.2.0) that has been" - " known to cause problems on certain" - " operating systems such as Debian. " - "Please install a version of" - " BeautifulSoup4 != 4.2.0, both earlier" - " and later releases will work.") + raise ValueError("You're using a version" + " of BeautifulSoup4 (4.2.0) that has been" + " known to cause problems on certain" + " operating systems such as Debian. " + "Please install a version of" + " BeautifulSoup4 != 4.2.0, both earlier" + " and later releases will work.") else: if not _HAS_LXML: raise ImportError("lxml not found please install it") return _valid_parsers[flavor] -def _validate_parser_flavor(flavor): +def _print_as_set(s): + return '{%s}' % ', '.join([com.pprint_thing(el) for el in s]) + + +def _validate_flavor(flavor): if flavor is None: - flavor = ['lxml', 'bs4'] - elif isinstance(flavor, compat.string_types): - flavor = [flavor] + flavor = 'lxml', 'bs4' + elif isinstance(flavor, string_types): + flavor = flavor, elif isinstance(flavor, collections.Iterable): - if not all(isinstance(flav, compat.string_types) for flav in flavor): - raise TypeError('{0} is not an iterable of strings'.format(flavor)) + if not all(isinstance(flav, string_types) for flav in flavor): + raise TypeError('Object of type %r is not an iterable of strings' % + type(flavor).__name__) else: - raise TypeError('{0} is not a valid "flavor"'.format(flavor)) - - flavor = list(flavor) - valid_flavors = list(_valid_parsers.keys()) - - if not set(flavor) & set(valid_flavors): - raise ValueError('{0} is not a valid set of flavors, valid flavors are' - ' {1}'.format(flavor, valid_flavors)) + fmt = '{0!r}' if isinstance(flavor, string_types) else '{0}' + fmt += ' is not a valid flavor' + raise ValueError(fmt.format(flavor)) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError('%s is not a valid set of flavors, valid flavors are ' + '%s' % (_print_as_set(flavor_set), + _print_as_set(valid_flavors))) return flavor def _parse(flavor, io, match, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands, attrs): - # bonus: re.compile is idempotent under function iteration so you can pass - # a compiled regex to it and it will return itself - flavor = _validate_parser_flavor(flavor) - compiled_match = re.compile(match) + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here - # ugly hack because python 3 DELETES the exception variable! + # hack around python 3 deleting the exception variable retained = None for flav in flavor: parser = _parser_dispatch(flav) @@ -714,7 +705,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, else: break else: - raise retained + raise_with_traceback(retained) return [_data_to_frame(table, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands) @@ -724,14 +715,14 @@ def _parse(flavor, io, match, header, index_col, skiprows, infer_types, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, infer_types=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=','): - r"""Read HTML tables into a ``list`` of DataFrames. + r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- io : str or file-like A URL, a file-like object, or a raw string containing HTML. Note that lxml only accepts the http, ftp and file url protocols. If you have a - URL that starts with ``'https'`` you might removing the ``'s'``. + URL that starts with ``'https'`` you might try removing the ``'s'``. match : str or compiled regular expression, optional The set of tables containing text matching this regex or string will be @@ -755,25 +746,14 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, The column (or list of columns) to use to create the index. skiprows : int or list-like or slice or None, optional - If an integer is given then skip this many rows after parsing the - column header. If a sequence of integers is given skip those specific - rows (0-based). Note that - - .. code-block:: python - - pandas.read_html(..., skiprows=0) - - yields the same result as - - .. code-block:: python - - pandas.read_html(..., skiprows=None) - - If `skiprows` is a positive integer, say :math:`n`, then - it is treated as "skip :math:`n` rows", *not* as "skip the - :math:`n^\textrm{th}` row". + 0-based. Number of rows to skip after parsing the column integer. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. - infer_types : bool, optional, deprecated since 0.13, removed in 0.14 + infer_types : bool, optional + This option is deprecated in 0.13, an will have no effect in 0.14. It + defaults to ``True``. attrs : dict or None, optional This is a dictionary of attributes that you can pass to use to identify @@ -811,11 +791,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, contrast to other IO functions which default to ``True``. thousands : str, optional - Separator to use to parse thousands. Defaults to ``','``. Note that - this is different from :func:`~pandas.read_csv` because - :func:`~pandas.read_csv` must be able to parse different separators, - and the default separator is ``','``. :func:`~pandas.read_html` does - not need to do this, so it defaults to ``','``. + Separator to use to parse thousands. Defaults to ``','``. Returns ------- @@ -852,14 +828,14 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, pandas.read_csv """ if infer_types is not None: - warnings.warn("infer_types will be removed in 0.14") + warnings.warn("infer_types will have no effect in 0.14", FutureWarning) else: infer_types = True # TODO: remove in 0.14 # Type check here. We don't want to parse only to fail because of an # invalid value of an integer skiprows. if isinstance(skiprows, numbers.Integral) and skiprows < 0: - raise AssertionError('cannot skip rows starting from the end of the ' - 'data (you passed a negative value)') + raise ValueError('cannot skip rows starting from the end of the ' + 'data (you passed a negative value)') return _parse(flavor, io, match, header, index_col, skiprows, infer_types, parse_dates, tupleize_cols, thousands, attrs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3ef3cbf856fef..8a2f249f6af06 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -606,16 +606,10 @@ def _failover_to_python(self): raise NotImplementedError def read(self, nrows=None): - suppressed_warnings = False if nrows is not None: if self.options.get('skip_footer'): raise ValueError('skip_footer not supported for iteration') - # # XXX hack - # if isinstance(self._engine, CParserWrapper): - # suppressed_warnings = True - # self._engine.set_error_bad_lines(False) - ret = self._engine.read(nrows) if self.options.get('as_recarray'): @@ -710,7 +704,6 @@ def _should_parse_dates(self, i): else: return (j in self.parse_dates) or (name in self.parse_dates) - def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_names=False): """ extract and return the names, index_names, col_names header is a list-of-lists returned from the parsers """ @@ -728,12 +721,10 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, passed_ ic = [ ic ] sic = set(ic) - orig_header = list(header) - # clean the index_names index_names = header.pop(-1) - (index_names, names, - index_col) = _clean_index_names(index_names, self.index_col) + index_names, names, index_col = _clean_index_names(index_names, + self.index_col) # extract the columns field_count = len(header[0]) @@ -766,7 +757,7 @@ def _maybe_make_multi_index_columns(self, columns, col_names=None): return columns def _make_index(self, data, alldata, columns, indexnamerow=False): - if not _is_index_col(self.index_col) or len(self.index_col) == 0: + if not _is_index_col(self.index_col) or not self.index_col: index = None elif not self._has_complex_date_col: @@ -1430,7 +1421,7 @@ def read(self, rows=None): self._first_chunk = False columns = list(self.orig_names) - if len(content) == 0: # pragma: no cover + if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 return _get_empty_meta(self.orig_names, self.index_col, @@ -1468,8 +1459,8 @@ def _convert_data(self, data): col = self.orig_names[col] clean_conv[col] = f - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, - clean_conv) + return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, + self.verbose, clean_conv) def _infer_columns(self): names = self.names @@ -1478,16 +1469,15 @@ def _infer_columns(self): header = self.header # we have a mi columns, so read and extra line - if isinstance(header,(list,tuple,np.ndarray)): + if isinstance(header, (list, tuple, np.ndarray)): have_mi_columns = True - header = list(header) + [header[-1]+1] + header = list(header) + [header[-1] + 1] else: have_mi_columns = False - header = [ header ] + header = [header] columns = [] for level, hr in enumerate(header): - if len(self.buf) > 0: line = self.buf[0] else: @@ -1521,10 +1511,11 @@ def _infer_columns(self): if names is not None: if len(names) != len(columns[0]): - raise Exception('Number of passed names did not match ' - 'number of header fields in the file') + raise ValueError('Number of passed names did not match ' + 'number of header fields in the file') if len(columns) > 1: - raise Exception('Cannot pass names with multi-index columns') + raise TypeError('Cannot pass names with multi-index ' + 'columns') columns = [ names ] else: diff --git a/pandas/io/tests/data/nyse_wsj.html b/pandas/io/tests/data/nyse_wsj.html new file mode 100644 index 0000000000000..aa3d470a5fbc6 --- /dev/null +++ b/pandas/io/tests/data/nyse_wsj.html @@ -0,0 +1,1207 @@ + + + + + + +
+
+
+
+
+ SEARCH +
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 Issue(Roll over for charts and headlines) + VolumePriceChg% Chg
1 + J.C. Penney (JCP) + + 250,697,455$9.05-1.37-13.15
2 + Bank of America (BAC) + + 77,162,10313.90-0.18-1.28
3 + Rite Aid (RAD) + + 52,140,3824.70-0.08-1.67
4 + Ford Motor (F) + + 33,745,28717.05-0.22-1.27
5 + Pfizer (PFE) + + 27,801,85328.880.361.26
6 + Hertz Global Hldgs (HTZ) + + 25,821,26422.320.693.19
7 + General Electric (GE) + + 25,142,06424.05-0.20-0.82
8 + Elan ADS (ELN) + + 24,725,20915.590.080.52
9 + JPMorgan Chase (JPM) + + 22,402,75652.240.350.67
10 + Regions Financial (RF) + + 20,790,5329.300.121.31
11 + Violin Memory (VMEM) + + 20,669,8467.02-1.98-22.00
12 + Citigroup (C) + + 19,979,93248.89-0.04-0.08
13 + Nokia ADS (NOK) + + 19,585,0756.660.020.30
14 + Wells Fargo (WFC) + + 19,478,59041.59-0.02-0.05
15 + Vale ADS (VALE) + + 18,781,98715.60-0.52-3.23
16 + Delta Air Lines (DAL) + + 16,013,95623.57-0.44-1.83
17 + EMC (EMC) + + 15,771,25226.07-0.11-0.42
18 + Nike Cl B (NKE) + + 15,514,71773.643.304.69
19 + Alcoa (AA) + + 14,061,0738.20-0.07-0.85
20 + General Motors (GM) + + 13,984,00436.37-0.58-1.57
21 + Oracle (ORCL) + + 13,856,67133.78-0.03-0.09
22 + AT&T (T) + + 13,736,94833.98-0.25-0.73
23 + Trina Solar ADS (TSL) + + 13,284,20214.831.9915.50
24 + Yingli Green Energy Holding ADS (YGE) + + 12,978,3786.730.6310.33
25 + Petroleo Brasileiro ADS (PBR) + + 12,833,66015.40-0.21-1.35
26 + United Continental Holdings (UAL) + + 12,603,22530.91-3.16-9.28
27 + Coca-Cola (KO) + + 12,343,45238.40-0.34-0.88
28 + Arch Coal (ACI) + + 12,261,1384.25-0.28-6.18
29 + Morgan Stanley (MS) + + 11,956,34527.08-0.07-0.26
30 + Pandora Media (P) + + 11,829,96325.520.130.51
31 + Barrick Gold (ABX) + + 11,775,58518.530.000.00
32 + Abbott Laboratories (ABT) + + 11,755,71833.14-0.52-1.54
33 + Banco Santander Brasil ADS (BSBR) + + 11,587,3107.010.467.02
34 + Advanced Micro Devices (AMD) + + 11,337,6093.86-0.03-0.77
35 + Annaly Capital Management (NLY) + + 11,004,44011.63-0.07-0.60
36 + Alpha Natural Resources (ANR) + + 10,941,0746.08-0.19-3.03
37 + Exxon Mobil (XOM) + + 10,668,11586.90-0.17-0.20
38 + Itau Unibanco Holding ADS (ITUB) + + 10,638,80314.300.231.63
39 + Merck&Co (MRK) + + 10,388,15247.790.110.23
40 + Alcatel-Lucent ADS (ALU) + + 10,181,8333.650.010.27
41 + Verizon Communications (VZ) + + 10,139,32147.00-0.67-1.41
42 + Magnum Hunter Resources (MHR) + + 10,004,3036.330.467.84
43 + Hewlett-Packard (HPQ) + + 9,948,93521.17-0.13-0.61
44 + PulteGroup (PHM) + + 9,899,14116.57-0.41-2.41
45 + ReneSola ADS (SOL) + + 9,667,4384.840.398.76
46 + Corning (GLW) + + 9,547,26514.73-0.21-1.41
47 + Cole Real Estate Investments (COLE) + + 9,544,02112.210.010.08
48 + Dow Chemical (DOW) + + 9,150,47939.02-0.97-2.43
49 + International Game Technology (IGT) + + 9,129,12319.23-1.44-6.97
50 + Accenture Cl A (ACN) + + 8,773,26074.09-1.78-2.35
51 + KeyCorp (KEY) + + 8,599,33311.360.020.18
52 + Bristol-Myers Squibb (BMY) + + 8,440,70946.20-0.73-1.56
53 + Companhia Siderurgica Nacional ADS (SID) + + 8,437,6364.36-0.05-1.13
54 + H&R Block (HRB) + + 8,240,98426.360.311.19
55 + MGIC Investment (MTG) + + 8,135,0377.26-0.10-1.36
56 + RingCentral Cl A (RNG) + + 8,117,46918.205.2040.00
57 + United States Steel (X) + + 8,107,89920.44-0.66-3.13
58 + Cliffs Natural Resources (CLF) + + 8,041,57221.00-0.83-3.80
59 + Newmont Mining (NEM) + + 8,014,25027.98-0.19-0.67
60 + Altria Group (MO) + + 7,786,04834.71-0.29-0.83
61 + SandRidge Energy (SD) + + 7,782,7455.93-0.06-1.00
62 + Molycorp (MCP) + + 7,735,8316.73-0.45-6.27
63 + Halliburton (HAL) + + 7,728,73548.39-0.32-0.66
64 + Taiwan Semiconductor Manufacturing ADS (TSM) + + 7,661,39717.07-0.25-1.44
65 + Freeport-McMoRan Copper&Gold (FCX) + + 7,622,80333.42-0.45-1.33
66 + Kodiak Oil&Gas (KOG) + + 7,543,80611.940.161.36
67 + Xerox (XRX) + + 7,440,68910.37-0.01-0.10
68 + Sprint (S) + + 7,291,3516.16-0.14-2.22
69 + Two Harbors Investment (TWO) + + 7,153,8039.790.050.51
70 + Walter Energy (WLT) + + 7,152,19214.19-0.36-2.47
71 + International Paper (IP) + + 7,123,72245.44-1.85-3.91
72 + PPL (PPL) + + 7,026,29230.34-0.13-0.43
73 + Goldcorp (GG) + + 6,857,44725.760.080.31
74 + Time Warner (TWX) + + 6,807,23766.201.332.05
75 + Synovus Financial (SNV) + + 6,764,8053.290.020.61
76 + AK Steel Holding (AKS) + + 6,662,5993.83-0.11-2.79
77 + Boston Scientific (BSX) + + 6,629,08411.52-0.15-1.29
78 + Eldorado Gold (EGO) + + 6,596,9026.65-0.03-0.45
79 + Newpark Resources (NR) + + 6,552,45312.560.090.72
80 + AbbVie (ABBV) + + 6,525,52444.33-0.67-1.49
81 + MBIA (MBI) + + 6,416,58710.38-0.43-3.98
82 + SAIC (SAI) + + 6,404,58716.030.130.82
83 + Procter&Gamble (PG) + + 6,389,14377.21-0.84-1.08
84 + IAMGOLD (IAG) + + 6,293,0014.77-0.06-1.24
85 + Safeway (SWY) + + 6,268,18432.25-0.29-0.89
86 + Kinross Gold (KGC) + + 6,112,6584.99-0.03-0.60
87 + MGM Resorts International (MGM) + + 5,986,14320.22-0.05-0.25
88 + Cemex ADS (CX) + + 5,907,04011.27-0.06-0.53
89 + American International Group (AIG) + + 5,900,13349.15-0.30-0.61
90 + Chesapeake Energy (CHK) + + 5,848,01626.21-0.20-0.76
91 + RadioShack (RSH) + + 5,837,8333.44-0.43-11.11
92 + U.S. Bancorp (USB) + + 5,814,37336.50-0.04-0.11
93 + Eli Lilly (LLY) + + 5,776,99150.50-0.54-1.06
94 + MetLife (MET) + + 5,774,99647.21-0.37-0.78
95 + Yamana Gold (AUY) + + 5,742,42610.370.030.29
96 + CBS Cl B (CBS) + + 5,718,85855.50-0.06-0.11
97 + CSX (CSX) + + 5,710,06625.85-0.13-0.50
98 + Carnival (CCL) + + 5,661,32532.88-0.05-0.15
99 + Mosaic (MOS) + + 5,595,59243.43-0.76-1.72
100 + Walgreen (WAG) + + 5,568,31054.51-0.22-0.40
+ + +
+ + + + + + + + + + + + + + +
An Advertising Feature    PARTNER CENTER
+ + + + + + + + + + + + + + +
+ + +
diff --git a/pandas/io/tests/data/valid_markup.html b/pandas/io/tests/data/valid_markup.html index 5db90da3baec4..0130e9ed9d5f3 100644 --- a/pandas/io/tests/data/valid_markup.html +++ b/pandas/io/tests/data/valid_markup.html @@ -35,35 +35,26 @@
7 0
443
554
645
714
+ + + + + + + + - + - - - + + +
ab
80 6 7
985140
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index c266a6c900e90..762cd24af7be9 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -18,7 +18,7 @@ from numpy.random import rand from numpy.testing.decorators import slow -from pandas import DataFrame, MultiIndex, read_csv, Timestamp +from pandas import DataFrame, MultiIndex, read_csv, Timestamp, Index from pandas.compat import map, zip, StringIO, string_types from pandas.io.common import URLError, urlopen from pandas.io.html import read_html @@ -140,10 +140,10 @@ def test_banklist(self): assert_framelist_equal(df1, df2) def test_spam_no_types(self): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', infer_types=False) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) assert_framelist_equal(df1, df2) @@ -228,8 +228,9 @@ def test_skiprows_ndarray(self): assert_framelist_equal(df1, df2) def test_skiprows_invalid(self): - self.assertRaises(TypeError, self.read_html, self.spam_data, - '.*Water.*', skiprows='asdf') + with tm.assertRaisesRegexp(TypeError, + 'is not a valid type for skipping rows'): + self.read_html(self.spam_data, '.*Water.*', skiprows='asdf') def test_index(self): df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0) @@ -237,10 +238,10 @@ def test_index(self): assert_framelist_equal(df1, df2) def test_header_and_index_no_types(self): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', header=1, index_col=0, infer_types=False) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', header=1, index_col=0, infer_types=False) assert_framelist_equal(df1, df2) @@ -252,19 +253,20 @@ def test_header_and_index_with_types(self): assert_framelist_equal(df1, df2) def test_infer_types(self): - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', index_col=0, infer_types=False) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', index_col=0, infer_types=False) assert_framelist_equal(df1, df2) - with tm.assert_produces_warning(): + with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', index_col=0, infer_types=True) - self.assertRaises(AssertionError, assert_framelist_equal, df1, df2) + with tm.assertRaises(AssertionError): + assert_framelist_equal(df1, df2) def test_string_io(self): with open(self.spam_data) as f: @@ -297,19 +299,18 @@ def test_file_like(self): @network def test_bad_url_protocol(self): - self.assertRaises(URLError, self.read_html, - 'git://github.com', '.*Water.*') + with tm.assertRaises(URLError): + self.read_html('git://github.com', match='.*Water.*') @network def test_invalid_url(self): - self.assertRaises(URLError, self.read_html, - 'http://www.a23950sdfa908sd.com') + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') @slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html('file://' + url, 'First', - attrs={'id': 'table'}) + dfs = self.read_html('file://' + url, 'First', attrs={'id': 'table'}) tm.assert_isinstance(dfs, list) for df in dfs: tm.assert_isinstance(df, DataFrame) @@ -317,13 +318,13 @@ def test_file_url(self): @slow def test_invalid_table_attrs(self): url = self.banklist_data - self.assertRaises(AssertionError, self.read_html, url, - 'First Federal Bank of Florida', - attrs={'id': 'tasdfable'}) + with tm.assertRaisesRegexp(ValueError, 'No tables found'): + self.read_html(url, 'First Federal Bank of Florida', + attrs={'id': 'tasdfable'}) def _bank_data(self, *args, **kwargs): return self.read_html(self.banklist_data, 'Metcalf', - attrs={'id': 'table'}, *args, **kwargs) + attrs={'id': 'table'}, *args, **kwargs) @slow def test_multiindex_header(self): @@ -341,15 +342,18 @@ def test_multiindex_header_index(self): tm.assert_isinstance(df.columns, MultiIndex) tm.assert_isinstance(df.index, MultiIndex) + @slow + def test_multiindex_header_skiprows_tuples(self): + df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] + tm.assert_isinstance(df.columns, Index) + @slow def test_multiindex_header_skiprows(self): - # wtf does skiprows=1 fail here?!? df = self._bank_data(header=[0, 1], skiprows=1)[0] tm.assert_isinstance(df.columns, MultiIndex) @slow def test_multiindex_header_index_skiprows(self): - # wtf does skiprows=1 fail here?!? df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] tm.assert_isinstance(df.index, MultiIndex) tm.assert_isinstance(df.columns, MultiIndex) @@ -364,15 +368,10 @@ def test_regex_idempotency(self): for df in dfs: tm.assert_isinstance(df, DataFrame) - def test_negative_skiprows_spam(self): - url = self.spam_data - self.assertRaises(AssertionError, self.read_html, url, 'Water', - skiprows=-1) - - def test_negative_skiprows_banklist(self): - url = self.banklist_data - self.assertRaises(AssertionError, self.read_html, url, 'Florida', - skiprows=-1) + def test_negative_skiprows(self): + with tm.assertRaisesRegexp(ValueError, + '\(you passed a negative value\)'): + self.read_html(self.spam_data, 'Water', skiprows=-1) @network def test_multiple_matches(self): @@ -391,17 +390,26 @@ def test_pythonxy_plugins_table(self): @slow def test_thousands_macau_stats(self): + all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') dfs = self.read_html(macau_data, index_col=0, - attrs={'class': 'style1'}) + attrs={'class': 'style1'}) + df = dfs[all_non_nan_table_index] - # no columns should have all nans - res = any((df.count() == 0).any() for df in dfs) - self.assertEqual(res, False) + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) + + @slow + def test_thousands_macau_index_col(self): + all_non_nan_table_index = -2 + macau_data = os.path.join(DATA_PATH, 'macau.html') + dfs = self.read_html(macau_data, index_col=0, header=0) + df = dfs[all_non_nan_table_index] + + self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) def test_countries_municipalities(self): # GH5048 - data1 = StringIO(u''' + data1 = StringIO('''
@@ -417,7 +425,7 @@ def test_countries_municipalities(self):
Country
''') - data2 = StringIO(u''' + data2 = StringIO(''' @@ -436,6 +444,17 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) + def test_nyse_wsj_commas_table(self): + data = os.path.join(DATA_PATH, 'nyse_wsj.html') + df = self.read_html(data, index_col=0, header=0, + attrs={'class': 'mdcTable'})[0] + + columns = Index(['Issue(Roll over for charts and headlines)', + 'Volume', 'Price', 'Chg', '% Chg']) + nrows = 100 + self.assertEqual(df.shape[0], nrows) + self.assertTrue(df.columns.equals(columns)) + @slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -542,10 +561,8 @@ def test_different_number_of_rows(self):
""" - expected = self.read_html(out, attrs={'class': 'dataframe'}, - index_col=0)[0] - res = self.read_html(out, attrs={'class': 'dataframe'}, - index_col=0)[0] + expected = self.read_html(expected, index_col=0)[0] + res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) @@ -562,17 +579,16 @@ def read_html(self, *args, **kwargs): def try_skip(self): _skip_if_no('lxml') - def test_spam_data_fail(self): + def test_data_fail(self): from lxml.etree import XMLSyntaxError spam_data = os.path.join(DATA_PATH, 'spam.html') - self.assertRaises(XMLSyntaxError, self.read_html, spam_data, - flavor=['lxml']) - - def test_banklist_data_fail(self): - from lxml.etree import XMLSyntaxError banklist_data = os.path.join(DATA_PATH, 'banklist.html') - self.assertRaises(XMLSyntaxError, self.read_html, banklist_data, - flavor=['lxml']) + + with tm.assertRaises(XMLSyntaxError): + self.read_html(spam_data, flavor=['lxml']) + + with tm.assertRaises(XMLSyntaxError): + self.read_html(banklist_data, flavor=['lxml']) def test_works_on_valid_markup(self): filename = os.path.join(DATA_PATH, 'valid_markup.html') @@ -628,3 +644,11 @@ def test_lxml_finds_tables(): def test_lxml_finds_tbody(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'tbody') + + +def test_same_ordering(): + _skip_if_none_of(['bs4', 'lxml', 'html5lib']) + filename = os.path.join(DATA_PATH, 'valid_markup.html') + dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) + dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) + assert_framelist_equal(dfs_lxml, dfs_bs4) From bea34eb10fa5536caf6227dfbbc4744937d5e98f Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 2 Oct 2013 20:50:10 -0400 Subject: [PATCH 5/5] CLN: proper parse_dates support --- doc/source/release.rst | 13 +++++++++++++ pandas/io/html.py | 11 +++++------ pandas/io/tests/test_html.py | 18 +++++++++++++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 4f4681b112664..78236bbf821dd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -167,6 +167,8 @@ Improvements to existing features - Improve support for converting R datasets to pandas objects (more informative index for timeseries and numeric, support for factors, dist, and high-dimensional arrays). + - :func:`~pandas.read_html` now supports the ``parse_dates``, + ``tupleize_cols`` and ``thousands`` parameters (:issue:`4770`). API Changes ~~~~~~~~~~~ @@ -373,6 +375,8 @@ See :ref:`Internal Refactoring` ``core/generic.py`` (:issue:`4435`). - Refactor cum objects to core/generic.py (:issue:`4435`), note that these have a more numpy-like function signature. + - :func:`~pandas.read_html` now uses ``TextParser`` to parse HTML data from + bs4/lxml (:issue:`4770`). .. _release.bug_fixes-0.13.0: @@ -538,6 +542,15 @@ Bug Fixes - Make sure series-series boolean comparions are label based (:issue:`4947`) - Bug in multi-level indexing with a Timestamp partial indexer (:issue:`4294`) - Tests/fix for multi-index construction of an all-nan frame (:isue:`4078`) + - Fixed a bug where :func:`~pandas.read_html` wasn't correctly inferring + values of tables with commas (:issue:`5029`) + - Fixed a bug where :func:`~pandas.read_html` wasn't providing a stable + ordering of returned tables (:issue:`4770`, :issue:`5029`). + - Fixed a bug where :func:`~pandas.read_html` was incorrectly parsing when + passed ``index_col=0`` (:issue:`5066`). + - Fixed a bug where :func:`~pandas.read_html` was incorrectly infering the + type of headers (:issue:`5048`). + pandas 0.12.0 ------------- diff --git a/pandas/io/html.py b/pandas/io/html.py index ac3e3ad096392..96bedbf390af6 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -16,7 +16,7 @@ from pandas.io.common import _is_url, urlopen, parse_url from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, text_type, - raise_with_traceback, OrderedDict) + raise_with_traceback) from pandas.core import common as com from pandas import Series @@ -485,8 +485,8 @@ def _parse_tables(self, doc, match, kwargs): pattern = match.pattern # 1. check all descendants for the given pattern and only search tables - # 2. go up the tree until we find a table or if we are a table use that - query = '//table/*[re:test(text(), %r)]/ancestor-or-self::table' + # 2. go up the tree until we find a table + query = '//table//*[re:test(text(), %r)]/ancestor::table' xpath_expr = u(query) % pattern # if any table attributes were given build an xpath expression to @@ -786,9 +786,8 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, tupleize_cols : bool, optional If ``False`` try to parse multiple header rows into a - :class:`~pandas.MultiIndex`. See :func:`~pandas.read_csv` for more - details. Defaults to ``False`` for backwards compatibility. This is in - contrast to other IO functions which default to ``True``. + :class:`~pandas.MultiIndex`, otherwise return raw tuples. Defaults to + ``False``. thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 762cd24af7be9..9b0fb1cacfb65 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -18,7 +18,8 @@ from numpy.random import rand from numpy.testing.decorators import slow -from pandas import DataFrame, MultiIndex, read_csv, Timestamp, Index +from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, + date_range, Series) from pandas.compat import map, zip, StringIO, string_types from pandas.io.common import URLError, urlopen from pandas.io.html import read_html @@ -565,6 +566,21 @@ def test_different_number_of_rows(self): res = self.read_html(out, index_col=0)[0] tm.assert_frame_equal(expected, res) + def test_parse_dates_list(self): + df = DataFrame({'date': date_range('1/1/2001', periods=10)}) + expected = df.to_html() + res = read_html(expected, parse_dates=[0], index_col=0) + tm.assert_frame_equal(df, res[0]) + + def test_parse_dates_combine(self): + raw_dates = Series(date_range('1/1/2001', periods=10)) + df = DataFrame({'date': raw_dates.map(lambda x: str(x.date())), + 'time': raw_dates.map(lambda x: str(x.time()))}) + res = read_html(df.to_html(), parse_dates={'datetime': [1, 2]}, + index_col=1) + newdf = DataFrame({'datetime': raw_dates}) + tm.assert_frame_equal(newdf, res[0]) + class TestReadHtmlLxml(unittest.TestCase): def setUp(self):