Skip to content

Commit cabc05f

Browse files
WillAydjreback
authored andcommitted
Clean / Consolidate pandas/tests/io/test_html.py (#20293)
1 parent 6126ad6 commit cabc05f

File tree

9 files changed

+159
-343
lines changed

9 files changed

+159
-343
lines changed

ci/requirements-2.7_COMPAT.pip

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
html5lib==1.0b2
2-
beautifulsoup4==4.2.0
2+
beautifulsoup4==4.2.1
33
openpyxl
44
argparse

ci/requirements-optional-conda.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
beautifulsoup4
1+
beautifulsoup4>=4.2.1
22
blosc
33
bottleneck
44
fastparquet

ci/requirements-optional-pip.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# This file was autogenerated by scripts/convert_deps.py
22
# Do not modify directly
3-
beautifulsoup4
3+
beautifulsoup4>=4.2.1
44
blosc
55
bottleneck
66
fastparquet

doc/source/install.rst

+6-3
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,12 @@ Optional Dependencies
266266
* One of the following combinations of libraries is needed to use the
267267
top-level :func:`~pandas.read_html` function:
268268

269+
.. versionchanged:: 0.23.0
270+
271+
.. note::
272+
273+
If using BeautifulSoup4 a minimum version of 4.2.1 is required
274+
269275
* `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is
270276
okay.)
271277
* `BeautifulSoup4`_ and `lxml`_
@@ -282,9 +288,6 @@ Optional Dependencies
282288
* You are highly encouraged to read :ref:`HTML Table Parsing gotchas <io.html.gotchas>`.
283289
It explains issues surrounding the installation and
284290
usage of the above three libraries.
285-
* You may need to install an older version of `BeautifulSoup4`_:
286-
Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit
287-
Ubuntu/Debian
288291

289292
.. note::
290293

doc/source/whatsnew/v0.23.0.txt

+9-7
Original file line numberDiff line numberDiff line change
@@ -358,13 +358,15 @@ Dependencies have increased minimum versions
358358
We have updated our minimum supported versions of dependencies (:issue:`15184`).
359359
If installed, we now require:
360360

361-
+-----------------+-----------------+----------+
362-
| Package | Minimum Version | Required |
363-
+=================+=================+==========+
364-
| python-dateutil | 2.5.0 | X |
365-
+-----------------+-----------------+----------+
366-
| openpyxl | 2.4.0 | |
367-
+-----------------+-----------------+----------+
361+
+-----------------+-----------------+----------+---------------+
362+
| Package | Minimum Version | Required | Issue |
363+
+=================+=================+==========+===============+
364+
| python-dateutil | 2.5.0 | X | :issue:`15184`|
365+
+-----------------+-----------------+----------+---------------+
366+
| openpyxl | 2.4.0 | | :issue:`15184`|
367+
+-----------------+-----------------+----------+---------------+
368+
| beautifulsoup4 | 4.2.1 | | :issue:`20082`|
369+
+-----------------+-----------------+----------+---------------+
368370

369371
.. _whatsnew_0230.api_breaking.dict_insertion_order:
370372

pandas/compat/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ def lmap(*args, **kwargs):
131131
def lfilter(*args, **kwargs):
132132
return list(filter(*args, **kwargs))
133133

134+
from importlib import reload
135+
reload = reload
136+
134137
else:
135138
# Python 2
136139
import re
@@ -184,6 +187,7 @@ def get_range_parameters(data):
184187
lmap = builtins.map
185188
lfilter = builtins.filter
186189

190+
reload = builtins.reload
187191

188192
if PY2:
189193
def iteritems(obj, **kw):

pandas/io/html.py

+28-33
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
from pandas.core.dtypes.common import is_list_like
1616
from pandas.errors import EmptyDataError
17-
from pandas.io.common import (_is_url, urlopen,
18-
parse_url, _validate_header_arg)
17+
from pandas.io.common import _is_url, urlopen, _validate_header_arg
1918
from pandas.io.parsers import TextParser
2019
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
2120
raise_with_traceback, binary_type)
@@ -554,8 +553,7 @@ def _parse_td(self, row):
554553
return row.xpath('.//td|.//th')
555554

556555
def _parse_tr(self, table):
557-
expr = './/tr[normalize-space()]'
558-
return table.xpath(expr)
556+
return table.xpath('.//tr')
559557

560558
def _parse_tables(self, doc, match, kwargs):
561559
pattern = match.pattern
@@ -606,18 +604,20 @@ def _build_doc(self):
606604
"""
607605
from lxml.html import parse, fromstring, HTMLParser
608606
from lxml.etree import XMLSyntaxError
609-
610-
parser = HTMLParser(recover=False, encoding=self.encoding)
607+
parser = HTMLParser(recover=True, encoding=self.encoding)
611608

612609
try:
613-
# try to parse the input in the simplest way
614-
r = parse(self.io, parser=parser)
615-
610+
if _is_url(self.io):
611+
with urlopen(self.io) as f:
612+
r = parse(f, parser=parser)
613+
else:
614+
# try to parse the input in the simplest way
615+
r = parse(self.io, parser=parser)
616616
try:
617617
r = r.getroot()
618618
except AttributeError:
619619
pass
620-
except (UnicodeDecodeError, IOError):
620+
except (UnicodeDecodeError, IOError) as e:
621621
# if the input is a blob of html goop
622622
if not _is_url(self.io):
623623
r = fromstring(self.io, parser=parser)
@@ -627,17 +627,7 @@ def _build_doc(self):
627627
except AttributeError:
628628
pass
629629
else:
630-
# not a url
631-
scheme = parse_url(self.io).scheme
632-
if scheme not in _valid_schemes:
633-
# lxml can't parse it
634-
msg = (('{invalid!r} is not a valid url scheme, valid '
635-
'schemes are {valid}')
636-
.format(invalid=scheme, valid=_valid_schemes))
637-
raise ValueError(msg)
638-
else:
639-
# something else happened: maybe a faulty connection
640-
raise
630+
raise e
641631
else:
642632
if not hasattr(r, 'text_content'):
643633
raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -657,12 +647,21 @@ def _parse_raw_thead(self, table):
657647
thead = table.xpath(expr)
658648
res = []
659649
if thead:
660-
trs = self._parse_tr(thead[0])
661-
for tr in trs:
662-
cols = [_remove_whitespace(x.text_content()) for x in
663-
self._parse_td(tr)]
650+
# Grab any directly descending table headers first
651+
ths = thead[0].xpath('./th')
652+
if ths:
653+
cols = [_remove_whitespace(x.text_content()) for x in ths]
664654
if any(col != '' for col in cols):
665655
res.append(cols)
656+
else:
657+
trs = self._parse_tr(thead[0])
658+
659+
for tr in trs:
660+
cols = [_remove_whitespace(x.text_content()) for x in
661+
self._parse_td(tr)]
662+
663+
if any(col != '' for col in cols):
664+
res.append(cols)
666665
return res
667666

668667
def _parse_raw_tfoot(self, table):
@@ -739,14 +738,10 @@ def _parser_dispatch(flavor):
739738
raise ImportError(
740739
"BeautifulSoup4 (bs4) not found, please install it")
741740
import bs4
742-
if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
743-
raise ValueError("You're using a version"
744-
" of BeautifulSoup4 (4.2.0) that has been"
745-
" known to cause problems on certain"
746-
" operating systems such as Debian. "
747-
"Please install a version of"
748-
" BeautifulSoup4 != 4.2.0, both earlier"
749-
" and later releases will work.")
741+
if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
742+
raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
743+
"is required")
744+
750745
else:
751746
if not _HAS_LXML:
752747
raise ImportError("lxml not found, please install it")

pandas/tests/io/data/banklist.html

+1
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ <h1 class="page_title">Failed Bank List</h1>
340340
<td class="closing">April 19, 2013</td>
341341
<td class="updated">April 23, 2013</td>
342342
</tr>
343+
<tr>
343344
<td class="institution"><a href="goldcanyon.html">Gold Canyon Bank</a></td>
344345
<td class="city">Gold Canyon</td>
345346
<td class="state">AZ</td>

0 commit comments

Comments
 (0)