Clean / Consolidate pandas/tests/io/test_html.py (#20293)

WillAyd · jreback · commit cabc05f5ec30 · 2018-03-14T06:47:47.000-04:00
diff --git a/ci/requirements-2.7_COMPAT.pip b/ci/requirements-2.7_COMPAT.pip
@@ -1,4 +1,4 @@
 html5lib==1.0b2
-beautifulsoup4==4.2.0
+beautifulsoup4==4.2.1
 openpyxl
 argparse
diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt
@@ -1,4 +1,4 @@
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt
@@ -1,6 +1,6 @@
 # This file was autogenerated by scripts/convert_deps.py
 # Do not modify directly
-beautifulsoup4
+beautifulsoup4>=4.2.1
 blosc
 bottleneck
 fastparquet
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -266,6 +266,12 @@ Optional Dependencies
 * One of the following combinations of libraries is needed to use the
   top-level :func:`~pandas.read_html` function:
 
+  .. versionchanged:: 0.23.0
+
+  .. note::
+
+     If using BeautifulSoup4 a minimum version of 4.2.1 is required
+
   * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is
     okay.)
   * `BeautifulSoup4`_ and `lxml`_
@@ -282,9 +288,6 @@ Optional Dependencies
      * You are highly encouraged to read :ref:`HTML Table Parsing gotchas <io.html.gotchas>`.
        It explains issues surrounding the installation and
        usage of the above three libraries.
-     * You may need to install an older version of `BeautifulSoup4`_:
-       Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit
-       Ubuntu/Debian
 
   .. note::
 
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -358,13 +358,15 @@ Dependencies have increased minimum versions
 We have updated our minimum supported versions of dependencies (:issue:`15184`).
 If installed, we now require:
 
-+-----------------+-----------------+----------+
-| Package         | Minimum Version | Required |
-+=================+=================+==========+
-| python-dateutil | 2.5.0           |    X     |
-+-----------------+-----------------+----------+
-| openpyxl        | 2.4.0           |          |
-+-----------------+-----------------+----------+
++-----------------+-----------------+----------+---------------+
+| Package         | Minimum Version | Required |     Issue     |
++=================+=================+==========+===============+
+| python-dateutil | 2.5.0           |    X     | :issue:`15184`|
++-----------------+-----------------+----------+---------------+
+| openpyxl        | 2.4.0           |          | :issue:`15184`|
++-----------------+-----------------+----------+---------------+
+| beautifulsoup4  | 4.2.1           |          | :issue:`20082`|
++-----------------+-----------------+----------+---------------+
 
 .. _whatsnew_0230.api_breaking.dict_insertion_order:
 
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -131,6 +131,9 @@ def lmap(*args, **kwargs):
     def lfilter(*args, **kwargs):
         return list(filter(*args, **kwargs))
 
+    from importlib import reload
+    reload = reload
+
 else:
     # Python 2
     import re
@@ -184,6 +187,7 @@ def get_range_parameters(data):
     lmap = builtins.map
     lfilter = builtins.filter
 
+    reload = builtins.reload
 
 if PY2:
     def iteritems(obj, **kw):
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -14,8 +14,7 @@
 
 from pandas.core.dtypes.common import is_list_like
 from pandas.errors import EmptyDataError
-from pandas.io.common import (_is_url, urlopen,
-                              parse_url, _validate_header_arg)
+from pandas.io.common import _is_url, urlopen, _validate_header_arg
 from pandas.io.parsers import TextParser
 from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                            raise_with_traceback, binary_type)
@@ -554,8 +553,7 @@ def _parse_td(self, row):
         return row.xpath('.//td|.//th')
 
     def _parse_tr(self, table):
-        expr = './/tr[normalize-space()]'
-        return table.xpath(expr)
+        return table.xpath('.//tr')
 
     def _parse_tables(self, doc, match, kwargs):
         pattern = match.pattern
@@ -606,18 +604,20 @@ def _build_doc(self):
         """
         from lxml.html import parse, fromstring, HTMLParser
         from lxml.etree import XMLSyntaxError
-
-        parser = HTMLParser(recover=False, encoding=self.encoding)
+        parser = HTMLParser(recover=True, encoding=self.encoding)
 
         try:
-            # try to parse the input in the simplest way
-            r = parse(self.io, parser=parser)
-
+            if _is_url(self.io):
+                with urlopen(self.io) as f:
+                    r = parse(f, parser=parser)
+            else:
+                # try to parse the input in the simplest way
+                r = parse(self.io, parser=parser)
             try:
                 r = r.getroot()
             except AttributeError:
                 pass
-        except (UnicodeDecodeError, IOError):
+        except (UnicodeDecodeError, IOError) as e:
             # if the input is a blob of html goop
             if not _is_url(self.io):
                 r = fromstring(self.io, parser=parser)
@@ -627,17 +627,7 @@ def _build_doc(self):
                 except AttributeError:
                     pass
             else:
-                # not a url
-                scheme = parse_url(self.io).scheme
-                if scheme not in _valid_schemes:
-                    # lxml can't parse it
-                    msg = (('{invalid!r} is not a valid url scheme, valid '
-                            'schemes are {valid}')
-                           .format(invalid=scheme, valid=_valid_schemes))
-                    raise ValueError(msg)
-                else:
-                    # something else happened: maybe a faulty connection
-                    raise
+                raise e
         else:
             if not hasattr(r, 'text_content'):
                 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
@@ -657,12 +647,21 @@ def _parse_raw_thead(self, table):
         thead = table.xpath(expr)
         res = []
         if thead:
-            trs = self._parse_tr(thead[0])
-            for tr in trs:
-                cols = [_remove_whitespace(x.text_content()) for x in
-                        self._parse_td(tr)]
+            # Grab any directly descending table headers first
+            ths = thead[0].xpath('./th')
+            if ths:
+                cols = [_remove_whitespace(x.text_content()) for x in ths]
                 if any(col != '' for col in cols):
                     res.append(cols)
+            else:
+                trs = self._parse_tr(thead[0])
+
+                for tr in trs:
+                    cols = [_remove_whitespace(x.text_content()) for x in
+                            self._parse_td(tr)]
+
+                    if any(col != '' for col in cols):
+                        res.append(cols)
         return res
 
     def _parse_raw_tfoot(self, table):
@@ -739,14 +738,10 @@ def _parser_dispatch(flavor):
             raise ImportError(
                 "BeautifulSoup4 (bs4) not found, please install it")
         import bs4
-        if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-            raise ValueError("You're using a version"
-                             " of BeautifulSoup4 (4.2.0) that has been"
-                             " known to cause problems on certain"
-                             " operating systems such as Debian. "
-                             "Please install a version of"
-                             " BeautifulSoup4 != 4.2.0, both earlier"
-                             " and later releases will work.")
+        if LooseVersion(bs4.__version__) <= LooseVersion('4.2.0'):
+            raise ValueError("A minimum version of BeautifulSoup 4.2.1 "
+                             "is required")
+
     else:
         if not _HAS_LXML:
             raise ImportError("lxml not found, please install it")
diff --git a/pandas/tests/io/data/banklist.html b/pandas/tests/io/data/banklist.html
@@ -340,6 +340,7 @@ <h1 class="page_title">Failed Bank List</h1>
 				<td class="closing">April 19, 2013</td>
 				<td class="updated">April 23, 2013</td>
 			</tr>
+			<tr>
 				<td class="institution"><a href="goldcanyon.html">Gold Canyon Bank</a></td>
 				<td class="city">Gold Canyon</td>
 				<td class="state">AZ</td>
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-beautifulsoup4`
	`1`	`+beautifulsoup4>=4.2.1`
`2`	`2`	`blosc`
`3`	`3`	`bottleneck`
`4`	`4`	`fastparquet`