14
14
15
15
from pandas .core .dtypes .common import is_list_like
16
16
from pandas .errors import EmptyDataError
17
- from pandas .io .common import (_is_url , urlopen ,
18
- parse_url , _validate_header_arg )
17
+ from pandas .io .common import _is_url , urlopen , _validate_header_arg
19
18
from pandas .io .parsers import TextParser
20
19
from pandas .compat import (lrange , lmap , u , string_types , iteritems ,
21
20
raise_with_traceback , binary_type )
@@ -554,8 +553,7 @@ def _parse_td(self, row):
554
553
return row .xpath ('.//td|.//th' )
555
554
556
555
def _parse_tr (self , table ):
557
- expr = './/tr[normalize-space()]'
558
- return table .xpath (expr )
556
+ return table .xpath ('.//tr' )
559
557
560
558
def _parse_tables (self , doc , match , kwargs ):
561
559
pattern = match .pattern
@@ -606,18 +604,20 @@ def _build_doc(self):
606
604
"""
607
605
from lxml .html import parse , fromstring , HTMLParser
608
606
from lxml .etree import XMLSyntaxError
609
-
610
- parser = HTMLParser (recover = False , encoding = self .encoding )
607
+ parser = HTMLParser (recover = True , encoding = self .encoding )
611
608
612
609
try :
613
- # try to parse the input in the simplest way
614
- r = parse (self .io , parser = parser )
615
-
610
+ if _is_url (self .io ):
611
+ with urlopen (self .io ) as f :
612
+ r = parse (f , parser = parser )
613
+ else :
614
+ # try to parse the input in the simplest way
615
+ r = parse (self .io , parser = parser )
616
616
try :
617
617
r = r .getroot ()
618
618
except AttributeError :
619
619
pass
620
- except (UnicodeDecodeError , IOError ):
620
+ except (UnicodeDecodeError , IOError ) as e :
621
621
# if the input is a blob of html goop
622
622
if not _is_url (self .io ):
623
623
r = fromstring (self .io , parser = parser )
@@ -627,17 +627,7 @@ def _build_doc(self):
627
627
except AttributeError :
628
628
pass
629
629
else :
630
- # not a url
631
- scheme = parse_url (self .io ).scheme
632
- if scheme not in _valid_schemes :
633
- # lxml can't parse it
634
- msg = (('{invalid!r} is not a valid url scheme, valid '
635
- 'schemes are {valid}' )
636
- .format (invalid = scheme , valid = _valid_schemes ))
637
- raise ValueError (msg )
638
- else :
639
- # something else happened: maybe a faulty connection
640
- raise
630
+ raise e
641
631
else :
642
632
if not hasattr (r , 'text_content' ):
643
633
raise XMLSyntaxError ("no text parsed from document" , 0 , 0 , 0 )
@@ -657,12 +647,21 @@ def _parse_raw_thead(self, table):
657
647
thead = table .xpath (expr )
658
648
res = []
659
649
if thead :
660
- trs = self . _parse_tr ( thead [ 0 ])
661
- for tr in trs :
662
- cols = [ _remove_whitespace ( x . text_content ()) for x in
663
- self . _parse_td ( tr ) ]
650
+ # Grab any directly descending table headers first
651
+ ths = thead [ 0 ]. xpath ( './th' )
652
+ if ths :
653
+ cols = [ _remove_whitespace ( x . text_content ()) for x in ths ]
664
654
if any (col != '' for col in cols ):
665
655
res .append (cols )
656
+ else :
657
+ trs = self ._parse_tr (thead [0 ])
658
+
659
+ for tr in trs :
660
+ cols = [_remove_whitespace (x .text_content ()) for x in
661
+ self ._parse_td (tr )]
662
+
663
+ if any (col != '' for col in cols ):
664
+ res .append (cols )
666
665
return res
667
666
668
667
def _parse_raw_tfoot (self , table ):
@@ -739,14 +738,10 @@ def _parser_dispatch(flavor):
739
738
raise ImportError (
740
739
"BeautifulSoup4 (bs4) not found, please install it" )
741
740
import bs4
742
- if LooseVersion (bs4 .__version__ ) == LooseVersion ('4.2.0' ):
743
- raise ValueError ("You're using a version"
744
- " of BeautifulSoup4 (4.2.0) that has been"
745
- " known to cause problems on certain"
746
- " operating systems such as Debian. "
747
- "Please install a version of"
748
- " BeautifulSoup4 != 4.2.0, both earlier"
749
- " and later releases will work." )
741
+ if LooseVersion (bs4 .__version__ ) <= LooseVersion ('4.2.0' ):
742
+ raise ValueError ("A minimum version of BeautifulSoup 4.2.1 "
743
+ "is required" )
744
+
750
745
else :
751
746
if not _HAS_LXML :
752
747
raise ImportError ("lxml not found, please install it" )
0 commit comments