From a43a88856027c375ea31d12f4c4c0eedbf6aed12 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 21 May 2020 14:17:36 +0100 Subject: [PATCH 1/7] TYP: Add type hints to read_html --- pandas/io/html.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index c4ffe332e3020..d2a44663cced8 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,7 +8,9 @@ import numbers import os import re +from typing import Dict, List, Optional, Pattern, Union +from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, EmptyDataError from pandas.util._decorators import deprecate_nonkeyword_arguments @@ -16,6 +18,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.frame import DataFrame from pandas.io.common import is_url, urlopen, validate_header_arg from pandas.io.formats.printing import pprint_thing @@ -924,22 +927,22 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): @deprecate_nonkeyword_arguments(version="2.0") def read_html( - io, - match=".+", - flavor=None, + io: FilePathOrBuffer, + match: Union[str, Pattern] = ".+", + flavor: Optional[str] = None, header=None, index_col=None, skiprows=None, - attrs=None, - parse_dates=False, - thousands=",", - encoding=None, - decimal=".", - converters=None, + attrs: Optional[Dict[str, str]] = None, + parse_dates: Optional[bool] = False, + thousands: str = ",", + encoding: Optional[str] = None, + decimal: str = ".", + converters: Optional[dict] = None, na_values=None, - keep_default_na=True, - displayed_only=True, -): + keep_default_na: bool = True, + displayed_only: bool = True, +) -> List[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. From c125f2f5c7e3eb0b1e66342aca4d8e39d0b50542 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 21 May 2020 20:06:21 +0100 Subject: [PATCH 2/7] update type hints for parameters --- pandas/io/html.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index d2a44663cced8..9a80fe128dfcf 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,7 +8,7 @@ import numbers import os import re -from typing import Dict, List, Optional, Pattern, Union +from typing import Dict, List, Optional, Pattern, Sequence, Union from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -930,9 +930,9 @@ def read_html( io: FilePathOrBuffer, match: Union[str, Pattern] = ".+", flavor: Optional[str] = None, - header=None, - index_col=None, - skiprows=None, + header: Union[int, Sequence[int], None] = None, + index_col: Union[int, Sequence[int], None] = None, + skiprows: Union[int, Sequence[int], slice, None] = None, attrs: Optional[Dict[str, str]] = None, parse_dates: Optional[bool] = False, thousands: str = ",", @@ -967,14 +967,14 @@ def read_html( default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or list-like or None, optional + header : int or sequence of ints, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. - index_col : int or list-like or None, optional + index_col : int or sequence of ints, optional The column (or list of columns) to use to create the index. - skiprows : int or list-like or slice or None, optional + skiprows : int, sequence of ints or slice, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth From 43e424f81be79ac1311c28ae26358208a1c60bdf Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 17 Jun 2020 21:58:24 +0100 Subject: [PATCH 3/7] change parse_dates type hint --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 9a80fe128dfcf..276395e3606fb 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -934,7 +934,7 @@ def read_html( index_col: Union[int, Sequence[int], None] = None, skiprows: Union[int, Sequence[int], slice, None] = None, attrs: Optional[Dict[str, str]] = None, - parse_dates: Optional[bool] = False, + parse_dates: bool = False, thousands: str = ",", encoding: Optional[str] = None, decimal: str = ".", From 2659973d78e1f5c56f55768c25893fcd66e83a25 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 7 Jul 2020 06:51:22 +0100 Subject: [PATCH 4/7] Change doc strings --- pandas/io/html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 276395e3606fb..29145ef4cfce0 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -967,14 +967,14 @@ def read_html( default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or sequence of ints, optional + header : int or list-like or None, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. - index_col : int or sequence of ints, optional + index_col : int or list-like or None, optional The column (or list of columns) to use to create the index. - skiprows : int, sequence of ints or slice, optional + skiprows : int or list-like or None, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth From 1150f9f7ea5b2eb24b51132e5ab5dc840ce49b65 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 7 Jul 2020 06:58:56 +0100 Subject: [PATCH 5/7] doc string improvements --- pandas/io/html.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 29145ef4cfce0..87f9bcad50c3e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -935,7 +935,7 @@ def read_html( skiprows: Union[int, Sequence[int], slice, None] = None, attrs: Optional[Dict[str, str]] = None, parse_dates: bool = False, - thousands: str = ",", + thousands: Optional[str] = ",", encoding: Optional[str] = None, decimal: str = ".", converters: Optional[dict] = None, @@ -961,26 +961,26 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str or None + flavor : str, optional The parsing engine to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or list-like or None, optional + header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. - index_col : int or list-like or None, optional + index_col : int or list-like, optional The column (or list of columns) to use to create the index. - skiprows : int or list-like or None, optional + skiprows : int, list-like or slice, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth row' whereas an integer means 'skip n rows'. - attrs : dict or None, optional + attrs : dict, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be @@ -1008,7 +1008,7 @@ def read_html( thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. - encoding : str or None, optional + encoding : str, optional The encoding used to decode the web page. Defaults to ``None``.``None`` preserves the previous encoding behavior, which depends on the underlying parser library (e.g., the parser library will try to use From 360d86fa1916464717102e3c2198ff19584e4791 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 7 Jul 2020 23:13:11 +0100 Subject: [PATCH 6/7] update --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 87f9bcad50c3e..d41a9b81a17cd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -930,7 +930,7 @@ def read_html( io: FilePathOrBuffer, match: Union[str, Pattern] = ".+", flavor: Optional[str] = None, - header: Union[int, Sequence[int], None] = None, + header: Optional[Union[int, Sequence[int]]] = None, index_col: Union[int, Sequence[int], None] = None, skiprows: Union[int, Sequence[int], slice, None] = None, attrs: Optional[Dict[str, str]] = None, From e56897bb16d3092bc5be8576c25459546f06c797 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 11 Jul 2020 17:57:18 +0100 Subject: [PATCH 7/7] Update type hints --- pandas/io/html.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index d41a9b81a17cd..3193f52d239f1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -931,14 +931,14 @@ def read_html( match: Union[str, Pattern] = ".+", flavor: Optional[str] = None, header: Optional[Union[int, Sequence[int]]] = None, - index_col: Union[int, Sequence[int], None] = None, - skiprows: Union[int, Sequence[int], slice, None] = None, + index_col: Optional[Union[int, Sequence[int]]] = None, + skiprows: Optional[Union[int, Sequence[int], slice]] = None, attrs: Optional[Dict[str, str]] = None, parse_dates: bool = False, thousands: Optional[str] = ",", encoding: Optional[str] = None, decimal: str = ".", - converters: Optional[dict] = None, + converters: Optional[Dict] = None, na_values=None, keep_default_na: bool = True, displayed_only: bool = True,