Skip to content

API: Make most arguments for read_html and read_json keyword-ony #27573

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 7, 2020
13 changes: 13 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,26 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns

Deprecations
~~~~~~~~~~~~

- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)

- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`)
- The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`)

- Passing any arguments but the first one to :func:`read_html` as
positional arguments is deprecated since version 1.1. All other
arguments should be given as keyword arguments (:issue:`27573`).

- Passing any arguments but `path_or_buf` (the first one) to
:func:`read_json` as positional arguments is deprecated since
version 1.1. All other arguments should be given as keyword
arguments (:issue:`27573`).

-

.. ---------------------------------------------------------------------------


Expand Down
2 changes: 2 additions & 0 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError, EmptyDataError
from pandas.util._decorators import deprecate_nonkeyword_arguments

from pandas.core.dtypes.common import is_list_like

Expand Down Expand Up @@ -921,6 +922,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
return ret


@deprecate_nonkeyword_arguments(version="2.0")
def read_html(
io,
match=".+",
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._libs.tslibs import iNaT
from pandas._typing import JSONSerializable
from pandas.errors import AbstractMethodError
from pandas.util._decorators import deprecate_kwarg
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments

from pandas.core.dtypes.common import ensure_str, is_period_dtype

Expand Down Expand Up @@ -345,6 +345,9 @@ def _write(


@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None)
@deprecate_nonkeyword_arguments(
version="2.0", allowed_args=["path_or_buf"], stacklevel=3
)
def read_json(
path_or_buf=None,
orient=None,
Expand Down
31 changes: 31 additions & 0 deletions pandas/tests/io/json/test_deprecated_kwargs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""
Tests for the deprecated keyword arguments for `read_json`.
"""

import pandas as pd
import pandas._testing as tm

from pandas.io.json import read_json


def test_deprecated_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
buf = df.to_json(orient="split")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "split"))
buf = df.to_json(orient="columns")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "columns"))
buf = df.to_json(orient="index")
with tm.assert_produces_warning(FutureWarning):
tm.assert_frame_equal(df, read_json(buf, "index"))


def test_good_kwargs():
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
with tm.assert_produces_warning(None):
tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split"))
tm.assert_frame_equal(
df, read_json(df.to_json(orient="columns"), orient="columns")
)
tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))
116 changes: 68 additions & 48 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_invalid_flavor():
msg = r"\{" + flavor + r"\} is not a valid set of flavors"

with pytest.raises(ValueError, match=msg):
read_html(url, "google", flavor=flavor)
read_html(url, match="google", flavor=flavor)


@td.skip_if_no("bs4")
Expand Down Expand Up @@ -121,13 +121,26 @@ def test_to_html_compat(self):
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
tm.assert_frame_equal(res, df)

@tm.network
def test_banklist_url_positional_match(self):
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
# Passing match argument as positional should cause a FutureWarning.
with tm.assert_produces_warning(FutureWarning):
df1 = self.read_html(
url, "First Federal Bank of Florida", attrs={"id": "table"}
)
with tm.assert_produces_warning(FutureWarning):
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})

assert_framelist_equal(df1, df2)

@tm.network
def test_banklist_url(self):
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
df1 = self.read_html(
url, "First Federal Bank of Florida", attrs={"id": "table"}
url, match="First Federal Bank of Florida", attrs={"id": "table"}
)
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})
df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"})

assert_framelist_equal(df1, df2)

Expand All @@ -137,21 +150,25 @@ def test_spam_url(self):
"https://raw.githubusercontent.com/pandas-dev/pandas/master/"
"pandas/tests/io/data/html/spam.html"
)
df1 = self.read_html(url, ".*Water.*")
df2 = self.read_html(url, "Unit")
df1 = self.read_html(url, match=".*Water.*")
df2 = self.read_html(url, match="Unit")

assert_framelist_equal(df1, df2)

@pytest.mark.slow
def test_banklist(self):
df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"})
df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"})
df1 = self.read_html(
self.banklist_data, match=".*Florida.*", attrs={"id": "table"}
)
df2 = self.read_html(
self.banklist_data, match="Metcalf Bank", attrs={"id": "table"}
)

assert_framelist_equal(df1, df2)

def test_spam(self):
df1 = self.read_html(self.spam_data, ".*Water.*")
df2 = self.read_html(self.spam_data, "Unit")
df1 = self.read_html(self.spam_data, match=".*Water.*")
df2 = self.read_html(self.spam_data, match="Unit")
assert_framelist_equal(df1, df2)

assert df1[0].iloc[0, 0] == "Proximates"
Expand All @@ -168,81 +185,82 @@ def test_banklist_no_match(self):
assert isinstance(df, DataFrame)

def test_spam_header(self):
df = self.read_html(self.spam_data, ".*Water.*", header=2)[0]
df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0]
assert df.columns[0] == "Proximates"
assert not df.empty

def test_skiprows_int(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)

assert_framelist_equal(df1, df2)

def test_skiprows_range(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0]
df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0]
tm.assert_frame_equal(df1, df2)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2))

assert_framelist_equal(df1, df2)

def test_skiprows_list(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2])
df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1])
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2])
df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1])

assert_framelist_equal(df1, df2)

def test_skiprows_set(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2})
df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1})
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2})
df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1})

assert_framelist_equal(df1, df2)

def test_skiprows_slice(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)

assert_framelist_equal(df1, df2)

def test_skiprows_slice_short(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2))
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2))

assert_framelist_equal(df1, df2)

def test_skiprows_slice_long(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1))

assert_framelist_equal(df1, df2)

def test_skiprows_ndarray(self):
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2))
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2))
df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2))

assert_framelist_equal(df1, df2)

def test_skiprows_invalid(self):
with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
self.read_html(self.spam_data, ".*Water.*", skiprows="asdf")
self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf")

def test_index(self):
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
assert_framelist_equal(df1, df2)

def test_header_and_index_no_types(self):
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
assert_framelist_equal(df1, df2)

def test_header_and_index_with_types(self):
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
assert_framelist_equal(df1, df2)

def test_infer_types(self):

# 10892 infer_types removed
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
assert_framelist_equal(df1, df2)

def test_string_io(self):
Expand All @@ -252,25 +270,25 @@ def test_string_io(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data2 = StringIO(f.read())

df1 = self.read_html(data1, ".*Water.*")
df2 = self.read_html(data2, "Unit")
df1 = self.read_html(data1, match=".*Water.*")
df2 = self.read_html(data2, match="Unit")
assert_framelist_equal(df1, df2)

def test_string(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
data = f.read()

df1 = self.read_html(data, ".*Water.*")
df2 = self.read_html(data, "Unit")
df1 = self.read_html(data, match=".*Water.*")
df2 = self.read_html(data, match="Unit")

assert_framelist_equal(df1, df2)

def test_file_like(self):
with open(self.spam_data, **self.spam_data_kwargs) as f:
df1 = self.read_html(f, ".*Water.*")
df1 = self.read_html(f, match=".*Water.*")

with open(self.spam_data, **self.spam_data_kwargs) as f:
df2 = self.read_html(f, "Unit")
df2 = self.read_html(f, match="Unit")

assert_framelist_equal(df1, df2)

Expand All @@ -292,7 +310,7 @@ def test_invalid_url(self):
def test_file_url(self):
url = self.banklist_data
dfs = self.read_html(
file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"}
file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
)
assert isinstance(dfs, list)
for df in dfs:
Expand All @@ -308,7 +326,7 @@ def test_invalid_table_attrs(self):

def _bank_data(self, *args, **kwargs):
return self.read_html(
self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs
self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs
)

@pytest.mark.slow
Expand Down Expand Up @@ -358,7 +376,7 @@ def test_regex_idempotency(self):
def test_negative_skiprows(self):
msg = r"\(you passed a negative value\)"
with pytest.raises(ValueError, match=msg):
self.read_html(self.spam_data, "Water", skiprows=-1)
self.read_html(self.spam_data, match="Water", skiprows=-1)

@tm.network
def test_multiple_matches(self):
Expand Down Expand Up @@ -600,7 +618,9 @@ def test_gold_canyon(self):
raw_text = f.read()

assert gc in raw_text
df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0]
df = self.read_html(
self.banklist_data, match="Gold Canyon", attrs={"id": "table"}
)[0]
assert gc in df.to_string()

def test_different_number_of_cols(self):
Expand Down Expand Up @@ -855,7 +875,7 @@ def test_wikipedia_states_table(self, datapath):
data = datapath("io", "data", "html", "wikipedia_states.html")
assert os.path.isfile(data), f"{repr(data)} is not a file"
assert os.path.getsize(data), f"{repr(data)} is an empty file"
result = self.read_html(data, "Arizona", header=1)[0]
result = self.read_html(data, match="Arizona", header=1)[0]
assert result.shape == (60, 12)
assert "Unnamed" in result.columns[-1]
assert result["sq mi"].dtype == np.dtype("float64")
Expand Down Expand Up @@ -1065,7 +1085,7 @@ def test_works_on_valid_markup(self, datapath):
@pytest.mark.slow
def test_fallback_success(self, datapath):
banklist_data = datapath("io", "data", "html", "banklist.html")
self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"])
self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])

def test_to_html_timestamp(self):
rng = date_range("2000-01-01", periods=10)
Expand Down
Loading