Skip to content

Commit 7186719

Browse files
committed
Updating code_check.sh.
2 parents e1c2879 + e2ed477 commit 7186719

File tree

14 files changed

+131
-240
lines changed

14 files changed

+131
-240
lines changed

ci/code_checks.sh

-5
Original file line numberDiff line numberDiff line change
@@ -158,15 +158,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
158158
-i "pandas.Series.sparse.sp_values SA01" \
159159
-i "pandas.Series.sparse.to_coo PR07,RT03,SA01" \
160160
-i "pandas.Series.std PR01,RT03,SA01" \
161-
-i "pandas.Series.str.lstrip RT03" \
162161
-i "pandas.Series.str.match RT03" \
163162
-i "pandas.Series.str.normalize RT03,SA01" \
164-
-i "pandas.Series.str.partition RT03" \
165163
-i "pandas.Series.str.repeat SA01" \
166164
-i "pandas.Series.str.replace SA01" \
167-
-i "pandas.Series.str.rpartition RT03" \
168-
-i "pandas.Series.str.rstrip RT03" \
169-
-i "pandas.Series.str.strip RT03" \
170165
-i "pandas.Series.str.wrap RT03,SA01" \
171166
-i "pandas.Series.str.zfill RT03" \
172167
-i "pandas.Series.struct.dtypes SA01" \

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5454
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5555
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
56+
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
5657
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
5758
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
5859

pandas/core/frame.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -6406,7 +6406,7 @@ def dropna(
64066406
64076407
thresh : int, optional
64086408
Require that many non-NA values. Cannot be combined with how.
6409-
subset : column label or sequence of labels, optional
6409+
subset : column label or iterable of labels, optional
64106410
Labels along other axis to consider, e.g. if you are dropping rows
64116411
these would be a list of columns to include.
64126412
inplace : bool, default False
@@ -6536,7 +6536,7 @@ def dropna(
65366536
@overload
65376537
def drop_duplicates(
65386538
self,
6539-
subset: Hashable | Sequence[Hashable] | None = ...,
6539+
subset: Hashable | Iterable[Hashable] | None = ...,
65406540
*,
65416541
keep: DropKeep = ...,
65426542
inplace: Literal[True],
@@ -6546,7 +6546,7 @@ def drop_duplicates(
65466546
@overload
65476547
def drop_duplicates(
65486548
self,
6549-
subset: Hashable | Sequence[Hashable] | None = ...,
6549+
subset: Hashable | Iterable[Hashable] | None = ...,
65506550
*,
65516551
keep: DropKeep = ...,
65526552
inplace: Literal[False] = ...,
@@ -6556,7 +6556,7 @@ def drop_duplicates(
65566556
@overload
65576557
def drop_duplicates(
65586558
self,
6559-
subset: Hashable | Sequence[Hashable] | None = ...,
6559+
subset: Hashable | Iterable[Hashable] | None = ...,
65606560
*,
65616561
keep: DropKeep = ...,
65626562
inplace: bool = ...,
@@ -6565,7 +6565,7 @@ def drop_duplicates(
65656565

65666566
def drop_duplicates(
65676567
self,
6568-
subset: Hashable | Sequence[Hashable] | None = None,
6568+
subset: Hashable | Iterable[Hashable] | None = None,
65696569
*,
65706570
keep: DropKeep = "first",
65716571
inplace: bool = False,
@@ -6579,7 +6579,7 @@ def drop_duplicates(
65796579
65806580
Parameters
65816581
----------
6582-
subset : column label or sequence of labels, optional
6582+
subset : column label or iterable of labels, optional
65836583
Only consider certain columns for identifying duplicates, by
65846584
default use all of the columns.
65856585
keep : {'first', 'last', ``False``}, default 'first'
@@ -6669,7 +6669,7 @@ def drop_duplicates(
66696669

66706670
def duplicated(
66716671
self,
6672-
subset: Hashable | Sequence[Hashable] | None = None,
6672+
subset: Hashable | Iterable[Hashable] | None = None,
66736673
keep: DropKeep = "first",
66746674
) -> Series:
66756675
"""
@@ -6679,7 +6679,7 @@ def duplicated(
66796679
66806680
Parameters
66816681
----------
6682-
subset : column label or sequence of labels, optional
6682+
subset : column label or iterable of labels, optional
66836683
Only consider certain columns for identifying duplicates, by
66846684
default use all of the columns.
66856685
keep : {'first', 'last', False}, default 'first'
@@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67716771
return labels.astype("i8"), len(shape)
67726772

67736773
if subset is None:
6774-
# https://github.com/pandas-dev/pandas/issues/28770
6775-
# Incompatible types in assignment (expression has type "Index", variable
6776-
# has type "Sequence[Any]")
6777-
subset = self.columns # type: ignore[assignment]
6774+
subset = self.columns
67786775
elif (
67796776
not np.iterable(subset)
67806777
or isinstance(subset, str)
@@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]:
67956792

67966793
if len(subset) == 1 and self.columns.is_unique:
67976794
# GH#45236 This is faster than get_group_index below
6798-
result = self[subset[0]].duplicated(keep)
6795+
result = self[next(iter(subset))].duplicated(keep)
67996796
result.name = None
68006797
else:
68016798
vals = (col.values for name, col in self.items() if name in subset)

pandas/core/strings/accessor.py

+3
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,8 @@ def rsplit(self, pat=None, *, n=-1, expand: bool = False):
969969
Returns
970970
-------
971971
DataFrame/MultiIndex or Series/Index of objects
972+
Returns appropriate type based on `expand` parameter with strings
973+
split based on the `sep` parameter.
972974
973975
See Also
974976
--------
@@ -2127,6 +2129,7 @@ def encode(self, encoding, errors: str = "strict"):
21272129
Returns
21282130
-------
21292131
Series or Index of object
2132+
Series or Index with the strings being stripped from the %(side)s.
21302133
21312134
See Also
21322135
--------

pandas/tests/frame/methods/test_drop_duplicates.py

+38
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
476476
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
477477
with pytest.raises(ValueError, match=msg):
478478
df.drop_duplicates(ignore_index=arg)
479+
480+
481+
def test_drop_duplicates_set():
482+
# GH#59237
483+
df = DataFrame(
484+
{
485+
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
486+
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
487+
"C": [1, 1, 2, 2, 2, 2, 1, 2],
488+
"D": range(8),
489+
}
490+
)
491+
# single column
492+
result = df.drop_duplicates({"AAA"})
493+
expected = df[:2]
494+
tm.assert_frame_equal(result, expected)
495+
496+
result = df.drop_duplicates({"AAA"}, keep="last")
497+
expected = df.loc[[6, 7]]
498+
tm.assert_frame_equal(result, expected)
499+
500+
result = df.drop_duplicates({"AAA"}, keep=False)
501+
expected = df.loc[[]]
502+
tm.assert_frame_equal(result, expected)
503+
assert len(result) == 0
504+
505+
# multi column
506+
expected = df.loc[[0, 1, 2, 3]]
507+
result = df.drop_duplicates({"AAA", "B"})
508+
tm.assert_frame_equal(result, expected)
509+
510+
result = df.drop_duplicates({"AAA", "B"}, keep="last")
511+
expected = df.loc[[0, 5, 6, 7]]
512+
tm.assert_frame_equal(result, expected)
513+
514+
result = df.drop_duplicates({"AAA", "B"}, keep=False)
515+
expected = df.loc[[0]]
516+
tm.assert_frame_equal(result, expected)

pandas/tests/io/excel/test_readers.py

+14-30
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@
3030
read_csv,
3131
)
3232
import pandas._testing as tm
33-
from pandas.core.arrays import (
34-
ArrowStringArray,
35-
StringArray,
36-
)
3733

3834
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
3935
engine_params = [
@@ -692,43 +688,31 @@ def test_dtype_backend_and_dtype(self, read_ext, tmp_excel):
692688
)
693689
tm.assert_frame_equal(result, df)
694690

695-
@pytest.mark.xfail(
696-
using_string_dtype(), reason="infer_string takes precedence", strict=False
697-
)
698691
def test_dtype_backend_string(self, read_ext, string_storage, tmp_excel):
699692
# GH#36712
700693
if read_ext in (".xlsb", ".xls"):
701694
pytest.skip(f"No engine for filetype: '{read_ext}'")
702695

703-
pa = pytest.importorskip("pyarrow")
696+
df = DataFrame(
697+
{
698+
"a": np.array(["a", "b"], dtype=np.object_),
699+
"b": np.array(["x", pd.NA], dtype=np.object_),
700+
}
701+
)
702+
df.to_excel(tmp_excel, sheet_name="test", index=False)
704703

705704
with pd.option_context("mode.string_storage", string_storage):
706-
df = DataFrame(
707-
{
708-
"a": np.array(["a", "b"], dtype=np.object_),
709-
"b": np.array(["x", pd.NA], dtype=np.object_),
710-
}
711-
)
712-
df.to_excel(tmp_excel, sheet_name="test", index=False)
713705
result = pd.read_excel(
714706
tmp_excel, sheet_name="test", dtype_backend="numpy_nullable"
715707
)
716708

717-
if string_storage == "python":
718-
expected = DataFrame(
719-
{
720-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
721-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
722-
}
723-
)
724-
else:
725-
expected = DataFrame(
726-
{
727-
"a": ArrowStringArray(pa.array(["a", "b"])),
728-
"b": ArrowStringArray(pa.array(["x", None])),
729-
}
730-
)
731-
tm.assert_frame_equal(result, expected)
709+
expected = DataFrame(
710+
{
711+
"a": Series(["a", "b"], dtype=pd.StringDtype(string_storage)),
712+
"b": Series(["x", None], dtype=pd.StringDtype(string_storage)),
713+
}
714+
)
715+
tm.assert_frame_equal(result, expected)
732716

733717
@pytest.mark.parametrize("dtypes, exp_value", [({}, 1), ({"a.1": "int64"}, 1)])
734718
def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):

pandas/tests/io/json/test_pandas.py

+9-27
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,6 @@
2828
read_json,
2929
)
3030
import pandas._testing as tm
31-
from pandas.core.arrays import (
32-
ArrowStringArray,
33-
StringArray,
34-
)
35-
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
3631

3732
from pandas.io.json import ujson_dumps
3833

@@ -2143,12 +2138,10 @@ def test_json_uint64(self):
21432138
result = df.to_json(orient="split")
21442139
assert result == expected
21452140

2146-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
21472141
def test_read_json_dtype_backend(
21482142
self, string_storage, dtype_backend, orient, using_infer_string
21492143
):
21502144
# GH#50750
2151-
pa = pytest.importorskip("pyarrow")
21522145
df = DataFrame(
21532146
{
21542147
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2162,30 +2155,18 @@ def test_read_json_dtype_backend(
21622155
}
21632156
)
21642157

2165-
if using_infer_string:
2166-
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
2167-
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
2168-
elif string_storage == "python":
2169-
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
2170-
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
2171-
2172-
elif dtype_backend == "pyarrow":
2173-
pa = pytest.importorskip("pyarrow")
2174-
from pandas.arrays import ArrowExtensionArray
2175-
2176-
string_array = ArrowExtensionArray(pa.array(["a", "b", "c"]))
2177-
string_array_na = ArrowExtensionArray(pa.array(["a", "b", None]))
2178-
2179-
else:
2180-
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
2181-
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
2182-
21832158
out = df.to_json(orient=orient)
21842159
with pd.option_context("mode.string_storage", string_storage):
21852160
result = read_json(
21862161
StringIO(out), dtype_backend=dtype_backend, orient=orient
21872162
)
21882163

2164+
if dtype_backend == "pyarrow":
2165+
pa = pytest.importorskip("pyarrow")
2166+
string_dtype = pd.ArrowDtype(pa.string())
2167+
else:
2168+
string_dtype = pd.StringDtype(string_storage)
2169+
21892170
expected = DataFrame(
21902171
{
21912172
"a": Series([1, np.nan, 3], dtype="Int64"),
@@ -2194,12 +2175,13 @@ def test_read_json_dtype_backend(
21942175
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
21952176
"e": Series([True, False, NA], dtype="boolean"),
21962177
"f": Series([True, False, True], dtype="boolean"),
2197-
"g": string_array,
2198-
"h": string_array_na,
2178+
"g": Series(["a", "b", "c"], dtype=string_dtype),
2179+
"h": Series(["a", "b", None], dtype=string_dtype),
21992180
}
22002181
)
22012182

22022183
if dtype_backend == "pyarrow":
2184+
pa = pytest.importorskip("pyarrow")
22032185
from pandas.arrays import ArrowExtensionArray
22042186

22052187
expected = DataFrame(

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

+8-22
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,7 @@
1919
Timestamp,
2020
)
2121
import pandas._testing as tm
22-
from pandas.core.arrays import (
23-
ArrowStringArray,
24-
IntegerArray,
25-
StringArray,
26-
)
22+
from pandas.core.arrays import IntegerArray
2723

2824
pytestmark = pytest.mark.filterwarnings(
2925
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -465,8 +461,6 @@ def test_dtype_backend_and_dtype(all_parsers):
465461

466462
def test_dtype_backend_string(all_parsers, string_storage):
467463
# GH#36712
468-
pa = pytest.importorskip("pyarrow")
469-
470464
with pd.option_context("mode.string_storage", string_storage):
471465
parser = all_parsers
472466

@@ -476,21 +470,13 @@ def test_dtype_backend_string(all_parsers, string_storage):
476470
"""
477471
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")
478472

479-
if string_storage == "python":
480-
expected = DataFrame(
481-
{
482-
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
483-
"b": StringArray(np.array(["x", pd.NA], dtype=np.object_)),
484-
}
485-
)
486-
else:
487-
expected = DataFrame(
488-
{
489-
"a": ArrowStringArray(pa.array(["a", "b"])),
490-
"b": ArrowStringArray(pa.array(["x", None])),
491-
}
492-
)
493-
tm.assert_frame_equal(result, expected)
473+
expected = DataFrame(
474+
{
475+
"a": pd.array(["a", "b"], dtype=pd.StringDtype(string_storage)),
476+
"b": pd.array(["x", pd.NA], dtype=pd.StringDtype(string_storage)),
477+
}
478+
)
479+
tm.assert_frame_equal(result, expected)
494480

495481

496482
def test_dtype_backend_ea_dtype_specified(all_parsers):

0 commit comments

Comments
 (0)