Skip to content

Commit

Permalink
ENH: Allow Iterable[Hashable] in drop_duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
kirill-bash authored Aug 13, 2024
1 parent fb6842d commit 614939a
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 13 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Other enhancements
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)

Expand Down
23 changes: 10 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6406,7 +6406,7 @@ def dropna(
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
subset : column label or sequence of labels, optional
subset : column label or iterable of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
inplace : bool, default False
Expand Down Expand Up @@ -6536,7 +6536,7 @@ def dropna(
@overload
def drop_duplicates(
self,
subset: Hashable | Sequence[Hashable] | None = ...,
subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[True],
Expand All @@ -6546,7 +6546,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
subset: Hashable | Sequence[Hashable] | None = ...,
subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: Literal[False] = ...,
Expand All @@ -6556,7 +6556,7 @@ def drop_duplicates(
@overload
def drop_duplicates(
self,
subset: Hashable | Sequence[Hashable] | None = ...,
subset: Hashable | Iterable[Hashable] | None = ...,
*,
keep: DropKeep = ...,
inplace: bool = ...,
Expand All @@ -6565,7 +6565,7 @@ def drop_duplicates(

def drop_duplicates(
self,
subset: Hashable | Sequence[Hashable] | None = None,
subset: Hashable | Iterable[Hashable] | None = None,
*,
keep: DropKeep = "first",
inplace: bool = False,
Expand All @@ -6579,7 +6579,7 @@ def drop_duplicates(
Parameters
----------
subset : column label or sequence of labels, optional
subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', ``False``}, default 'first'
Expand Down Expand Up @@ -6669,7 +6669,7 @@ def drop_duplicates(

def duplicated(
self,
subset: Hashable | Sequence[Hashable] | None = None,
subset: Hashable | Iterable[Hashable] | None = None,
keep: DropKeep = "first",
) -> Series:
"""
Expand All @@ -6679,7 +6679,7 @@ def duplicated(
Parameters
----------
subset : column label or sequence of labels, optional
subset : column label or iterable of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
Expand Down Expand Up @@ -6771,10 +6771,7 @@ def f(vals) -> tuple[np.ndarray, int]:
return labels.astype("i8"), len(shape)

if subset is None:
# https://github.com/pandas-dev/pandas/issues/28770
# Incompatible types in assignment (expression has type "Index", variable
# has type "Sequence[Any]")
subset = self.columns # type: ignore[assignment]
subset = self.columns
elif (
not np.iterable(subset)
or isinstance(subset, str)
Expand All @@ -6795,7 +6792,7 @@ def f(vals) -> tuple[np.ndarray, int]:

if len(subset) == 1 and self.columns.is_unique:
# GH#45236 This is faster than get_group_index below
result = self[subset[0]].duplicated(keep)
result = self[next(iter(subset))].duplicated(keep)
result.name = None
else:
vals = (col.values for name, col in self.items() if name in subset)
Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/frame/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,3 +476,41 @@ def test_drop_duplicates_non_boolean_ignore_index(arg):
msg = '^For argument "ignore_index" expected type bool, received type .*.$'
with pytest.raises(ValueError, match=msg):
df.drop_duplicates(ignore_index=arg)


def test_drop_duplicates_set():
# GH#59237
df = DataFrame(
{
"AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": [1, 1, 2, 2, 2, 2, 1, 2],
"D": range(8),
}
)
# single column
result = df.drop_duplicates({"AAA"})
expected = df[:2]
tm.assert_frame_equal(result, expected)

result = df.drop_duplicates({"AAA"}, keep="last")
expected = df.loc[[6, 7]]
tm.assert_frame_equal(result, expected)

result = df.drop_duplicates({"AAA"}, keep=False)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
assert len(result) == 0

# multi column
expected = df.loc[[0, 1, 2, 3]]
result = df.drop_duplicates({"AAA", "B"})
tm.assert_frame_equal(result, expected)

result = df.drop_duplicates({"AAA", "B"}, keep="last")
expected = df.loc[[0, 5, 6, 7]]
tm.assert_frame_equal(result, expected)

result = df.drop_duplicates({"AAA", "B"}, keep=False)
expected = df.loc[[0]]
tm.assert_frame_equal(result, expected)

0 comments on commit 614939a

Please sign in to comment.