diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index c18bedd0cf6eb..8fb738ff7d76d 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -660,6 +660,7 @@ Deprecations - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`). - ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`) +- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`) - ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They can easily be replaced by using the ``add`` and ``mul`` methods: ``DataFrame.add(other, fill_value=0)`` and ``DataFrame.mul(other, fill_value=1.)`` diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 250b4b3e562b8..36d31d493b10d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -453,24 +453,24 @@ def group_position(*args): _dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'} -def _finalize_nsmallest(arr, kth_val, n, take_last, narr): +def _finalize_nsmallest(arr, kth_val, n, keep, narr): ns, = np.nonzero(arr <= kth_val) inds = ns[arr[ns].argsort(kind='mergesort')][:n] - - if take_last: + if keep == 'last': # reverse indices return narr - 1 - inds - return inds + else: + return inds -def nsmallest(arr, n, take_last=False): +def nsmallest(arr, n, keep='first'): ''' Find the indices of the n smallest values of a numpy array. Note: Fails silently with NaN. ''' - if take_last: + if keep == 'last': arr = arr[::-1] narr = len(arr) @@ -480,10 +480,10 @@ def nsmallest(arr, n, take_last=False): arr = arr.view(_dtype_map.get(sdtype, sdtype)) kth_val = algos.kth_smallest(arr.copy(), n - 1) - return _finalize_nsmallest(arr, kth_val, n, take_last, narr) + return _finalize_nsmallest(arr, kth_val, n, keep, narr) -def nlargest(arr, n, take_last=False): +def nlargest(arr, n, keep='first'): """ Find the indices of the n largest values of a numpy array. @@ -491,11 +491,11 @@ def nlargest(arr, n, take_last=False): """ sdtype = str(arr.dtype) arr = arr.view(_dtype_map.get(sdtype, sdtype)) - return nsmallest(-arr, n, take_last=take_last) + return nsmallest(-arr, n, keep=keep) -def select_n_slow(dropped, n, take_last, method): - reverse_it = take_last or method == 'nlargest' +def select_n_slow(dropped, n, keep, method): + reverse_it = (keep == 'last' or method == 'nlargest') ascending = method == 'nsmallest' slc = np.s_[::-1] if reverse_it else np.s_[:] return dropped[slc].sort_values(ascending=ascending).head(n) @@ -504,13 +504,13 @@ def select_n_slow(dropped, n, take_last, method): _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest} -def select_n(series, n, take_last, method): +def select_n(series, n, keep, method): """Implement n largest/smallest. Parameters ---------- n : int - take_last : bool + keep : {'first', 'last'}, default 'first' method : str, {'nlargest', 'nsmallest'} Returns @@ -522,15 +522,18 @@ def select_n(series, n, take_last, method): np.timedelta64)): raise TypeError("Cannot use method %r with dtype %s" % (method, dtype)) + if keep not in ('first', 'last'): + raise ValueError('keep must be either "first", "last"') + if n <= 0: return series[[]] dropped = series.dropna() if n >= len(series): - return select_n_slow(dropped, n, take_last, method) + return select_n_slow(dropped, n, keep, method) - inds = _select_methods[method](dropped.values, n, take_last) + inds = _select_methods[method](dropped.values, n, keep) return dropped.iloc[inds] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e908bf9d579b..3abf7c4458854 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3163,16 +3163,16 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=inplace, sort_remaining=sort_remaining) - def _nsorted(self, columns, n, method, take_last): + def _nsorted(self, columns, n, method, keep): if not com.is_list_like(columns): columns = [columns] columns = list(columns) - ser = getattr(self[columns[0]], method)(n, take_last=take_last) + ser = getattr(self[columns[0]], method)(n, keep=keep) ascending = dict(nlargest=False, nsmallest=True)[method] return self.loc[ser.index].sort_values(columns, ascending=ascending, kind='mergesort') - def nlargest(self, n, columns, take_last=False): + def nlargest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` largest values of `columns`. @@ -3184,8 +3184,10 @@ def nlargest(self, n, columns, take_last=False): Number of items to retrieve columns : list or str Column name or names to order by - take_last : bool, optional - Where there are duplicate values, take the last duplicate + keep : {'first', 'last', False}, default 'first' + Where there are duplicate values: + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. Returns ------- @@ -3202,9 +3204,9 @@ def nlargest(self, n, columns, take_last=False): 1 10 b 2 2 8 d NaN """ - return self._nsorted(columns, n, 'nlargest', take_last) + return self._nsorted(columns, n, 'nlargest', keep) - def nsmallest(self, n, columns, take_last=False): + def nsmallest(self, n, columns, keep='first'): """Get the rows of a DataFrame sorted by the `n` smallest values of `columns`. @@ -3216,8 +3218,10 @@ def nsmallest(self, n, columns, take_last=False): Number of items to retrieve columns : list or str Column name or names to order by - take_last : bool, optional - Where there are duplicate values, take the last duplicate + keep : {'first', 'last', False}, default 'first' + Where there are duplicate values: + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. Returns ------- @@ -3234,7 +3238,7 @@ def nsmallest(self, n, columns, take_last=False): 0 1 a 1 2 8 d NaN """ - return self._nsorted(columns, n, 'nsmallest', take_last) + return self._nsorted(columns, n, 'nsmallest', keep) def swaplevel(self, i, j, axis=0): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 444f149e70e34..8adaf1437c1de 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -19,7 +19,8 @@ from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel -from pandas.util.decorators import cache_readonly, Appender, make_signature +from pandas.util.decorators import (cache_readonly, Appender, make_signature, + deprecate_kwarg) import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.common import(_possibly_downcast_to_dtype, isnull, @@ -82,7 +83,7 @@ _series_apply_whitelist = \ (_common_apply_whitelist - set(['boxplot'])) | \ - frozenset(['dtype', 'unique', 'nlargest', 'nsmallest']) + frozenset(['dtype', 'unique']) _dataframe_apply_whitelist = \ _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) @@ -2583,6 +2584,19 @@ def nunique(self, dropna=True): index=self.grouper.result_index, name=self.name) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @Appender(Series.nlargest.__doc__) + def nlargest(self, n=5, keep='first'): + # ToDo: When we remove deprecate_kwargs, we can remote these methods + # and inlucde nlargest and nsmallest to _series_apply_whitelist + return self.apply(lambda x: x.nlargest(n=n, keep=keep)) + + + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @Appender(Series.nsmallest.__doc__) + def nsmallest(self, n=5, keep='first'): + return self.apply(lambda x: x.nsmallest(n=n, keep=keep)) + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index b4fc1c9c48f27..2890730956c75 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1817,15 +1817,19 @@ def rank(self, method='average', na_option='keep', ascending=True, ascending=ascending, pct=pct) return self._constructor(ranks, index=self.index).__finalize__(self) - def nlargest(self, n=5, take_last=False): + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + def nlargest(self, n=5, keep='first'): """Return the largest `n` elements. Parameters ---------- n : int Return this many descending sorted values - take_last : bool - Where there are duplicate values, take the last duplicate + keep : {'first', 'last', False}, default 'first' + Where there are duplicate values: + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + take_last : deprecated Returns ------- @@ -1848,17 +1852,21 @@ def nlargest(self, n=5, take_last=False): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nlargest(10) # only sorts up to the N requested """ - return select_n(self, n=n, take_last=take_last, method='nlargest') + return select_n(self, n=n, keep=keep, method='nlargest') - def nsmallest(self, n=5, take_last=False): + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + def nsmallest(self, n=5, keep='first'): """Return the smallest `n` elements. Parameters ---------- n : int Return this many ascending sorted values - take_last : bool - Where there are duplicate values, take the last duplicate + keep : {'first', 'last', False}, default 'first' + Where there are duplicate values: + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + take_last : deprecated Returns ------- @@ -1881,7 +1889,7 @@ def nsmallest(self, n=5, take_last=False): >>> s = pd.Series(np.random.randn(1e6)) >>> s.nsmallest(10) # only sorts up to the N requested """ - return select_n(self, n=n, take_last=take_last, method='nsmallest') + return select_n(self, n=n, keep=keep, method='nsmallest') def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a8bbc372ebe25..41703b3b5a3b7 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4997,7 +4997,7 @@ def test_groupby_whitelist(self): 'corr', 'cov', 'diff', 'unique', - 'nlargest', 'nsmallest', + # 'nlargest', 'nsmallest', ]) for obj, whitelist in zip((df, s), @@ -5316,6 +5316,16 @@ def test_nlargest(self): [3, 2, 1, 9, 5, 8]])) tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([3, 2, 1, 3, 3, 2], + index=MultiIndex.from_arrays([list('aaabbb'), + [2, 3, 1, 6, 5, 7]])) + assert_series_equal(gb.nlargest(3, keep='last'), e) + with tm.assert_produces_warning(FutureWarning): + assert_series_equal(gb.nlargest(3, take_last=True), e) + def test_nsmallest(self): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list('a' * 5 + 'b' * 5)) @@ -5326,6 +5336,15 @@ def test_nsmallest(self): [0, 4, 1, 6, 7, 8]])) tm.assert_series_equal(r, e) + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([0, 1, 1, 0, 1, 2], + index=MultiIndex.from_arrays([list('aaabbb'), + [4, 1, 0, 9, 8, 7]])) + assert_series_equal(gb.nsmallest(3, keep='last'), e) + with tm.assert_produces_warning(FutureWarning): + assert_series_equal(gb.nsmallest(3, take_last=True), e) + def test_transform_doesnt_clobber_ints(self): # GH 7972 n = 6 diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index a429059c761d6..34ea674fe10c0 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5040,11 +5040,16 @@ def test_nsmallest_nlargest(self): for s in s_list: assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) + + assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]]) + with tm.assert_produces_warning(FutureWarning): + assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) - assert_series_equal(s.nlargest(3, take_last=True), - s.iloc[[4, 0, 3]]) + + assert_series_equal(s.nlargest(3, keep='last'), s.iloc[[4, 0, 3]]) + with tm.assert_produces_warning(FutureWarning): + assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) @@ -5062,6 +5067,12 @@ def test_nsmallest_nlargest(self): assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + msg = 'keep must be either "first", "last"' + with tm.assertRaisesRegexp(ValueError, msg): + s.nsmallest(keep='invalid') + with tm.assertRaisesRegexp(ValueError, msg): + s.nlargest(keep='invalid') + def test_rank(self): tm._skip_if_no_scipy() from scipy.stats import rankdata