Merge pull request #10920 from sinhrks/nlargest

jreback · jreback · commit 66174e559a02 · 2015-08-28T20:03:43.000-04:00
DEPR: Series.nlargest/nsmallest take_last.
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -645,6 +645,7 @@ Deprecations
 
 - ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
 - ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
+- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
 - ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They
   can easily be replaced by using the ``add`` and ``mul`` methods:
   ``DataFrame.add(other, fill_value=0)`` and ``DataFrame.mul(other, fill_value=1.)``
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -453,24 +453,24 @@ def group_position(*args):
 _dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}
 
 
-def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
+def _finalize_nsmallest(arr, kth_val, n, keep, narr):
     ns, = np.nonzero(arr <= kth_val)
     inds = ns[arr[ns].argsort(kind='mergesort')][:n]
-
-    if take_last:
+    if keep == 'last':
         # reverse indices
         return narr - 1 - inds
-    return inds
+    else:
+        return inds
 
 
-def nsmallest(arr, n, take_last=False):
+def nsmallest(arr, n, keep='first'):
     '''
     Find the indices of the n smallest values of a numpy array.
 
     Note: Fails silently with NaN.
 
     '''
-    if take_last:
+    if keep == 'last':
         arr = arr[::-1]
 
     narr = len(arr)
@@ -480,22 +480,22 @@ def nsmallest(arr, n, take_last=False):
     arr = arr.view(_dtype_map.get(sdtype, sdtype))
 
     kth_val = algos.kth_smallest(arr.copy(), n - 1)
-    return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
+    return _finalize_nsmallest(arr, kth_val, n, keep, narr)
 
 
-def nlargest(arr, n, take_last=False):
+def nlargest(arr, n, keep='first'):
     """
     Find the indices of the n largest values of a numpy array.
 
     Note: Fails silently with NaN.
     """
     sdtype = str(arr.dtype)
     arr = arr.view(_dtype_map.get(sdtype, sdtype))
-    return nsmallest(-arr, n, take_last=take_last)
+    return nsmallest(-arr, n, keep=keep)
 
 
-def select_n_slow(dropped, n, take_last, method):
-    reverse_it = take_last or method == 'nlargest'
+def select_n_slow(dropped, n, keep, method):
+    reverse_it = (keep == 'last' or method == 'nlargest')
     ascending = method == 'nsmallest'
     slc = np.s_[::-1] if reverse_it else np.s_[:]
     return dropped[slc].sort_values(ascending=ascending).head(n)
@@ -504,13 +504,13 @@ def select_n_slow(dropped, n, take_last, method):
 _select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
 
 
-def select_n(series, n, take_last, method):
+def select_n(series, n, keep, method):
     """Implement n largest/smallest.
 
     Parameters
     ----------
     n : int
-    take_last : bool
+    keep : {'first', 'last'}, default 'first'
     method : str, {'nlargest', 'nsmallest'}
 
     Returns
@@ -522,15 +522,18 @@ def select_n(series, n, take_last, method):
                                    np.timedelta64)):
         raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
 
+    if keep not in ('first', 'last'):
+        raise ValueError('keep must be either "first", "last"')
+
     if n <= 0:
         return series[[]]
 
     dropped = series.dropna()
 
     if n >= len(series):
-        return select_n_slow(dropped, n, take_last, method)
+        return select_n_slow(dropped, n, keep, method)
 
-    inds = _select_methods[method](dropped.values, n, take_last)
+    inds = _select_methods[method](dropped.values, n, keep)
     return dropped.iloc[inds]
 
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3169,16 +3169,16 @@ def sortlevel(self, level=0, axis=0, ascending=True,
                                inplace=inplace, sort_remaining=sort_remaining)
 
 
-    def _nsorted(self, columns, n, method, take_last):
+    def _nsorted(self, columns, n, method, keep):
         if not com.is_list_like(columns):
             columns = [columns]
         columns = list(columns)
-        ser = getattr(self[columns[0]], method)(n, take_last=take_last)
+        ser = getattr(self[columns[0]], method)(n, keep=keep)
         ascending = dict(nlargest=False, nsmallest=True)[method]
         return self.loc[ser.index].sort_values(columns, ascending=ascending,
                                                kind='mergesort')
 
-    def nlargest(self, n, columns, take_last=False):
+    def nlargest(self, n, columns, keep='first'):
         """Get the rows of a DataFrame sorted by the `n` largest
         values of `columns`.
 
@@ -3190,8 +3190,10 @@ def nlargest(self, n, columns, take_last=False):
             Number of items to retrieve
         columns : list or str
             Column name or names to order by
-        take_last : bool, optional
-            Where there are duplicate values, take the last duplicate
+        keep : {'first', 'last', False}, default 'first'
+            Where there are duplicate values:
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
 
         Returns
         -------
@@ -3208,9 +3210,9 @@ def nlargest(self, n, columns, take_last=False):
         1  10  b   2
         2   8  d NaN
         """
-        return self._nsorted(columns, n, 'nlargest', take_last)
+        return self._nsorted(columns, n, 'nlargest', keep)
 
-    def nsmallest(self, n, columns, take_last=False):
+    def nsmallest(self, n, columns, keep='first'):
         """Get the rows of a DataFrame sorted by the `n` smallest
         values of `columns`.
 
@@ -3222,8 +3224,10 @@ def nsmallest(self, n, columns, take_last=False):
             Number of items to retrieve
         columns : list or str
             Column name or names to order by
-        take_last : bool, optional
-            Where there are duplicate values, take the last duplicate
+        keep : {'first', 'last', False}, default 'first'
+            Where there are duplicate values:
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
 
         Returns
         -------
@@ -3240,7 +3244,7 @@ def nsmallest(self, n, columns, take_last=False):
         0  1  a   1
         2  8  d NaN
         """
-        return self._nsorted(columns, n, 'nsmallest', take_last)
+        return self._nsorted(columns, n, 'nsmallest', keep)
 
     def swaplevel(self, i, j, axis=0):
         """
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -19,7 +19,8 @@
 from pandas.core.internals import BlockManager, make_block
 from pandas.core.series import Series
 from pandas.core.panel import Panel
-from pandas.util.decorators import cache_readonly, Appender, make_signature
+from pandas.util.decorators import (cache_readonly, Appender, make_signature,
+                                    deprecate_kwarg)
 import pandas.core.algorithms as algos
 import pandas.core.common as com
 from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
@@ -82,7 +83,7 @@
 
 _series_apply_whitelist = \
     (_common_apply_whitelist - set(['boxplot'])) | \
-    frozenset(['dtype', 'unique', 'nlargest', 'nsmallest'])
+    frozenset(['dtype', 'unique'])
 
 _dataframe_apply_whitelist = \
     _common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2585,6 +2586,19 @@ def nunique(self, dropna=True):
                       index=self.grouper.result_index,
                       name=self.name)
 
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
+    @Appender(Series.nlargest.__doc__)
+    def nlargest(self, n=5, keep='first'):
+        # ToDo: When we remove deprecate_kwargs, we can remote these methods
+        # and inlucde nlargest and nsmallest to _series_apply_whitelist
+        return self.apply(lambda x: x.nlargest(n=n, keep=keep))
+
+
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
+    @Appender(Series.nsmallest.__doc__)
+    def nsmallest(self, n=5, keep='first'):
+        return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
+
     def value_counts(self, normalize=False, sort=True, ascending=False,
                      bins=None, dropna=True):
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1817,15 +1817,19 @@ def rank(self, method='average', na_option='keep', ascending=True,
                      ascending=ascending, pct=pct)
         return self._constructor(ranks, index=self.index).__finalize__(self)
 
-    def nlargest(self, n=5, take_last=False):
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
+    def nlargest(self, n=5, keep='first'):
         """Return the largest `n` elements.
 
         Parameters
         ----------
         n : int
             Return this many descending sorted values
-        take_last : bool
-            Where there are duplicate values, take the last duplicate
+        keep : {'first', 'last', False}, default 'first'
+            Where there are duplicate values:
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
+        take_last : deprecated
 
         Returns
         -------
@@ -1848,17 +1852,21 @@ def nlargest(self, n=5, take_last=False):
         >>> s = pd.Series(np.random.randn(1e6))
         >>> s.nlargest(10)  # only sorts up to the N requested
         """
-        return select_n(self, n=n, take_last=take_last, method='nlargest')
+        return select_n(self, n=n, keep=keep, method='nlargest')
 
-    def nsmallest(self, n=5, take_last=False):
+    @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
+    def nsmallest(self, n=5, keep='first'):
         """Return the smallest `n` elements.
 
         Parameters
         ----------
         n : int
             Return this many ascending sorted values
-        take_last : bool
-            Where there are duplicate values, take the last duplicate
+        keep : {'first', 'last', False}, default 'first'
+            Where there are duplicate values:
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
+        take_last : deprecated
 
         Returns
         -------
@@ -1881,7 +1889,7 @@ def nsmallest(self, n=5, take_last=False):
         >>> s = pd.Series(np.random.randn(1e6))
         >>> s.nsmallest(10)  # only sorts up to the N requested
         """
-        return select_n(self, n=n, take_last=take_last, method='nsmallest')
+        return select_n(self, n=n, keep=keep, method='nsmallest')
 
     def sortlevel(self, level=0, ascending=True, sort_remaining=True):
         """
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -4997,7 +4997,7 @@ def test_groupby_whitelist(self):
             'corr', 'cov',
             'diff',
             'unique',
-            'nlargest', 'nsmallest',
+            # 'nlargest', 'nsmallest',
         ])
 
         for obj, whitelist in zip((df, s),
@@ -5316,6 +5316,16 @@ def test_nlargest(self):
                                                  [3, 2, 1, 9, 5, 8]]))
         tm.assert_series_equal(r, e)
 
+
+        a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+        gb = a.groupby(b)
+        e = Series([3, 2, 1, 3, 3, 2],
+                   index=MultiIndex.from_arrays([list('aaabbb'),
+                                                 [2, 3, 1, 6, 5, 7]]))
+        assert_series_equal(gb.nlargest(3, keep='last'), e)
+        with tm.assert_produces_warning(FutureWarning):
+            assert_series_equal(gb.nlargest(3, take_last=True), e)
+
     def test_nsmallest(self):
         a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
         b = Series(list('a' * 5 + 'b' * 5))
@@ -5326,6 +5336,15 @@ def test_nsmallest(self):
                                                  [0, 4, 1, 6, 7, 8]]))
         tm.assert_series_equal(r, e)
 
+        a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
+        gb = a.groupby(b)
+        e = Series([0, 1, 1, 0, 1, 2],
+                   index=MultiIndex.from_arrays([list('aaabbb'),
+                                                 [4, 1, 0, 9, 8, 7]]))
+        assert_series_equal(gb.nsmallest(3, keep='last'), e)
+        with tm.assert_produces_warning(FutureWarning):
+            assert_series_equal(gb.nsmallest(3, take_last=True), e)
+
     def test_transform_doesnt_clobber_ints(self):
         # GH 7972
         n = 6
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -5040,11 +5040,16 @@ def test_nsmallest_nlargest(self):
         for s in s_list:
 
             assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
-            assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
+
+            assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
+            with tm.assert_produces_warning(FutureWarning):
+                assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
 
             assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
-            assert_series_equal(s.nlargest(3, take_last=True),
-                                s.iloc[[4, 0, 3]])
+
+            assert_series_equal(s.nlargest(3, keep='last'), s.iloc[[4, 0, 3]])
+            with tm.assert_produces_warning(FutureWarning):
+                assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])
 
             empty = s.iloc[0:0]
             assert_series_equal(s.nsmallest(0), empty)
@@ -5062,6 +5067,12 @@ def test_nsmallest_nlargest(self):
         assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
         assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
 
+        msg = 'keep must be either "first", "last"'
+        with tm.assertRaisesRegexp(ValueError, msg):
+            s.nsmallest(keep='invalid')
+        with tm.assertRaisesRegexp(ValueError, msg):
+            s.nlargest(keep='invalid')
+
     def test_rank(self):
         tm._skip_if_no_scipy()
         from scipy.stats import rankdata