Skip to content

Commit 66174e5

Browse files
committed
Merge pull request #10920 from sinhrks/nlargest
DEPR: Series.nlargest/nsmallest take_last.
2 parents 91f7e42 + 73fc1dd commit 66174e5

File tree

7 files changed

+99
-39
lines changed

7 files changed

+99
-39
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,7 @@ Deprecations
645645

646646
- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
647647
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
648+
- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
648649
- ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They
649650
can easily be replaced by using the ``add`` and ``mul`` methods:
650651
``DataFrame.add(other, fill_value=0)`` and ``DataFrame.mul(other, fill_value=1.)``

pandas/core/algorithms.py

+18-15
Original file line numberDiff line numberDiff line change
@@ -453,24 +453,24 @@ def group_position(*args):
453453
_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}
454454

455455

456-
def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
456+
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
457457
ns, = np.nonzero(arr <= kth_val)
458458
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
459-
460-
if take_last:
459+
if keep == 'last':
461460
# reverse indices
462461
return narr - 1 - inds
463-
return inds
462+
else:
463+
return inds
464464

465465

466-
def nsmallest(arr, n, take_last=False):
466+
def nsmallest(arr, n, keep='first'):
467467
'''
468468
Find the indices of the n smallest values of a numpy array.
469469
470470
Note: Fails silently with NaN.
471471
472472
'''
473-
if take_last:
473+
if keep == 'last':
474474
arr = arr[::-1]
475475

476476
narr = len(arr)
@@ -480,22 +480,22 @@ def nsmallest(arr, n, take_last=False):
480480
arr = arr.view(_dtype_map.get(sdtype, sdtype))
481481

482482
kth_val = algos.kth_smallest(arr.copy(), n - 1)
483-
return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
483+
return _finalize_nsmallest(arr, kth_val, n, keep, narr)
484484

485485

486-
def nlargest(arr, n, take_last=False):
486+
def nlargest(arr, n, keep='first'):
487487
"""
488488
Find the indices of the n largest values of a numpy array.
489489
490490
Note: Fails silently with NaN.
491491
"""
492492
sdtype = str(arr.dtype)
493493
arr = arr.view(_dtype_map.get(sdtype, sdtype))
494-
return nsmallest(-arr, n, take_last=take_last)
494+
return nsmallest(-arr, n, keep=keep)
495495

496496

497-
def select_n_slow(dropped, n, take_last, method):
498-
reverse_it = take_last or method == 'nlargest'
497+
def select_n_slow(dropped, n, keep, method):
498+
reverse_it = (keep == 'last' or method == 'nlargest')
499499
ascending = method == 'nsmallest'
500500
slc = np.s_[::-1] if reverse_it else np.s_[:]
501501
return dropped[slc].sort_values(ascending=ascending).head(n)
@@ -504,13 +504,13 @@ def select_n_slow(dropped, n, take_last, method):
504504
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
505505

506506

507-
def select_n(series, n, take_last, method):
507+
def select_n(series, n, keep, method):
508508
"""Implement n largest/smallest.
509509
510510
Parameters
511511
----------
512512
n : int
513-
take_last : bool
513+
keep : {'first', 'last'}, default 'first'
514514
method : str, {'nlargest', 'nsmallest'}
515515
516516
Returns
@@ -522,15 +522,18 @@ def select_n(series, n, take_last, method):
522522
np.timedelta64)):
523523
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
524524

525+
if keep not in ('first', 'last'):
526+
raise ValueError('keep must be either "first", "last"')
527+
525528
if n <= 0:
526529
return series[[]]
527530

528531
dropped = series.dropna()
529532

530533
if n >= len(series):
531-
return select_n_slow(dropped, n, take_last, method)
534+
return select_n_slow(dropped, n, keep, method)
532535

533-
inds = _select_methods[method](dropped.values, n, take_last)
536+
inds = _select_methods[method](dropped.values, n, keep)
534537
return dropped.iloc[inds]
535538

536539

pandas/core/frame.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -3169,16 +3169,16 @@ def sortlevel(self, level=0, axis=0, ascending=True,
31693169
inplace=inplace, sort_remaining=sort_remaining)
31703170

31713171

3172-
def _nsorted(self, columns, n, method, take_last):
3172+
def _nsorted(self, columns, n, method, keep):
31733173
if not com.is_list_like(columns):
31743174
columns = [columns]
31753175
columns = list(columns)
3176-
ser = getattr(self[columns[0]], method)(n, take_last=take_last)
3176+
ser = getattr(self[columns[0]], method)(n, keep=keep)
31773177
ascending = dict(nlargest=False, nsmallest=True)[method]
31783178
return self.loc[ser.index].sort_values(columns, ascending=ascending,
31793179
kind='mergesort')
31803180

3181-
def nlargest(self, n, columns, take_last=False):
3181+
def nlargest(self, n, columns, keep='first'):
31823182
"""Get the rows of a DataFrame sorted by the `n` largest
31833183
values of `columns`.
31843184
@@ -3190,8 +3190,10 @@ def nlargest(self, n, columns, take_last=False):
31903190
Number of items to retrieve
31913191
columns : list or str
31923192
Column name or names to order by
3193-
take_last : bool, optional
3194-
Where there are duplicate values, take the last duplicate
3193+
keep : {'first', 'last', False}, default 'first'
3194+
Where there are duplicate values:
3195+
- ``first`` : take the first occurrence.
3196+
- ``last`` : take the last occurrence.
31953197
31963198
Returns
31973199
-------
@@ -3208,9 +3210,9 @@ def nlargest(self, n, columns, take_last=False):
32083210
1 10 b 2
32093211
2 8 d NaN
32103212
"""
3211-
return self._nsorted(columns, n, 'nlargest', take_last)
3213+
return self._nsorted(columns, n, 'nlargest', keep)
32123214

3213-
def nsmallest(self, n, columns, take_last=False):
3215+
def nsmallest(self, n, columns, keep='first'):
32143216
"""Get the rows of a DataFrame sorted by the `n` smallest
32153217
values of `columns`.
32163218
@@ -3222,8 +3224,10 @@ def nsmallest(self, n, columns, take_last=False):
32223224
Number of items to retrieve
32233225
columns : list or str
32243226
Column name or names to order by
3225-
take_last : bool, optional
3226-
Where there are duplicate values, take the last duplicate
3227+
keep : {'first', 'last', False}, default 'first'
3228+
Where there are duplicate values:
3229+
- ``first`` : take the first occurrence.
3230+
- ``last`` : take the last occurrence.
32273231
32283232
Returns
32293233
-------
@@ -3240,7 +3244,7 @@ def nsmallest(self, n, columns, take_last=False):
32403244
0 1 a 1
32413245
2 8 d NaN
32423246
"""
3243-
return self._nsorted(columns, n, 'nsmallest', take_last)
3247+
return self._nsorted(columns, n, 'nsmallest', keep)
32443248

32453249
def swaplevel(self, i, j, axis=0):
32463250
"""

pandas/core/groupby.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
from pandas.core.internals import BlockManager, make_block
2020
from pandas.core.series import Series
2121
from pandas.core.panel import Panel
22-
from pandas.util.decorators import cache_readonly, Appender, make_signature
22+
from pandas.util.decorators import (cache_readonly, Appender, make_signature,
23+
deprecate_kwarg)
2324
import pandas.core.algorithms as algos
2425
import pandas.core.common as com
2526
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
@@ -82,7 +83,7 @@
8283

8384
_series_apply_whitelist = \
8485
(_common_apply_whitelist - set(['boxplot'])) | \
85-
frozenset(['dtype', 'unique', 'nlargest', 'nsmallest'])
86+
frozenset(['dtype', 'unique'])
8687

8788
_dataframe_apply_whitelist = \
8889
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
@@ -2585,6 +2586,19 @@ def nunique(self, dropna=True):
25852586
index=self.grouper.result_index,
25862587
name=self.name)
25872588

2589+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
2590+
@Appender(Series.nlargest.__doc__)
2591+
def nlargest(self, n=5, keep='first'):
2592+
# ToDo: When we remove deprecate_kwargs, we can remote these methods
2593+
# and inlucde nlargest and nsmallest to _series_apply_whitelist
2594+
return self.apply(lambda x: x.nlargest(n=n, keep=keep))
2595+
2596+
2597+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
2598+
@Appender(Series.nsmallest.__doc__)
2599+
def nsmallest(self, n=5, keep='first'):
2600+
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))
2601+
25882602
def value_counts(self, normalize=False, sort=True, ascending=False,
25892603
bins=None, dropna=True):
25902604

pandas/core/series.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -1817,15 +1817,19 @@ def rank(self, method='average', na_option='keep', ascending=True,
18171817
ascending=ascending, pct=pct)
18181818
return self._constructor(ranks, index=self.index).__finalize__(self)
18191819

1820-
def nlargest(self, n=5, take_last=False):
1820+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
1821+
def nlargest(self, n=5, keep='first'):
18211822
"""Return the largest `n` elements.
18221823
18231824
Parameters
18241825
----------
18251826
n : int
18261827
Return this many descending sorted values
1827-
take_last : bool
1828-
Where there are duplicate values, take the last duplicate
1828+
keep : {'first', 'last', False}, default 'first'
1829+
Where there are duplicate values:
1830+
- ``first`` : take the first occurrence.
1831+
- ``last`` : take the last occurrence.
1832+
take_last : deprecated
18291833
18301834
Returns
18311835
-------
@@ -1848,17 +1852,21 @@ def nlargest(self, n=5, take_last=False):
18481852
>>> s = pd.Series(np.random.randn(1e6))
18491853
>>> s.nlargest(10) # only sorts up to the N requested
18501854
"""
1851-
return select_n(self, n=n, take_last=take_last, method='nlargest')
1855+
return select_n(self, n=n, keep=keep, method='nlargest')
18521856

1853-
def nsmallest(self, n=5, take_last=False):
1857+
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
1858+
def nsmallest(self, n=5, keep='first'):
18541859
"""Return the smallest `n` elements.
18551860
18561861
Parameters
18571862
----------
18581863
n : int
18591864
Return this many ascending sorted values
1860-
take_last : bool
1861-
Where there are duplicate values, take the last duplicate
1865+
keep : {'first', 'last', False}, default 'first'
1866+
Where there are duplicate values:
1867+
- ``first`` : take the first occurrence.
1868+
- ``last`` : take the last occurrence.
1869+
take_last : deprecated
18621870
18631871
Returns
18641872
-------
@@ -1881,7 +1889,7 @@ def nsmallest(self, n=5, take_last=False):
18811889
>>> s = pd.Series(np.random.randn(1e6))
18821890
>>> s.nsmallest(10) # only sorts up to the N requested
18831891
"""
1884-
return select_n(self, n=n, take_last=take_last, method='nsmallest')
1892+
return select_n(self, n=n, keep=keep, method='nsmallest')
18851893

18861894
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
18871895
"""

pandas/tests/test_groupby.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -4997,7 +4997,7 @@ def test_groupby_whitelist(self):
49974997
'corr', 'cov',
49984998
'diff',
49994999
'unique',
5000-
'nlargest', 'nsmallest',
5000+
# 'nlargest', 'nsmallest',
50015001
])
50025002

50035003
for obj, whitelist in zip((df, s),
@@ -5316,6 +5316,16 @@ def test_nlargest(self):
53165316
[3, 2, 1, 9, 5, 8]]))
53175317
tm.assert_series_equal(r, e)
53185318

5319+
5320+
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
5321+
gb = a.groupby(b)
5322+
e = Series([3, 2, 1, 3, 3, 2],
5323+
index=MultiIndex.from_arrays([list('aaabbb'),
5324+
[2, 3, 1, 6, 5, 7]]))
5325+
assert_series_equal(gb.nlargest(3, keep='last'), e)
5326+
with tm.assert_produces_warning(FutureWarning):
5327+
assert_series_equal(gb.nlargest(3, take_last=True), e)
5328+
53195329
def test_nsmallest(self):
53205330
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
53215331
b = Series(list('a' * 5 + 'b' * 5))
@@ -5326,6 +5336,15 @@ def test_nsmallest(self):
53265336
[0, 4, 1, 6, 7, 8]]))
53275337
tm.assert_series_equal(r, e)
53285338

5339+
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
5340+
gb = a.groupby(b)
5341+
e = Series([0, 1, 1, 0, 1, 2],
5342+
index=MultiIndex.from_arrays([list('aaabbb'),
5343+
[4, 1, 0, 9, 8, 7]]))
5344+
assert_series_equal(gb.nsmallest(3, keep='last'), e)
5345+
with tm.assert_produces_warning(FutureWarning):
5346+
assert_series_equal(gb.nsmallest(3, take_last=True), e)
5347+
53295348
def test_transform_doesnt_clobber_ints(self):
53305349
# GH 7972
53315350
n = 6

pandas/tests/test_series.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -5040,11 +5040,16 @@ def test_nsmallest_nlargest(self):
50405040
for s in s_list:
50415041

50425042
assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
5043-
assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
5043+
5044+
assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
5045+
with tm.assert_produces_warning(FutureWarning):
5046+
assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
50445047

50455048
assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
5046-
assert_series_equal(s.nlargest(3, take_last=True),
5047-
s.iloc[[4, 0, 3]])
5049+
5050+
assert_series_equal(s.nlargest(3, keep='last'), s.iloc[[4, 0, 3]])
5051+
with tm.assert_produces_warning(FutureWarning):
5052+
assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])
50485053

50495054
empty = s.iloc[0:0]
50505055
assert_series_equal(s.nsmallest(0), empty)
@@ -5062,6 +5067,12 @@ def test_nsmallest_nlargest(self):
50625067
assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
50635068
assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
50645069

5070+
msg = 'keep must be either "first", "last"'
5071+
with tm.assertRaisesRegexp(ValueError, msg):
5072+
s.nsmallest(keep='invalid')
5073+
with tm.assertRaisesRegexp(ValueError, msg):
5074+
s.nlargest(keep='invalid')
5075+
50655076
def test_rank(self):
50665077
tm._skip_if_no_scipy()
50675078
from scipy.stats import rankdata

0 commit comments

Comments
 (0)