Accept multiple lambda in groupby list #26430

TomAugspurger · 2019-05-16T19:41:15Z

We currently don't allow duplicate function names in the list passed too .groupby().agg({'col': [aggfuncs]}). This is painful with multiple lambdas, which all have the name <lambda>

In [1]: import pandas as pd
df
In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})

In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})

---------------------------------------------------------------------------
SpecificationError                        Traceback (most recent call last)
~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    483                 try:
--> 484                     result = _agg(arg, _agg_1dim)
    485                 except SpecificationError:

~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
    434                 for fname, agg_how in arg.items():
--> 435                     result[fname] = func(fname, agg_how)
    436                 return result

~/sandbox/pandas/pandas/core/base.py in _agg_1dim(name, how, subset)
    417                                              "in aggregation")
--> 418                 return colg.aggregate(how, _level=(_level or 0) + 1)
    419

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
    771             ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772                                                  (_level or 0) + 1)
    773         else:

~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
    834                     'Function names must be unique, found multiple named '
--> 835                     '{}'.format(name))
    836

SpecificationError: Function names must be unique, found multiple named <lambda>

During handling of the above exception, another exception occurred:

SpecificationError                        Traceback (most recent call last)
<ipython-input-3-2aa02bdc2edd> in <module>
----> 1 df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
   1344     @Appender(_shared_docs['aggregate'])
   1345     def aggregate(self, arg=None, *args, **kwargs):
-> 1346         return super().aggregate(arg, *args, **kwargs)
   1347
   1348     agg = aggregate

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
    174                             "'(column, aggfunc).")
    175
--> 176         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
    177         if how is None:
    178             return result

~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    487                     # we are aggregating expecting all 1d-returns
    488                     # but we have 2d
--> 489                     result = _agg(arg, _agg_2dim)
    490
    491             # combine results

~/sandbox/pandas/pandas/core/base.py in _agg(arg, func)
    433                 result = OrderedDict()
    434                 for fname, agg_how in arg.items():
--> 435                     result[fname] = func(fname, agg_how)
    436                 return result
    437

~/sandbox/pandas/pandas/core/base.py in _agg_2dim(name, how)
    424                 colg = self._gotitem(self._selection, ndim=2,
    425                                      subset=obj)
--> 426                 return colg.aggregate(how, _level=None)
    427
    428             def _agg(arg, func):

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs)
   1344     @Appender(_shared_docs['aggregate'])
   1345     def aggregate(self, arg=None, *args, **kwargs):
-> 1346         return super().aggregate(arg, *args, **kwargs)
   1347
   1348     agg = aggregate

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs)
    174                             "'(column, aggfunc).")
    175
--> 176         result, how = self._aggregate(func, _level=_level, *args, **kwargs)
    177         if how is None:
    178             return result

~/sandbox/pandas/pandas/core/base.py in _aggregate(self, arg, *args, **kwargs)
    542             return self._aggregate_multiple_funcs(arg,
    543                                                   _level=_level,
--> 544                                                   _axis=_axis), None
    545         else:
    546             result = None

~/sandbox/pandas/pandas/core/base.py in _aggregate_multiple_funcs(self, arg, _level, _axis)
    588                     colg = self._gotitem(col, ndim=1,
    589                                          subset=obj.iloc[:, index])
--> 590                     results.append(colg.aggregate(arg))
    591                     keys.append(col)
    592                 except (TypeError, DataError):

~/sandbox/pandas/pandas/core/groupby/generic.py in aggregate(self, func_or_funcs, *args, **kwargs)
    770             # but not the class list / tuple itself.
    771             ret = self._aggregate_multiple_funcs(func_or_funcs,
--> 772                                                  (_level or 0) + 1)
    773         else:
    774             cyfunc = self._is_cython_func(func_or_funcs)

~/sandbox/pandas/pandas/core/groupby/generic.py in _aggregate_multiple_funcs(self, arg, _level)
    833                 raise SpecificationError(
    834                     'Function names must be unique, found multiple named '
--> 835                     '{}'.format(name))
    836
    837             # reset the cache so that we

SpecificationError: Function names must be unique, found multiple named <lambda>

I propose that we mangle the names somehow

In [2]: df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]})

In [3]: df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]})
Out[3]:
         B
  <lambda> <lambda 1>
A
a        0          1

That adds a 1, 2, ... to all subsequent lambdas in the same MI level. It doesn't change the first. Do we want <lambda 0> for the first?

As a side-effect, this enables multiple lambdas per column with the new keyword aggregation

In [4]: df.groupby("A").agg(b=('B', lambda x: 0), c=('B', lambda x: 1))
Out[4]:
   b  c
A
a  0  0

I have a WIP started. Will do for 0.25.

The text was updated successfully, but these errors were encountered:

TomAugspurger · 2019-08-21T11:40:20Z

There’s an open PR fixing that.

…

On Aug 21, 2019, at 05:17, Florian Wetschoreck ***@***.***> wrote: Thank you for adding this @TomAugspurger Everything works great for me except for example [4]: df.groupby("A").agg(b=('B', lambda x: 0), c=('B', lambda x: 1)) Which results in the following error: KeyError Traceback (most recent call last) <ipython-input-149-3ff87fe40b34> in <module> 1 df = pd.DataFrame({"A": ['a', 'a'], 'B': [1, 2], 'C': [3, 4]}) 2 #df.groupby("A").agg({'B': [lambda x: 0, lambda x: 1]}) ----> 3 df.groupby("A").agg(b=('B', lambda x: 0), c=('B', lambda x: 1)) 4 #df.groupby("A").agg(b=('B', lambda x: 0), c=('C', lambda x: 1)) /usr/local/lib/python3.7/site-packages/pandas/core/groupby/generic.py in aggregate(self, arg, *args, **kwargs) 1453 @appender(_shared_docs["aggregate"]) 1454 def aggregate(self, arg=None, *args, **kwargs): -> 1455 return super().aggregate(arg, *args, **kwargs) 1456 1457 agg = aggregate /usr/local/lib/python3.7/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, *args, **kwargs) 262 263 if relabeling: --> 264 result = result[order] 265 result.columns = columns 266 /usr/local/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key) 2979 if is_iterator(key): 2980 key = list(key) -> 2981 indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True) 2982 2983 # take() does not accept boolean indexers /usr/local/lib/python3.7/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing) 1269 # When setting, missing keys are not allowed, even with .loc: 1270 kwargs = {"raise_missing": True if is_setter else raise_missing} -> 1271 return self._get_listlike_indexer(obj, axis, **kwargs)[1] 1272 else: 1273 try: /usr/local/lib/python3.7/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing) 1076 1077 self._validate_read_indexer( -> 1078 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing 1079 ) 1080 return keyarr, indexer /usr/local/lib/python3.7/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing) 1161 raise KeyError( 1162 "None of [{key}] are in the [{axis}]".format( -> 1163 key=key, axis=self.obj._get_axis_name(axis) 1164 ) 1165 ) KeyError: "None of [MultiIndex([('B', '<lambda>'),\n ('B', '<lambda>')],\n )] are in the [columns]" The new named approach only works for different columns for me. E.g. when I change column B to C in the second aggregation: df.groupby("A").agg(b=('B', lambda x: 0), c=('C', lambda x: 1)) — You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub, or mute the thread.

TomAugspurger added this to the 0.25.0 milestone May 16, 2019

TomAugspurger added API Design Groupby labels May 16, 2019

TomAugspurger mentioned this issue May 16, 2019

ENH: Support nested renaming / selection #26399

Merged

2 tasks

TomAugspurger mentioned this issue Jun 17, 2019

Allow multiple lambdas in Groupby.aggregate #26905

Merged

jreback closed this as completed in #26905 Jun 27, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Accept multiple lambda in groupby list #26430

Accept multiple lambda in groupby list #26430

TomAugspurger commented May 16, 2019

TomAugspurger commented Aug 21, 2019 via email

Accept multiple lambda in groupby list #26430

Accept multiple lambda in groupby list #26430

Comments

TomAugspurger commented May 16, 2019

TomAugspurger commented Aug 21, 2019 via email