import numpy as np
from pandas import DataFrame
df=DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),'data2':np.random.randn(5)})
df
data1 data2 key1 key2
0 0.867596 0.739047 a one
1 -0.271038 -0.621842 a two
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
4 0.008490 2.401128 a one
#想按key1分组,计算data1列的平均值,访问data1,根据key1调用groupby
grouped=df['data1'].groupby(df['key1'])
grouped.mean()
key1a 0.201683
b 0.302487
Name: data1, dtype: float64
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
Out[8]:
key1 key2
a one 0.438043
two -0.271038
b one 1.809116
two -1.204141
Name: data1, dtype: float64
#通过两个键对数据进行分组,得到的Series具有一个层次化索引
means.unstack()
Out[9]:
key2 one two
key1
a 0.438043 -0.271038
b 1.809116 -1.204141
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Out[12]:
California 2005 -0.271038
2006 1.809116
Ohio 2005 -0.168272
2006 0.008490
Name: data1, dtype: float64
df.groupby('key1').mean()
Out[13]:
data1 data2
key1
a 0.201683 0.839444
b 0.302487 -0.328473
df.groupby(['key1','key2']).mean()
Out[14]:
data1 data2
key1 key2
a one 0.438043 1.570087
two -0.271038 -0.621842
b one 1.809116 -0.479957
two -1.204141 -0.176988
df.groupby(['key1','key2']).size()#size()返回一个有分组大小的列
Out[15]:
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
对分组进行迭代:产生一组二元元组,由分组名和数据块组成
for name,group in df.groupby('key1'):
print name
print group
a
data1 data2 key1 key2
0 0.867596 0.739047 a one
1 -0.271038 -0.621842 a two
4 0.008490 2.401128 a one
b
data1 data2 key1 key2
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
对于多重键的情况,元组的第一个元素将会是由键值组成的元组:
for (k1,k2),group in df.groupby(['key1','key2']):
print k1,k2
print group
a one
data1 data2 key1 key2
0 0.867596 0.739047 a one
4 0.008490 2.401128 a one
a two
data1 data2 key1 key2
1 -0.271038 -0.621842 a two
b one
data1 data2 key1 key2
2 1.809116 -0.479957 b one
b two
data1 data2 key1 key2
3 -1.204141 -0.176988 b two
pieces=dict(list(df.groupby('key1')))
pieces['b']
Out[19]:
data1 data2 key1 key2
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
groupby默认是在axis=0上进行分组的,通过设置也可在任何轴上进行分组,根据dtype对列进行分组:
df.dtypes
Out[20]:
data1 float64
data2 float64
key1 object
key2 object
dtype: object
grouped=df.groupby(df.dtypes,axis=1)
dict(list(grouped))
Out[22]:
{dtype('float64'): data1 data2
0 0.867596 0.739047
1 -0.271038 -0.621842
2 1.809116 -0.479957
3 -1.204141 -0.176988
4 0.008490 2.401128, dtype('O'): key1 key2
0 a one
1 a two
2 b one
3 b two
4 a one}
df.groupby(['key1','key2'])[['data2']].mean()
Out[26]:
data2
key1 key2
a one 1.570087
two -0.621842
b one -0.479957
two -0.176988
这种索引操作所返回的对象是一个已分组的DataFrame或已分组的Series
s_grouped=df.groupby(['key1','key2'])['data2']
s_grouped
Out[28]: <pandas.core.groupby.SeriesGroupBy object at 0x095467F0>
s_grouped.mean()
Out[29]:
key1 key2
a one 1.570087
two -0.621842
b one -0.479957
two -0.176988
Name: data2, dtype: float64
通过字典或Series进行分组:
people=DataFrame(np.random.randn(5,5),
columns=['a','b','c','d','e'],
index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan#添加几个na值
people
Out[32]:
a b c d e
Joe -1.349415 -0.034864 -0.041473 0.316972 -1.077930
Steve -0.928486 1.348135 0.648762 -1.265573 -1.798529
Wes -0.221656 NaN NaN 0.844571 0.249980
Jim -0.048006 -0.207574 -0.465525 -0.888653 1.646979
Travis -1.190065 0.113572 0.680029 -1.015694 1.728276
by_column=people.groupby(mapping,axis=1)#axis=1表示按行,默认按列axis=0
by_column.sum()
Out[35]:
blue red
Joe 0.275499 -2.462209
Steve -0.616811 -1.378880
Wes 0.844571 0.028324
Jim -1.354177 1.391399
Travis -0.335665 0.651783
from pandas import Series
map_series=Series(mapping)
map_series
Out[39]:
a red
b red
c blue
d blue
e red
f orange
dtype: object
people.groupby(map_series,axis=1).count()
Out[40]:
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3
通过函数进行分组:
索引值为人名,根据人名的长度进行分组,用len函数即可。
people.groupby(len).sum()
Out[41]:
a b c d e
3 -1.619077 -0.242438 -0.506997 0.272891 0.819028
5 -0.928486 1.348135 0.648762 -1.265573 -1.798529
6 -1.190065 0.113572 0.680029 -1.015694 1.728276
将函数与数组、列表、字典、Series混用:
people.groupby([len,key_list]).min()
Out[43]:
a b c d e
3 one -1.349415 -0.034864 -0.041473 0.316972 -1.077930
two -0.048006 -0.207574 -0.465525 -0.888653 1.646979
5 one -0.928486 1.348135 0.648762 -1.265573 -1.798529
6 two -1.190065 0.113572 0.680029 -1.015694 1.728276
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],
names=['city','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
hier_df
Out[48]:
city US JP
tenor 1 3 5 1 3
0 -0.905468 1.703159 0.239218 1.090464 -0.677526
1 -1.152784 -0.610877 1.476141 0.270902 0.118338
2 0.535665 -0.551337 -1.060250 0.890812 0.923537
3 -0.204244 -0.249316 0.886220 -1.238991 -0.793501
根据索引级别分组:层次化索引数据集最方便的地方在于它能够索引级别进行聚合。通过level关键字传入级别编号或名称即可hier_df.groupby(level='city',axis=1).count()
Out[49]:
city JP US
0 2 3
1 2 3
2 2 3
3 2 3
数据聚合:
df
Out[50]:
data1 data2 key1 key2
0 0.867596 0.739047 a one
1 -0.271038 -0.621842 a two
2 1.809116 -0.479957 b one
3 -1.204141 -0.176988 b two
4 0.008490 2.401128 a one
grouped=df.groupby('key1')
grouped['data1'].quantile(0.9)#quantile计算Series或者dataframe样本分位数
Out[52]:
key1
a 0.695775
b 1.507790
Name: data1, dtype: float64
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)
Out[54]:
data1 data2
key1
a 1.138635 3.022970
b 3.013257 0.302969
tips=pd.read_csv('e:/tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]
Out[57]:
total_bill tip sex smoker day time size tip_pct
0 16.99 1.01 Female No Sun Dinner 2 0.059447
1 10.34 1.66 Male No Sun Dinner 3 0.160542
2 21.01 3.50 Male No Sun Dinner 3 0.166587
3 23.68 3.31 Male No Sun Dinner 2 0.139780
4 24.59 3.61 Female No Sun Dinner 4 0.146808
5 25.29 4.71 Male No Sun Dinner 4 0.186240
面向列的多函数应用:根据sex和smoker对tips进行分组
grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct']
grouped_pct.agg('mean')
Out[60]:
sex smoker
Female No 0.156921
Yes 0.182150
Male No 0.160669
Yes 0.152771
Name: tip_pct, dtype: float64
如果传入一组函数或函数名,得到的DataFrame的列就会以相应的函数命名:
grouped_pct.agg(['mean','std',peak_to_peak])
Out[61]:
mean std peak_to_peak
sex smoker
Female No 0.156921 0.036421 0.195876
Yes 0.182150 0.071595 0.360233
Male No 0.160669 0.041849 0.220186
Yes 0.152771 0.090588 0.674707
如果传入的是一个由(name,function)元组组成的列表,各元组的第一个元素就会被用作DataFrame的列名
grouped_pct.agg([('foo','mean'),('bar','std')])
Out[62]:
foo bar
sex smoker
Female No 0.156921 0.036421
Yes 0.182150 0.071595
Male No 0.160669 0.041849
Yes 0.152771 0.090588
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result
Out[65]:
tip_pct total_bill
count mean max count mean max
sex smoker
Female No 54 0.156921 0.252672 54 18.105185 35.83
Yes 33 0.182150 0.416667 33 17.977879 44.30
Male No 97 0.160669 0.291990 97 19.791237 48.33
Yes 60 0.152771 0.710345 60 22.284500 50.81
result['tip_pct']
Out[66]:
count mean max
sex smoker
Female No 54 0.156921 0.252672
Yes 33 0.182150 0.416667
Male No 97 0.160669 0.291990
Yes 60 0.152771 0.710345
传入带有自定义名称的元组列表ftuples=[('Durchschnitt','mean'),('Abweichung','var')]
grouped['tip_pct','total_bill'].agg(ftuples)
Out[68]:
tip_pct total_bill
Durchschnitt Abweichung Durchschnitt Abweichung
sex smoker
Female No 0.156921 0.001327 18.105185 53.092422
Yes 0.182150 0.005126 17.977879 84.451517
Male No 0.160669 0.001751 19.791237 76.152961
Yes 0.152771 0.008206 22.284500 98.244673
想对不同的列应用不同的函数,向agg传入一个从列名映射到函数的字典:
grouped.agg({'tip':np.max,'size':'sum'})
Out[69]:
tip size
sex smoker
Female No 5.2 140
Yes 6.5 74
Male No 9.0 263
Yes 10.0 150
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
Out[70]:
tip_pct size
min max mean std sum
sex smoker
Female No 0.056797 0.252672 0.156921 0.036421 140
Yes 0.056433 0.416667 0.182150 0.071595 74
Male No 0.071804 0.291990 0.160669 0.041849 263
Yes 0.035638 0.710345 0.152771 0.090588 150
向groupby传入as_index=False可以禁用
分组级运算及转换
想为一个DataFrame添加一个用于存放各索引分组平均值的列,先聚合再合并:
k1_means=df.groupby('key1').mean().add_prefix('mean_')
k1_means
Out[73]:
mean_data1 mean_data2
key1
a 0.201683 0.839444
b 0.302487 -0.328473
pd.merge(df,k1_means,left_on='key1',right_index=True)
Out[74]:
data1 data2 key1 key2 mean_data1 mean_data2
0 0.867596 0.739047 a one 0.201683 0.839444
1 -0.271038 -0.621842 a two 0.201683 0.839444
4 0.008490 2.401128 a one 0.201683 0.839444
2 1.809116 -0.479957 b one 0.302487 -0.328473
3 -1.204141 -0.176988 b two 0.302487 -0.328473
在groupby上使用transform方法:tran sform会将一个函数应用于各个分组,并将结果返回到适当位置上
people.groupby(key).mean()
Out[76]:
a b c d e
one -0.920379 0.039354 0.319278 0.048616 0.300109
two -0.488246 0.570281 0.091618 -1.077113 -0.075775
people.groupby(key).transform(np.mean)
Out[77]:
a b c d e
Joe -0.920379 0.039354 0.319278 0.048616 0.300109
Steve -0.488246 0.570281 0.091618 -1.077113 -0.075775
Wes -0.920379 0.039354 0.319278 0.048616 0.300109
Jim -0.488246 0.570281 0.091618 -1.077113 -0.075775
Travis -0.920379 0.039354 0.319278 0.048616 0.300109
创建一个距平化函数,传递给transform:
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).transform(demean)
demeaned
Out[80]:
a b c d e
Joe -0.429036 -0.074218 -0.360751 0.268356 -1.378039
Steve -0.440240 0.777855 0.557143 -0.188460 -1.722754
Wes 0.698722 NaN NaN 0.795955 -0.050129
Jim 0.440240 -0.777855 -0.557143 0.188460 1.722754
Travis -0.269686 0.074218 0.360751 -1.064310 1.428167
demeaned.groupby(key).mean()
Out[81]:
a b c d e
one 7.401487e-17 0.000000e+00 2.775558e-17 -7.401487e-17 7.401487e-17
two 2.775558e-17 -5.551115e-17 0.000000e+00 5.551115e-17 0.000000e+00
apply函数:拆分-应用-合并。根绝分组选出最高的5个tip_pct值,编写一个选取指定列具有最大值的行函数:
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
Out[84]:
total_bill tip sex smoker day time size tip_pct
109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
232 11.61 3.39 Male No Sat Dinner 2 0.291990
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
对smoker分组并用该函数调用applytips.groupby('smoker').apply(top)
Out[85]:
total_bill tip sex smoker day time size tip_pct
smoker
No 88 24.71 5.85 Male No Thur Lunch 2 0.236746
185 20.69 5.00 Male No Sun Dinner 5 0.241663
51 10.29 2.60 Female No Sun Dinner 2 0.252672
149 7.51 2.00 Male No Thur Lunch 2 0.266312
232 11.61 3.39 Male No Sat Dinner 2 0.291990
Yes 109 14.31 4.00 Female Yes Sat Dinner 2 0.279525
183 23.17 6.50 Male Yes Sun Dinner 4 0.280535
67 3.07 1.00 Female Yes Sat Dinner 1 0.325733
178 9.60 4.00 Female Yes Sun Dinner 2 0.416667
172 7.25 5.15 Male Yes Sun Dinner 2 0.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
Out[86]:
total_bill tip sex smoker day time size \
smoker day
No Fri 94 22.75 3.25 Female No Fri Dinner 2
Sat 212 48.33 9.00 Male No Sat Dinner 4
Sun 156 48.17 5.00 Male No Sun Dinner 6
Thur 142 41.19 5.00 Male No Thur Lunch 5
Yes Fri 95 40.17 4.73 Male Yes Fri Dinner 4
Sat 170 50.81 10.00 Male Yes Sat Dinner 3
Sun 182 45.35 3.50 Male Yes Sun Dinner 3
Thur 197 43.11 5.00 Female Yes Thur Lunch 4
tip_pct
smoker day
No Fri 94 0.142857
Sat 212 0.186220
Sun 156 0.103799
Thur 142 0.121389
Yes Fri 95 0.117750
Sat 170 0.196812
Sun 182 0.077178
Thur 197 0.115982
result=tips.groupby('smoker')['tip_pct'].describe()
result
Out[88]:
smoker
No count 151.000000
mean 0.159328
std 0.039910
min 0.056797
25% 0.136906
50% 0.155625
75% 0.185014
max 0.291990
Yes count 93.000000
mean 0.163196
std 0.085119
min 0.035638
25% 0.106771
50% 0.153846
75% 0.195059
max 0.710345
Name: tip_pct, dtype: float64
result.unstack('smoker')
Out[89]:
smoker No Yes
count 151.000000 93.000000
mean 0.159328 0.163196
std 0.039910 0.085119
min 0.056797 0.035638
25% 0.136906 0.106771
50% 0.155625 0.153846
75% 0.185014 0.195059
max 0.291990 0.710345
分位数和桶分析:
frame=DataFrame({'data1':np.random.randn(1000),'data2':np.random.randn(1000)})
factor=pd.cut(frame.data1,4)
factor[:10]
Out[92]:
0 (-1.307, 0.37]
1 (0.37, 2.0461]
2 (-1.307, 0.37]
3 (0.37, 2.0461]
4 (-1.307, 0.37]
5 (-1.307, 0.37]
6 (-1.307, 0.37]
7 (0.37, 2.0461]
8 (-1.307, 0.37]
9 (-1.307, 0.37]
Name: data1, dtype: category
Categories (4, object): [(-2.99, -1.307] < (-1.307, 0.37] < (0.37, 2.0461] < (2.0461, 3.723]]
cut返回的factor对象可直接用于groupbydef get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
Out[95]:
count max mean min
data1
(-2.99, -1.307] 101.0 2.730091 -0.206504 -2.922759
(-1.307, 0.37] 531.0 2.773289 -0.034422 -2.892153
(0.37, 2.0461] 344.0 3.078922 0.046008 -3.127290
(2.0461, 3.723] 24.0 1.671815 -0.020857 -3.019921
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
Out[98]:
count max mean min
data1
0 100.0 2.730091 -0.212662 -2.922759
1 100.0 2.230289 -0.076260 -2.521450
2 100.0 2.404481 0.051184 -2.369298
3 100.0 2.773289 0.016575 -2.284056
4 100.0 2.328424 -0.043627 -2.892153
5 100.0 1.996065 -0.104378 -2.032999
6 100.0 2.166334 0.015241 -2.291139
7 100.0 2.687426 -0.057435 -2.409512
8 100.0 2.883604 0.267017 -3.127290
9 100.0 3.078922 -0.093742 -3.019921
#返回分位数编号
grouping=pd.qcut(frame.data1,10,labels=False)
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
s=Series(np.random.randn(6))#第0、2、4为nan
s[::2]=np.nan
s
a=Series(np.random.randn(9))#第0、3、6为nan
a[::3]=np.nan
a
a.dropna()
s.fillna(s.mean())
缺失值填充:
states=['Ohio','New York','Vermont','Florida','Oregon','Nevada','California','Idaho']
group_key=['East']*4+['West']*4
data=Series(np.random.randn(8),index=states)
data[['Vermont','Nevada','Idaho']]=np.nan
data
Out[117]:
Ohio 0.326110
New York 0.136178
Vermont NaN
Florida 0.392480
Oregon -0.177571
Nevada NaN
California 0.061360
Idaho NaN
dtype: float64
data.groupby(group_key).mean()
Out[118]:
East 0.284923
West -0.058105
dtype: float64
用分组平均值去填充NA
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Out[120]:
Ohio 0.326110
New York 0.136178
Vermont 0.284923
Florida 0.392480
Oregon -0.177571
Nevada -0.058105
California 0.061360
Idaho -0.058105
dtype: float64
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Out[123]:
Ohio 0.326110
New York 0.136178
Vermont 0.500000
Florida 0.392480
Oregon -0.177571
Nevada -1.000000
California 0.061360
Idaho -1.000000
dtype: float64
随机抽牌
for suit in ['H','S','C','D']:
cards.extend(str(num)+suit for num in base_names)
#extend()接受一个列表参数,把参数列表的元素添加到列表的尾部
deck=Series(card_val,index=cards)#长度为52的Series,索引为排名
deck[:20]
#从整副牌中抽取5张
def draw(deck,n=5):
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
#想要从每种花色中抽取两张,花色是最后一个字符,用apply进行分组
get_suit=lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)
分组加权平均数和相关系数
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
Out[146]:
category
a 0.858361
b -0.098089
dtype: float64