本章內(nèi)容
pandas中的分組聚合
一菩帝、分組
import pandas as pd
import numpy as np
df=pd.DataFrame({
'name':['BOSS','Lilei','Lilei','Han','BOSS','BOSS','Han','BOSS'],
'Year':[2016,2016,2016,2016,2017,2017,2017,2017],
'Salary':[999999,20000,25000,3000,9999999,999999,3500,999999],
'Bonus':[100000,20000,20000,5000,200000,300000,3000,400000]
})
print(df)
# 根據(jù)name這一列進(jìn)行分組
group_by_name=df.groupby('name')
print(type(group_by_name))
# 查看分組
print(group_by_name.groups)
# 分組后的數(shù)量
print(group_by_name.count())
# 查看分組的情況
for name,group in group_by_name:
print(name) # 組的名字
print(group)# 組的數(shù)據(jù)
# 按照某一列分組,將name這一列作為分組的鍵尝哆,對(duì)year進(jìn)行分組
group_by_name = df['Year'].groupby(df['name'])
print(group_by_name.count())
# 按照多列進(jìn)行分組
group_by_name_year=df.groupby(['name','Year'])
for name,group in group_by_name_year:
print(name)
print(group)
# 可以選擇分組
print(group_by_name_year.get_group(('BOSS',2016)))
'''
name Year Salary Bonus
0 BOSS 2016 999999 100000
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
3 Han 2016 3000 5000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
6 Han 2017 3500 3000
7 BOSS 2017 999999 400000
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
{'BOSS': Int64Index([0, 4, 5, 7], dtype='int64'), 'Han': Int64Index([3, 6], dtype='int64'), 'Lilei': Int64Index([1, 2], dtype='int64')}
Year Salary Bonus
name
BOSS 4 4 4
Han 2 2 2
Lilei 2 2 2
BOSS
name Year Salary Bonus
0 BOSS 2016 999999 100000
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
Han
name Year Salary Bonus
3 Han 2016 3000 5000
6 Han 2017 3500 3000
Lilei
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
name
BOSS 4
Han 2
Lilei 2
Name: Year, dtype: int64
('BOSS', 2016)
name Year Salary Bonus
0 BOSS 2016 999999 100000
('BOSS', 2017)
name Year Salary Bonus
4 BOSS 2017 9999999 200000
5 BOSS 2017 999999 300000
7 BOSS 2017 999999 400000
('Han', 2016)
name Year Salary Bonus
3 Han 2016 3000 5000
('Han', 2017)
name Year Salary Bonus
6 Han 2017 3500 3000
('Lilei', 2016)
name Year Salary Bonus
1 Lilei 2016 20000 20000
2 Lilei 2016 25000 20000
name Year Salary Bonus
0 BOSS 2016 999999 100000
'''
二、聚合
聚合的函數(shù)
mean 計(jì)算分組平均值
count 分組中非NA值的數(shù)量
sum 非NA值的和
median 非NA值的算術(shù)中位數(shù)
std 標(biāo)準(zhǔn)差
var 方差
min 非NA值的最小值
max 非NA值的最大值
prod 非NA值的積
first 第一個(gè)非NA值
last 最后一個(gè)非NA值
mad 平均絕對(duì)偏差
mode 模
abs 絕對(duì)值
sem 平均值的標(biāo)準(zhǔn)誤差
skew 樣品偏斜度(三階矩)
kurt 樣品峰度(四階矩)
quantile 樣本分位數(shù)(百分位上的值)
cumsum 累積總和
cumprod 累積乘積
cummax 累積最大值
cum 累積最小值
示例:
df1=pd.DataFrame({'Data1':np.random.randint(0,10,5),
'Data2':np.random.randint(10,20,5),
'key1':list('aabba'),
'key2':list('xyyxy')})
print(df1)
'''
Data1 Data2 key1 key2
0 5 16 a x
1 5 11 a y
2 9 13 b y
3 4 13 b x
4 3 16 a y
'''
# 按key1分組檐束,進(jìn)行聚合計(jì)算
# 注意:當(dāng)分組后進(jìn)行數(shù)值計(jì)算時(shí)误算,不是數(shù)值類(lèi)的列(即麻煩列)會(huì)被清除
print(df1.groupby('key1').sum())
'''
Data1 Data2
key1
a 9 47
b 6 30
'''
# 只算data1
print(df1['Data1'].groupby(df1['key1']).sum())
'''
key1
a 13
b 5
Name: Data1, dtype: int32
'''
print(df1.groupby('key1')['Data1'].sum())
'''
key1
a 14
b 12
Name: Data1, dtype: int32
'''
print(df1.groupby('key1')['Data1'].mean())
'''
key1
a 3.0
b 3.5
Name: Data1, dtype: float64
'''
# 使用agg()函數(shù)做聚合運(yùn)算
print(df1.groupby('key1').agg('sum'))
# 可以同時(shí)做多個(gè)聚合運(yùn)算
print(df1.groupby('key1').agg(['sum','mean','std']))
'''
Data1 Data2
key1
a 9 48
b 7 30
Data1 Data2
sum mean std sum mean std
key1
a 9 3.0 4.358899 48 16 3.000000
b 7 3.5 4.949747 30 15 4.242641
'''
# 可自定義函數(shù),傳入agg方法中 grouped.agg(func)
def peak_range(df):
"""
返回?cái)?shù)值范圍
"""
return df.max() - df.min()
print(df1.groupby('key1').agg(peak_range))
'''
Data1 Data2
key1
a 8 6
b 7 6
此次:df為
Data1 Data2 key1 key2
0 8 13 a x
1 0 19 a y
2 7 12 b y
3 0 18 b x
4 1 16 a y
'''
#同時(shí)應(yīng)眵個(gè)聚合函數(shù)
print(df1.groupby('key1').agg(['mean','std','count',peak range])) #默認(rèn)列名為函數(shù)名
print(df1.groupby('key1').agg(['mean','std','count',('range', peak_range)])) #通過(guò)元組提供新的列名
拓展apply()函數(shù)
df1=pd.DataFrame({'sex':list('FFMFMMF'),'smoker':list('YNYYNYY'),'age':[21,30,17,37,40,18,26],'weight':[120,100,132,140,94,89,123]})
print(df1)
'''
sex smoker age weight
0 F Y 21 120
1 F N 30 100
2 M Y 17 132
3 F Y 37 140
4 M N 40 94
5 M Y 18 89
6 F Y 26 123
'''
def bin_age(age):
if age >=18:
return 1
else:
return 0
# 抽煙的年齡大于等18的
print(df1['age'].apply(bin_age))
'''
0 1
1 1
2 0
3 1
4 1
5 1
6 1
Name: age, dtype: int64
'''
df1['age'] = df1['age'].apply(bin_age)
print(df1)
'''
sex smoker age weight
0 F Y 1 120
1 F N 1 100
2 M Y 0 132
3 F Y 1 140
4 M N 1 94
5 M Y 1 89
6 F Y 1 123
'''
# 取出抽煙和不抽煙的體重前二
def top(smoker,col,n=5):
return smoker.sort_values(by=col)[-n:]
df1.groupby('smoker').apply(top,col='weight',n=2)
'''
sex smoker age weight
smoker
N 4 M N 1 94
1 F N 1 100
Y 2 M Y 0 132
3 F Y 1 140
'''