pandas數據累記和分組
代碼
import numpy as np
import pandas as pd
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
# plt.style.use('classic')
plt.style.use('seaborn-whitegrid')
np.random.seed(0)
# 配置pandas顯示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# 數據累計(summarization):計算累計(aggregation)指標,
# 如sum()哈肖、mean()凡桥、median()、min()和max()伏尼,
# 其中每一個指標都呈現了大數據集的特征流妻。
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser)
print(ser.sum())
print(ser.mean())
df = pd.DataFrame({'A': rng.rand(5),
'B': rng.rand(5)})
print(df)
print(df.mean())
print(df.mean(axis='columns'))
print(df.describe())
# 雖然“分組”(group by)這個名字是借用SQL數據庫語言的命令,
# 但其理念引用發(fā)明R語言frame的 Hadley Wickham的觀點可能更合適:
# 分割(split)、應用(apply)和組合(combine)检诗。
# GroupBy的用處就是將這些 步驟進行抽象:
# 用戶不需要知道在底層如何計算,只要把操作看成一個整體就夠了瓢剿。
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)},
columns=['key', 'data'])
print(df)
print(df.groupby('key'))
print(df.groupby('key').sum())
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns=['key', 'data1', 'data2'])
print(df)
print(df.groupby('key').aggregate(['min', np.median, max]))
print(df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'}))
def filter_func(x):
return x['data2'].std() > 4
print(df.groupby('key').std())
print(df.groupby('key').filter(filter_func))
print(df.groupby('key').transform(lambda x: x - x.mean()))
def norm_by_data(x):
x['data1'] /= x['data2'].sum()
return x
print(df.groupby('key').apply(norm_by_data))
L = [0, 1, 0, 1, 2, 0]
print(df.groupby(L).sum())
輸出
0 0.374540
1 0.950714
2 0.731994
3 0.598658
4 0.156019
dtype: float64
2.811925491708157
0.5623850983416314
A B
0 0.155995 0.020584
1 0.058084 0.969910
2 0.866176 0.832443
3 0.601115 0.212339
4 0.708073 0.181825
A 0.477888
B 0.443420
dtype: float64
0 0.088290
1 0.513997
2 0.849309
3 0.406727
4 0.444949
dtype: float64
A B
count 5.000000 5.000000
mean 0.477888 0.443420
std 0.353125 0.426952
min 0.058084 0.020584
25% 0.155995 0.181825
50% 0.601115 0.212339
75% 0.708073 0.832443
max 0.866176 0.969910
key data
0 A 0
1 B 1
2 C 2
3 A 3
4 B 4
5 C 5
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11edbb6d8>
data
key
A 3
B 5
C 7
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
min median max min median max
key
A 0 1.5 3 3 4.0 5
B 1 2.5 4 0 3.5 7
C 2 3.5 5 3 6.0 9
data1 data2
key
A 0 5
B 1 7
C 2 9
data1 data2
key
A 2.12132 1.414214
B 2.12132 4.949747
C 2.12132 4.242641
key data1 data2
1 B 1 0
2 C 2 3
4 B 4 7
5 C 5 9
data1 data2
0 -1.5 1.0
1 -1.5 -3.5
2 -1.5 -3.0
3 1.5 -1.0
4 1.5 3.5
5 1.5 3.0
key data1 data2
0 A 0.000000 5
1 B 0.142857 0
2 C 0.166667 3
3 A 0.375000 3
4 B 0.571429 7
5 C 0.416667 9
data1 data2
0 7 17
1 4 3
2 4 7