pandas 多級索引消别,高級知識了吱晒。
代碼
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
np.random.seed(0)
# 配置pandas顯示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
# Pandas 提供了Panel和Panel4D對象解決三維數(shù)據(jù)與四維數(shù)據(jù)。
# 而在實踐中续滋,更直觀的形式是通過層級索引(hierarchical indexing忘伞,也被稱為多級索引-multi-indexing)配合多個有不同等級(level)的一級索引一起使用,
# 這樣就可以將高維數(shù) 組轉(zhuǎn)換成類似一維 Series 和二維 DataFrame 對象的形式饥瓷。
index = [('California', 2000), ('California', 2010),
('New York', 2000), ('New York', 2010),
('Texas', 2000), ('Texas', 2010)]
population = [33871468, 37353956,
18976457, 19378102,
20851820, 25145561]
pop = pd.Series(population, index=index)
print(pop)
print(pop[('California', 2010):('Texas', 2000)])
print(pop[[i for i in pop.index if i[1] == 2010]])
index = pd.MultiIndex.from_tuples(index)
print(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2010])
pop_df = pop.unstack()
print(pop_df)
print(pop_df.stack())
pop_df = pd.DataFrame({'total': pop,
'under18': [9267089, 9284094,
4687374, 4318033,
5906301, 6879014]})
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())
df = pd.DataFrame(np.random.rand(4, 2),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=['data1', 'data2'])
print(df)
data = {('California', 2000): 33871648,
('California', 2010): 37253956,
('Texas', 2000): 20851820,
('Texas', 2010): 25145561,
('New York', 2000): 18976457,
('New York', 2010): 19378102}
print(pd.Series(data))
# 顯示創(chuàng)建多級索引
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]))
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]))
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
pop.index.names = ['state', 'year']
print(pop)
# 多級列索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
names=['subject', 'type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)
print(health_data['Guido'])
# 多級索引的取值與切片
print(pop)
print(pop['California', 2000])
print(pop['California'])
print(pop.loc['California': 'New York'])
print(pop[:, 2000])
print(pop[pop > 22000000])
print(pop[['California', 'Texas']])
print(health_data['Guido', 'HR'])
print(health_data.iloc[:2, :2])
print(health_data.loc[:, ('Bob', 'HR')])
idx = pd.IndexSlice
print(health_data.loc[idx[:, 1], idx[:, 'HR']])
# 多級索引行列轉(zhuǎn)換
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data = data.sort_index()
print(data)
print(data['a': 'b'])
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack().stack())
pop_flat = pop.reset_index(name='population')
print(pop_flat)
print(pop_flat.set_index(['state', 'year']))
# 多級索引的數(shù)據(jù)累記
print(health_data)
data_mean = health_data.mean(level='year')
print(data_mean)
print(data_mean.mean(axis=1, level='type'))
輸出
(California, 2000) 33871468
(California, 2010) 37353956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
(Texas, 2010) 25145561
dtype: int64
(California, 2010) 37353956
(New York, 2000) 18976457
(New York, 2010) 19378102
(Texas, 2000) 20851820
dtype: int64
(California, 2010) 37353956
(New York, 2010) 19378102
(Texas, 2010) 25145561
dtype: int64
MultiIndex([('California', 2000),
('California', 2010),
( 'New York', 2000),
( 'New York', 2010),
( 'Texas', 2000),
( 'Texas', 2010)],
)
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
California 37353956
New York 19378102
Texas 25145561
dtype: int64
2000 2010
California 33871468 37353956
New York 18976457 19378102
Texas 20851820 25145561
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
total under18
California 2000 33871468 9267089
2010 37353956 9284094
New York 2000 18976457 4687374
2010 19378102 4318033
Texas 2000 20851820 5906301
2010 25145561 6879014
2000 2010
California 0.273596 0.248544
New York 0.247010 0.222831
Texas 0.283251 0.273568
data1 data2
a 1 0.548814 0.715189
2 0.602763 0.544883
b 1 0.423655 0.645894
2 0.437587 0.891773
California 2000 33871648
2010 37253956
Texas 2000 20851820
2010 25145561
New York 2000 18976457
2010 19378102
dtype: int64
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
MultiIndex([('a', 1),
('a', 2),
('b', 1),
('b', 2)],
)
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 36.0 37.4 38.0 38.5 45.0 37.1
2 41.0 37.3 52.0 36.8 40.0 36.1
2014 1 11.0 37.7 46.0 36.3 60.0 35.5
2 37.0 36.8 52.0 38.5 39.0 37.4
type HR Temp
year visit
2013 1 38.0 38.5
2 52.0 36.8
2014 1 46.0 36.3
2 52.0 38.5
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
33871468
year
2000 33871468
2010 37353956
dtype: int64
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
dtype: int64
state
California 33871468
New York 18976457
Texas 20851820
dtype: int64
state year
California 2000 33871468
2010 37353956
Texas 2010 25145561
dtype: int64
state year
California 2000 33871468
2010 37353956
Texas 2000 20851820
2010 25145561
dtype: int64
year visit
2013 1 38.0
2 52.0
2014 1 46.0
2 52.0
Name: (Guido, HR), dtype: float64
subject Bob
type HR Temp
year visit
2013 1 36.0 37.4
2 41.0 37.3
year visit
2013 1 36.0
2 41.0
2014 1 11.0
2 37.0
Name: (Bob, HR), dtype: float64
subject Bob Guido Sue
type HR HR HR
year visit
2013 1 36.0 38.0 45.0
2014 1 11.0 46.0 60.0
char int
a 1 0.359508
2 0.437032
b 1 0.666767
2 0.670638
c 1 0.697631
2 0.060225
dtype: float64
char int
a 1 0.359508
2 0.437032
b 1 0.666767
2 0.670638
dtype: float64
state California New York Texas
year
2000 33871468 18976457 20851820
2010 37353956 19378102 25145561
year 2000 2010
state
California 33871468 37353956
New York 18976457 19378102
Texas 20851820 25145561
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
dtype: int64
state year population
0 California 2000 33871468
1 California 2010 37353956
2 New York 2000 18976457
3 New York 2010 19378102
4 Texas 2000 20851820
5 Texas 2010 25145561
population
state year
California 2000 33871468
2010 37353956
New York 2000 18976457
2010 19378102
Texas 2000 20851820
2010 25145561
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year visit
2013 1 36.0 37.4 38.0 38.5 45.0 37.1
2 41.0 37.3 52.0 36.8 40.0 36.1
2014 1 11.0 37.7 46.0 36.3 60.0 35.5
2 37.0 36.8 52.0 38.5 39.0 37.4
subject Bob Guido Sue
type HR Temp HR Temp HR Temp
year
2013 38.5 37.35 45.0 37.65 42.5 36.60
2014 24.0 37.25 49.0 37.40 49.5 36.45
type HR Temp
year
2013 42.000000 37.200000
2014 40.833333 37.033333
Process finished with exit code 0