練習(xí)書5-《python數(shù)據(jù)科學(xué)手冊》

pandas 多級索引消别，高級知識了吱晒。

代碼

import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

np.random.seed(0)
# 配置pandas顯示
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)


# Pandas 提供了Panel和Panel4D對象解決三維數(shù)據(jù)與四維數(shù)據(jù)。
# 而在實踐中续滋，更直觀的形式是通過層級索引(hierarchical indexing忘伞，也被稱為多級索引-multi-indexing)配合多個有不同等級(level)的一級索引一起使用，
# 這樣就可以將高維數(shù) 組轉(zhuǎn)換成類似一維 Series 和二維 DataFrame 對象的形式饥瓷。
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
population = [33871468, 37353956,
              18976457, 19378102,
              20851820, 25145561]
pop = pd.Series(population, index=index)
print(pop)
print(pop[('California', 2010):('Texas', 2000)])
print(pop[[i for i in pop.index if i[1] == 2010]])

index = pd.MultiIndex.from_tuples(index)
print(index)
pop = pop.reindex(index)
print(pop)
print(pop[:, 2010])
pop_df = pop.unstack()
print(pop_df)
print(pop_df.stack())
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
print(pop_df)
f_u18 = pop_df['under18'] / pop_df['total']
print(f_u18.unstack())

df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
print(df)

data = {('California', 2000): 33871648,
        ('California', 2010): 37253956,
        ('Texas', 2000): 20851820,
        ('Texas', 2010): 25145561,
        ('New York', 2000): 18976457,
        ('New York', 2010): 19378102}
print(pd.Series(data))

# 顯示創(chuàng)建多級索引
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]]))
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)]))
print(pd.MultiIndex.from_product([['a', 'b'], [1, 2]]))
print(pd.MultiIndex(levels=[['a', 'b'], [1, 2]],
                    codes=[[0, 0, 1, 1], [0, 1, 0, 1]]))
pop.index.names = ['state', 'year']
print(pop)

# 多級列索引
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]],
                                   names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'], ['HR', 'Temp']],
                                     names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
print(health_data)
print(health_data['Guido'])
# 多級索引的取值與切片
print(pop)
print(pop['California', 2000])
print(pop['California'])
print(pop.loc['California': 'New York'])
print(pop[:, 2000])
print(pop[pop > 22000000])
print(pop[['California', 'Texas']])

print(health_data['Guido', 'HR'])
print(health_data.iloc[:2, :2])
print(health_data.loc[:, ('Bob', 'HR')])
idx = pd.IndexSlice
print(health_data.loc[idx[:, 1], idx[:, 'HR']])

# 多級索引行列轉(zhuǎn)換
index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
data = data.sort_index()
print(data)
print(data['a': 'b'])
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack().stack())
pop_flat = pop.reset_index(name='population')
print(pop_flat)
print(pop_flat.set_index(['state', 'year']))
# 多級索引的數(shù)據(jù)累記
print(health_data)
data_mean = health_data.mean(level='year')
print(data_mean)
print(data_mean.mean(axis=1, level='type'))

輸出

(California, 2000)    33871468
(California, 2010)    37353956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64
(California, 2010)    37353956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64
(California, 2010)    37353956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64
MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
California    37353956
New York      19378102
Texas         25145561
dtype: int64
                2000      2010
California  33871468  37353956
New York    18976457  19378102
Texas       20851820  25145561
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
                    total  under18
California 2000  33871468  9267089
           2010  37353956  9284094
New York   2000  18976457  4687374
           2010  19378102  4318033
Texas      2000  20851820  5906301
           2010  25145561  6879014
                2000      2010
California  0.273596  0.248544
New York    0.247010  0.222831
Texas       0.283251  0.273568
        data1     data2
a 1  0.548814  0.715189
  2  0.602763  0.544883
b 1  0.423655  0.645894
  2  0.437587  0.891773
California  2000    33871648
            2010    37253956
Texas       2000    20851820
            2010    25145561
New York    2000    18976457
            2010    19378102
dtype: int64
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      36.0  37.4  38.0  38.5  45.0  37.1
     2      41.0  37.3  52.0  36.8  40.0  36.1
2014 1      11.0  37.7  46.0  36.3  60.0  35.5
     2      37.0  36.8  52.0  38.5  39.0  37.4
type          HR  Temp
year visit            
2013 1      38.0  38.5
     2      52.0  36.8
2014 1      46.0  36.3
     2      52.0  38.5
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
33871468
year
2000    33871468
2010    37353956
dtype: int64
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
dtype: int64
state
California    33871468
New York      18976457
Texas         20851820
dtype: int64
state       year
California  2000    33871468
            2010    37353956
Texas       2010    25145561
dtype: int64
state       year
California  2000    33871468
            2010    37353956
Texas       2000    20851820
            2010    25145561
dtype: int64
year  visit
2013  1        38.0
      2        52.0
2014  1        46.0
      2        52.0
Name: (Guido, HR), dtype: float64
subject      Bob      
type          HR  Temp
year visit            
2013 1      36.0  37.4
     2      41.0  37.3
year  visit
2013  1        36.0
      2        41.0
2014  1        11.0
      2        37.0
Name: (Bob, HR), dtype: float64
subject      Bob Guido   Sue
type          HR    HR    HR
year visit                  
2013 1      36.0  38.0  45.0
2014 1      11.0  46.0  60.0
char  int
a     1      0.359508
      2      0.437032
b     1      0.666767
      2      0.670638
c     1      0.697631
      2      0.060225
dtype: float64
char  int
a     1      0.359508
      2      0.437032
b     1      0.666767
      2      0.670638
dtype: float64
state  California  New York     Texas
year                                 
2000     33871468  18976457  20851820
2010     37353956  19378102  25145561
year            2000      2010
state                         
California  33871468  37353956
New York    18976457  19378102
Texas       20851820  25145561
state       year
California  2000    33871468
            2010    37353956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64
        state  year  population
0  California  2000    33871468
1  California  2010    37353956
2    New York  2000    18976457
3    New York  2010    19378102
4       Texas  2000    20851820
5       Texas  2010    25145561
                 population
state      year            
California 2000    33871468
           2010    37353956
New York   2000    18976457
           2010    19378102
Texas      2000    20851820
           2010    25145561
subject      Bob       Guido         Sue      
type          HR  Temp    HR  Temp    HR  Temp
year visit                                    
2013 1      36.0  37.4  38.0  38.5  45.0  37.1
     2      41.0  37.3  52.0  36.8  40.0  36.1
2014 1      11.0  37.7  46.0  36.3  60.0  35.5
     2      37.0  36.8  52.0  38.5  39.0  37.4
subject   Bob        Guido          Sue       
type       HR   Temp    HR   Temp    HR   Temp
year                                          
2013     38.5  37.35  45.0  37.65  42.5  36.60
2014     24.0  37.25  49.0  37.40  49.5  36.45
type         HR       Temp
year                      
2013  42.000000  37.200000
2014  40.833333  37.033333

Process finished with exit code 0