1.DataFrame的合并
1)未指定基于哪一列合并
>>> import numpy as np
>>> import pandas as pd
>>> frame1=pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],'price':[12.33,11.44,33.21,13.23,33.62]})
>>> frame1
????id?price
0???ball?12.33
1??pencil?11.44
2???pen?33.21
3???mug?13.23
4?ashtray?33.62
>>> frame2=pd.DataFrame({'id':['pencil','pencil','ball','pen'],'color':['white','red','red','black']})
>>> frame2
??color???id
0?white?pencil
1??red?pencil
2??red??ball
3?black???pen
>>> pd.merge(frame1,frame2)? #frame1放到前,則依據(jù)frame1去合并经窖,字段順序也是先frame1的 后frame2的呜师,且只保留二者都有的
????id?price?color
0??ball?12.33??red
1?pencil?11.44?white
2?pencil?11.44??red
3???pen?33.21?black
>>> pd.merge(frame2,frame1)
??color???id?price
0?white?pencil?11.44
1??red?pencil?11.44
2??red??ball?12.33
3?black???pen?33.21
2)指定基于哪一列進(jìn)行合并:當(dāng)兩個DataFrame對象有不止一個列名相同的時候竭恬,合并的時候需用指定依據(jù)哪個列進(jìn)行合并
>>> import numpy as np
>>> import pandas as pd
>>> frame1=pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],'color':['white','red','red','black','green'],'brand':['OMG','ABC','ABC','POD','POD']})
>>> frame1
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
>>> frame2=pd.DataFrame({'id':['pencil','pencil','ball','pen'],'brand':['OMG','POD','ABD','POD']})
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> pd.merge(frame1,frame2)
Empty DataFrame
Columns: [brand, color, id]
Index: []
>>> pd.merge(frame1,frame2,on='id')
?brand_x?color???id brand_y
0???OMG?white??ball???ABD
1???ABC??red?pencil???OMG
2???ABC??red?pencil???POD
3???ABC??red???pen???POD
>>> pd.merge(frame1,frame2,on='brand')
?brand?color???id_x??id_y
0??OMG?white???ball?pencil
1??POD?black???mug?pencil
2??POD?black???mug???pen
3??POD?green?ashtray?pencil
4??POD?green?ashtray???pen
>>> pd.merge(frame2,frame1,on='id')
?brand_x???id brand_y?color
0???OMG?pencil???ABC??red
1???POD?pencil???ABC??red
2???ABD??ball???OMG?white
3???POD???pen???ABC??red
>>> pd.merge(frame2,frame1,on='brand')
?brand??id_x?color???id_y
0??OMG?pencil?white???ball
1??POD?pencil?black???mug
2??POD?pencil?green?ashtray
3??POD???pen?black???mug
4??POD???pen?green?ashtray
3)基于左對標(biāo)和基于右對標(biāo):當(dāng)兩個DataFrame的列名不同時候,用left_on和right_on指定第一個dataframe用來對標(biāo)的列和第二個dataframe用來對標(biāo)的列腺办,兩列進(jìn)行對標(biāo)
>>> frame1
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> frame2.columns=['brand','sid']
>>> frame2
?brand???sid
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> pd.merge(frame1,frame2,left_on='id',right_on='sid')
?brand_x?color???id brand_y???sid
0???OMG?white??ball???ABD??ball
1???ABC??red?pencil???OMG?pencil
2???ABC??red?pencil???POD?pencil
3???ABC??red???pen???POD???pen
4)外鏈接:相當(dāng)于把所有列聚合倒一起粘招,還有左鏈接和右鏈接
>>> frame2
?brand???sid
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> frame2.columns=['brand','id']
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> pd.merge(frame1,frame2,on='id')
?brand_x?color???id brand_y
0???OMG?white??ball???ABD
1???ABC??red?pencil???OMG
2???ABC??red?pencil???POD
3???ABC??red???pen???POD
>>> pd.merge(frame1,frame2,on='id',how='outer')
?brand_x?color????id brand_y
0???OMG?white???ball???ABD
1???ABC??red??pencil???OMG
2???ABC??red??pencil???POD
3???ABC??red???pen???POD
4???POD?black???mug???NaN
5???POD?green?ashtray???NaN
>>> pd.merge(frame1,frame2,on='id',how='left')
?brand_x?color????id brand_y
0???OMG?white???ball???ABD
1???ABC??red??pencil???OMG
2???ABC??red??pencil???POD
3???ABC??red???pen???POD
4???POD?black???mug???NaN
5???POD?green?ashtray???NaN
>>> pd.merge(frame1,frame2,on='id',how='right')
?brand_x?color???id brand_y
0???OMG?white??ball???ABD
1???ABC??red?pencil???OMG
2???ABC??red?pencil???POD
3???ABC??red???pen???POD
>>> pd.merge(frame1,frame2,on=['id','brand'],how='outer')
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
5??OMG??NaN??pencil
6??POD??NaN??pencil
7??ABD??NaN???ball
8??POD??NaN???pen
5)根據(jù)索引合并
>>> frame1
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> pd.merge(frame1,frame2,left_index=True,right_index=True)
?brand_x?color??id_x brand_y??id_y
0???OMG?white??ball???OMG?pencil
1???ABC??red?pencil???POD?pencil
2???ABC??red???pen???ABD??ball
3???POD?black???mug???POD???pen
6)join函數(shù)更適合利用所以進(jìn)行合并索引相同列不一致的dataframe對象
列名相同的報錯
>>> frame1
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> frame1.join(frame2)#由于列名有重復(fù)的報錯
Traceback (most recent call last):
?File "", line 1, in
?File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 5316, in join
??rsuffix=rsuffix, sort=sort)
?File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py", line 5331, in _join_compat
??suffixes=(lsuffix, rsuffix), sort=sort)
?File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/reshape/merge.py", line 58, in merge
??return op.get_result()
?File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/reshape/merge.py", line 588, in get_result
??rdata.items, rsuf)
?File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/internals.py", line 5026, in items_overlap_with_suffix
??to_rename)
ValueError: columns overlap but no suffix specified: Index(['brand', 'id'], dtype='object')
修改frame2的列名后利用join函數(shù)進(jìn)行相同索引不同列名的
>>> frame1
?brand?color????id
0??OMG?white???ball
1??ABC??red??pencil
2??ABC??red???pen
3??POD?black???mug
4??POD?green?ashtray
>>> frame2
?brand???id
0??OMG?pencil
1??POD?pencil
2??ABD??ball
3??POD???pen
>>> frame2.columns=['brand2','id2']
>>> frame1.join(frame2)
?brand?color????id brand2???id2
0??OMG?white???ball??OMG?pencil
1??ABC??red??pencil??POD?pencil
2??ABC??red???pen??ABD??ball
3??POD?black???mug??POD???pen
4??POD?green?ashtray??NaN???NaN
>>> frame2.join(frame1)
?brand2???id2 brand?color???id
0??OMG?pencil??OMG?white??ball
1??POD?pencil??ABC??red?pencil
2??ABD??ball??ABC??red???pen
3??POD???pen??POD?black???mug
2.拼接
1)numpy的nparray對象進(jìn)行拼接而涉,利用np.concatenate函數(shù)
>>> import numpy as np
>>> array1=([0,1,2],[3,4,5],[6,7,8])
>>> array1
([0, 1, 2], [3, 4, 5], [6, 7, 8])
>>> array2=np.arange(9).reshape(3,3)
>>> array2
array([[0, 1, 2],
????[3, 4, 5],
????[6, 7, 8]])
>>> np.concatenate([array1,array2],axis=1)
array([[0, 1, 2, 0, 1, 2],
????[3, 4, 5, 3, 4, 5],
????[6, 7, 8, 6, 7, 8]])
>>> np.concatenate([array1,array2],axis=0)
array([[0, 1, 2],
????[3, 4, 5],
????[6, 7, 8],
????[0, 1, 2],
????[3, 4, 5],
????[6, 7, 8]])
2)pandas及他的Series和DataFrame等數(shù)據(jù)結(jié)構(gòu)實(shí)現(xiàn)了帶編號的軸,pandas的concat函數(shù)實(shí)現(xiàn)了按照軸進(jìn)行拼接
Series對象拼接
>>> ser1=pd.Series(np.random.rand(4),index=[1,2,3,4])
>>> ser1
1??0.835800
2??0.421464
3??0.859449
4??0.974251
dtype: float64
>>> ser2=pd.Series(np.random.rand(4),index=[5,6,7,8])
>>> ser2
5??0.296453
6??0.692925
7??0.098937
8??0.324647
dtype: float64
>>> pd.concat([ser1,ser2])#默認(rèn)按照0軸(行)進(jìn)行拼接
1??0.835800
2??0.421464
3??0.859449
4??0.974251
5??0.296453
6??0.692925
7??0.098937
8??0.324647
dtype: float64
>>> pd.concat([ser1,ser2],axis=1)
?????0?????1
1?0.835800????NaN
2?0.421464????NaN
3?0.859449????NaN
4?0.974251????NaN
5????NaN?0.296453
6????NaN?0.692925
7????NaN?0.098937
8????NaN?0.324647
>>> ser3=pd.concat([ser1,ser2],axis=1)
>>> ser3
?????0?????1
1?0.835800????NaN
2?0.421464????NaN
3?0.859449????NaN
4?0.974251????NaN
5????NaN?0.296453
6????NaN?0.692925
7????NaN?0.098937
8????NaN?0.324647
>>> ser1
1??0.835800
2??0.421464
3??0.859449
4??0.974251
dtype: float64
>>> pd.concat([ser1,ser3],axis=1,join='inner')
?????0?????0??1
1?0.835800?0.835800 NaN
2?0.421464?0.421464 NaN
3?0.859449?0.859449 NaN
4?0.974251?0.974251 NaN
>>> pd.concat([ser1,ser2],keys=[1,2])#在結(jié)果上創(chuàng)建等級
1?1??0.835800
??2??0.421464
??3??0.859449
??4??0.974251
2?5??0.296453
??6??0.692925
??7??0.098937
??8??0.324647
dtype: float64
>>> pd.concat([ser1,ser2],axis=1,keys=[1,2])#在1軸上拼接瞳秽,生成DDAtaFrame對象
?????1?????2
1?0.835800????NaN
2?0.421464????NaN
3?0.859449????NaN
4?0.974251????NaN
5????NaN?0.296453
6????NaN?0.692925
7????NaN?0.098937
8????NaN?0.324647
DataFrame對象拼接
>>> frame1=pd.DataFrame(np.random.rand(9).reshape(3,3),index=[1,2,3])
>>> frame2=pd.DataFrame(np.random.rand(9).reshape(3,3),index=[4,5,6])
>>> frame1
?????0?????1?????2
1?0.687920?0.176828?0.050595
2?0.174212?0.315637?0.780156
3?0.352718?0.613909?0.447205
>>> frame2
?????0?????1?????2
4?0.216871?0.389312?0.363118
5?0.253215?0.312054?0.760887
6?0.437853?0.847273?0.300629
>>> pd.concat([frame1,frame2],axis=1)
?????0?????1?????2?????0?????1?????2
1?0.687920?0.176828?0.050595????NaN????NaN????NaN
2?0.174212?0.315637?0.780156????NaN????NaN????NaN
3?0.352718?0.613909?0.447205????NaN????NaN????NaN
4????NaN????NaN????NaN?0.216871?0.389312?0.363118
5????NaN????NaN????NaN?0.253215?0.312054?0.760887
6????NaN????NaN????NaN?0.437853?0.847273?0.300629
>>> pd.concat([frame1,frame2])
?????0?????1?????2
1?0.687920?0.176828?0.050595
2?0.174212?0.315637?0.780156
3?0.352718?0.613909?0.447205
4?0.216871?0.389312?0.363118
5?0.253215?0.312054?0.760887
6?0.437853?0.847273?0.300629
3.組合
無法通過合并和拼接的辦法進(jìn)行組合數(shù)據(jù)瓣履,兩個數(shù)據(jù)集索引完全或者部分重合
cmbine_first函數(shù)可以用來組合Series對象,同時對齊數(shù)據(jù)
>>> ser1=pd.Series(np.random.rand(5),index=[1,2,3,4,5])
>>> ser1
1??0.428280
2??0.350504
3??0.832672
4??0.746660
5??0.698320
dtype: float64
>>> ser2=pd.Series(np.random.rand(4),index=[2,4,5,6])
>>> ser2
2??0.173355
4??0.668513
5??0.766825
6??0.683407
dtype: float64
>>> ser1.combine_first(ser2)
1??0.428280
2??0.350504
3??0.832672
4??0.746660
5??0.698320
6??0.683407
dtype: float64
>>> ser2.combine_first(ser1)
1??0.428280
2??0.173355
3??0.832672
4??0.668513
5??0.766825
6??0.683407
dtype: float64
>>> ser1
1??0.428280
2??0.350504
3??0.832672
4??0.746660
5??0.698320
dtype: float64
>>> ser2
2??0.173355
4??0.668513
5??0.766825
6??0.683407
dtype: float64
>>> ser1[:3].combine_first(ser2[:3])
1??0.428280
2??0.350504
3??0.832672
4??0.668513
5??0.766825
dtype: float64
4.按等級索引旋轉(zhuǎn)
1)入棧:旋轉(zhuǎn)數(shù)據(jù)結(jié)構(gòu)把列轉(zhuǎn)換為行
2)出棧:把行轉(zhuǎn)換為列
>>> frame1=pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
>>> frame1
????ball?pen?pencil
white???0??1????2
black???3??4????5
red????6??7????8
>>> frame1.stack()
white?ball???0
???? ? ? ?pen????1
???? ? ? ?pencil??2
black?ball???3
????? ? ? pen????4
????pencil??5
red?? ?ball???6
????pen????7
????pencil??8
dtype: int64
>>> ser5=frame1.stack()
>>> ser5
white?ball???0
???? ? ? ?pen????1
???? ? ? ?pencil??2
black?ball???3
????? ? ? pen????4
????pencil??5
red?? ?ball???6
????pen????7
????pencil??8
dtype: int64
>>> ser5.unstack()
????ball?pen?pencil
white???0??1????2
black???3??4????5
red????6??7????8
>>> ser5.unstack(0)
????white?black?red
ball????0???3??6
pen?????1???4??7
pencil???2???5??8
5.從長格式向?qū)捀袷睫D(zhuǎn)化
1)長格式數(shù)據(jù)
>>> longframe=pd.DataFrame({'color':['white','white','white','red','red','red','black','black','black'],'item':['ball','pen','mug','ball','pen','mug','ball','pen','mug'],'value':np.random.rand(9)})
>>> longframe
??color?item???value
0?white?ball?0.825575
1?white??pen?0.595834
2?white??mug?0.228163
3??red?ball?0.543954
4??red??pen?0.150930
5??red??mug?0.285771
6?black?ball?0.092283
7?black??pen?0.961326
8?black??mug?0.027581
2)轉(zhuǎn)換成寬格式
>>> wideframe=longframe.pivot('color','item')
>>> wideframe
?????value
item????ball???? ? ? ? ?mug????? ? pen
color
black?0.092283?0.027581?0.961326
red??0.543954?0.285771?0.150930
white?0.825575?0.228163?0.595834
6.刪除
>>> frame1=pd.DataFrame(np.arange(9).reshape(3,3),index=['white','black','red'],columns=['ball','pen','pencil'])
>>> frame1
??? ? ??ball?pen?pencil
white???0??1????2
black???3??4????5
red????6??7????8
>>> del frame1['ball'] #刪除一列
>>> frame1
????pen?pencil
white??1????2
black??4????5
red???7????8
>>> frame1.drop('white') #刪除行
????pen?pencil
black??4????5
red???7????8
7.刪除重復(fù)元素
>>> import numpy as np
>>> import pandas as pd
>>> dframe=pd.DataFrame({'color':['white','white','red','red','white'],'value':[2,1,3,3,2]})
>>> dframe
??color?value
0?white?????2
1?white?????1
2???red?????3
3???red?????3
4?white?????2
>>> dframe.duplicated()
0???False
1???False
2???False
3????True
4????True
dtype: bool
>>> dframe[dframe.duplicated()]
??color?value
3???red?????3
4?white?????2
>>> dframe.drop_duplicates()
??color?value
0?white?????2
1?white?????1
2???red?????3
8.映射
以下函數(shù)都以dict對象作為參數(shù)练俐,也就是用能夠承載映射的ditct字典對象作為參數(shù)
1)replace函數(shù):替換元素
2)map函數(shù):新建一列
3)rename函數(shù):替換索引
用映射替換元素
>>> import numpy as np
>>> import pandas as pd
>>> frame=pd.DataFrame()
>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],'color':['white','rosso','verde','black','yellow'],'price':[5.56,4.20,1.30,0.56,2.75]})
>>> frame
???color????item?price
0??white????ball??5.56
1??rosso?????mug??4.20
2??verde?????pen??1.30
3??black??pencil??0.56
4?yellow?ashtray??2.75
>>> newcolors={'rosso':'red','verde':'green'}
>>> frame.replace(newcolors)
???color????item?price
0??white????ball??5.56
1????red?????mug??4.20
2??green?????pen??1.30
3??black??pencil??0.56
4?yellow?ashtray??2.75
>>> ser=pd.Series([1,3,np.nan,4,6,np.nan,3])
>>> ser
0???1.0
1???3.0
2???NaN
3???4.0
4???6.0
5???NaN
6???3.0
dtype: float64
>>> ser.replace(np.nan,0)
0???1.0
1???3.0
2???0.0
3???4.0
4???6.0
5???0.0
6???3.0
dtype: float64
用映射添加元素
>>> frame=pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],'color':['white','red','freen','black','yellow']})
>>> frame
???color????item
0??white????ball
1????red?????mug
2??freen?????pen
3??black??pencil
4?yellow?ashtray
>>> price={'ball':5.56,'mug':4.20,'bottle':1.30,'scissors':3.41,'pen':1.30,'pencil':0.56,'ashtray':2.75}
>>> frame['price']=frame['item'].map(price)
>>> frame
???color????item?price
0??white????ball??5.56
1????red?????mug??4.20
2??freen?????pen??1.30
3??black??pencil??0.56
4?yellow?ashtray??2.75
重命名軸索引
>>> frame
???color????item?price
0??white????ball??5.56
1????red?????mug??4.20
2??freen?????pen??1.30
3??black??pencil??0.56
4?yellow?ashtray??2.75
>>> reindex={0:'frist',1:'second',2:'third',3:'fourth',4:'fifth'}
>>> frame.rename(reindex)
????????color????item?price
frist???white????ball??5.56
second????red?????mug??4.20
third???freen?????pen??1.30
fourth??black??pencil??0.56
fifth??yellow?ashtray??2.75
>>> recolumns={'item':'object','price':'value'}
>>> frame.rename(index=reindex,columns=recolumns)
????????color??object?value
frist???white????ball??5.56
second????red?????mug??4.20
third???freen?????pen??1.30
fourth??black??pencil??0.56
fifth??yellow?ashtray??2.75
>>> frame.rename(columns=recolumns)
???color??object?value
0??white????ball??5.56
1????red?????mug??4.20
2??freen?????pen??1.30
3??black??pencil??0.56
4?yellow?ashtray??2.75