七般甲、Groupby操作與字符串
7.1 Groupby操作
import pandas as pd
s = pd.Series([1,2,3,1,2,3],[8,7,6,6,7,8])
grouped = s.groupby(level=0)
grouped.first()
6 3
7 2
8 1
dtype: int64
grouped.last()
#------------------------------
6 1
7 2
8 3
dtype: int64
grouped.sum()
data = pd.DataFrame({'X':['A','B','A','B'],'Y':[4,3,2,1]})
data.groupby(['X']).get_group('A') #根據(jù)X列g(shù)roupby, 取出A組的值
#------------------------------------
X Y
0 A 4
2 A 2
import numpy as np
data = [['bar','bar','foo','foo','tax','tax','cat','cat'],['one','tow','one','tow','one','tow','one','tow']]
index = pd.MultiIndex.from_arrays(data,names=['first','second'])
s = pd.Series(np.random.randn(8),index = index)
s
first second
bar one 0.047907
tow -0.100240
foo one -0.989266
tow -0.656413
tax one -0.146667
tow -1.458819
cat one -0.034952
tow -1.186470
dtype: float64
grouped = s.groupby(level=0) # level=0 first 列
grouped.sum()
grouped = s.groupby(level=1)
grouped.sum()
grouped = s.groupby(level='first')
grouped.sum()
7.2 字符串操作
import pandas as pd
import numpy as np
s = pd.Series(['A', 'b','CkdkieK','CAT','dog',np.nan])
s.str.lower() #把所有的字母轉(zhuǎn)小寫(xiě)
df = pd.DataFrame(np.random.randn(3,2), columns=['A a','B b'], index = range(3))
df.columns = df.columns.str.replace(' ','_') #把列名中的空格替換成_
s = pd.Series(['a-b-c','c-d-e','d-e-f'])
0 a-b-c
1 c-d-e
2 d-e-f
dtype: object
s.str.split('-')
0 [a, b, c]
1 [c, d, e]
2 [d, e, f]
dtype: object
s.str.split('-',expand = True)
0 1 2
0 a b c
1 c d e
2 d e f
s.str.split('-',expand = True, n =1)
0 1
0 a b-c
1 c d-e
2 d e-f
s = pd.Series(['a-b-c','c-d-e','d-e-f'])
s.str.contains('c')
0 True
1 True
2 False
dtype: bool
s.str.get_dummies(sep='-')
a b c d e f
0 1 1 1 0 0 0
1 0 0 1 1 1 0
2 0 0 0 1 1 1
八、pandas 繪圖
畫(huà)拆線圖
%matplotlib inline
import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(10),index = np.arange(1,100,10))
s.plot()
df = pd.DataFrame(np.random.randn(10,4).cumsum(0),index = np.arange(0,100,10),columns=list('ABCD'))
df.plot()
畫(huà)子圖和柱狀圖
import matplotlib.pyplot as plt
fig,axes = plt.subplots(2,1) # 畫(huà)兩行1列的兩個(gè)子圖
data = pd.Series(np.random.randn(16),index=list('abcdefghijllmnop'))
data.plot(ax=axes[0],kind='bar') #在第0行子圖畫(huà) 條形圖
data.plot(ax=axes[1],kind='barh') #在第1行子圖畫(huà) 水平條形圖
df = pd.DataFrame(np.random.rand(6,4),index=['one','two','three','fore','five','six'],columns=pd.Index(list('ABCD'),name='Genus'))
df.plot(kind='bar')
畫(huà)直方圖
tips = pd.read_csv('tips.csv')
tips.total_bill.plot(kind='hist',bins=50) # 畫(huà)直方圖 分50個(gè)
畫(huà)散點(diǎn)圖
macro = pd.read_csv('macrodata.csv')
data = macro[['quarter','realgdp','realcons']]
data.plot.scatter('quarter','realgdp')
pd.scatter_matrix(data,color='g',alpha=0.3) # 廢棄了,用下面的
pd.plotting.scatter_matrix(data,color='g',alpha=0.3)
九膘格、 大數(shù)據(jù)處理技巧
怎么讓內(nèi)存占用量小一些
內(nèi)存占用情況
import pandas as pd
gl = pd.read_csv('game_logs.csv')
gl.shape
(171907, 161) #17萬(wàn)行,161列
gl.info(memory_usage='deep') #內(nèi)存使用的詳細(xì)情況
#-------------------------------------------------------------
class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 860.5 MB
各種類型占用內(nèi)存情況
for dtype in ['float64','int64','object']:
selected_dtype = gl.select_dtypes(include = [dtype])
mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
mean_usage_mb = mean_usage_b/1024**2
print ('平均內(nèi)存占用',dtype,mean_usage_mb)
#----------------------------------------------
平均內(nèi)存占用 float64 1.294733194204477
平均內(nèi)存占用 int64 1.1242000034877233
平均內(nèi)存占用 object 9.514454648464541
列出各種類型的最小财松、最大值
import numpy as np
int_types = ['uint8','int8','int16','int32','int64']
for it in int_types:
print (np.iinfo(it))
減少內(nèi)存空間使用方法
int64 或float64 向下轉(zhuǎn)換為 unsigned 或float32
object 類型占用空間最多瘪贱。如果重復(fù)值特別多,使用category類型辆毡,把對(duì)象轉(zhuǎn)換成整形
日期類型如果把"20170405"轉(zhuǎn)換成日期類型"2017-04-05"會(huì)多占用內(nèi)存