import pandas as pd
import os #導(dǎo)入os模塊
encoding = 'latin1'# 格式設(shè)置成“l(fā)atinl""
#用pandas.read_table講各個(gè)表分別讀到一個(gè)pandas Dataframe對(duì)像中
#pandas 有兩種數(shù)據(jù)格式Dataframe表示一個(gè)表格帜羊,類(lèi)似表格的數(shù)據(jù)結(jié)構(gòu)缺猛;Series是一個(gè)一維的類(lèi)似的數(shù)組對(duì)象负芋,包含一個(gè)數(shù)組的數(shù)據(jù)
#和一個(gè)與數(shù)組關(guān)聯(lián)的數(shù)據(jù)標(biāo)簽,被叫做索引,最簡(jiǎn)單的Series是由一個(gè)數(shù)組的數(shù)據(jù)構(gòu)成:
#os.path模塊重要用于文件的屬性獲取,
#os.path.expanduser(path) #把path中包含的"~"和"~user"轉(zhuǎn)換成用戶(hù)目錄
#os.path.expandvars(path) #根據(jù)環(huán)境變量的值替換path中包含的”$name”和”${name}”
upath = os.path.expanduser('ch02/movielens/users.dat')
rpath = os.path.expanduser('ch02/movielens/ratings.dat')
mpath = os.path.expanduser('ch02/movielens/movies.dat')
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']#設(shè)列名稱(chēng)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']
users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
#這里是用pd.read_cav讀取文件 #sep="::"表示記錄用“:”隔開(kāi)如果記錄使用“偿曙,”就sep=","
#header=None默認(rèn)無(wú),encoding=encoding默認(rèn)編碼如utf-8
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
if __name__ == '__main__':
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
from ipykernel import kernelapp as app
/Users/zhongyaode/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
app.launch_new_instance()
users[:20]#切語(yǔ)法羔巢;查看DataFrame
ratings[:5]
movies[:5]
ratings
#利用pandas將ratings和users合并到一起望忆,然后再將moview也合并到一起
#pandas會(huì)根據(jù)列名推斷哪些列是合并列
data = pd.merge(pd.merge(ratings, users), movies)
data
data = pd.merge(pd.merge(ratings,users),movies)
data
data['rating'].mean()
3.5815644530293169
data.ix[1]#索引
user_id 2
movie_id 1193
rating 5
timestamp 978298413
gender M
age 56
occupation 16
zip 70072
title One Flew Over the Cuckoo's Nest (1975)
genres Drama
Name: 1, dtype: object
按性別計(jì)算每部電影的平均得分罩阵,可以使用pivot_table
mean_ratings = data.pivot_table('rating', index='title',
columns='gender', aggfunc='mean')
mean_ratings[:7]
#過(guò)濾掉數(shù)據(jù)不足250條的電影,對(duì)title進(jìn)行分組启摄,利用ize()得到一個(gè)含有個(gè)各電影分組大小的Series的對(duì)象
ratings_by_title = data.groupby('title').size()
ratings_by_title[:10]
title
$1,000,000 Duck (1971) 37
'Night Mother (1986) 70
'Til There Was You (1997) 52
'burbs, The (1989) 303
...And Justice for All (1979) 199
1-900 (1994) 2
10 Things I Hate About You (1999) 700
101 Dalmatians (1961) 565
101 Dalmatians (1996) 364
12 Angry Men (1957) 616
dtype: int64
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
'101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
'13th Warrior, The (1999)', '2 Days in the Valley (1996)',
'20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
'2010 (1984)',
...
'X-Men (2000)', 'Year of Living Dangerously (1982)',
'Yellow Submarine (1968)', 'You've Got Mail (1998)',
'Young Frankenstein (1974)', 'Young Guns (1988)',
'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
'Zero Effect (1998)', 'eXistenZ (1999)'],
dtype='object', name='title', length=1216)
mean_ratings=mean_ratings.ix[active_titles]
mean_ratings
為了了解女性最了解的電影稿壁,對(duì)F進(jìn)行降序排列
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)#sort_index被棄用使用sort_values
top_female_ratings[:10]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
data['age'].mean()
29.738313692438279
data['age'].max()
56
data['age'].min()
1
data['age'].var()#var方差std標(biāo)準(zhǔn)差
138.10909427256377
#畫(huà)直方圖
fig=plt.figure()
x=data['age']
ax=fig.add_subplot(111)
numBins=5
ax.hist(x,numBins,color='red',alpha=0.8,rwidth=0.5)
plt.title(u'age')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
def normfun(x,mu,sigma):
pdf=np.exp(-((x-mu)**2)/(2*sigma**2))/(sigma*np.sqrt(2*np.pi))
return pdf
p=data['age']
x=np.arange(1,60,1)#140和160是起點(diǎn)和終點(diǎn),根據(jù)最大歉备、最小傅是、均值設(shè)置;
#0.3表示線的彎曲角度(步長(zhǎng))
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示數(shù)據(jù),bins表示顯示的組數(shù)
#rwidth表示每組寬度,normed表示顯示曲線
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起點(diǎn)和終點(diǎn),根據(jù)最大威创、最小落午、均值設(shè)置;
#0.3表示線的彎曲角度(步長(zhǎng))
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(p,bins=2,rwidth=0.9,normed=True)#time表示數(shù)據(jù),bins表示顯示的組數(shù)
#rwidth表示每組寬度,normed表示顯示曲線
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
x=np.arange(1,60,1)#140和160是起點(diǎn)和終點(diǎn),根據(jù)最大肚豺、最小、均值設(shè)置;
#0.3表示線的彎曲角度(步長(zhǎng))
y=normfun(x,mean,std)
plt.plot(x,y)
![
![
![
![
![output_34_0.png](http://upload-images.jianshu.io/upload_images/2007820-408b82f75a63a3f3.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-cb7ffce4a59d0504.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-a8fce3d1a47c8184.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-9271b213266a0748.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
](http://upload-images.jianshu.io/upload_images/2007820-0554285cc748af49.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
plt.hist(p,bins=6,rwidth=0.9,normed=True)#time表示數(shù)據(jù),bins表示顯示的組數(shù)
#rwidth表示每組寬度,normed表示顯示曲線
plt.title("time")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
len(p)#顯示記錄數(shù)
1000209
std=p.std()
std
11.751982567744209
mean=p.mean()
std=p.std()
結(jié)果分析:電影人的平均值為29.74界拦,大部分人的年紀(jì)在20~30之間吸申。
標(biāo)準(zhǔn)差是11.75,波動(dòng)較小有68%的人年紀(jì)在29.74-11.75到29.74+11.75之間
數(shù)據(jù)顯示10歲以下的人很少享甸,廣告應(yīng)該控制在20~60之間
a=p[:100000]#拿出前10%的數(shù)據(jù)截碴,進(jìn)行分析
x=np.arange(1,60,1)#140和160是起點(diǎn)和終點(diǎn),根據(jù)最大蛉威、最小日丹、均值設(shè)置;
#0.3表示線的彎曲角度(步長(zhǎng))
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(a,bins=6,rwidth=0.9,normed=True)#time表示數(shù)據(jù),bins表示顯示的組數(shù)
#rwidth表示每組寬度,normed表示顯示曲線
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
c=p[:10000]#拿出前1%的數(shù)據(jù)進(jìn)行分析
x=np.arange(1,60,1)#140和160是起點(diǎn)和終點(diǎn),根據(jù)最大蚯嫌、最小哲虾、均值設(shè)置;
#0.3表示線的彎曲角度(步長(zhǎng))
y=normfun(x,mean,std)
plt.plot(x,y)
plt.hist(c,bins=6,rwidth=0.9,normed=True)#time表示數(shù)據(jù),bins表示顯示的組數(shù)
#rwidth表示每組寬度,normed表示顯示曲線
plt.title("age")
plt.xlabel("stakes")
plt.ylabel("Probability")
plt.show()
符合上面的結(jié)論:
##時(shí)間有限先到這里,相信隨著深入會(huì)體會(huì)到更多的樂(lè)趣