import pandas as pd #數(shù)據(jù)分析
import numpy as np #科學(xué)計(jì)算
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei'] #用來(lái)正常顯示中文標(biāo)簽 中易黑體
plt.rcParams['axes.unicode_minus']=False #用來(lái)正常顯示負(fù)號(hào)
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
data_train = pd.read_csv('../data/train.csv')
print data_train.columns
data_train
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
plt.subplot2grid((2,3),(0,0)) # 在一張大圖里分列幾個(gè)小圖
data_train.Survived.value_counts().plot(kind='bar') # plots a bar graph of those who surived vs those who did not.
plt.title(u'獲救情況 (1為獲救)') # puts a title on our graph
plt.ylabel(u'人數(shù)')
plt.subplot2grid((2,3),(0,1))
data_train.Pclass.value_counts().plot(kind='bar')
plt.title(u'乘客等級(jí)分布')
plt.ylabel(u'人數(shù)')
plt.subplot2grid((2,3),(0,2))
plt.scatter(data_train.Survived, data_train.Age)
plt.grid(b=True, which='major', axis='y') # formats the grid line style of our graphs
plt.title(u'按年齡看獲救分布 (1為獲救)')
plt.ylabel(u'年齡')
plt.subplot2grid((2,3),(1,0),colspan=2)
data_train.Age[data_train.Pclass==1].plot(kind='kde') # plots a kernel desnsity estimate of the subset of the 1st class passanges's age
data_train.Age[data_train.Pclass==2].plot(kind='kde')
data_train.Age[data_train.Pclass==3].plot(kind='kde')
plt.legend((u'頭等艙', u'2等艙',u'3等艙'),loc='best') # sets our legend for our graph.
plt.title(u'各等級(jí)的乘客年齡分布')
plt.xlabel(u'年齡')
plt.ylabel(u'密度')
plt.subplot2grid((2,3),(1,2))
data_train.Embarked.value_counts().plot(kind='bar')
plt.title(u'各登船口岸上船人數(shù)')
plt.ylabel(u'人數(shù)')
plt.show()
#看看各乘客等級(jí)的獲救情況
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
Survived_0 = data_train.Pclass[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Pclass[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'獲救':Survived_1, u'未獲救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各乘客等級(jí)的獲救情況")
plt.xlabel(u"乘客等級(jí)")
plt.ylabel(u"人數(shù)")
plt.show()
#看看各登錄港口的獲救情況
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
Survived_0 = data_train.Embarked[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Embarked[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'獲救':Survived_1, u'未獲救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各登錄港口乘客的獲救情況")
plt.xlabel(u"登錄港口")
plt.ylabel(u"人數(shù)")
plt.show()
#看看各性別的獲救情況
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
Survived_m = data_train.Survived[data_train.Sex == 'male'].value_counts()
Survived_f = data_train.Survived[data_train.Sex == 'female'].value_counts()
df=pd.DataFrame({u'男性':Survived_m, u'女性':Survived_f})
df.plot(kind='bar', stacked=True)
plt.title(u"按性別看獲救情況")
plt.xlabel(u"1為獲救")
plt.ylabel(u"人數(shù)")
plt.show()
#看看各性別的獲救情況
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
Survived_0 = data_train.Sex[data_train.Survived == 0].value_counts()
Survived_1 = data_train.Sex[data_train.Survived == 1].value_counts()
df = pd.DataFrame({u'獲救':Survived_1, u'未獲救':Survived_0})
df.plot(kind='bar', stacked=True)
plt.title(u"各性別的獲救情況")
plt.xlabel(u"性別")
plt.ylabel(u"人數(shù)")
plt.show()
#然后我們?cè)賮?lái)看看各種艙級(jí)別情況下各性別的獲救情況
fig=plt.figure()
fig.set(alpha=0.65) # 設(shè)置圖像透明度蹄溉,無(wú)所謂
plt.title(u"根據(jù)艙等級(jí)和性別的獲救情況")
ax1=fig.add_subplot(141)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass != 3].value_counts().plot(kind='bar', label="female highclass", color='#FA2479')
ax1.set_xticklabels([u"獲救", u"未獲救"], rotation=0)
ax1.legend([u"女性/高級(jí)艙"], loc='best')
ax2=fig.add_subplot(142, sharey=ax1)
data_train.Survived[data_train.Sex == 'female'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='female, low class', color='pink')
ax2.set_xticklabels([u"未獲救", u"獲救"], rotation=0)
plt.legend([u"女性/低級(jí)艙"], loc='best')
ax3=fig.add_subplot(143, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass != 3].value_counts().plot(kind='bar', label='male, high class',color='lightblue')
ax3.set_xticklabels([u"未獲救", u"獲救"], rotation=0)
plt.legend([u"男性/高級(jí)艙"], loc='best')
ax4=fig.add_subplot(144, sharey=ax1)
data_train.Survived[data_train.Sex == 'male'][data_train.Pclass == 3].value_counts().plot(kind='bar', label='male low class', color='steelblue')
ax4.set_xticklabels([u"未獲救", u"獲救"], rotation=0)
plt.legend([u"男性/低級(jí)艙"], loc='best')
plt.show()
g = data_train.groupby(['SibSp','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
df
g = data_train.groupby(['Parch','Survived'])
df = pd.DataFrame(g.count()['PassengerId'])
df
#ticket是船票編號(hào)沪袭,應(yīng)該是unique的谅阿,和最后的結(jié)果沒(méi)有太大的關(guān)系坐慰,不納入考慮的特征范疇
#cabin只有204個(gè)乘客有值茬高,我們先看看它的一個(gè)分布
data_train.Cabin.value_counts()
#cabin的值計(jì)數(shù)太分散了个盆,絕大多數(shù)Cabin值只出現(xiàn)一次。感覺(jué)上作為類(lèi)目舍沙,加入特征未必會(huì)有效
#那我們一起看看這個(gè)值的有無(wú)近尚,對(duì)于survival的分布狀況,影響如何吧
fig = plt.figure()
fig.set(alpha=0.2) # 設(shè)定圖表顏色alpha參數(shù)
Survived_cabin = data_train.Survived[pd.notnull(data_train.Cabin)].value_counts()
Survived_nocabin = data_train.Survived[pd.isnull(data_train.Cabin)].value_counts()
df=pd.DataFrame({u'有':Survived_cabin, u'無(wú)':Survived_nocabin}).transpose()
df.plot(kind='bar', stacked=True)
plt.title(u"按Cabin有無(wú)看獲救情況")
plt.xlabel(u"Cabin有無(wú)")
plt.ylabel(u"人數(shù)")
plt.show()
我們這里用scikit-learn中的RandomForest來(lái)擬合一下缺失的年齡數(shù)據(jù)
from sklearn.ensemble import RandomForestRegressor
### 使用RandomForestRegressor 填補(bǔ)缺失的年齡屬性
def set_missing_ages(df):
# 把已有的數(shù)值型特征取出來(lái)丟進(jìn)Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年齡和未知年齡兩部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目標(biāo)年齡
y = known_age[:, 0]
# X即特征屬性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型進(jìn)行未知年齡結(jié)果預(yù)測(cè)
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的預(yù)測(cè)結(jié)果填補(bǔ)原缺失數(shù)據(jù)
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
return df, rfr
def set_Cabin_type(df):
df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
return df
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
data_train
# 因?yàn)檫壿嫽貧w建模時(shí)场勤,需要輸入的特征都是數(shù)值型特征
# 我們先對(duì)類(lèi)目型的特征離散/因子化
# 以Cabin為例,原本一個(gè)屬性維度歼跟,因?yàn)槠淙≈悼梢允荹'yes','no']和媳,而將其平展開(kāi)為'Cabin_yes','Cabin_no'兩個(gè)屬性
# 原本Cabin取值為yes的,在此處的'Cabin_yes'下取值為1哈街,在'Cabin_no'下取值為0
# 原本Cabin取值為no的留瞳,在此處的'Cabin_yes'下取值為0,在'Cabin_no'下取值為1
# 我們使用pandas的get_dummies來(lái)完成這個(gè)工作骚秦,并拼接在原來(lái)的data_train之上她倘,如下所示
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df
# 接下來(lái)我們要接著做一些數(shù)據(jù)預(yù)處理的工作,比如scaling作箍,將一些變化幅度較大的特征化到[-1,1]之內(nèi)
# 這樣可以加速logistic regression的收斂
import sklearn.preprocessing as preprocessing
df['Age_scaled'] = preprocessing.scale(df['Age'])
df['Fare_scaled'] = preprocessing.scale(df['Fare'])
df
我們把需要的feature字段取出來(lái)硬梁,轉(zhuǎn)成numpy格式,使用scikit-learn中的LogisticRegression建模胞得。
# 我們把需要的feature字段取出來(lái)荧止,轉(zhuǎn)成numpy格式,使用scikit-learn中的LogisticRegression建模
from sklearn import linear_model
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()
# y即Survival結(jié)果
y = train_np[:, 0]
# X即特征屬性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
clf
data_test = pd.read_csv("../data/test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
# 接著我們對(duì)test_data做和train_data中一致的特征變換
# 首先用同樣的RandomForestRegressor模型填上丟失的年齡
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根據(jù)特征屬性X預(yù)測(cè)年齡并補(bǔ)上
X_ = null_age[:, 1:]
predictedAges = rfr.predict(X_)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
df_test['Age_scaled'] = preprocessing.scale(df_test['Age'])
df_test['Fare_scaled'] = preprocessing.scale(df_test['Fare'])
df_test
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions.csv", index=False)
pd.read_csv("logistic_regression_predictions.csv")
from sklearn.learning_curve import learning_curve
# 用sklearn的learning_curve得到training_score和cv_score阶剑,使用matplotlib畫(huà)出learning curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
"""
畫(huà)出data在某模型上的learning curve.
參數(shù)解釋
----------
estimator : 你用的分類(lèi)器跃巡。
title : 表格的標(biāo)題。
X : 輸入的feature牧愁,numpy類(lèi)型
y : 輸入的target vector
ylim : tuple格式的(ymin, ymax), 設(shè)定圖像中縱坐標(biāo)的最低點(diǎn)和最高點(diǎn)
cv : 做cross-validation的時(shí)候素邪,數(shù)據(jù)分成的份數(shù),其中一份作為cv集猪半,其余n-1份作為training(默認(rèn)為3份)
n_jobs : 并行的的任務(wù)數(shù)(默認(rèn)1)
"""
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
if plot:
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel(u"訓(xùn)練樣本數(shù)")
plt.ylabel(u"得分")
plt.gca().invert_yaxis()
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
alpha=0.1, color="b")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
alpha=0.1, color="r")
plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"訓(xùn)練集上得分")
plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉驗(yàn)證集上得分")
plt.legend(loc="best")
plt.draw()
plt.gca().invert_yaxis()
plt.show()
midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
return midpoint, diff
plot_learning_curve(clf, u"學(xué)習(xí)曲線", X, y)
咱們可以看看現(xiàn)在得到的模型的系數(shù)兔朦,因?yàn)橄禂?shù)和它們最終的判定能力強(qiáng)弱是正相關(guān)的
咱們可以看看現(xiàn)在得到的模型的系數(shù)偷线,因?yàn)橄禂?shù)和它們最終的判定能力強(qiáng)弱是正相關(guān)的
咱們可以看看現(xiàn)在得到的模型的系數(shù),因?yàn)橄禂?shù)和它們最終的判定能力強(qiáng)弱是正相關(guān)的
pd.DataFrame({"columns":list(train_df.columns)[1:], "coef":list(clf.coef_.T)})
from sklearn import cross_validation
# 簡(jiǎn)單看看打分情況
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
all_data = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
X = all_data.as_matrix()[:,1:]
y = all_data.as_matrix()[:,0]
print cross_validation.cross_val_score(clf, X, y, cv=5)
# 分割數(shù)據(jù)
split_train, split_cv = cross_validation.train_test_split(?df, test_size=0.3, random_state=0)
train_df = split_train.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
# 生成模型
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(train_df.as_matrix()[:,1:], train_df.as_matrix()[:,0])
# 對(duì)cross validation數(shù)據(jù)進(jìn)行預(yù)測(cè)
cv_df = split_cv.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = clf.predict(cv_df.as_matrix()[:,1:])
split_cv[ predictions != cv_df.as_matrix()[:,0] ]
# 去除預(yù)測(cè)錯(cuò)誤的case看原始dataframe數(shù)據(jù)
#split_cv['PredictResult'] = predictions
origin_data_train = pd.read_csv("../data/Train.csv")
bad_cases = origin_data_train.loc[origin_data_train['PassengerId'].isin(split_cv[predictions != cv_df.as_matrix()[:,0]]['PassengerId'].values)]
bad_cases
data_train[data_train['Name'].str.contains("Major")]
data_train = pd.read_csv("../data/Train.csv")
data_train['Sex_Pclass'] = data_train.Sex + "_" + data_train.Pclass.map(str)
from sklearn.ensemble import RandomForestRegressor
### 使用 RandomForestClassifier 填補(bǔ)缺失的年齡屬性
def set_missing_ages(df):
# 把已有的數(shù)值型特征取出來(lái)丟進(jìn)Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年齡和未知年齡兩部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目標(biāo)年齡
y = known_age[:, 0]
# X即特征屬性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型進(jìn)行未知年齡結(jié)果預(yù)測(cè)
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的預(yù)測(cè)結(jié)果填補(bǔ)原缺失數(shù)據(jù)
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
return df, rfr
def set_Cabin_type(df):
df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
return df
data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)
dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_train['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_train['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_train['Pclass'], prefix= 'Pclass')
dummies_Sex_Pclass = pd.get_dummies(data_train['Sex_Pclass'], prefix= 'Sex_Pclass')
df = pd.concat([data_train, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)
import sklearn.preprocessing as preprocessing
df['Age_scaled'] = preprocessing.scale(df['Age'])
df['Fare_scaled'] = preprocessing.scale(df['Fare'])
from sklearn import linear_model
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')
train_np = train_df.as_matrix()
# y即Survival結(jié)果
y = train_np[:, 0]
# X即特征屬性值
X = train_np[:, 1:]
# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
clf.fit(X, y)
clf
data_test = pd.read_csv("../data/test.csv")
data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
data_test['Sex_Pclass'] = data_test.Sex + "_" + data_test.Pclass.map(str)
# 接著我們對(duì)test_data做和train_data中一致的特征變換
# 首先用同樣的RandomForestRegressor模型填上丟失的年齡
tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
null_age = tmp_df[data_test.Age.isnull()].as_matrix()
# 根據(jù)特征屬性X預(yù)測(cè)年齡并補(bǔ)上
X = null_age[:, 1:]
predictedAges = rfr.predict(X)
data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
data_test = set_Cabin_type(data_test)
dummies_Cabin = pd.get_dummies(data_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(data_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(data_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(data_test['Pclass'], prefix= 'Pclass')
dummies_Sex_Pclass = pd.get_dummies(data_test['Sex_Pclass'], prefix= 'Sex_Pclass')
df_test = pd.concat([data_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass, dummies_Sex_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Sex_Pclass'], axis=1, inplace=True)
df_test['Age_scaled'] = preprocessing.scale(df_test['Age'])
df_test['Fare_scaled'] = preprocessing.scale(df_test['Fare'])
df_test
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*')
predictions = clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions2.csv", index=False)
一般做到后期烘绽,咱們要進(jìn)行模型優(yōu)化的方法就是模型融合啦先解釋解釋啥叫模型融合哈淋昭,我們還是舉幾個(gè)例子直觀理解一下好了。
大家都看過(guò)知識(shí)問(wèn)答的綜藝節(jié)目中安接,求助現(xiàn)場(chǎng)觀眾時(shí)候翔忽,讓觀眾投票,最高的答案作為自己的答案的形式吧盏檐,每個(gè)人都有一個(gè)判定結(jié)果歇式,最后我們相信答案在大多數(shù)人手里。
再通俗一點(diǎn)舉個(gè)例子胡野。你和你班某數(shù)學(xué)大神關(guān)系好材失,每次作業(yè)都『模仿』他的,于是絕大多數(shù)情況下硫豆,他做對(duì)了龙巨,你也對(duì)了。突然某一天大神腦子犯糊涂熊响,手一抖旨别,寫(xiě)錯(cuò)了一個(gè)數(shù),于是…恩汗茄,你也只能跟著錯(cuò)了秸弛。 我們?cè)賮?lái)看看另外一個(gè)場(chǎng)景,你和你班5個(gè)數(shù)學(xué)大神關(guān)系都很好洪碳,每次都把他們作業(yè)拿過(guò)來(lái)递览,對(duì)比一下,再『自己做』瞳腌,那你想想绞铃,如果哪天某大神犯糊涂了,寫(xiě)錯(cuò)了嫂侍,but另外四個(gè)寫(xiě)對(duì)了啊憎兽,那你肯定相信另外4人的是正確答案吧?
最簡(jiǎn)單的模型融合大概就是這么個(gè)意思吵冒,比如分類(lèi)問(wèn)題纯命,當(dāng)我們手頭上有一堆在同一份數(shù)據(jù)集上訓(xùn)練得到的分類(lèi)器(比如logistic regression,SVM痹栖,KNN亿汞,random forest,神經(jīng)網(wǎng)絡(luò))揪阿,那我們讓他們都分別去做判定疗我,然后對(duì)結(jié)果做投票統(tǒng)計(jì)咆畏,取票數(shù)最多的結(jié)果為最后結(jié)果。
bingo吴裤,問(wèn)題就這么完美的解決了旧找。
模型融合可以比較好地緩解,訓(xùn)練過(guò)程中產(chǎn)生的過(guò)擬合問(wèn)題麦牺,從而對(duì)于結(jié)果的準(zhǔn)確度提升有一定的幫助钮蛛。
話說(shuō)回來(lái),回到我們現(xiàn)在的問(wèn)題剖膳。你看魏颓,我們現(xiàn)在只講了logistic regression,如果我們還想用這個(gè)融合思想去提高我們的結(jié)果吱晒,我們?cè)撛趺醋瞿兀?br>
既然這個(gè)時(shí)候模型沒(méi)得選甸饱,那咱們就在數(shù)據(jù)上動(dòng)動(dòng)手腳咯。大家想想仑濒,如果模型出現(xiàn)過(guò)擬合現(xiàn)在叹话,一定是在我們的訓(xùn)練上出現(xiàn)擬合過(guò)度造成的對(duì)吧。
那我們干脆就不要用全部的訓(xùn)練集墩瞳,每次取訓(xùn)練集的一個(gè)subset驼壶,做訓(xùn)練,這樣矗烛,我們雖然用的是同一個(gè)機(jī)器學(xué)習(xí)算法,但是得到的模型卻是不一樣的箩溃;同時(shí)瞭吃,因?yàn)槲覀儧](méi)有任何一份子數(shù)據(jù)集是全的,因此即使出現(xiàn)過(guò)擬合涣旨,也是在子訓(xùn)練集上出現(xiàn)過(guò)擬合歪架,而不是全體數(shù)據(jù)上,這樣做一個(gè)融合霹陡,可能對(duì)最后的結(jié)果有一定的幫助和蚪。對(duì),這就是常用的Bagging烹棉。
我們用scikit-learn里面的Bagging來(lái)完成上面的思路攒霹,過(guò)程非常簡(jiǎn)單。代碼如下:
from sklearn.ensemble import BaggingRegressor
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.as_matrix()
# y即Survival結(jié)果
y = train_np[:, 0]
# X即特征屬性值
X = train_np[:, 1:]
# fit到BaggingRegressor之中
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y)
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("logistic_regression_predictions22.csv", index=False)