- 導入庫并加載泰坦尼克號數(shù)據(jù)集
import pandas #ipython notebook
titanic = pandas.read_csv("titanic_train.csv")
titanic.head(5) # 顯示前五條數(shù)據(jù)
- 觀察源數(shù)據(jù)集發(fā)現(xiàn),age屬性中有缺失, 通過計算該屬性的均值將缺失處填補,使得數(shù)據(jù)的數(shù)量一致
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
print (titanic.describe())
- 獲取sex的值, 并用0和1代表男性和女性
print (titanic["Sex"].unique())
# Replace all the occurences of male with the number 0.
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
- 獲取embarked的值, 用0,1,2分別表示S,C,Q
print (titanic["Embarked"].unique())
titanic["Embarked"] = titanic["Embarked"].fillna('S')
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
- 使用交叉驗證并調(diào)用線性回歸算法
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold
# 我們用來預測目標的列
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# 對線性回歸類進行實例化
alg = LinearRegression()
# 為泰坦尼克數(shù)據(jù)集生成交叉驗證折疊瘫镇。它返回與訓練和測試相對應的行索引
# 我們設置了random_state以確保每次運行時都得到相同的分割。
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
train_predictors = (titanic[predictors].iloc[train,:])
# 訓練算法的目標.
train_target = titanic["Survived"].iloc[train]
# 利用預測器和目標訓練算法.
alg.fit(train_predictors, train_target)
# 我們現(xiàn)在可以在測試折疊部分做出預測
test_predictions = alg.predict(titanic[predictors].iloc[test,:])
predictions.append(test_predictions)
- 將predictions放到一個array中
predictions = np.concatenate(predictions, axis=0)
# 將預測映射到結果(只有可能的結果是1和0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print (accuracy)
- 再換一種算法預測, 使用邏輯回歸
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
alg = LogisticRegression(random_state=1)
# 計算所有交叉驗證折疊的精度分數(shù)答姥。(比以前簡單多了!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# 取分數(shù)的平均值(因為每一組都有一個分數(shù))
print(scores.mean())
- 對真正測試集進行預處理
titanic_test = pandas.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2
- 使用隨機森林
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# 用默認參數(shù)初始化算法
# n_estimators就是我們要生成的樹的數(shù)量
# min_samples_split是進行拆分所需的最少行數(shù)
# min_samples_leaf是我們在樹枝末端(樹的底部點)可以得到的最小樣本數(shù)量
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
# 計算所有交叉驗證折疊的精度分數(shù)铣除。
kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# 取分數(shù)的平均值(因為每一組都有一個分數(shù))
print(scores.mean())
調(diào)優(yōu),修改算法參數(shù):
alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2)
# 計算所有交叉驗證折疊的精度分數(shù)。
kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# 取分數(shù)的平均值
print(scores.mean())
- 自己構造一個特征
# 生成一個familysize列
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
# apply方法生成一個新的列
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))
import re
#從名稱中獲取標題的函數(shù).
def get_title(name):
# 使用正則表達式搜索標題鹦付。標題總是由大寫字母和小寫字母組成尚粘,以句號結尾。
title_search = re.search(' ([A-Za-z]+)\.', name)
# 如果標題存在敲长,提取并返回它.
if title_search:
return title_search.group(1)
return ""
# 獲取所有標題并打印出每個標題出現(xiàn)的頻率郎嫁。
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
# 將每個標題映射為整數(shù)。有些標題非常罕見祈噪,它們被壓縮成與其他標題相同的代碼泽铛。
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
titles[titles == k] = v
# 驗證我們轉(zhuǎn)換了的所有東西
print(pandas.value_counts(titles))
# 添加到標題列.
titanic["Title"] = titles
- 畫出各個特征對預測結果的影響關系
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]
# 進行特征選擇
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])
#獲取每個特性的原始p值,并將p值轉(zhuǎn)換為分數(shù)
scores = -np.log10(selector.pvalues_)
# 畫出成績辑鲤】唬看看“Pclass”、“Sex”、“Title”和“Fare”如何是最好的?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
# 只選擇四個最好的功能弛随。
predictors = ["Pclass", "Sex", "Fare", "Title"]
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
- 通過bootsing和logistic預測
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
# 我們要集成的算法.
# 我們使用了線性預測器來進行邏輯回歸瓢喉,以及使用梯度提升分類器.
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title",]],
[LogisticRegression(random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]
# 初始化交叉驗證折疊
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
predictions = []
for train, test in kf:
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
# 對每一種算法進行預測
for alg, predictors in algorithms:
# 根據(jù)訓練數(shù)據(jù)擬合算法.
alg.fit(titanic[predictors].iloc[train,:], train_target)
# 選擇并預測測試折疊.
# 要將dataframe轉(zhuǎn)換為所有float,并避免sklearn錯誤舀透,必須使用.astype(float).
test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]
full_test_predictions.append(test_predictions)
# 使用一個簡單的整體方案——平均預測得到最終的分類.
test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2
# 任何大于0.5的值都被假設為1栓票,小于0.5的值為0.
test_predictions[test_predictions <= .5] = 0
test_predictions[test_predictions > .5] = 1
predictions.append(test_predictions)
# 把所有的預測放在一個數(shù)組中.
predictions = np.concatenate(predictions, axis=0)
# 通過與訓練數(shù)據(jù)的比較計算準確率.
accuracy = sum(predictions[predictions == titanic["Survived"]]) / len(predictions)
print(accuracy)
小結
以上分別使用了'線性回歸', '邏輯回歸', '隨機森林',以及'bootsing+logisic集成學習方法'對數(shù)據(jù)進行預測