本例使用sklearn進行kaggle案例泰坦尼克Titanic船員獲救預(yù)測
環(huán)境:python3+Anaconda(Anaconda集成了實驗用到的包)
源碼及語料:https://gitee.com/yqmyqm/Machine_learn
下面為實現(xiàn)代碼:
'''
Created on 2017年12月20日
@author: yqm
'''
import pandas
file_dir = "G:\\研究生\\實驗\\語料\\titanic_train.csv"
titanic = pandas.read_csv(file_dir)
# print(titanic.head(5))
# Age列存在缺失項畔塔,用Age的平均數(shù)填充缺失值(NaN格式)
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
# print(titanic["Sex"].unique())
# print(titanic["Embarked"].unique()) # unique()函數(shù)查看一共有多少重復(fù)的值 例:['S' 'C' 'Q' nan]
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
# print(titanic.describe())#輸出總數(shù)作谚,均值叉讥。蛉顽。等信息
from sklearn.linear_model import LinearRegression #引入線性回歸
from sklearn.cross_validation import KFold #交叉驗證庫照宝,將測試集進行切分交叉取平均
#傳入模型的特征
predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg = LinearRegression() #實例化模型
# print(titanic.shape)
# 將m個樣本平均分成3份進行交叉驗證
# titanic.shape輸出樣本集的行和列本例輸出為(891, 12),shape[0]指第一個列的值
# 本例titanic.shape[0]為樣本的個數(shù)
# 根據(jù)參數(shù)n和n_folds將n個樣本分成n_folds份叔遂。每次驗證過程選取其中1份作為測試集同规,剩下的n_folds-1份作為訓(xùn)練集,并且做n_folds次這樣的驗證
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
# print(kf)
predictions = []
for train, test in kf:
train_predictions = (titanic[predictors].iloc[train,:])#將predictors作為測試特征
train_target = titanic["Survived"].iloc[train] #訓(xùn)練集標簽值
alg.fit(train_predictions, train_target) #訓(xùn)練數(shù)據(jù)
#用訓(xùn)練好的模型預(yù)測數(shù)據(jù)
test_prediction = alg.predict(titanic[predictors].iloc[test,:])
# print(test_prediction)
predictions.append(test_prediction)
print(predictions)