1. 模型融合目標(biāo)
對于多種調(diào)參完成的模型進(jìn)行模型融合疏旨。
2. Stacking相關(guān)理論介紹
stacking 是用初始訓(xùn)練數(shù)據(jù)學(xué)習(xí)出若干個基學(xué)習(xí)器后,將這幾個學(xué)習(xí)器的預(yù)測結(jié)果作為新的訓(xùn)練集趴樱,來學(xué)習(xí)一個新的學(xué)習(xí)器。
將個體學(xué)習(xí)器結(jié)合在一起的時候使用的方法叫做結(jié)合策略酪捡。對于分類問題叁征,可以使用投票法來選擇輸出最多的類。對于回歸問題逛薇,可以將分類器輸出的結(jié)果求平均值捺疼。
投票法和平均法都是很有效的結(jié)合策略,Stacking是使用另外一個機(jī)器學(xué)習(xí)算法來將個體機(jī)器學(xué)習(xí)器的結(jié)果結(jié)合在一起的一種結(jié)合策略永罚。
在stacking方法中啤呼,把個體學(xué)習(xí)器叫做初級學(xué)習(xí)器卧秘,用于結(jié)合的學(xué)習(xí)器叫做次級學(xué)習(xí)器或元學(xué)習(xí)器(meta-learner),次級學(xué)習(xí)器用于訓(xùn)練的數(shù)據(jù)叫做次級訓(xùn)練集官扣。次級訓(xùn)練集是在訓(xùn)練集上用初級學(xué)習(xí)器得到的翅敌。
3. 代碼實現(xiàn)
導(dǎo)包
import pandas as pd
import numpy as np
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline
讀取數(shù)據(jù)
Train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
TestB_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
print(Train_data.shape)
print(TestB_data.shape)
Train_data.head()
查看數(shù)據(jù)并預(yù)處理
numerical_cols = Train_data.select_dtypes(exclude = 'object').columns
print(numerical_cols)
feature_cols = [col for col in numerical_cols if col not in ['SaleID','name','regDate','price']]
X_data = Train_data[feature_cols]
Y_data = Train_data['price']
X_test = TestB_data[feature_cols]
print('X train shape:',X_data.shape)
print('X test shape:',X_test.shape)
def Sta_inf(data):
print('_min',np.min(data))
print('_max:',np.max(data))
print('_mean',np.mean(data))
print('_ptp',np.ptp(data))
print('_std',np.std(data))
print('_var',np.var(data))
print('Sta of label:')
Sta_inf(Y_data)
X_data = X_data.fillna(-1)
X_test = X_test.fillna(-1)
def build_model_lr(x_train,y_train):
reg_model = linear_model.LinearRegression()
reg_model.fit(x_train,y_train)
return reg_model
def build_model_ridge(x_train,y_train):
reg_model = linear_model.Ridge(alpha=0.8)#alphas=range(1,100,5)
reg_model.fit(x_train,y_train)
return reg_model
def build_model_lasso(x_train,y_train):
reg_model = linear_model.LassoCV()
reg_model.fit(x_train,y_train)
return reg_model
def build_model_gbdt(x_train,y_train):
estimator =GradientBoostingRegressor(loss='ls',subsample= 0.85,max_depth= 5,n_estimators = 100)
param_grid = {
'learning_rate': [0.05,0.08,0.1,0.2],
}
gbdt = GridSearchCV(estimator, param_grid,cv=3)
gbdt.fit(x_train,y_train)
print(gbdt.best_params_)
# print(gbdt.best_estimator_ )
return gbdt
def build_model_xgb(x_train,y_train):
model = xgb.XGBRegressor(n_estimators=120, learning_rate=0.08, gamma=0, subsample=0.8,\
colsample_bytree=0.9, max_depth=5) #, objective ='reg:squarederror'
model.fit(x_train, y_train)
return model
def build_model_lgb(x_train,y_train):
estimator = lgb.LGBMRegressor(num_leaves=63,n_estimators = 100)
param_grid = {
'learning_rate': [0.01, 0.05, 0.1],
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train)
return gbm
## xgb
xgr = xgb.XGBRegressor(n_estimators=120, learning_rate=0.1, subsample=0.8,\
colsample_bytree=0.9, max_depth=7) # ,objective ='reg:squarederror'
scores_train = []
scores = []
## 5折交叉驗證方式
sk=StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
for train_ind,val_ind in sk.split(X_data,Y_data):
train_x=X_data.iloc[train_ind].values
train_y=Y_data.iloc[train_ind]
val_x=X_data.iloc[val_ind].values
val_y=Y_data.iloc[val_ind]
xgr.fit(train_x,train_y)
pred_train_xgb=xgr.predict(train_x)
pred_xgb=xgr.predict(val_x)
score_train = mean_absolute_error(train_y,pred_train_xgb)
scores_train.append(score_train)
score = mean_absolute_error(val_y,pred_xgb)
scores.append(score)
print('Train mae:',np.mean(score_train))
print('Val mae',np.mean(scores))