XGBoost回歸
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
# 加載數(shù)據(jù)集
boston = load_boston()
# 獲取特征值和目標(biāo)值
x,y = boston.data, boston.target
# 獲取特征名稱
feature_name = boston.feature_names
# 劃分?jǐn)?shù)據(jù)集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# 參數(shù)設(shè)置
# 訓(xùn)練算法參數(shù)設(shè)置
params = {
# 通用參數(shù)
'booster': 'gbtree', #使用的弱學(xué)習(xí)器材蹬,有兩種選擇gbtree(默認(rèn))和gbliner嗤形,gbtree是基于
#樹(shù)模型的提升計(jì)算无埃,gbliner是基于線性模型的提升計(jì)算
# 任務(wù)參數(shù)
'objective': 'reg:gamma',#回歸的損失函數(shù)鹤树,gamma回歸
# 提升參數(shù)
'gamma': 0.1,#葉子節(jié)點(diǎn)進(jìn)行劃分時(shí)需要損失函數(shù)減少的最小值
'max_depth': 5,#樹(shù)的最大深度,缺省值為5椰拒,可設(shè)置其他值
'lambda': 3,#正則化權(quán)重
'subsample': 0.7,#訓(xùn)練模型的樣本占總樣本的比例晶渠,用于防止過(guò)擬合
'colsample_bytree': 0.7,#建立樹(shù)時(shí)對(duì)特征進(jìn)行采樣的比例
'min_child_weight': 3,#葉子節(jié)點(diǎn)繼續(xù)劃分的最小樣本權(quán)重和
'eta': 0.1,#加法模型中使用的收縮步長(zhǎng)
'seed': 1000,#隨機(jī)數(shù)種子
'nthread':4,
}
plst = params.items()
# 數(shù)據(jù)集格式轉(zhuǎn)化
dtrain = xgb.DMatrix(x_train, y_train, feature_names = feature_name)
dtest = xgb.DMatrix(x_test, feature_names = feature_name)
# 模型訓(xùn)練
num_rounds = 30
model = xgb.train(plst, dtrain, num_rounds)
# 模型預(yù)測(cè)
y_pred = model.predict(dtest)
# 顯示重要特征
plot_importance(model, importance_type="weight")
plt.show()
# 可視化樹(shù)的生成情況,num_tree是樹(shù)的索引
plot_tree(model, num_trees=17)
plt.show()
# 將基學(xué)習(xí)器輸出到txt文件中
model.dump_model('model2.txt')
圖片:
QQ截圖20200409170505.png
QQ截圖20200409170515.png
sklearn-XGBoost回歸
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
# 加載數(shù)據(jù)集
boston = load_boston()
# 獲取特征值和目標(biāo)值
x,y = boston.data, boston.target
# 獲取特征名稱
feature_name = boston.feature_names
# 劃分?jǐn)?shù)據(jù)集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# 模型訓(xùn)練
model = xgb.XGBRFRegressor(max_depth=5, learning_rate=0.1, n_estimators=50, silent=True, objective='reg:gamma')
model.fit(x_train,y_train)
# 模型預(yù)測(cè)
y_pred = model.predict(x_test)
# 顯示重要特征
plot_importance(model)
plt.show()
# 可視化樹(shù)的生成情況燃观,num_tree是樹(shù)的索引
plot_tree(model, num_trees=17)
plt.show()
XGBoost分類
import time
import numpy as np
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_boston
import matplotlib
import matplotlib.pyplot as plt
import os
iris = load_iris()
x,y = iris.data,iris.target
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=1234565)
# 訓(xùn)練算法參數(shù)設(shè)置
params = {
# 通用參數(shù)
'booster': 'gbtree', #使用的弱學(xué)習(xí)器褒脯,有兩種選擇gbtree(默認(rèn))和gbliner,gbtree是基于
#樹(shù)模型的提升計(jì)算缆毁,gbliner是基于線性模型的提升計(jì)算
'ntread': 4,#XGBoost運(yùn)行時(shí)的線程數(shù)番川,缺省時(shí)是當(dāng)前系統(tǒng)獲得的最大線程數(shù)
'silent': 0,#0:表示打印運(yùn)行時(shí)的信息,1:表示以緘默方式運(yùn)行脊框,默認(rèn)為0
'num_feature': 4,#boosting過(guò)程中使用的特征維數(shù)
'seed': 1000,#隨機(jī)數(shù)種子
# 任務(wù)參數(shù)
'objective': 'multi:softmax',#多分類的softmax颁督,objective用來(lái)定義學(xué)習(xí)任務(wù)及相應(yīng)的損失函數(shù)
'num_class': 3,#類別總數(shù)
# 提升參數(shù)
'gamma': 0.1,#葉子節(jié)點(diǎn)進(jìn)行劃分時(shí)需要損失函數(shù)減少的最小值
'max_depth': 6,#樹(shù)的最大深度,缺省值為6缚陷,可設(shè)置其他值
'lambda': 2,#正則化權(quán)重
'subsample': 0.7,#訓(xùn)練模型的樣本占總樣本的比例适篙,用于防止過(guò)擬合
'colsample_bytree': 0.7,#建立樹(shù)時(shí)對(duì)特征進(jìn)行采樣的比例
'min_child_weight': 3,#葉子節(jié)點(diǎn)繼續(xù)劃分的最小樣本權(quán)重和
'eta': 0.1#加法模型中使用的收縮步長(zhǎng)
}
plst = params.items()
# 數(shù)據(jù)集格式轉(zhuǎn)換
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)
# 迭代次數(shù),對(duì)于分類問(wèn)題箫爷,每個(gè)類別的迭代次數(shù),所以,總的基學(xué)習(xí)器個(gè)數(shù) = 迭代次數(shù) * 類別個(gè)數(shù)
num_rounds = 50
model = xgb.train(plst, dtrain, num_rounds)#xgboost模型訓(xùn)練
# 對(duì)測(cè)試集進(jìn)行預(yù)測(cè)
y_pred = model.predict(dtest)
# 計(jì)算準(zhǔn)確率
accuracy = accuracy_score(y_test,y_pred)
print("accuracy: %.2f%%" % (accuracy*100.0))
# 顯示重要特征
plot_importance(model)
plt.show()
# 可視化樹(shù)的生成情況虎锚,num_tree是樹(shù)的索引
plot_tree(model, num_trees=5)
plt.show()
# 將基學(xué)習(xí)器輸出到txt文件中
model.dump_model('model.txt')
圖片:
QQ截圖20200409170800.png
QQ截圖20200409170809.png
QQ截圖20200409170857.png
sklearn-XGBoost分類
import xgboost as xgb
from xgboost import plot_importance,plot_tree
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
iris = load_iris()
x,y = iris.data,iris.target
feature_name = iris.feature_names
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=3)
# 模型訓(xùn)練
model = xgb.XGBClassifier(max_depth=5, n_estimators=50, silent = True,objective='multi:softmax',feature_names=feature_name)
model.fit(x_train,y_train)
# 預(yù)測(cè)
y_pred = model.predict(x_test)
# 計(jì)算準(zhǔn)確率
accuracy = accuracy_score(y_test,y_pred)
print("accuracy: %.2f%%" % (accuracy*100.0))
# 顯示重要特征
plot_importance(model)
plt.show()
# 可視化樹(shù)的生成情況硫痰,num_tree是樹(shù)的索引
plot_tree(model, num_trees=5)
plt.show()
圖片:
QQ截圖20200409171015.png
QQ截圖20200409171023.png
QQ截圖20200409171030.png
LightGBM
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# 加載數(shù)據(jù)集
breast = load_breast_cancer()
# 獲取特征值和目標(biāo)值
x,y = breast.data,breast.target
# 獲取特征名稱
feature_name = breast.feature_names
# 數(shù)據(jù)集劃分
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)
# 數(shù)據(jù)格式轉(zhuǎn)換
lgb_train = lgb.Dataset(x_train,y_train)
lgb_eval = lgb.Dataset(x_test,y_test,reference=lgb_train)
# 參數(shù)設(shè)置
boost_round = 50#迭代次數(shù)
early_stop_rounds = 10#驗(yàn)證數(shù)據(jù)在early_stop_rounds輪中未提高,則提前停止
params = {
'boosting_type':'gbdt',#設(shè)置提mu'bia'han'shu升類型
'objective':'regression',#目標(biāo)函數(shù)
'metric':{'12','auc'},#評(píng)估函數(shù)
'num_leaves':31,#葉子節(jié)點(diǎn)數(shù)
'learning_rate':0.05,#學(xué)習(xí)速率
'feature_fraction':0.9,#建樹(shù)的特征選擇比例
'bagging_fraction':0.8,#建樹(shù)的樣本采集比例
'bagging_freq':5,#k意味著每k次迭代執(zhí)行bagging
'verbose':1#<0,顯示致命的窜护,=0顯示錯(cuò)誤(警告)效斑,>0顯示信息
}
# 訓(xùn)練模型,加入提前停止的功能
results = {}
gbm = lgb.train(
params,
lgb_train,
num_boost_round=boost_round,
valid_sets=(lgb_eval,lgb_train),
valid_names=('validate','train'),
early_stopping_rounds=early_stop_rounds,
evals_result=results
)
# 模型預(yù)測(cè)
y_pred = gbm.predict(x_test,num_iteration=gbm.best_iteration)
print(y_pred)
lgb.plot_metric(results)
plt.show()
# 繪制重要的特征
lgb.plot_importance(gbm,importance_type='split')
plt.show()
圖片:
QQ截圖20200409171234.png
QQ截圖20200409171226.png
QQ截圖20200409171306.png