在lightgbm中對categorical feature有專門的處理眷蚓,但是需要標明哪些特征是categorical類型知押;另外在執(zhí)行config文件也有相應的參數(shù)categorical_feature摸恍,可見LightGBM parameters.
如果是python API, 是通過pandas標明category赶诊,如下:
import pickle
import datetime
import json
import xgboost as xgb
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import cross_validation
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
def categorize(X, cols):
"""
輸入: X pd數(shù)據(jù)
cols 需要變換的列
輸出:變換好的X
"""
for col in cols:
X[col] = X[col].astype("category")
return X
def train_lgb(path_X, path_y):
"""
訓練lgb模型
輸出:lgb模型文件
"""
#從csv中讀取提取好的特征,如果沒有對應的csv逸贾,需要先執(zhí)行gen_data()
X = pd.read_csv(path_X)
cols = ['x1', 'x2'] #只是作為例子
X = categorize(X, cols) #需要category的列
del X['pv'] #這里的pv選成了new_resblock_pv,暫時不加上陨仅,改過來之后刪除這一行
label = pd.read_csv(path_y, header=None) #不需要header,不然會少一行
label = label[0].tolist() #不會有警告
train_X,test_X,train_y,test_y = train_test_split(X, label, test_size = 0.2, random_state = 2019)
#利用類別型特征需要標注出來,目前是auto
clf = lgb.LGBMClassifier(objective='binary', max_depth=4, learning_rate=0.3, n_estimators = 300, verbosity = -1, metric = 'auc')
clf.fit(train_X, train_y)
pred = clf.predict_proba(test_X)[:, 1]
#計算AUC
auc = metrics.roc_auc_score(test_y, pred)
pred_train = clf.predict_proba(train_X)[:, 1]
auc_train = metrics.roc_auc_score(train_y, pred_train)
print "train_auc:", auc_train
print "test_auc:", auc
#保存模型
model_name = 'lgb' + datetime.datetime.now().strftime('%Y-%m-%d-%H_%M') + '.model'
print model_name
clf.booster_.save_model(DIR + "model/" + model_name )
return clf