xgb部分代碼

#讀取測試集數(shù)據(jù)
print(get_print_head('讀取訓(xùn)練集數(shù)據(jù)'))
train = pd.read_excel('train.xlsx')
print(get_print_head("訓(xùn)練集 shape"))
print(train.shape)
print(get_print_head("完成訓(xùn)練集數(shù)據(jù)讀取"))
print('\n')

# print(get_print_head("訓(xùn)練集01 head"))
# print(data1_1.head())
# print(get_print_head("訓(xùn)練集02 head"))
# print(data1_2.head())
# print('\n')

print(get_print_head('讀取測試集數(shù)據(jù)'))
test = pd.read_excel('test.xlsx')
print(get_print_head("測試集 shape"))
print(test.shape)
print(get_print_head("完成測試集數(shù)據(jù)讀取"))
print('\n')
train.rename(columns={"姓名":'name','回溯時間':'dt','身份證號':'idcard','手機(jī)號':'mobile'},inplace=True)
test.rename(columns={"姓名":'name','回溯時間':'dt','身份證號':'idcard','手機(jī)號':'mobile'},inplace=True)
train['dt'] = pd.to_datetime(train['dt'],format='%Y%m%d')
test['dt'] = pd.to_datetime(test['dt'],format='%Y%m%d')
train['dt'] = train['dt'].astype(str)
test['dt'] = test['dt'].astype(str)

print(get_print_head("獲取原始變量列表"))
raw_xlist0 = train.columns.tolist()
raw_xlist0 = raw_xlist0[4:]
print("原始變量有{}個".format(str(len(raw_xlist0))))
print(get_print_head("保存原始變量列表"))
raw_xlist0_file = 'raw_xlist0.pkl'
print("原始變量文件名為:{}".format(raw_xlist0_file))
with open(raw_xlist0_file,'wb') as f:
    pickle.dump(raw_xlist0,f)
print(get_print_head("完成保存原始變量列表"))
print('\n')

print(get_print_head("訓(xùn)練集匹配率"))
print("訓(xùn)練集匹配率為{}".format(str(len(train[~train[raw_xlist0].isnull().T.all()])/20000)))
# print(get_print_head("去除未匹配上的訓(xùn)練數(shù)據(jù)"))
# train = train[~train[raw_xlist0].isnull().T.all()]
# print(get_print_head("完成去除未匹配上的訓(xùn)練數(shù)據(jù)"))
print('\n')

print(get_print_head("測試集匹配率"))
print("測試集匹配率為{}".format(str(len(test[~test[raw_xlist0].isnull().T.all()])/70000)))
# print(get_print_head("去除未匹配上的測試數(shù)據(jù)"))
# test = test[~test[raw_xlist0].isnull().T.all()]
# print(get_print_head("完成去除未匹配上的測試數(shù)據(jù)"))
print('\n')

print(get_print_head("檢查字符型變量個數(shù)"))
float_xlist1 = raw_xlist0.copy()
for xvar in raw_xlist0:
    try:
        train[xvar] = train[xvar].astype(float)
    except:
        float_xlist1.remove(xvar)
print("字符型變量有{}個".format(str(len(raw_xlist0)-len(float_xlist1))))
print(get_print_head("保存非字符變量列表"))
float_xlist1_file = 'float_xlist1.pkl'
print("非字符變量文件名為:{}".format(float_xlist1_file))
with open(float_xlist1_file,'wb') as f:
    pickle.dump(float_xlist1,f)
print(get_print_head("完成保存非字符變量列表"))
print(get_print_head("保存字符變量列表"))
string_xlist1_file = 'string_xlist1.pkl'
string_xlist1 = list(set(raw_xlist0) - set(float_xlist1))
print("字符變量文件名為:{}".format(string_xlist1_file))
with open(string_xlist1_file,'wb') as f:
    pickle.dump(string_xlist1,f)
print(get_print_head("完成保存字符變量列表"))
print("\n")

print(get_print_head("檢查缺失率高于95%變量個數(shù)"))
desc = train[float_xlist1].describe(percentiles=[.01,.05,.95,.99]).T
desc["%missing"] = (len(train) - desc["count"]) / len(train)
high_missing_remove_xlist2 = desc[desc["%missing"]<=0.95].index.tolist()
temp_xlist = high_missing_remove_xlist2.copy()
for xvar in temp_xlist:
    group = train.groupby(xvar).size()
    group = pd.DataFrame(group)
    group.reset_index(inplace=True)
    group.rename(columns={0:"cnt"},inplace=True)
    cnt = group["cnt"].max() 
    if cnt / len(train) > 0.95:
        high_missing_remove_xlist2.remove(xvar)
print('缺失率高于95%的變量為{}個'.format(str(len(float_xlist1) - len(high_missing_remove_xlist2))))
print(get_print_head("保存缺失率小于等于95%變量列表"))
high_missing_remove_xlist2_file = 'high_missing_remove_xlist2.pkl'
print("缺失率小于等于95%變量文件名為:{}".format(high_missing_remove_xlist2_file))
with open(high_missing_remove_xlist2_file,'wb') as f:
    pickle.dump(high_missing_remove_xlist2,f)
print(get_print_head("完成保存缺失率小于等于95%變量列表"))
print("\n")

import md
import pandas as pd
import numpy as np
import gc
import scorecardpy as sc
from sklearn.metrics import roc_curve
import xgboost as xgb
import pickle
import json
def cal_iv(data,feature_list):
    fail = []
    bins = {}
    for x in feature_list:
        try:
            bins_ = sc.woebin(data[[x,'label']], y="label")
            bins.update(bins_)
        except:
            print(x)
            fail.append(x)

    iv = pd.Series()
    for k,v in bins.items():
        iv[k] = v['total_iv'].values[0]
    return iv, bins
def get_label(x):
    if x == 0:
        return 0
    elif x >=30:
        return 1
    else:
        return np.nan
    
def ks(y_predicted,y_true):
    fpr, tpr, thresholds = roc_curve(y_true.get_label(),y_predicted)
    return 'ks',-np.max(tpr-fpr)

def ks1(y_predicted,y_true):
    fpr, tpr, thresholds = roc_curve(y_true,y_predicted)
    return 'ks',-np.max(tpr-fpr)

def psi(data1,data2,var):
#數(shù)值型特征  
#     data1[var]=data1[var].astype(float)
#     data2[var]=data2[var].astype(float)
    try:
        _,bins = pd.qcut(data1[var],q=10,retbins=True,duplicates = 'drop')

        bins=np.sort(list(set([-np.inf] + bins.tolist()+[np.inf])))
        t1 = pd.cut(data1[var],bins=bins)
        t2 = pd.cut(data2[var],bins=bins)
        t11=t1.value_counts(dropna=False)
        t22=t2.value_counts(dropna=False)
        t11.index=t11.index.astype(str)
        t22.index=t22.index.astype(str)
        t = pd.concat([t11,t22],axis=1).sort_index()

        t.columns = ['base_cnt','test_cnt']
        t['feature']=var
        t.loc[t.base_cnt.isnull(),'base_cnt']=0
        t.loc[t.test_cnt.isnull(),'test_cnt']=0
        t['base_rate'] = t['base_cnt']/t['base_cnt'].sum()
        t['test_rate'] = t['test_cnt']/t['test_cnt'].sum()
        t['psi'] = (t['test_rate']-t['base_rate'])*np.log((t['test_rate']+0.000001)/(t['base_rate']+0.000001))    
        t['total_psi']= t['psi'].sum()
        columns=['feature','base_cnt','test_cnt','base_rate','test_rate','psi','total_psi']
        t=t.loc[:,columns]
        return t
    except:
        print('無數(shù)據(jù)疟游!',var)
# group_all = pd.DataFrame()
# for xvar in xlist:
#     print(xvar)
#     group = raw.groupby([xvar,"label"]).size().reset_index()
#     group = pd.DataFrame(group)
#     group.rename(columns={0:"cnt"},inplace=True)
#     group0 = group[group["label"]==0]
#     group0.rename(columns={"cnt":"good"},inplace=True)
#     group1 = group[group["label"]==1]
#     group1.rename(columns={"cnt":"bad"},inplace=True)
#     group0 = group0[[xvar,"good"]]
#     group1 = group1[[xvar,"bad"]]
#     group = pd.merge(group0,group1,how='outer',on=xvar)
#     group["total"] = group["good"]+ group["bad"]
#     group["%total"] = group["total"] / group["total"].sum()
#     group["%bad"] = group["bad"] / group["total"]
#     group.rename(columns={xvar:"value"},inplace=True)
#     group["xvar"] = xvar
#     group = group[["xvar","value","good","bad","total","%total","%bad"]]
#     group_all = pd.concat([group_all,group])
with open(r"br2_features.pkl",'rb') as f:
    br2_feature = pickle.load(f)
train_xlist3 = list(iv_table[iv_table['iv']!=0]['xvar'])
with open("train_xlist3.pkl","wb") as f:
    pickle.dump(train_xlist3,f)
for xvar in train_xlist3:
    train[xvar] = train[xvar].astype(float)

train = pd.read_pickle('train.pkl')
test = pd.read_pickle('test.pkl')
with open("train_xlist3.pkl","rb") as f:
    train_xlist3 = pickle.load(f)
clf1 = xgb.XGBClassifier(learning_rate=0.1,max_depth=3,min_child_weight=250,subsample=0.7,colsample_bytree=0.6,reg_alpha=3,n_estimators=500,n_jobs=30,scale_pos_weight=167)
clf1.fit(train[train_xlist3],train["label"],
        eval_metric=ks,    #ks
        eval_set=[(train[train_xlist3],train["label"]),(test[train_xlist3],test['label'])],
        early_stopping_rounds=100)

importance = pd.DataFrame()
importance["xvar"] = train_xlist3
importance["importance"] = clf1.feature_importances_
importance["importance"] = importance["importance"].astype(float)
importance.sort_values('importance',ascending=False)
out = pd.DataFrame()
out['FLAG']=train['label']
out['Y_PRED']=clf1.predict_proba(train[train_xlist3])[:,1]
print(md.PowerX(out,'Y_PRED','FLAG'))
md.x_split_n(out,'Y_PRED','FLAG')
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末,一起剝皮案震驚了整個濱河市儒陨,隨后出現(xiàn)的幾起案子笋籽,更是在濱河造成了極大的恐慌车海,老刑警劉巖,帶你破解...
    沈念sama閱讀 206,723評論 6 481
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件容劳,死亡現(xiàn)場離奇詭異喘沿,居然都是意外死亡闸度,警方通過查閱死者的電腦和手機(jī)竭贩,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,485評論 2 382
  • 文/潘曉璐 我一進(jìn)店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來莺禁,“玉大人留量,你說我怎么就攤上這事∮炊” “怎么了楼熄?”我有些...
    開封第一講書人閱讀 152,998評論 0 344
  • 文/不壞的土叔 我叫張陵,是天一觀的道長浩峡。 經(jīng)常有香客問我,道長翰灾,這世上最難降的妖魔是什么缕粹? 我笑而不...
    開封第一講書人閱讀 55,323評論 1 279
  • 正文 為了忘掉前任稚茅,我火速辦了婚禮,結(jié)果婚禮上平斩,老公的妹妹穿的比我還像新娘亚享。我一直安慰自己,他們只是感情好绘面,可當(dāng)我...
    茶點(diǎn)故事閱讀 64,355評論 5 374
  • 文/花漫 我一把揭開白布欺税。 她就那樣靜靜地躺著,像睡著了一般揭璃。 火紅的嫁衣襯著肌膚如雪晚凿。 梳的紋絲不亂的頭發(fā)上,一...
    開封第一講書人閱讀 49,079評論 1 285
  • 那天瘦馍,我揣著相機(jī)與錄音晃虫,去河邊找鬼。 笑死扣墩,一個胖子當(dāng)著我的面吹牛哲银,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播呻惕,決...
    沈念sama閱讀 38,389評論 3 400
  • 文/蒼蘭香墨 我猛地睜開眼荆责,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了亚脆?” 一聲冷哼從身側(cè)響起做院,我...
    開封第一講書人閱讀 37,019評論 0 259
  • 序言:老撾萬榮一對情侶失蹤,失蹤者是張志新(化名)和其女友劉穎濒持,沒想到半個月后键耕,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 43,519評論 1 300
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡柑营,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 35,971評論 2 325
  • 正文 我和宋清朗相戀三年屈雄,在試婚紗的時候發(fā)現(xiàn)自己被綠了。 大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片官套。...
    茶點(diǎn)故事閱讀 38,100評論 1 333
  • 序言:一個原本活蹦亂跳的男人離奇死亡酒奶,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出奶赔,到底是詐尸還是另有隱情惋嚎,我是刑警寧澤,帶...
    沈念sama閱讀 33,738評論 4 324
  • 正文 年R本政府宣布站刑,位于F島的核電站另伍,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏绞旅。R本人自食惡果不足惜摆尝,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 39,293評論 3 307
  • 文/蒙蒙 一愕宋、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧结榄,春花似錦中贝、人聲如沸。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,289評論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽。三九已至视哑,卻和暖如春绣否,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背挡毅。 一陣腳步聲響...
    開封第一講書人閱讀 31,517評論 1 262
  • 我被黑心中介騙來泰國打工蒜撮, 沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留,地道東北人跪呈。 一個月前我還...
    沈念sama閱讀 45,547評論 2 354
  • 正文 我出身青樓段磨,卻偏偏與公主長得像,于是被迫代替她去往敵國和親耗绿。 傳聞我的和親對象是個殘疾皇子苹支,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 42,834評論 2 345

推薦閱讀更多精彩內(nèi)容