#讀取測試集數(shù)據(jù)
print(get_print_head('讀取訓(xùn)練集數(shù)據(jù)'))
train = pd.read_excel('train.xlsx')
print(get_print_head("訓(xùn)練集 shape"))
print(train.shape)
print(get_print_head("完成訓(xùn)練集數(shù)據(jù)讀取"))
print('\n')
# print(get_print_head("訓(xùn)練集01 head"))
# print(data1_1.head())
# print(get_print_head("訓(xùn)練集02 head"))
# print(data1_2.head())
# print('\n')
print(get_print_head('讀取測試集數(shù)據(jù)'))
test = pd.read_excel('test.xlsx')
print(get_print_head("測試集 shape"))
print(test.shape)
print(get_print_head("完成測試集數(shù)據(jù)讀取"))
print('\n')
train.rename(columns={"姓名":'name','回溯時間':'dt','身份證號':'idcard','手機(jī)號':'mobile'},inplace=True)
test.rename(columns={"姓名":'name','回溯時間':'dt','身份證號':'idcard','手機(jī)號':'mobile'},inplace=True)
train['dt'] = pd.to_datetime(train['dt'],format='%Y%m%d')
test['dt'] = pd.to_datetime(test['dt'],format='%Y%m%d')
train['dt'] = train['dt'].astype(str)
test['dt'] = test['dt'].astype(str)
print(get_print_head("獲取原始變量列表"))
raw_xlist0 = train.columns.tolist()
raw_xlist0 = raw_xlist0[4:]
print("原始變量有{}個".format(str(len(raw_xlist0))))
print(get_print_head("保存原始變量列表"))
raw_xlist0_file = 'raw_xlist0.pkl'
print("原始變量文件名為:{}".format(raw_xlist0_file))
with open(raw_xlist0_file,'wb') as f:
pickle.dump(raw_xlist0,f)
print(get_print_head("完成保存原始變量列表"))
print('\n')
print(get_print_head("訓(xùn)練集匹配率"))
print("訓(xùn)練集匹配率為{}".format(str(len(train[~train[raw_xlist0].isnull().T.all()])/20000)))
# print(get_print_head("去除未匹配上的訓(xùn)練數(shù)據(jù)"))
# train = train[~train[raw_xlist0].isnull().T.all()]
# print(get_print_head("完成去除未匹配上的訓(xùn)練數(shù)據(jù)"))
print('\n')
print(get_print_head("測試集匹配率"))
print("測試集匹配率為{}".format(str(len(test[~test[raw_xlist0].isnull().T.all()])/70000)))
# print(get_print_head("去除未匹配上的測試數(shù)據(jù)"))
# test = test[~test[raw_xlist0].isnull().T.all()]
# print(get_print_head("完成去除未匹配上的測試數(shù)據(jù)"))
print('\n')
print(get_print_head("檢查字符型變量個數(shù)"))
float_xlist1 = raw_xlist0.copy()
for xvar in raw_xlist0:
try:
train[xvar] = train[xvar].astype(float)
except:
float_xlist1.remove(xvar)
print("字符型變量有{}個".format(str(len(raw_xlist0)-len(float_xlist1))))
print(get_print_head("保存非字符變量列表"))
float_xlist1_file = 'float_xlist1.pkl'
print("非字符變量文件名為:{}".format(float_xlist1_file))
with open(float_xlist1_file,'wb') as f:
pickle.dump(float_xlist1,f)
print(get_print_head("完成保存非字符變量列表"))
print(get_print_head("保存字符變量列表"))
string_xlist1_file = 'string_xlist1.pkl'
string_xlist1 = list(set(raw_xlist0) - set(float_xlist1))
print("字符變量文件名為:{}".format(string_xlist1_file))
with open(string_xlist1_file,'wb') as f:
pickle.dump(string_xlist1,f)
print(get_print_head("完成保存字符變量列表"))
print("\n")
print(get_print_head("檢查缺失率高于95%變量個數(shù)"))
desc = train[float_xlist1].describe(percentiles=[.01,.05,.95,.99]).T
desc["%missing"] = (len(train) - desc["count"]) / len(train)
high_missing_remove_xlist2 = desc[desc["%missing"]<=0.95].index.tolist()
temp_xlist = high_missing_remove_xlist2.copy()
for xvar in temp_xlist:
group = train.groupby(xvar).size()
group = pd.DataFrame(group)
group.reset_index(inplace=True)
group.rename(columns={0:"cnt"},inplace=True)
cnt = group["cnt"].max()
if cnt / len(train) > 0.95:
high_missing_remove_xlist2.remove(xvar)
print('缺失率高于95%的變量為{}個'.format(str(len(float_xlist1) - len(high_missing_remove_xlist2))))
print(get_print_head("保存缺失率小于等于95%變量列表"))
high_missing_remove_xlist2_file = 'high_missing_remove_xlist2.pkl'
print("缺失率小于等于95%變量文件名為:{}".format(high_missing_remove_xlist2_file))
with open(high_missing_remove_xlist2_file,'wb') as f:
pickle.dump(high_missing_remove_xlist2,f)
print(get_print_head("完成保存缺失率小于等于95%變量列表"))
print("\n")
import md
import pandas as pd
import numpy as np
import gc
import scorecardpy as sc
from sklearn.metrics import roc_curve
import xgboost as xgb
import pickle
import json
def cal_iv(data,feature_list):
fail = []
bins = {}
for x in feature_list:
try:
bins_ = sc.woebin(data[[x,'label']], y="label")
bins.update(bins_)
except:
print(x)
fail.append(x)
iv = pd.Series()
for k,v in bins.items():
iv[k] = v['total_iv'].values[0]
return iv, bins
def get_label(x):
if x == 0:
return 0
elif x >=30:
return 1
else:
return np.nan
def ks(y_predicted,y_true):
fpr, tpr, thresholds = roc_curve(y_true.get_label(),y_predicted)
return 'ks',-np.max(tpr-fpr)
def ks1(y_predicted,y_true):
fpr, tpr, thresholds = roc_curve(y_true,y_predicted)
return 'ks',-np.max(tpr-fpr)
def psi(data1,data2,var):
#數(shù)值型特征
# data1[var]=data1[var].astype(float)
# data2[var]=data2[var].astype(float)
try:
_,bins = pd.qcut(data1[var],q=10,retbins=True,duplicates = 'drop')
bins=np.sort(list(set([-np.inf] + bins.tolist()+[np.inf])))
t1 = pd.cut(data1[var],bins=bins)
t2 = pd.cut(data2[var],bins=bins)
t11=t1.value_counts(dropna=False)
t22=t2.value_counts(dropna=False)
t11.index=t11.index.astype(str)
t22.index=t22.index.astype(str)
t = pd.concat([t11,t22],axis=1).sort_index()
t.columns = ['base_cnt','test_cnt']
t['feature']=var
t.loc[t.base_cnt.isnull(),'base_cnt']=0
t.loc[t.test_cnt.isnull(),'test_cnt']=0
t['base_rate'] = t['base_cnt']/t['base_cnt'].sum()
t['test_rate'] = t['test_cnt']/t['test_cnt'].sum()
t['psi'] = (t['test_rate']-t['base_rate'])*np.log((t['test_rate']+0.000001)/(t['base_rate']+0.000001))
t['total_psi']= t['psi'].sum()
columns=['feature','base_cnt','test_cnt','base_rate','test_rate','psi','total_psi']
t=t.loc[:,columns]
return t
except:
print('無數(shù)據(jù)疟游!',var)
# group_all = pd.DataFrame()
# for xvar in xlist:
# print(xvar)
# group = raw.groupby([xvar,"label"]).size().reset_index()
# group = pd.DataFrame(group)
# group.rename(columns={0:"cnt"},inplace=True)
# group0 = group[group["label"]==0]
# group0.rename(columns={"cnt":"good"},inplace=True)
# group1 = group[group["label"]==1]
# group1.rename(columns={"cnt":"bad"},inplace=True)
# group0 = group0[[xvar,"good"]]
# group1 = group1[[xvar,"bad"]]
# group = pd.merge(group0,group1,how='outer',on=xvar)
# group["total"] = group["good"]+ group["bad"]
# group["%total"] = group["total"] / group["total"].sum()
# group["%bad"] = group["bad"] / group["total"]
# group.rename(columns={xvar:"value"},inplace=True)
# group["xvar"] = xvar
# group = group[["xvar","value","good","bad","total","%total","%bad"]]
# group_all = pd.concat([group_all,group])
with open(r"br2_features.pkl",'rb') as f:
br2_feature = pickle.load(f)
train_xlist3 = list(iv_table[iv_table['iv']!=0]['xvar'])
with open("train_xlist3.pkl","wb") as f:
pickle.dump(train_xlist3,f)
for xvar in train_xlist3:
train[xvar] = train[xvar].astype(float)
train = pd.read_pickle('train.pkl')
test = pd.read_pickle('test.pkl')
with open("train_xlist3.pkl","rb") as f:
train_xlist3 = pickle.load(f)
clf1 = xgb.XGBClassifier(learning_rate=0.1,max_depth=3,min_child_weight=250,subsample=0.7,colsample_bytree=0.6,reg_alpha=3,n_estimators=500,n_jobs=30,scale_pos_weight=167)
clf1.fit(train[train_xlist3],train["label"],
eval_metric=ks, #ks
eval_set=[(train[train_xlist3],train["label"]),(test[train_xlist3],test['label'])],
early_stopping_rounds=100)
importance = pd.DataFrame()
importance["xvar"] = train_xlist3
importance["importance"] = clf1.feature_importances_
importance["importance"] = importance["importance"].astype(float)
importance.sort_values('importance',ascending=False)
out = pd.DataFrame()
out['FLAG']=train['label']
out['Y_PRED']=clf1.predict_proba(train[train_xlist3])[:,1]
print(md.PowerX(out,'Y_PRED','FLAG'))
md.x_split_n(out,'Y_PRED','FLAG')