1、項(xiàng)目涉及到的一些函數(shù)
# -*- coding: utf-8 -*-
# %%time
# from pyhive import presto
import pandas as pd
import numpy as np
import warnings
import os
from pyhive import presto
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
from sklearn.externals import joblib
from joblib import Parallel,delayed
import scorecardpy as sc
import toad
import datetime, calendar
import time
from datetime import timedelta, date
from time import *
import toad
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
from pandas import DataFrame
from sklearn.tree import _tree
from functools import reduce
%matplotlib inline
#透視表功能馏臭,agg里面寫函數(shù)就是按函數(shù)統(tǒng)計(jì)野蝇,不寫就是統(tǒng)計(jì)values的占比讼稚,agg內(nèi)可以選,count,min,max,sum,mean,len
#按照列為index绕沈,行為columns進(jìn)行透視展示values內(nèi)對象的分布情況锐想,空值填充0
# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count'])
#設(shè)置行,列乍狐,列寬等
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth', 500)
cursor=presto.connect('IP',端口,'jobschedule').cursor()
def read_sql(hql):
cursor.execute(hql)
try:
result = cursor.fetchall()
num_columns = len(cursor.description)
columns_names = [i[0] for i in cursor.description]
except Exception as e:
print(num_columns)
print(columns_names)
data = pd.DataFrame(list(result),columns=columns_names)
return data
####################### PlotKS ##########################
def PlotKS(preds, labels, n=20, asc=True):
# preds is score: asc=1
# preds is prob: asc=0
pred = preds # 預(yù)測值
bad = labels # 取1為bad, 0為good
ksds = pd.DataFrame({'bad': bad, 'pred': pred})
ksds['good'] = 1 - ksds.bad
if asc == 1:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
elif asc == 0:
ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
ksds1.index = range(len(ksds1.pred))
ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)
ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)
if asc == 1:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
elif asc == 0:
ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
ksds2.index = range(len(ksds2.pred))
ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)
ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)
# ksds1 ksds2 -> average
ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]
ksds['cumsum_good2'] = ksds2['cumsum_good2']
ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2
# ks
ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
ksds['tile0'] = range(1, len(ksds.ks) + 1)
ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])
qe = list(np.arange(0, 1, 1.0/n))
qe.append(1)
qe = qe[1:]
ks_index = pd.Series(ksds.index)
ks_index = ks_index.quantile(q = qe)
ks_index = np.ceil(ks_index).astype(int)
ks_index = list(ks_index)
ksds = ksds.loc[ks_index]
ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
ksds0 = np.array([[0, 0, 0, 0]])
ksds = np.concatenate([ksds0, ksds], axis=0)
ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])
ks_value =abs(ksds.ks).max() #20200502-調(diào)整增加abs
ks_pop = ksds.tile[ksds.ks.idxmax()]
print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))
# chart
plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',
color='blue', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',
color='red', linestyle='-', linewidth=2)
plt.plot(ksds.tile, ksds.ks, label='ks',
color='green', linestyle='-', linewidth=2)
plt.axvline(ks_pop, color='gray', linestyle='--')
plt.axhline(ks_value, color='green', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')
plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')
plt.title('KS=%s ' %np.round(ks_value, 4) +
'at Pop=%s' %np.round(ks_pop, 4), fontsize=15)
plt.legend()
return ksds
#逾期打標(biāo)
def fun(x):
if x >= 30:
return 1
else:
return 0
#灰用戶打標(biāo)
def Grey(x):
if (x > 0)&(x<30):
return 1
else:
return 0
#返回日期的年份月份
def getYearMonth(x):
return str(x)[0:7]
#計(jì)算分組后的區(qū)間用戶占比
def ration(x):
return x.sum()/x.count()
class rule_tree(object):
def __init__(self, datasets, ex_lis, dep='bad_ind', min_samples=0.05, min_samples_leaf=200, min_samples_split=50,
max_depth=3 ):
'''
目前規(guī)則變量只支持?jǐn)?shù)值型
datasets:數(shù)據(jù)集 dataframe格式
ex_lis:不參與建模的特征赠摇,如id,時(shí)間切片等浅蚪。 list格式
min_samples:分箱時(shí)最小箱的樣本占總比 numeric格式
max_depth:決策樹最大深度 numeric格式
min_samples_leaf:決策樹子節(jié)點(diǎn)最小樣本個(gè)數(shù) numeric格式
min_samples_split:決策樹劃分前藕帜,父節(jié)點(diǎn)最小樣本個(gè)數(shù) numeric格式
'''
self.datasets = datasets
self.ex_lis = ex_lis
self.dep = dep
self.max_depth = max_depth
self.min_samples = min_samples
self.min_samples_leaf = min_samples_leaf
self.min_samples_split = min_samples_split
def tree_to_code(self, tree, feature_names):
tree_ = tree.tree_
feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature]
print ("def tree({}):".format(", ".join(feature_names)))
p_value = [i[0][0] for i in dtree.tree_.value]
def recurse(node, depth):
indent = " " * depth
if tree_.feature[node] != _tree.TREE_UNDEFINED:
name = feature_name[node]
threshold = tree_.threshold[node]
print ("{}if {} <= {}:".format(indent, name, threshold))
recurse(tree_.children_left[node], depth + 1)
print ("{}if {} > {}".format(indent, name, threshold))
#print ("{}else:".format(indent))
recurse(tree_.children_right[node], depth + 1)
else:
print ("{}return {}".format(indent, tree_.value[node]))
recurse(0, 1)
def get_lineage(self, tree, feature_names):
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
# get ids of child nodes
idx = np.argwhere(left == -1)[:,0]
def recurse(left, right, child, lineage=None):
if lineage is None:
lineage = [child]
if child in left:
parent = np.where(left == child)[0].item()
split = 'l'
else:
parent = np.where(right == child)[0].item()
split = 'r'
lineage.append((parent, split, threshold[parent], features[parent]))
if parent == 0:
lineage.reverse()
return lineage
else:
return recurse(left, right, parent, lineage)
node_list = []
for child in idx:
for node in recurse(left, right, child):
#print(node)
node_list.append(node)
return node_list
def get_interval_node(self, node_list):
node_dict = {}
node_id = 1
for id,i in enumerate(node_list[:-1]):
if id==0:
node_dict[node_id] = [i[1:]]
if id>0:
if type(node_list[id-1])==tuple:
if type(i)==tuple:
node_dict[node_id].append(i[1:])
else:
node_id += 1
node_dict[node_id] = []
else:
node_dict[node_id].append(i[1:])
return node_dict
def rules(self,tree,feature_names):
node_list = self.get_lineage(tree,feature_names)
node_dict = self.get_interval_node(node_list)
rules_dict = {}
con_cal = []
for k,v in node_dict.items():
con = []
con_cal_ = []
for v_ in v:
con.append("({} < {})".format(v_[2],v_[1]) if v_[0]=='l' else "({} >= {})".format(v_[2],v_[1]))
con_cal_.append((self.datasets[v_[2]] < v_[1]) if v_[0]=='l' else (self.datasets[v_[2]] >= v_[1]))
con_cal.append(reduce(lambda x, y: x*y,con_cal_))
rules_dict[k] = ['&'.join(con),round(self.datasets[con_cal[-1]][self.dep].mean(),4)]
rules = pd.DataFrame(rules_dict).T
rules.columns = ['規(guī)則','p']
return con_cal,rules
def rule_indicator(self, d, cons):
indicator = {}
for id,con in enumerate(cons):
reject = np.sum(~con)
fugai=np.sum(con)
total=np.sum(con)+np.sum(~con)
wuju = np.sum((~con)*(d[self.dep]==0))
wufang = d[con][self.dep].sum()
lanjie_hei=total-fugai-wuju
shengyu_bai=fugai-wufang
fugai_lv=round(fugai / total,4)
default_rate = round(wufang / np.sum(con),4)
default_rate = round(wufang / np.sum(con),4)
precise = round((reject - wuju) / reject,4)
recall = round((reject - wuju) / d[self.dep].sum(),4)
distrub = round(np.sum((~con)*(d[self.dep]==0)) / np.sum(d[self.dep]==0),4)
ks = round(recall - distrub,4)
## lift 累計(jì)壞/累計(jì)總/平均逾期率
lift = round(default_rate /(d[self.dep].mean()),4)
indicator[id+1] = {'總攔截量':reject,'攔截壞樣本':lanjie_hei,'攔截白樣本':wuju,'通過量':fugai,'誤放量':wufang,'誤放率(逾期率)':default_rate,'總樣本':total,'通過率':fugai_lv,
'準(zhǔn)確率':precise,'召回率':recall,'打擾率':distrub,'ks':ks,'lift倍數(shù)':lift}
indicator = pd.DataFrame(indicator).T
indicator[['總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本']] = indicator[['總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本']].astype(int)
return indicator
def fit_plot(self):
# DecisionTreeClassifier 可分類算法
dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
min_samples_split=self.min_samples_split)
x = self.datasets.drop(self.ex_lis, axis=1)
y = self.datasets[self.dep]
dtree = dtree.fit(x, y)
with open("dt.dot", "w") as f:
tree.export_graphviz(dtree, out_file=f)
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x.columns,
class_names=[self.dep],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#self.tree_to_code(dtree,x.columns)
print('\n\n')
con_cal,rules = self.rules(dtree, x.columns)
indicator = self.rule_indicator(self.datasets, con_cal)
cols = ['規(guī)則','總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本','通過率', '誤放率(逾期率)','準(zhǔn)確率', '召回率', '打擾率', 'ks', 'lift倍數(shù)']
rule = rules.merge(indicator,left_index=True,right_index=True)[cols]
rule.sort_values('誤放率(逾期率)',ascending=True,inplace=True)
return rule,graph.create_png()
def evaluate_func1(df,cols=None,groups=None,label='y',descs=None):
'''
注意:這里注意非空樣本和空樣本是分開計(jì)算的
df:輸入的數(shù)據(jù),dataframe
cols:指定需要評估的列惜傲,列表
groups:指定需要分組的列洽故,列表,注意分組的列值最好沒有空值(自行填充一下)
label:指定標(biāo)簽的列名(標(biāo)簽中1黑盗誊,0白)
dropna:計(jì)算時(shí)是否去掉空值的行
descs:需降序分箱累計(jì)的列时甚,默認(rèn)全部按升序分箱累計(jì),即低分?jǐn)?shù)分箱往上累計(jì)加到高分?jǐn)?shù)分箱上
return:返回dataframe,列名如下
'group_set':分組
'var_name':'變量名'
'flag':'攔截區(qū)間'
'total':樣本數(shù)
'var_empty':空值個(gè)數(shù)
'var_get':變量覆蓋率(非空)
'stp':'攔截樣本數(shù)'
'pos':'黑樣本數(shù)'
'neg':'白樣本數(shù)'
'cunsum_pos':'累計(jì)黑樣本數(shù)'
'cunsum_neg':'累計(jì)白樣本數(shù)'
'cunsum_stp':'累計(jì)攔截?cái)?shù)'
'total_pos':總體黑樣本數(shù)
'total_neg':總體白樣本數(shù)
'total_stp':總體樣本數(shù)
'intercept':'攔截率'
'precision':'準(zhǔn)確率'
'recall':'召回率'
'disturb':'打擾率'
'cum_precision':'累計(jì)準(zhǔn)確率'
'avg_precision':'平均準(zhǔn)確率'
'cum_recall':'累計(jì)召回率'
'cum_disturb':'累計(jì)打擾率'
'bks':'ks區(qū)間值'
'ks':'ks值'
'''
df=df.apply(pd.to_numeric,errors='ignore')
df[df.select_dtypes('object').columns.tolist()]=df.select_dtypes('object').apply(pd.to_datetime,errors='ignore')
from tqdm import tqdm
# 計(jì)算指標(biāo)
def define_mertrics(df,label,descs=None):
def __num_q(ser,dropna):
num=ser.nunique(dropna)
if num>25:
q=10
elif num>15:
q=8
elif num>=10:
q=6
else:
q=num
return q
dts=df.select_dtypes('datetime64[ns]').shape[1]
objs=df.select_dtypes('object').shape[1]
nums=len(df.columns.tolist())-dts-objs
print('data has {} datetime columns,{} object columns,{} numeric columns'.format(dts,objs,nums))
res=[]
if not descs:
descs=[]
for col in tqdm(df.columns.difference([label])):
na_stp=df[df[col].isna()==True].shape[0]
na_pos=df[df[col].isna()==True][label].sum()
na_neg=na_stp-na_pos
var_get=df.shape[0]-na_stp
total=df.shape[0]
ser=df[df[col].isna()==False][col]
tmp=pd.DataFrame()
# 數(shù)值型且取值個(gè)數(shù)大于10
if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
q=__num_q(ser,dropna=True)
_,bins=pd.qcut(ser,q=q,retbins=True,precision=2,duplicates='drop',labels=False)
bins[0]=bins[0]-0.01
cuts=pd.cut(ser,bins=bins)
tmp['flag']=cuts # 分箱
tmp['var_name']= col # 變量
tmp['label']=df[label] # 標(biāo)簽
#數(shù)值型且取值個(gè)數(shù)小于10
elif ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()<10:
tmp['flag']=ser
tmp['var_name']=col
tmp['label']=df[label]
#非數(shù)值型(類別型)
else:
tmp['flag']=ser
tmp['var_name']=col
tmp=tmp.astype(str)
tmp['label']=df[label]
#計(jì)算非空樣本
result_stp=tmp.groupby(['var_name','flag']).count().rename(columns={'label':'stp'})
result_pos=tmp.groupby(['var_name','flag'])['label'].sum().to_frame().rename(columns={'label':'pos'}).astype(int) #黑樣本數(shù)
result_neg=tmp[tmp['label']==0].groupby(['var_name','flag'])['label'].count().to_frame().rename(columns={'label':'neg'}).astype(int) # 白樣本數(shù)目
merge_result=result_stp.merge(result_pos,how='left',on=['var_name','flag']).merge(result_neg,how='left',on=['var_name','flag'])
merge_result.reset_index(inplace=True)
merge_result['rank']=pd.Series([i+1 for i in range(merge_result.shape[0])])
if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
merge_result['flag'] = merge_result['flag'].cat.add_categories([0])
merge_result.fillna(0,inplace=True)
if col in descs:
merge_result.sort_values(by=['rank'],axis=0,ascending=False,inplace=True,na_position='last')
merge_result['cunsum_pos']=merge_result.groupby('var_name')['pos'].cumsum().astype(int) #累計(jì)黑樣本
merge_result['cunsum_neg']=merge_result.groupby('var_name')['neg'].cumsum().astype(int) #累計(jì)白樣本
merge_result['cunsum_stp']=merge_result.groupby('var_name')['stp'].cumsum().astype(int) #累計(jì)攔截?cái)?shù)
merge_result['total_pos']=merge_result['cunsum_pos'].max()
merge_result['total_neg']=merge_result['cunsum_neg'].max()
merge_result['total_stp']=merge_result['cunsum_stp'].max()
#空樣本計(jì)算
tmpna=pd.DataFrame()
columns=['var_name','flag','stp','pos','neg','rank','cunsum_pos','cunsum_neg','cunsum_stp','total_pos','total_neg','total_stp']
tmpna[columns]=pd.DataFrame(np.array([[col,'empty',na_stp,na_pos,na_neg,0,na_pos,na_neg,na_stp,na_pos,na_neg,na_neg]]),columns=columns)
#合并非空和空樣本
merge_result=pd.concat([merge_result,tmpna],axis=0)
merge_result['var_get']=var_get
merge_result['var_empty']=na_stp
merge_result['total']=total
res.append(merge_result)
res=pd.concat(res,axis=0)
res=res.apply(pd.to_numeric,errors='ignore')
# 是否空值單獨(dú)統(tǒng)計(jì)
# 計(jì)算指標(biāo)
res['get_rate']=res['var_get']/(res['total']+0.001) # 變量覆蓋率
res['intercept']=res['stp']/(res['total_stp']+0.001) # 攔截率
res['precision']=res['pos']/(res['stp']+0.001) # 準(zhǔn)確率
res['recall']=res['pos']/(res['total_pos']+0.001) #召回率
res['disturb']=res['neg']/(res['total_neg']+0.001) #打擾率
res['cum_precision']=res['cunsum_pos']/(res['cunsum_stp']+0.001) # 累計(jì)準(zhǔn)確率
res['avg_precision']=res['total_pos']/(res['total_stp']+0.001) #均準(zhǔn)確率
res['cum_recall']=res['cunsum_pos']/(res['total_pos']+0.001) # 累計(jì)召回率
res['cum_disturb']=res['cunsum_neg']/(res['total_neg']+0.001) # 累計(jì)打擾率
res['bks']=abs(res['cum_recall']-res['cum_disturb']) # 區(qū)間ks值
res=res.merge(res[res['flag']!='empty'].groupby('var_name')['bks'].max().to_frame().reset_index().rename(columns={'bks':'ks'}),how='left',on=['var_name'])
rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
# 返回列
retcols=['var_name','flag','var_empty','var_get','total','get_rate','stp','pos','neg','cunsum_stp','cunsum_pos','cunsum_neg','total_stp','total_pos','total_neg','intercept','precision','recall','disturb','cum_precision','avg_precision','cum_recall','cum_disturb','bks','ks']
rs=rs[retcols]
# 修改指標(biāo)名
rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
rs.rename(columns={'var_name':'變量'
,'flag':'攔截區(qū)間'
,'stp':'攔截樣本數(shù)'
,'pos':'黑樣本數(shù)'
,'neg':'白樣本數(shù)'
,'cunsum_pos':'累計(jì)黑樣本數(shù)'
,'cunsum_neg':'累計(jì)白樣本數(shù)'
,'cunsum_stp':'累計(jì)攔截?cái)?shù)'
,'intercept':'攔截率'
,'precision':'準(zhǔn)確率'
,'recall':'召回率'
,'disturb':'打擾率'
,'cum_precision':'累計(jì)準(zhǔn)確率'
,'avg_precision':'平均準(zhǔn)確率'
,'cum_recall':'累計(jì)召回率'
,'cum_disturb':'累計(jì)打擾率'
,'bks':'ks區(qū)間值'
,'ks':'ks值'
},inplace=True)
return rs
# 多組計(jì)算計(jì)算指標(biāo)
group_name='group_set'
tmp=[]
if not cols:
cols=df.columns.tolist()
if not groups:
groups=['nan_group_set']
df[groups[0]]='&&&'
df[groups]=df[groups].astype(str)
df[groups]=df[groups].astype(str).fillna('ng')
if len(groups)>1:
df['group_set']=df[groups[0]].str.cat([df[g] for g in groups[1:]],sep='_')
else:
df['group_set']=df[groups[0]]
group_set=df['group_set'].unique()
for k in group_set:
df_new=df[df['group_set']==k][cols+[label]]
res=define_mertrics(df_new,label=label,descs=descs)
columns=res.columns.tolist()
res[group_name]=k
res=res[[group_name]+columns]
tmp.append(res)
outs=pd.concat(tmp,axis=0)
return outs
print('加載完畢哈踱,時(shí)間:',strftime("%Y-%m-%d %H:%M:%S", localtime()))
2荒适、讀取數(shù)據(jù)
data_changjing=pd.read_csv('*/全域mob3.csv')
#如果一個(gè)文件包含多個(gè)sheet的讀取
data_Tencent_lfq=pd.read_excel('*.xlsx',sheet_name='測試')
#將兩個(gè)讀入的dataframe數(shù)據(jù)進(jìn)行拼接,類似excel的vlookup,on中可以多個(gè)條件共同使用嚣鄙,也可以只用一個(gè)條件
data_new3=pd.merge(data_changjing,data_y,on= ['num_id'])
#將兩個(gè)數(shù)據(jù)行拼接
data_new3=data_new3.append(data_new2)
#數(shù)據(jù)信息描述
data_new3.describe() #數(shù)據(jù)分布描述
data_new3.info()
data_new3.shape #數(shù)據(jù)類型吻贿,字段,空值等描述
data_new3.dtype #數(shù)據(jù)類型
data_new3[''].value_counts()#統(tǒng)計(jì)列內(nèi)數(shù)據(jù)分布
data_new3.rename(columns={'score':'Bscore'},inplace=True) #列修改名
data_new3=data_new3[['A','B','C','D','E']]#如果數(shù)據(jù)有A-G列哑子,只選幾列自己需要的數(shù)據(jù)
#groupby與聚合函數(shù)aggfunc
#groupby內(nèi)的分組條件舅列,可以是一個(gè),也是是多個(gè)卧蜓,多個(gè)時(shí)需要存儲(chǔ)為list形式帐要,groupby(['bins2']) /groupby([A,B,C])
def ration(x):
return x.sum()/x.count()
res2.groupby(['bins2'])['mob4'].agg(['count','sum',ration])
#agg內(nèi)可以有count,min,max,mean,sum,或者自己定義
#數(shù)據(jù)篩選與顯示
data1=data_new3[['num_id','fina_date','zhiye']]
data1.head()
數(shù)據(jù)的等頻與等距分箱,并將分箱結(jié)果存到新列
#等頻分箱
res2['bins']=pd.qcut(res2['score'],10,duplicates='drop',retbins=True)[0]
#等距分箱
res2['bins2']= pd.cut(res2['score'],10,duplicates='drop',retbins=True)[0]
數(shù)據(jù)透視
# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],aggfunc=['count'],values=['mob4'],fill_value=0,margins=True, dropna=True)
pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) #透視表功能弥奸,agg里面寫函數(shù)就是按函數(shù)統(tǒng)計(jì)榨惠,不寫就是統(tǒng)計(jì)values的占比
pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['mob4'],values=['num_id'],fill_value=0,aggfunc=['count'])
2、dataframe獲取數(shù)據(jù)的列明
data_new3.columns
結(jié)果:
Index(['num_id', 'zhiye', 'weiyue', 'gongzhai', 'qingchang', 'zhuxing', 'lvyue', 'shouxin', 'xiaofei', 'xingqu', 'chengzhang', 'overdue'], dtype='object')
3盛霎、評分卡Python包的操作使用-scorecardpy
import scorecardpy as sc
#卡方分箱獲取分箱結(jié)果
bins_new=sc.woebin(data_new3.loc[data_new3.overdue!=-1,['gongzhai','overdue']], y="overdue")
bins_new #分箱結(jié)果
#畫出分箱結(jié)果分布
woebin_plot=sc.woebin_plot(bins_new)
woebin_plot
#分箱結(jié)果顯示為dataframe形式
bins_new['gongzhai']
#對bins_new進(jìn)行統(tǒng)計(jì)操作
bins_new['gongzhai']['count'].sum()
畫圖相關(guān)的函數(shù)調(diào)用
plt_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2019-11')&(Score_set_overdue['class_y']=='lfq')]
ks=PlotKS(plt_data['pred_score'],plt_data['overdue'],n=20,asc=True)
ks
plt.show
ks走勢圖.png
bins_new=sc.woebin(plt_data.loc[plt_data.overdue!=-1,['pred_score','overdue']], y="overdue")
woebin_plot=sc.woebin_plot(bins_new)
plt.figure(figsize=(5,5))
woebin_plot
qujianyuyuqi.png
多特征交叉與選擇
# 讀取數(shù)據(jù)
cross_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2020-01')&(Score_set_overdue['class_y']=='dae')]
# 指定不參與建模的變量赠橙,包含標(biāo)簽bad_ind。
ex_list=cross_data.drop(['Bscore','Tencent_score'],axis=1)
# 調(diào)用決策樹函數(shù)
rules,graph = rule_tree(datasets=cross_data, ex_lis=ex_list,max_depth=2,dep='overdue', min_samples=0.01,
min_samples_leaf=200, min_samples_split=200).fit_plot()
Image(graph)
rules #查看選取規(guī)則
12.png
評分卡結(jié)果處理
outs_Bscore=evaluate_func1(Bscore_Score_set_overdue[Bscore_Score_set_overdue['class_x']!='None'],cols=['Bscore'],groups=['order_month','class_x'],label='overdue')
outs_Bscore[outs_Bscore['攔截區(qū)間']!='empty']