Python隨筆-pandas芳悲、風(fēng)控評分卡

1、項(xiàng)目涉及到的一些函數(shù)

# -*- coding: utf-8 -*-
# %%time
# from pyhive import presto
import pandas as pd
import numpy as np
import warnings
import os
from pyhive import presto
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
from sklearn.externals import joblib
from joblib import Parallel,delayed
import scorecardpy as sc
import toad
import datetime, calendar   
import time
from datetime import timedelta, date
from time import *

import toad
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
from pandas import DataFrame
from sklearn.tree import _tree
from functools import reduce  


%matplotlib inline

#透視表功能馏臭,agg里面寫函數(shù)就是按函數(shù)統(tǒng)計(jì)野蝇,不寫就是統(tǒng)計(jì)values的占比讼稚,agg內(nèi)可以選,count,min,max,sum,mean,len
#按照列為index绕沈,行為columns進(jìn)行透視展示values內(nèi)對象的分布情況锐想,空值填充0
# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) 


#設(shè)置行,列乍狐,列寬等
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth', 500)

cursor=presto.connect('IP',端口,'jobschedule').cursor()

def read_sql(hql):
    cursor.execute(hql)
    try:
        result = cursor.fetchall()
        num_columns = len(cursor.description)
        columns_names = [i[0] for i in cursor.description]
    except Exception as e:
        print(num_columns)
        print(columns_names)
    data = pd.DataFrame(list(result),columns=columns_names)
    return data

####################### PlotKS ##########################
def PlotKS(preds, labels, n=20, asc=True):
    
    # preds is score: asc=1
    # preds is prob: asc=0
    
    pred = preds  # 預(yù)測值
    bad = labels  # 取1為bad, 0為good
    ksds = pd.DataFrame({'bad': bad, 'pred': pred})
    ksds['good'] = 1 - ksds.bad
    
    if asc == 1:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
    elif asc == 0:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
    ksds1.index = range(len(ksds1.pred))
    ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)
    ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)
    
    if asc == 1:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
    elif asc == 0:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
    ksds2.index = range(len(ksds2.pred))
    ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)
    ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)
    
    # ksds1 ksds2 -> average
    ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]
    ksds['cumsum_good2'] = ksds2['cumsum_good2']
    ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
    ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
    ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2
    
    # ks
    ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
    ksds['tile0'] = range(1, len(ksds.ks) + 1)
    ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])
    
    qe = list(np.arange(0, 1, 1.0/n))
    qe.append(1)
    qe = qe[1:]
    
    ks_index = pd.Series(ksds.index)
    ks_index = ks_index.quantile(q = qe)
    ks_index = np.ceil(ks_index).astype(int)
    ks_index = list(ks_index)
    
    ksds = ksds.loc[ks_index]
    ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
    ksds0 = np.array([[0, 0, 0, 0]])
    ksds = np.concatenate([ksds0, ksds], axis=0)
    ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])
    
    ks_value =abs(ksds.ks).max() #20200502-調(diào)整增加abs
    ks_pop = ksds.tile[ksds.ks.idxmax()]
    print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))
    
    # chart
    plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',
                         color='blue', linestyle='-', linewidth=2)
                         
    plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',
                        color='red', linestyle='-', linewidth=2)
                        
    plt.plot(ksds.tile, ksds.ks, label='ks',
                   color='green', linestyle='-', linewidth=2)
                       
    plt.axvline(ks_pop, color='gray',  linestyle='--')
    plt.axhline(ks_value, color='green',  linestyle='--')
    plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')
    plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')
    plt.title('KS=%s ' %np.round(ks_value, 4) +  
                'at Pop=%s' %np.round(ks_pop, 4), fontsize=15) 
    plt.legend()
    return ksds

#逾期打標(biāo)
def fun(x):
    if x >= 30:
        return 1
    else:
        return 0

#灰用戶打標(biāo)    
def Grey(x):
    if (x > 0)&(x<30):
        return 1
    else:
        return 0

#返回日期的年份月份
def getYearMonth(x):  
    return str(x)[0:7]  

#計(jì)算分組后的區(qū)間用戶占比
def ration(x):
    return x.sum()/x.count()


class rule_tree(object):
    def __init__(self, datasets, ex_lis, dep='bad_ind', min_samples=0.05, min_samples_leaf=200, min_samples_split=50,
                 max_depth=3 ):
        '''
        目前規(guī)則變量只支持?jǐn)?shù)值型
        datasets:數(shù)據(jù)集 dataframe格式
        ex_lis:不參與建模的特征赠摇,如id,時(shí)間切片等浅蚪。 list格式  
        min_samples:分箱時(shí)最小箱的樣本占總比 numeric格式
        max_depth:決策樹最大深度 numeric格式
        min_samples_leaf:決策樹子節(jié)點(diǎn)最小樣本個(gè)數(shù) numeric格式
        min_samples_split:決策樹劃分前藕帜,父節(jié)點(diǎn)最小樣本個(gè)數(shù) numeric格式
        '''
        self.datasets = datasets
        self.ex_lis = ex_lis
        self.dep = dep
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split

    def tree_to_code(self, tree, feature_names):
        tree_ = tree.tree_
        feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature]
        print ("def tree({}):".format(", ".join(feature_names)))
         
        p_value = [i[0][0] for i in dtree.tree_.value]
        
        def recurse(node, depth):
            indent = "  " * depth
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                print ("{}if {} <= {}:".format(indent, name, threshold))
                recurse(tree_.children_left[node], depth + 1)
                print ("{}if {} > {}".format(indent, name, threshold))
                #print ("{}else:".format(indent))
                recurse(tree_.children_right[node], depth + 1)
            else:
                print ("{}return {}".format(indent, tree_.value[node]))
                    

        recurse(0, 1)
    
    def get_lineage(self, tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]

        # get ids of child nodes
        idx = np.argwhere(left == -1)[:,0]     

        def recurse(left, right, child, lineage=None):          
            if lineage is None:
                lineage = [child]
            if child in left:
                parent = np.where(left == child)[0].item()
                split = 'l'
            else:
                parent = np.where(right == child)[0].item()
                split = 'r'

            lineage.append((parent, split, threshold[parent], features[parent])) 

            if parent == 0:
                lineage.reverse()
                return lineage
            else:
                return recurse(left, right, parent, lineage)
        node_list = []
        for child in idx:
            for node in recurse(left, right, child):
                #print(node)
                node_list.append(node)
        return node_list
    
    def get_interval_node(self, node_list):
        node_dict = {}
        node_id = 1
        for id,i in enumerate(node_list[:-1]):
            if id==0:
                node_dict[node_id] = [i[1:]]
            if id>0:
                if type(node_list[id-1])==tuple:
                    if type(i)==tuple:
                        node_dict[node_id].append(i[1:])
                    else:
                        node_id += 1
                        node_dict[node_id] = []
                else:
                    node_dict[node_id].append(i[1:])
        return node_dict

    def rules(self,tree,feature_names):
        node_list = self.get_lineage(tree,feature_names)
        node_dict = self.get_interval_node(node_list)
        rules_dict = {}
        con_cal = []
        for k,v in node_dict.items():
            con = []
            con_cal_ = []
            for v_ in v:
                con.append("({} < {})".format(v_[2],v_[1]) if v_[0]=='l' else "({} >= {})".format(v_[2],v_[1]))
                con_cal_.append((self.datasets[v_[2]] < v_[1]) if v_[0]=='l' else (self.datasets[v_[2]] >= v_[1]))

            con_cal.append(reduce(lambda x, y: x*y,con_cal_))
            rules_dict[k] = ['&'.join(con),round(self.datasets[con_cal[-1]][self.dep].mean(),4)]

        rules = pd.DataFrame(rules_dict).T
        rules.columns = ['規(guī)則','p']

        return con_cal,rules
    
    def rule_indicator(self, d, cons):
        indicator = {}
        
        for id,con in enumerate(cons):

            reject = np.sum(~con)
            fugai=np.sum(con)
            total=np.sum(con)+np.sum(~con)
            wuju = np.sum((~con)*(d[self.dep]==0))
            wufang = d[con][self.dep].sum()
            lanjie_hei=total-fugai-wuju
            shengyu_bai=fugai-wufang
            fugai_lv=round(fugai / total,4)
            
            default_rate = round(wufang / np.sum(con),4)
            default_rate = round(wufang / np.sum(con),4) 
            precise = round((reject - wuju) / reject,4)
            recall = round((reject - wuju) / d[self.dep].sum(),4)
            distrub = round(np.sum((~con)*(d[self.dep]==0)) / np.sum(d[self.dep]==0),4)
            ks = round(recall - distrub,4)
            ## lift 累計(jì)壞/累計(jì)總/平均逾期率
            lift = round(default_rate /(d[self.dep].mean()),4)

            
            indicator[id+1] = {'總攔截量':reject,'攔截壞樣本':lanjie_hei,'攔截白樣本':wuju,'通過量':fugai,'誤放量':wufang,'誤放率(逾期率)':default_rate,'總樣本':total,'通過率':fugai_lv,
                           '準(zhǔn)確率':precise,'召回率':recall,'打擾率':distrub,'ks':ks,'lift倍數(shù)':lift}
        indicator = pd.DataFrame(indicator).T
        indicator[['總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本']] = indicator[['總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本']].astype(int)
  
        
        return indicator

    
    def fit_plot(self):
        # DecisionTreeClassifier 可分類算法
        dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,
                                           min_samples_leaf=self.min_samples_leaf,
                                           min_samples_split=self.min_samples_split)
        x = self.datasets.drop(self.ex_lis, axis=1)
        y = self.datasets[self.dep]
        dtree = dtree.fit(x, y)
        with open("dt.dot", "w") as f:
            tree.export_graphviz(dtree, out_file=f)
        dot_data = StringIO()
        tree.export_graphviz(dtree, out_file=dot_data,
                             feature_names=x.columns,
                             class_names=[self.dep],
                             filled=True, rounded=True,
                             special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        #self.tree_to_code(dtree,x.columns)
        print('\n\n')
        
        con_cal,rules = self.rules(dtree, x.columns)

        indicator = self.rule_indicator(self.datasets, con_cal)
        
        cols = ['規(guī)則','總攔截量','攔截壞樣本','攔截白樣本','通過量','誤放量','總樣本','通過率', '誤放率(逾期率)','準(zhǔn)確率', '召回率', '打擾率', 'ks', 'lift倍數(shù)']
        rule = rules.merge(indicator,left_index=True,right_index=True)[cols]
        
        rule.sort_values('誤放率(逾期率)',ascending=True,inplace=True)
        
        return rule,graph.create_png()

def evaluate_func1(df,cols=None,groups=None,label='y',descs=None):
    '''
    注意:這里注意非空樣本和空樣本是分開計(jì)算的
    df:輸入的數(shù)據(jù),dataframe
    cols:指定需要評估的列惜傲,列表
    groups:指定需要分組的列洽故,列表,注意分組的列值最好沒有空值(自行填充一下)
    label:指定標(biāo)簽的列名(標(biāo)簽中1黑盗誊,0白)
    dropna:計(jì)算時(shí)是否去掉空值的行
    descs:需降序分箱累計(jì)的列时甚,默認(rèn)全部按升序分箱累計(jì),即低分?jǐn)?shù)分箱往上累計(jì)加到高分?jǐn)?shù)分箱上
    return:返回dataframe,列名如下
        'group_set':分組
        'var_name':'變量名'
        'flag':'攔截區(qū)間'
        'total':樣本數(shù)
        'var_empty':空值個(gè)數(shù)
        'var_get':變量覆蓋率(非空)
        'stp':'攔截樣本數(shù)'
        'pos':'黑樣本數(shù)'
        'neg':'白樣本數(shù)'
        'cunsum_pos':'累計(jì)黑樣本數(shù)'
        'cunsum_neg':'累計(jì)白樣本數(shù)'
        'cunsum_stp':'累計(jì)攔截?cái)?shù)'
        'total_pos':總體黑樣本數(shù)
        'total_neg':總體白樣本數(shù)
        'total_stp':總體樣本數(shù)
        'intercept':'攔截率'
        'precision':'準(zhǔn)確率'
        'recall':'召回率'
        'disturb':'打擾率'
        'cum_precision':'累計(jì)準(zhǔn)確率'
        'avg_precision':'平均準(zhǔn)確率'
        'cum_recall':'累計(jì)召回率'
        'cum_disturb':'累計(jì)打擾率'
        'bks':'ks區(qū)間值'
        'ks':'ks值'     
    '''
    df=df.apply(pd.to_numeric,errors='ignore')
    df[df.select_dtypes('object').columns.tolist()]=df.select_dtypes('object').apply(pd.to_datetime,errors='ignore')
    from tqdm import tqdm
    # 計(jì)算指標(biāo)
    def define_mertrics(df,label,descs=None):
        
        def __num_q(ser,dropna):
                num=ser.nunique(dropna)
                if num>25:
                    q=10
                elif num>15:
                    q=8
                elif num>=10:
                    q=6
                else:
                    q=num
                return q
        dts=df.select_dtypes('datetime64[ns]').shape[1]
        objs=df.select_dtypes('object').shape[1]
        nums=len(df.columns.tolist())-dts-objs
        print('data has {} datetime columns,{} object columns,{} numeric columns'.format(dts,objs,nums))
        res=[]
        if not descs:
            descs=[]
        for col in tqdm(df.columns.difference([label])):
            na_stp=df[df[col].isna()==True].shape[0]
            na_pos=df[df[col].isna()==True][label].sum()
            na_neg=na_stp-na_pos
            var_get=df.shape[0]-na_stp
            total=df.shape[0]
            ser=df[df[col].isna()==False][col]
            tmp=pd.DataFrame()
            
            # 數(shù)值型且取值個(gè)數(shù)大于10
            if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
                q=__num_q(ser,dropna=True)
                _,bins=pd.qcut(ser,q=q,retbins=True,precision=2,duplicates='drop',labels=False)
                bins[0]=bins[0]-0.01
                cuts=pd.cut(ser,bins=bins)
                tmp['flag']=cuts # 分箱
                tmp['var_name']= col  # 變量 
                tmp['label']=df[label] # 標(biāo)簽
            #數(shù)值型且取值個(gè)數(shù)小于10    
            elif ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()<10:
                tmp['flag']=ser
                tmp['var_name']=col
                tmp['label']=df[label]
            #非數(shù)值型(類別型)
            else:
                tmp['flag']=ser
                tmp['var_name']=col
                tmp=tmp.astype(str)
                tmp['label']=df[label]                
            #計(jì)算非空樣本
            result_stp=tmp.groupby(['var_name','flag']).count().rename(columns={'label':'stp'})
            result_pos=tmp.groupby(['var_name','flag'])['label'].sum().to_frame().rename(columns={'label':'pos'}).astype(int) #黑樣本數(shù)
            result_neg=tmp[tmp['label']==0].groupby(['var_name','flag'])['label'].count().to_frame().rename(columns={'label':'neg'}).astype(int) # 白樣本數(shù)目
            merge_result=result_stp.merge(result_pos,how='left',on=['var_name','flag']).merge(result_neg,how='left',on=['var_name','flag'])
            merge_result.reset_index(inplace=True)
            merge_result['rank']=pd.Series([i+1 for i in range(merge_result.shape[0])])
            if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
                merge_result['flag'] = merge_result['flag'].cat.add_categories([0])
            merge_result.fillna(0,inplace=True)
            if col in descs:
                merge_result.sort_values(by=['rank'],axis=0,ascending=False,inplace=True,na_position='last')    
            merge_result['cunsum_pos']=merge_result.groupby('var_name')['pos'].cumsum().astype(int) #累計(jì)黑樣本
            merge_result['cunsum_neg']=merge_result.groupby('var_name')['neg'].cumsum().astype(int) #累計(jì)白樣本
            merge_result['cunsum_stp']=merge_result.groupby('var_name')['stp'].cumsum().astype(int) #累計(jì)攔截?cái)?shù)
            merge_result['total_pos']=merge_result['cunsum_pos'].max()
            merge_result['total_neg']=merge_result['cunsum_neg'].max()
            merge_result['total_stp']=merge_result['cunsum_stp'].max()
            #空樣本計(jì)算
            tmpna=pd.DataFrame()
            columns=['var_name','flag','stp','pos','neg','rank','cunsum_pos','cunsum_neg','cunsum_stp','total_pos','total_neg','total_stp']
            tmpna[columns]=pd.DataFrame(np.array([[col,'empty',na_stp,na_pos,na_neg,0,na_pos,na_neg,na_stp,na_pos,na_neg,na_neg]]),columns=columns)
            #合并非空和空樣本
            merge_result=pd.concat([merge_result,tmpna],axis=0)
            merge_result['var_get']=var_get
            merge_result['var_empty']=na_stp
            merge_result['total']=total
            res.append(merge_result)
        res=pd.concat(res,axis=0)
        res=res.apply(pd.to_numeric,errors='ignore')
        # 是否空值單獨(dú)統(tǒng)計(jì)
        # 計(jì)算指標(biāo)
        res['get_rate']=res['var_get']/(res['total']+0.001) # 變量覆蓋率
        res['intercept']=res['stp']/(res['total_stp']+0.001) # 攔截率
        res['precision']=res['pos']/(res['stp']+0.001) # 準(zhǔn)確率
        res['recall']=res['pos']/(res['total_pos']+0.001)  #召回率
        res['disturb']=res['neg']/(res['total_neg']+0.001) #打擾率
        res['cum_precision']=res['cunsum_pos']/(res['cunsum_stp']+0.001)  # 累計(jì)準(zhǔn)確率
        res['avg_precision']=res['total_pos']/(res['total_stp']+0.001) #均準(zhǔn)確率
        res['cum_recall']=res['cunsum_pos']/(res['total_pos']+0.001) # 累計(jì)召回率
        res['cum_disturb']=res['cunsum_neg']/(res['total_neg']+0.001) # 累計(jì)打擾率
        res['bks']=abs(res['cum_recall']-res['cum_disturb']) # 區(qū)間ks值
        res=res.merge(res[res['flag']!='empty'].groupby('var_name')['bks'].max().to_frame().reset_index().rename(columns={'bks':'ks'}),how='left',on=['var_name'])                 
        rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
        # 返回列
        retcols=['var_name','flag','var_empty','var_get','total','get_rate','stp','pos','neg','cunsum_stp','cunsum_pos','cunsum_neg','total_stp','total_pos','total_neg','intercept','precision','recall','disturb','cum_precision','avg_precision','cum_recall','cum_disturb','bks','ks']
        rs=rs[retcols]
       
    # 修改指標(biāo)名
        rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
        rs.rename(columns={'var_name':'變量'
                           ,'flag':'攔截區(qū)間'
                           ,'stp':'攔截樣本數(shù)'
                           ,'pos':'黑樣本數(shù)'
                           ,'neg':'白樣本數(shù)'
                           ,'cunsum_pos':'累計(jì)黑樣本數(shù)'
                           ,'cunsum_neg':'累計(jì)白樣本數(shù)'
                           ,'cunsum_stp':'累計(jì)攔截?cái)?shù)'
                           ,'intercept':'攔截率'
                           ,'precision':'準(zhǔn)確率'
                           ,'recall':'召回率'
                           ,'disturb':'打擾率'
                           ,'cum_precision':'累計(jì)準(zhǔn)確率'
                           ,'avg_precision':'平均準(zhǔn)確率'
                           ,'cum_recall':'累計(jì)召回率'
                           ,'cum_disturb':'累計(jì)打擾率'
                           ,'bks':'ks區(qū)間值'
                           ,'ks':'ks值'
                          },inplace=True)
        
        return rs
        
    # 多組計(jì)算計(jì)算指標(biāo)
    group_name='group_set'
    tmp=[]
    if not cols:
        cols=df.columns.tolist()
    if not groups:
        groups=['nan_group_set']
        df[groups[0]]='&&&'
    df[groups]=df[groups].astype(str)
    df[groups]=df[groups].astype(str).fillna('ng')
    if len(groups)>1:
        df['group_set']=df[groups[0]].str.cat([df[g] for g in groups[1:]],sep='_')
    else:
        df['group_set']=df[groups[0]]
    group_set=df['group_set'].unique()
    for k in group_set:
        df_new=df[df['group_set']==k][cols+[label]]
        res=define_mertrics(df_new,label=label,descs=descs)
        columns=res.columns.tolist()
        res[group_name]=k
        res=res[[group_name]+columns]
        tmp.append(res)       
    outs=pd.concat(tmp,axis=0)
    return outs
print('加載完畢哈踱,時(shí)間:',strftime("%Y-%m-%d %H:%M:%S", localtime()))

2荒适、讀取數(shù)據(jù)

data_changjing=pd.read_csv('*/全域mob3.csv')

#如果一個(gè)文件包含多個(gè)sheet的讀取
data_Tencent_lfq=pd.read_excel('*.xlsx',sheet_name='測試')

#將兩個(gè)讀入的dataframe數(shù)據(jù)進(jìn)行拼接,類似excel的vlookup,on中可以多個(gè)條件共同使用嚣鄙,也可以只用一個(gè)條件
data_new3=pd.merge(data_changjing,data_y,on=  ['num_id']) 

#將兩個(gè)數(shù)據(jù)行拼接
data_new3=data_new3.append(data_new2)

#數(shù)據(jù)信息描述
data_new3.describe() #數(shù)據(jù)分布描述
data_new3.info()
data_new3.shape #數(shù)據(jù)類型吻贿,字段,空值等描述
data_new3.dtype #數(shù)據(jù)類型
data_new3[''].value_counts()#統(tǒng)計(jì)列內(nèi)數(shù)據(jù)分布
data_new3.rename(columns={'score':'Bscore'},inplace=True)  #列修改名

data_new3=data_new3[['A','B','C','D','E']]#如果數(shù)據(jù)有A-G列哑子,只選幾列自己需要的數(shù)據(jù)
#groupby與聚合函數(shù)aggfunc
#groupby內(nèi)的分組條件舅列,可以是一個(gè),也是是多個(gè)卧蜓,多個(gè)時(shí)需要存儲(chǔ)為list形式帐要,groupby(['bins2'])   /groupby([A,B,C])
def ration(x):
    return x.sum()/x.count()

res2.groupby(['bins2'])['mob4'].agg(['count','sum',ration])

#agg內(nèi)可以有count,min,max,mean,sum,或者自己定義

#數(shù)據(jù)篩選與顯示
data1=data_new3[['num_id','fina_date','zhiye']]
data1.head()

數(shù)據(jù)的等頻與等距分箱,并將分箱結(jié)果存到新列

#等頻分箱
res2['bins']=pd.qcut(res2['score'],10,duplicates='drop',retbins=True)[0]

#等距分箱
res2['bins2']= pd.cut(res2['score'],10,duplicates='drop',retbins=True)[0]

數(shù)據(jù)透視

# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],aggfunc=['count'],values=['mob4'],fill_value=0,margins=True, dropna=True)
pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) #透視表功能弥奸,agg里面寫函數(shù)就是按函數(shù)統(tǒng)計(jì)榨惠,不寫就是統(tǒng)計(jì)values的占比

pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['mob4'],values=['num_id'],fill_value=0,aggfunc=['count'])

2、dataframe獲取數(shù)據(jù)的列明

data_new3.columns

結(jié)果:
Index(['num_id', 'zhiye', 'weiyue', 'gongzhai', 'qingchang', 'zhuxing', 'lvyue', 'shouxin', 'xiaofei', 'xingqu', 'chengzhang', 'overdue'], dtype='object')
3盛霎、評分卡Python包的操作使用-scorecardpy

import scorecardpy as sc
#卡方分箱獲取分箱結(jié)果
bins_new=sc.woebin(data_new3.loc[data_new3.overdue!=-1,['gongzhai','overdue']], y="overdue")
bins_new #分箱結(jié)果

#畫出分箱結(jié)果分布
woebin_plot=sc.woebin_plot(bins_new)
woebin_plot

#分箱結(jié)果顯示為dataframe形式
bins_new['gongzhai']

#對bins_new進(jìn)行統(tǒng)計(jì)操作
bins_new['gongzhai']['count'].sum()

畫圖相關(guān)的函數(shù)調(diào)用

plt_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2019-11')&(Score_set_overdue['class_y']=='lfq')]
ks=PlotKS(plt_data['pred_score'],plt_data['overdue'],n=20,asc=True)
ks
plt.show
ks走勢圖.png
bins_new=sc.woebin(plt_data.loc[plt_data.overdue!=-1,['pred_score','overdue']], y="overdue")
woebin_plot=sc.woebin_plot(bins_new)
plt.figure(figsize=(5,5))
woebin_plot
qujianyuyuqi.png

多特征交叉與選擇

# 讀取數(shù)據(jù)
cross_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2020-01')&(Score_set_overdue['class_y']=='dae')]
# 指定不參與建模的變量赠橙,包含標(biāo)簽bad_ind。
ex_list=cross_data.drop(['Bscore','Tencent_score'],axis=1)

# 調(diào)用決策樹函數(shù)
rules,graph = rule_tree(datasets=cross_data, ex_lis=ex_list,max_depth=2,dep='overdue', min_samples=0.01,
                        min_samples_leaf=200, min_samples_split=200).fit_plot()

Image(graph)
rules #查看選取規(guī)則
12.png

評分卡結(jié)果處理

outs_Bscore=evaluate_func1(Bscore_Score_set_overdue[Bscore_Score_set_overdue['class_x']!='None'],cols=['Bscore'],groups=['order_month','class_x'],label='overdue')
outs_Bscore[outs_Bscore['攔截區(qū)間']!='empty']
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
  • 序言:七十年代末愤炸,一起剝皮案震驚了整個(gè)濱河市期揪,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌规个,老刑警劉巖凤薛,帶你破解...
    沈念sama閱讀 218,204評論 6 506
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件姓建,死亡現(xiàn)場離奇詭異,居然都是意外死亡缤苫,警方通過查閱死者的電腦和手機(jī)速兔,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 93,091評論 3 395
  • 文/潘曉璐 我一進(jìn)店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來活玲,“玉大人涣狗,你說我怎么就攤上這事∫沓妫” “怎么了屑柔?”我有些...
    開封第一講書人閱讀 164,548評論 0 354
  • 文/不壞的土叔 我叫張陵,是天一觀的道長珍剑。 經(jīng)常有香客問我掸宛,道長,這世上最難降的妖魔是什么招拙? 我笑而不...
    開封第一講書人閱讀 58,657評論 1 293
  • 正文 為了忘掉前任唧瘾,我火速辦了婚禮,結(jié)果婚禮上别凤,老公的妹妹穿的比我還像新娘饰序。我一直安慰自己,他們只是感情好规哪,可當(dāng)我...
    茶點(diǎn)故事閱讀 67,689評論 6 392
  • 文/花漫 我一把揭開白布求豫。 她就那樣靜靜地躺著,像睡著了一般诉稍。 火紅的嫁衣襯著肌膚如雪蝠嘉。 梳的紋絲不亂的頭發(fā)上,一...
    開封第一講書人閱讀 51,554評論 1 305
  • 那天杯巨,我揣著相機(jī)與錄音蚤告,去河邊找鬼。 笑死服爷,一個(gè)胖子當(dāng)著我的面吹牛杜恰,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播仍源,決...
    沈念sama閱讀 40,302評論 3 418
  • 文/蒼蘭香墨 我猛地睜開眼心褐,長吁一口氣:“原來是場噩夢啊……” “哼!你這毒婦竟也來了笼踩?” 一聲冷哼從身側(cè)響起檬寂,我...
    開封第一講書人閱讀 39,216評論 0 276
  • 序言:老撾萬榮一對情侶失蹤,失蹤者是張志新(化名)和其女友劉穎戳表,沒想到半個(gè)月后桶至,有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 45,661評論 1 314
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡匾旭,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 37,851評論 3 336
  • 正文 我和宋清朗相戀三年镣屹,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片价涝。...
    茶點(diǎn)故事閱讀 39,977評論 1 348
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡女蜈,死狀恐怖,靈堂內(nèi)的尸體忽然破棺而出色瘩,到底是詐尸還是另有隱情伪窖,我是刑警寧澤,帶...
    沈念sama閱讀 35,697評論 5 347
  • 正文 年R本政府宣布居兆,位于F島的核電站覆山,受9級特大地震影響,放射性物質(zhì)發(fā)生泄漏泥栖。R本人自食惡果不足惜簇宽,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 41,306評論 3 330
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望吧享。 院中可真熱鬧魏割,春花似錦、人聲如沸钢颂。這莊子的主人今日做“春日...
    開封第一講書人閱讀 31,898評論 0 22
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽殊鞭。三九已至遭垛,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間钱豁,已是汗流浹背耻卡。 一陣腳步聲響...
    開封第一講書人閱讀 33,019評論 1 270
  • 我被黑心中介騙來泰國打工, 沒想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留牲尺,地道東北人卵酪。 一個(gè)月前我還...
    沈念sama閱讀 48,138評論 3 370
  • 正文 我出身青樓,卻偏偏與公主長得像谤碳,于是被迫代替她去往敵國和親溃卡。 傳聞我的和親對象是個(gè)殘疾皇子,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 44,927評論 2 355