本文章數(shù)據(jù)截取自一家銀行的真實客戶與交易數(shù)據(jù);涉及客戶主記錄、帳號、交易涮拗、業(yè)務(wù)和信用卡數(shù)據(jù);下載地址:https://pan.baidu.com/s/1mjg8OGS
Note:
本文同時用了dplyr包(處理速度很快)和sqldf包(sql語句寫起來比較方便迂苛,但是速度沒有dplyr快)的方法進行數(shù)據(jù)整理
導(dǎo)入數(shù)據(jù)
library(tidyverse)
library(sqldf)
accounts <- read_csv("accounts.csv")
clients <- read_csv("clients.csv")
disp <- read_csv("disp.csv")
loans <- read_csv("loans.csv")
trans <- read_csv("trans.csv")
……………………………………………………
使用“l(fā)oans”數(shù)據(jù)三热,根據(jù)“status”變量生成違約標(biāo)識變量(bad_good),
其中當(dāng)?shù)扔趕tatus=A時三幻,#取值為bad_good=0就漾,
其中當(dāng)?shù)扔趕tatus in (B,D)時,取值為bad_good=1念搬,
等于status=C時抑堡,bad_good為缺失值。*/
A代表合同終止朗徊,沒問題首妖;B代表合同終止,貸款沒有支付荣倾;
C代表合同處于執(zhí)行期悯搔,至今正常;D代表合同處于執(zhí)行期舌仍,
欠債狀態(tài)。
loans <- mutate(loans,bad_good=ifelse(status %in% c("B","D"),1,ifelse(status =="A",0,NA_integer_)))
********************************************************
是否違約與借款人的年齡是否有關(guān)系通危?
借款人的年齡信息在clients表中铸豁,所以要將兩個表連接起來
#dplyr方法
dpl_data <- left_join(loans,disp,by=c("account_id")
) %>% left_join(y=clients,by=c("client_id")
) %>% filter(type=="所有者") %>% select(-one_of(c("disp_id","type")))
#SQL方法
sql_data <- sqldf("select a.*,c.sex,c.birth_date,c.district_id,
c.client_id from loans as a
left join disp as b on a.account_id=b.account_id
left join clients as c on b.client_id=c.client_id
where b.type='所有者'")
setequal(dpl_data,sql_data)
dpl_data <- mutate(dpl_data,age=floor((as.Date(dpl_data$date)-as.Date(dpl_data$birth_date))/365))
aggregate(dpl_data$age,by=list(dpl_data$bad_good),mean)
Group.1 x
1 0 36.60591
2 1 37.55263
從圖中可以簡單看出年齡在不同好壞狀態(tài)下的分布相差不大,相關(guān)性應(yīng)該不大菊碟,不過還要進行假設(shè)檢驗才能得出結(jié)論
ggplot(dpl_data,mapping = aes(x=as.factor(bad_good),y=age))+geom_boxplot()
*****************************************************************
是否違約與借款人的資產(chǎn)否有關(guān)系节芥?
因為賬戶余額在“trans”表中,所以要將loans表和trans表連接
dpl <- left_join(loans,trans,by="account_id",suffix = c("_x", "_y")) %>% arrange(account_id,date_y) %>% select(
-one_of(c("trans_id","type","operation","amount_y","k_symbol","bank","account" ))
) %>% rename(date=date_x,amount=amount_x)
data2<-sqldf("select a.*,b.balance,b.date as t_date
from loans as a
left join trans as b on a.account_id=b.account_id
order by a.account_id,t_date")
dpl$date2 <- as.Date(dpl$date)
dpl$date2_y <- as.Date(dpl$date_y)
dpl$balance2 <- as.numeric(substr(gsub(",","",dpl$balance),2,nchar(dpl$balance)))
dpl <- select(dpl,-one_of("date","date_y","balance"))
只需要貸款前一年內(nèi)的賬戶余額
dpl2 <- filter(dpl,date2>date2_y & date2<=date2_y+365)
dpl4<-select(dpl2,account_id,status,amount,balance2) %>%
group_by(account_id,amount,status) %>%
summarize(avg_balance=mean(balance2,na.rm = T),stdev_balance2=sd(balance2,na.rm = T))%>%
arrange(account_id)
data4<-sqldf("select a.account_id,a.status,a.amount,
avg(balance2) as avg_balance,
stdev(balance2) as stdev_balance2
from dpl2 as a
group by a.account_id
order by a.account_id")
dpl4$bad_good<-ifelse(dpl4$status=="B"
| dpl4$status=="D",1,
ifelse(dpl4$status=="A",0,NA))
dpl4$bad_good <- as.factor(dpl4$bad_good)
資產(chǎn)高低和違約的可能性是否有關(guān)系?
從圖可以看出資產(chǎn)低的客人違約的可能性較高
with(dpl4,{
aggregate(avg_balance,by=list(bad_good),mean)
})
Group.1 x
1 0 42499.40
2 1 34642.31
ggplot(dpl4,mapping = aes(x=bad_good,y=avg_balance))+geom_boxplot()
貸款數(shù)額高的客人違約的可能性較高
with(dpl4,{
tapply(amount,bad_good,mean)
})
0 1
91641.46 205002.00
ggplot(dpl4,mapping = aes(x=bad_good,y=amount))+geom_boxplot()
貸款額度超出資產(chǎn)越高的客人違約的可能性較高
dpl4 <- mutate(dpl4,rate=amount/avg_balance)
with(dpl4,{tapply(rate,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=rate))+geom_boxplot()
資產(chǎn)波動越高的客人違約的可能性較高,不過不明顯
with(dpl4,{tapply(stdev_balance2,bad_good,mean)})
ggplot(dpl4,mapping = aes(x=bad_good,y=stdev_balance2))+geom_boxplot()