信用卡評分模型優化

原文出處:

http://blog.csdn.net/csqazwsxedc/article/details/51225156

我已經在博客裏轉載了,

存在問題:

1、源數據的獲取。要去國外網站(https://www.kaggle.com/c/GiveMeSomeCredit/data)下載,需要註冊賬號,註冊時需要用的Google的驗證碼,因爲國內封了Google,所以這個必須要翻牆才能顯示。簡便的解決方法是,360瀏覽器有個,穿越蒼穹,有5分鐘試用時間,或者其他翻牆軟件。另外,我將數據源上傳到了我的資源空間(http://download.csdn.net/detail/abc200941410128/9904440),請自行下載。

2、原文中前面代碼沒有補全,現在補上。

3、原文的WOE轉換中分箱完全是手動的等距分箱,這個非常不合理。應該採用自動分箱(這裏可以採用卡方分箱也可以採用包smbinning中的最優分箱)總之最好不要人工等距分箱,(最起碼也是等頻分箱)

4、邏輯迴歸建模時,自變量最好是分箱後變量值對應的woe值,模型效果會比用原來好,這也是一般邏輯迴歸模型都有woe分箱的原因,不過,直接用變量源數據也是可以的。

代碼如下:

setwd('F:/study/code/R_ompany/credit')
#library(devtools)  
#install_github("riv","tomasgreif") 


traindata0 <- read.csv("cs-training.csv",stringsAsFactors =F)
traindata<-traindata0[,2:12]
#traindata<-traindata0[,3:12]
y<-traindata0[,2]
names(traindata)<-c('y','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10')
#缺失值分析
summary(traindata)

library(VIM) 
matrixplot(traindata)

library(mice)
md.pattern(traindata)

#使用knn進行數據補全
library(DMwR)
traindata<-knnImputation(traindata,k=10,meth = "weighAvg")#knn處理缺失值是挺慢的
traindata1<-traindata

###異常值監測處理
traindata<-traindata1
names(traindata)<-c('y','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10')

boxplot(traindata)#箱線圖,x5太大。
unique(traindata$x2)#0爲異常值
traindata<-traindata[-which(traindata$x2==0),] #剔除異常值

boxplot(traindata[,c(4,8,10)])#箱線圖
unique(traindata$x3)#96,98爲異常值
traindata<-traindata[-which(traindata$x3>=96),] #剔除異常值
#which(traindata$x3 %in% c(96,98))

boxplot(traindata$x6)#箱線圖
unique(traindata$x6)#96,98爲異常值
boxplot(traindata$x7)#箱線圖
unique(traindata$x7)#96,98爲異常值
boxplot(traindata$x8)#箱線圖
unique(traindata$x8)#96,98爲異常值
boxplot(traindata$x9)#箱線圖
unique(traindata$x9)#96,98爲異常值


###########統計各個指標的分位數及超出上下限的數量

samp_num<-traindata[,c(2,5,6,11)]#取出數值型變量
nn <- nrow(samp_num)

mystats=function(x){
  num_unique=length(unique(x))
  mmean=mean(x)
  qq_bin=as.numeric(quantile(x,prob=c(0,0.01,0.05,0.1,0.25,0.5
                                      ,0.75,0.9,0.95,0.99,1),na.rm=T))
  
  b_up=mean(x)+3*sd(x)
  n_max=sum(x>b_up,na.rm=T)
  n_max_p=n_max/nn
  return(c(num_unique,mmean,b_up,n_max,n_max_p,qq_bin))
}
dim(samp_num)
tt <- apply(samp_num,2,mystats)
ttt <- t(as.data.frame(tt))

#處理極端值
extre_value=function(x){
  x_limit=mean(x)+3*sd(x)#上限
  x[x<0]=0#小於0的替換爲0
  x[x>x_limit]=x_limit#超出上限的替換爲上限
  rm(x_limit);gc()
  return(x)
}
d_num <- apply(traindata[,c(2,5,6)],2,extre_value)##對輸入的每一列進行極端值處理,向量化操作
traindata[,c(2,5,6)]<-d_num

###其它變量佔不作處理。

##################變量分析
#######單變量檢測分析
library("ggplot2")
ggplot(traindata, aes(x = x2, y = ..density..)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2) + geom_density()
#可以看到年齡變量大致呈正態分佈,符合統計分析的假設。
ggplot(traindata, aes(x = x5, y = ..density..)) + geom_histogram(fill = "blue", colour = "grey60", size = 0.2, alpha = 0.2) + geom_density() + xlim(1, 20000)
#月收入也大致呈正態分佈,符合統計分析的需要。

######################分箱(可選,也可以跳過,採用後面的smbinning包的分箱方法)
#卡方自動分箱函數
chimerge=function(data,begin=1000,end=4)    #data爲兩列,第一列爲實數變量,第二列爲取值爲0,1的因變量
  #begin表示初始化分段數量,end表示分類數量 
{
  breaks=seq(min(data[,1]),max(data[,1]),(max(data[,1])-min(data[,1]))/begin)  #劃分初始區間
  data[,3]=cut(data[,1],breaks)             #將數值型變量變成分類變量
  tj1=table(data[data[,2]==1,3])       
  tj0=table(data[data[,2]==0,3])
  
  while(length(breaks)>(end+2))
  {  
    kafang=c()
    for (i in 1:(length(breaks)-2))          #算出每個區間與下一個區間的卡方值
    {a=tj1[i]
    b=tj1[i+1]
    c=tj0[i]
    d=tj0[i+1]
    if (a+b==0 || d+c==0 || a+c==0 || b+d==0)
      kafang[i]=0
    else
      kafang[i]=((a*d-b*c)^2*(a+b+c+d)/(a+b)/(d+c)/(a+c)/(b+d))          
    }
    
    index=which(kafang==min(kafang))[1]         #區間最小卡方值的下標
    breaks=breaks[-(index+1)]                   #合併兩個區間
    tj1[index]=tj1[index]+tj1[index+1]            
    tj0[index]=tj0[index]+tj0[index+1]
    tj1=tj1[-(index+1)]
    tj0=tj0[-(index+1)]
    
  }
  breaks
}

mydata<-traindata
brks<- chimerge(traindata[,c(2,1)])
mydata$x1<-cut(traindata$x1,brks,include.lowest =T)
ss<-table(mydata[,c(1,2)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x1<-ss[1,]/sum(ss[1,])
pct1_x1<-ss[2,]/sum(ss[2,])
woe_x1<-log(pct1_x1/pct0_x1)
woe_x1
pct_x1<-pct1_x1-pct0_x1
iv_x1<-sum(woe_x1*pct_x1)
iv_x1 #0.77
mydata$x1_woe<-as.character(mydata$x1) 
mydata$x1_woe[which(mydata$x1==names(woe_x1[1]))]<-woe_x1[[1]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[2]))]<-woe_x1[[2]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[3]))]<-woe_x1[[3]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[4]))]<-woe_x1[[4]]
mydata$x1_woe[which(mydata$x1==names(woe_x1[5]))]<-woe_x1[[5]]
head(mydata$x1_woe)

brks<- chimerge(traindata[,c(3,1)],100,5)
mydata$x2<-cut(traindata$x2,brks,include.lowest =T)
ss<-table(mydata[,c(1,3)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x2<-ss[1,]/sum(ss[1,])
pct1_x2<-ss[2,]/sum(ss[2,])
woe_x2<-log(pct1_x2/pct0_x2)
woe_x2  ##形態好
pct_x2<-pct1_x2-pct0_x2
iv_x2<-sum(woe_x2*pct_x2)
iv_x2 #0.255
mydata$x2_woe<-as.character(mydata$x2) 
mydata$x2_woe[which(mydata$x2==names(woe_x2[1]))]<-woe_x2[[1]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[2]))]<-woe_x2[[2]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[3]))]<-woe_x2[[3]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[4]))]<-woe_x2[[4]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[5]))]<-woe_x2[[5]]
mydata$x2_woe[which(mydata$x2==names(woe_x2[6]))]<-woe_x2[[6]]
head(mydata$x2_woe)

unique(traindata$x3)
brks<- chimerge(traindata[,c(4,1)],100,5)
mydata$x3<-cut(traindata$x3,brks,include.lowest =T)
ss<-table(mydata[,c(1,4)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x3<-ss[1,]/sum(ss[1,])
pct1_x3<-ss[2,]/sum(ss[2,])
woe_x3<-log(pct1_x3/pct0_x3)
woe_x3  ##形態好
pct_x3<-pct1_x3-pct0_x3
iv_x3<-sum(woe_x3*pct_x3)
iv_x3  #0.457
mydata$x3_woe<-as.character(mydata$x3) 
mydata$x3_woe[which(mydata$x3==names(woe_x3[1]))]<-woe_x3[[1]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[2]))]<-woe_x3[[2]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[3]))]<-woe_x3[[3]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[4]))]<-woe_x3[[4]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[5]))]<-woe_x3[[5]]
mydata$x3_woe[which(mydata$x3==names(woe_x3[6]))]<-woe_x3[[6]]
head(mydata$x3_woe)


unique(traindata$x4)
brks<- chimerge(traindata[,c(5,1)],1000,5)
mydata$x4<-cut(traindata$x4,brks,include.lowest =T)
ss<-table(mydata[,c(1,5)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x4<-ss[1,]/sum(ss[1,])
pct1_x4<-ss[2,]/sum(ss[2,])
woe_x4<-log(pct1_x4/pct0_x4)
woe_x4  ##形態差
pct_x4<-pct1_x4-pct0_x4
iv_x4<-sum(woe_x4*pct_x4)
iv_x4  ###0.025
mydata$x4_woe<-as.character(mydata$x4) 
mydata$x4_woe[which(mydata$x4==names(woe_x4[1]))]<-woe_x4[[1]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[2]))]<-woe_x4[[2]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[3]))]<-woe_x4[[3]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[4]))]<-woe_x4[[4]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[5]))]<-woe_x4[[5]]
mydata$x4_woe[which(mydata$x4==names(woe_x4[6]))]<-woe_x4[[6]]
head(mydata$x4_woe)

unique(traindata$x5)
brks<- chimerge(traindata[,c(6,1)],1000,6)
mydata$x5<-cut(traindata$x5,brks,include.lowest =T)
ss<-table(mydata[,c(1,6)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x5<-ss[1,]/sum(ss[1,])
pct1_x5<-ss[2,]/sum(ss[2,])
woe_x5<-log(pct1_x5/pct0_x5)
woe_x5  ##形態差
pct_x5<-pct1_x5-pct0_x5
iv_x5<-sum(woe_x5*pct_x5)
iv_x5  ###0.227
mydata$x5_woe<-as.character(mydata$x5) 
mydata$x5_woe[which(mydata$x5==names(woe_x5[1]))]<-woe_x5[[1]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[2]))]<-woe_x5[[2]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[3]))]<-woe_x5[[3]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[4]))]<-woe_x5[[4]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[5]))]<-woe_x5[[5]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[6]))]<-woe_x5[[6]]
mydata$x5_woe[which(mydata$x5==names(woe_x5[7]))]<-woe_x5[[7]]
head(mydata$x5_woe)


unique(traindata$x6)
brks<- chimerge(traindata[,c(7,1)],100,6)
mydata$x6<-cut(traindata$x6,brks,include.lowest =T)
ss<-table(mydata[,c(1,7)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x6<-ss[1,]/sum(ss[1,])
pct1_x6<-ss[2,]/sum(ss[2,])
woe_x6<-log(pct1_x6/pct0_x6)
woe_x6  ##形態較好
pct_x6<-pct1_x6-pct0_x6
iv_x6<-sum(woe_x6*pct_x6)
iv_x6  ###0.078
mydata$x6_woe<-as.character(mydata$x6) 
mydata$x6_woe[which(mydata$x6==names(woe_x6[1]))]<-woe_x6[[1]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[2]))]<-woe_x6[[2]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[3]))]<-woe_x6[[3]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[4]))]<-woe_x6[[4]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[5]))]<-woe_x6[[5]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[6]))]<-woe_x6[[6]]
mydata$x6_woe[which(mydata$x6==names(woe_x6[7]))]<-woe_x6[[7]]
head(mydata$x6_woe)

unique(traindata$x7)
brks<- chimerge(traindata[,c(8,1)],100,5)
mydata$x7<-cut(traindata$x7,brks,include.lowest =T)
ss<-table(mydata[,c(1,8)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x7<-ss[1,]/sum(ss[1,])
pct1_x7<-ss[2,]/sum(ss[2,])
woe_x7<-log(pct1_x7/pct0_x7)
woe_x7  ##形態較好
pct_x7<-pct1_x7-pct0_x7
iv_x7<-sum(woe_x7*pct_x7)
iv_x7  ###0.457
mydata$x7_woe<-as.character(mydata$x7) 
mydata$x7_woe[which(mydata$x7==names(woe_x7[1]))]<-woe_x7[[1]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[2]))]<-woe_x7[[2]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[3]))]<-woe_x7[[3]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[4]))]<-woe_x7[[4]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[5]))]<-woe_x7[[5]]
mydata$x7_woe[which(mydata$x7==names(woe_x7[6]))]<-woe_x7[[6]]
head(mydata$x7_woe)

unique(traindata$x8)
brks<- chimerge(traindata[,c(9,1)],100,5)
mydata$x8<-cut(traindata$x8,brks,include.lowest =T)
ss<-table(mydata[,c(1,9)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x8<-ss[1,]/sum(ss[1,])
pct1_x8<-ss[2,]/sum(ss[2,])
woe_x8<-log(pct1_x8/pct0_x8)
woe_x8  ##形態較好
pct_x8<-pct1_x8-pct0_x8
iv_x8<-sum(woe_x8*pct_x8)
iv_x8  ###0.0217
mydata$x8_woe<-as.character(mydata$x8) 
mydata$x8_woe[which(mydata$x8==names(woe_x8[1]))]<-woe_x8[[1]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[2]))]<-woe_x8[[2]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[3]))]<-woe_x8[[3]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[4]))]<-woe_x8[[4]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[5]))]<-woe_x8[[5]]
mydata$x8_woe[which(mydata$x8==names(woe_x8[6]))]<-woe_x8[[6]]
head(mydata$x8_woe)

unique(traindata$x9)
brks<- chimerge(traindata[,c(10,1)],100,2)
mydata$x9<-cut(traindata$x9,brks,include.lowest =T)
ss<-table(mydata[,c(1,10)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x9<-ss[1,]/sum(ss[1,])
pct1_x9<-ss[2,]/sum(ss[2,])
woe_x9<-log(pct1_x9/pct0_x9)
woe_x9  ##形態較好
pct_x9<-pct1_x9-pct0_x9
iv_x9<-sum(woe_x9*pct_x9)
iv_x9  ###0.229
mydata$x9_woe<-as.character(mydata$x9) 
mydata$x9_woe[which(mydata$x9==names(woe_x9[1]))]<-woe_x9[[1]]
mydata$x9_woe[which(mydata$x9==names(woe_x9[2]))]<-woe_x9[[2]]
mydata$x9_woe[which(mydata$x9==names(woe_x9[3]))]<-woe_x9[[3]]
head(mydata$x9_woe)

unique(traindata$x10)
brks<- chimerge(traindata[,c(11,1)],1000,6)
mydata$x10<-cut(traindata$x10,brks,include.lowest =T)
ss<-table(mydata[,c(1,11)])
ss[1,]/ss[2,] ###遞增或者遞減最好
pct0_x10<-ss[1,]/sum(ss[1,])
pct1_x10<-ss[2,]/sum(ss[2,])
woe_x10<-log(pct1_x10/pct0_x10)
woe_x10  ##形態較好
pct_x10<-pct1_x10-pct0_x10
iv_x10<-sum(woe_x10*pct_x10)
iv_x10  ###0.0437
mydata$x10_woe<-as.character(mydata$x10) 
mydata$x10_woe[which(mydata$x10==names(woe_x10[1]))]<-woe_x10[[1]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[2]))]<-woe_x10[[2]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[3]))]<-woe_x10[[3]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[4]))]<-woe_x10[[4]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[5]))]<-woe_x10[[5]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[6]))]<-woe_x10[[6]]
mydata$x10_woe[which(mydata$x10==names(woe_x10[7]))]<-woe_x10[[7]]
head(mydata$x10_woe)

#觀察iv值
c(iv_x1,iv_x2,iv_x3,iv_x4,iv_x5,iv_x6,iv_x7,iv_x8,iv_x9,iv_x10)

#hist(traindata$x1, brks)
#plot(cut(traindata$x1, brks))


#######變量之間相關性檢測
#建模之前首先得檢驗變量之間的相關性,如果變量之間相關性顯著,會影響模型的預測效果
cor1<-cor(traindata[,1:11])
library("corrplot")
corrplot(cor1)
corrplot(cor1,method = "number")
#由上圖可以看出,各變量之間的相關性是非常小的。

#######切分數據
table(traindata$y)
# 由上表看出,對於響應變量SeriousDlqin2yrs,存在明顯的類失衡問題,SeriousDlqin2yrs等於1的觀測爲9879,僅爲所有觀測值的6.6%。
#因此我們需要對非平衡數據進行處理,在這裏可以採用SMOTE算法,用R對稀有事件進行超級採樣。
# 我們利用caret包中的createDataPartition(數據分割功能)函數將數據隨機分成相同的兩份
library(caret)
set.seed(1234) 
splitIndex<-createDataPartition(traindata$y,time=1,p=0.5,list=FALSE) 
train<-traindata[splitIndex,] 
test<-traindata[-splitIndex,] 
prop.table(table(train$y)) 
prop.table(table(test$y)) 

#兩者的分類結果是平衡的,仍然有6.6%左右的代表,我們仍然處於良好的水平。
#因此可以採用這份切割的數據進行建模及預測。

###########五、Logistic迴歸
# Logistic迴歸在信用評分卡開發中起到核心作用。由於其特點,以及對自變量進行了證據權重轉換(WOE),
# Logistic迴歸的結果可以直接轉換爲一個彙總表,即所謂的標準評分卡格式。
fit<-glm(y~.,train,family = "binomial")
summary(fit)
# 可以看出,利用全變量進行迴歸,模型擬合效果並不是很好,其中x1,x6變量的p值未能通過檢驗,
# 在此直接剔除這三個變量,利用剩餘的變量對y進行迴歸。
fit2<-glm(y~x2+x3+x7+x9,train,family = "binomial")
summary(fit2)
#第二個迴歸模型所有變量都通過了檢驗,甚至AIC值(赤池信息準則)更小,所有模型的擬合效果更好些
fit3<-glm(y~x2+x3+x4+x5+x7+x8+x9+x10,train,family = "binomial")
summary(fit3)

###模型評估
#對測試集預測
pre <- predict(fit3,test)
#在R中,可以利用pROC包,它能方便比較兩個分類器,還能自動標註出最優的臨界點,圖看起來也比較漂亮。
library(pROC)
modelroc <- roc(test$y,pre)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col="skyblue", print.thres=TRUE)
#圖中最優點FPR=1-TNR=0.845,TPR=0.638,AUC值爲0.8102,說明該模型的預測效果還是不錯的,正確較高。

######WOE轉換
# 證據權重(Weight of Evidence,WOE)轉換可以將Logistic迴歸模型轉變爲標準評分卡格式。
# 引入WOE轉換的目的並不是爲了提高模型質量,只是一些變量不應該被納入模型,這或者是因爲它們不能增加模型值,
# 或者是因爲與其模型相關係數有關的誤差較大,其實建立標準信用評分卡也可以不採用WOE轉換。這種情況下,
# Logistic迴歸模型需要處理更大數量的自變量。儘管這樣會增加建模程序的複雜性,但最終得到的評分卡都是一樣的。

# 用WOE(x)替換變量x。WOE()=ln[(違約/總違約)/(正常/總正常)]。
# 通過上述的Logistic迴歸,剔除x1,x4,x5,x6三個變量,對剩下的變量進行WOE轉換。

library(smbinning)
#######各變量的分組,計算woe
####看所有iv值
sumivt<-smbinning.sumiv(df=train,y="y") # IV for eache variable
sumivt # Display table with IV by characteristic
par(mfrow=c(1,1))
smbinning.sumiv.plot(sumivt,cex=1) # Plot IV summary table
###發現x1,x8,x9不在,重點處理

result_x1=smbinning(df=train,y='y',x="x1",p=0.05)
result_x1$ivtable
#自定義分組,等比劃分
per<-as.vector(quantile(train$x1,probs=seq(0,1,0.2),na.rm=T))
breaks<-per[2:(length(per)-1)]
result_x1=smbinning.custom(df=train,y='y',x="x1",cuts=breaks)
result_x1$ivtable
smbinning.plot(result_x1,option="WoE",sub="x1")#x1沒有分組結果
result_x1$iv
#x2
unique(train$x2)
result_x2=smbinning(df=train,y='y',x="x2",p=0.01)
smbinning.plot(result_x2,option="WoE",sub="x2")#看woe趨勢
result_x2$iv #看iv值
#x3
unique(train$x3)
result_x3=smbinning(df=train,y='y',x="x3",p=0.01)
smbinning.plot(result_x3,option="WoE",sub="x3")#看woe趨勢
result_x3$iv #看iv值
#x4
unique(train$x4)
result_x4=smbinning(df=train,y='y',x="x4",p=0.001)
smbinning.plot(result_x4,option="WoE",sub="x4")#看woe趨勢
result_x4$iv #看iv值
#x5
unique(train$x5)
result_x5=smbinning(df=train,y='y',x="x5",p=0.001)
smbinning.plot(result_x5,option="WoE",sub="x5")#看woe趨勢
result_x5$iv #看iv值
#x6
unique(train$x6)
result_x6=smbinning(df=train,y='y',x="x6",p=0.01)
smbinning.plot(result_x6,option="WoE",sub="x6")#看woe趨勢
result_x6$iv #看iv值
#x7
unique(train$x7)
result_x7=smbinning(df=train,y='y',x="x7",p=0.001)
smbinning.plot(result_x7,option="WoE",sub="x7")#看woe趨勢
result_x7$iv #看iv值
result_x7$ivtable
#x8
unique(train$x8)
result_x8=smbinning(df=train,y='y',x="x8",p=0.01)
smbinning.plot(result_x8,option="WoE",sub="x8")#x1沒有分組結果
#自定義分組,等比劃分
per<-as.vector(quantile(train$x8,probs=seq(0,1,0.25),na.rm=T))
breaks<-per[2:(length(per)-1)]
result_x8=smbinning.custom(df=train,y='y',x="x8",cuts=breaks)
result_x8$ivtable
smbinning.plot(result_x8,option="WoE",sub="x8")#x1沒有分組結果
result_x8$iv #看iv值
#x9
unique(train$x9)
result_x9=smbinning(df=train,y='y',x="x9",p=0.01)
smbinning.plot(result_x9,option="WoE",sub="x9")#x1沒有分組結果
result_x9$iv #看iv值
#x10
unique(train$x10)
result_x10=smbinning(df=train,y='y',x="x10",p=0.0001)
smbinning.plot(result_x10,option="WoE",sub="x10")#看woe趨勢
result_x10$iv #看iv值
result_x10$ivtable

#####對變量進行WOE變換
###修改smbinning.gen函數源碼
smbinning.wen<-function (df, ivout, chrname = "NewChar") 
{
  df = cbind(df, tmpname = NA)
  ncol = ncol(df)
  col_id = ivout$col_id
  b = ivout$bands
  c=ivout$ivtable[,13]
  df[, ncol][is.na(df[, col_id])] = 0
  df[, ncol][df[, col_id] <= b[2]] = c[1]
  if (length(b) > 3) {
    for (i in 2:(length(b) - 2)) {
      df[, ncol][df[, col_id] > b[i] & df[, col_id] <= 
                   b[i + 1]] = c[i]
    }
  }
  df[, ncol][df[, col_id] > b[length(b) - 1]] = c[length(b) - 1]
  #df[, ncol] = as.factor(df[, ncol])##轉換爲因子類型
  
  names(df)[names(df) == "tmpname"] = chrname
  return(df)
}
train=train[,1:11]
train=smbinning.wen(train, result_x1, chrname = "wx1")#增加一列
head(train$wx1)
table(train$wx1)
##其他類似
train=smbinning.wen(train, result_x2, chrname = "wx2")#增加一列
train=smbinning.wen(train, result_x3, chrname = "wx3")#增加一列
train=smbinning.wen(train, result_x4, chrname = "wx4")#增加一列
train=smbinning.wen(train, result_x5, chrname = "wx5")#增加一列
train=smbinning.wen(train, result_x6, chrname = "wx6")#增加一列
train=smbinning.wen(train, result_x7, chrname = "wx7")#增加一列
train=smbinning.wen(train, result_x8, chrname = "wx8")#增加一列
train=smbinning.wen(train, result_x9, chrname = "wx9")#增加一列
train=smbinning.wen(train, result_x10, chrname = "wx10")#增加一列


######WOE DataFrame構建:
trainWOE =train[,12:21]

#####################七、評分卡的創建和實施
#因爲數據中“1”代表的是違約,直接建模預測,求的是“發生違約的概率”,log(odds)即爲“壞好比”。
#爲了符合常規理解,分數越高,信用越好,所有就調換“0”和“1”,使建模預測結果爲“不發生違約的概率”,最後log(odds)即表示爲“好壞比”。
trainWOE$y = 1-train$y
glm.fit = glm(y~.,data = trainWOE,family = binomial(link = logit))
summary(glm.fit)
coe = (glm.fit$coefficients)

###用woe值,相關性更低
cor1<-cor(trainWOE[,1:11])
library("corrplot")
corrplot(cor1)
corrplot(cor1,method = "number")
####所有的變量效果更好,以下評分卡,即爲所有
fit4<-glm(y~.,trainWOE,family = "binomial")
summary(fit4)
fit5<-glm(y~wx1+wx2+wx3+wx4+wx5+wx7+wx8+wx9,trainWOE,family = "binomial")
summary(fit5)



p <- 20/log(2)
q <- 600-20*log(15)/log(2)
Score=q + p*as.numeric(coe[1])+p*as.numeric(coe[2])*trainWOE$wx1 +p*as.numeric(coe[3])*trainWOE$wx2
+p*as.numeric(coe[4])*trainWOE$wx3 +p*as.numeric(coe[5])*trainWOE$wx4+p*as.numeric(coe[6])*trainWOE$wx5
+p*as.numeric(coe[7])*trainWOE$wx6 +p*as.numeric(coe[8])*trainWOE$wx7+p*as.numeric(coe[9])*trainWOE$wx8
+p*as.numeric(coe[10])*trainWOE$wx9+p*as.numeric(coe[11])*trainWOE$wx10
#個人總評分=基礎分+各部分得分
#基礎分爲:
base <- q + p*as.numeric(coe[1])
base
#1、對各變量進行打分  
##構造計算分值函數:
getscore<-function(i,x){
  score = round(p*as.numeric(coe[i])*x,0)
  return(score)
}

# 2、計算各變量分箱得分:

x1<-as.data.frame(getscore(2,result_x1$ivtable[1:(length(result_x1$bands)-1),13]))
rownames(x1) <-result_x1$ivtable[1:(length(result_x1$bands)-1),1]
colnames(x1)<-'x1'
x2<-as.data.frame(getscore(2,result_x2$ivtable[1:(length(result_x2$bands)-1),13]))
rownames(x2) <-result_x2$ivtable[1:(length(result_x2$bands)-1),1]
colnames(x2)<-'x2'
x3<-as.data.frame(getscore(2,result_x3$ivtable[1:(length(result_x3$bands)-1),13]))
rownames(x3) <-result_x3$ivtable[1:(length(result_x3$bands)-1),1]
colnames(x3)<-'x3'
x4<-as.data.frame(getscore(2,result_x4$ivtable[1:(length(result_x4$bands)-1),13]))
rownames(x4) <-result_x4$ivtable[1:(length(result_x4$bands)-1),1]
colnames(x4)<-'x4'
x5<-as.data.frame(getscore(2,result_x5$ivtable[1:(length(result_x5$bands)-1),13]))
rownames(x5) <-result_x5$ivtable[1:(length(result_x5$bands)-1),1]
colnames(x5)<-'x5'
x6<-as.data.frame(getscore(2,result_x6$ivtable[1:(length(result_x6$bands)-1),13]))
rownames(x6) <-result_x6$ivtable[1:(length(result_x6$bands)-1),1]
colnames(x6)<-'x6'
x7<-as.data.frame(getscore(2,result_x7$ivtable[1:(length(result_x7$bands)-1),13]))
rownames(x7) <-result_x7$ivtable[1:(length(result_x7$bands)-1),1]
colnames(x7)<-'x7'
x8<-as.data.frame(getscore(2,result_x8$ivtable[1:(length(result_x8$bands)-1),13]))
rownames(x8) <-result_x8$ivtable[1:(length(result_x8$bands)-1),1]
colnames(x8)<-'x8'
x9<-as.data.frame(getscore(2,result_x9$ivtable[1:(length(result_x9$bands)-1),13]))
rownames(x9) <-result_x9$ivtable[1:(length(result_x9$bands)-1),1]
colnames(x9)<-'x9'
x10<-as.data.frame(getscore(2,result_x10$ivtable[1:(length(result_x10$bands)-1),13]))
rownames(x10) <-result_x10$ivtable[1:(length(result_x10$bands)-1),1]
colnames(x10)<-'x10'

score<-list(x1,x2,x3,x4,x5,x6,x7,x8,x9,x10)###整數分的評分卡

#####非整數的評分
train$score<-q + p*as.numeric(coe[1])+p*as.numeric(coe[2])*trainWOE$wx1 +p*as.numeric(coe[3])*trainWOE$wx2
+p*as.numeric(coe[4])*trainWOE$wx3 +p*as.numeric(coe[5])*trainWOE$wx4+p*as.numeric(coe[6])*trainWOE$wx5
+p*as.numeric(coe[7])*trainWOE$wx6 +p*as.numeric(coe[8])*trainWOE$wx7+p*as.numeric(coe[9])*trainWOE$wx8
+p*as.numeric(coe[10])*trainWOE$wx9+p*as.numeric(coe[11])*trainWOE$wx10

sort(train$score)

#############################也有進行woe替換後進行建模分析的
pre <- predict(glm.fit,test)
#在R中,可以利用pROC包,它能方便比較兩個分類器,還能自動標註出最優的臨界點,圖看起來也比較漂亮。
library(pROC)
modelroc <- roc(test$y,pre)
plot(modelroc, print.auc=TRUE, auc.polygon=TRUE, grid=c(0.1, 0.2),
     grid.col=c("green", "red"), max.auc.polygon=TRUE,
     auc.polygon.col="skyblue", print.thres=TRUE)


請跟前文的代碼相比較!再上傳一張腦圖如下:



發佈了32 篇原創文章 · 獲贊 131 · 訪問量 28萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章