[B8]信用評分卡建模

在家不能太頹廢，跟着視頻裏的老師練習了一個風控建模案例，算用來練練手吧。中間出了一點小bug耽誤了大半天的時間。

第一步：導入數據

#導入科學計算包
import numpy as np
import matplotlib as mlt
import matplotlib.pyplot as plt
plt.rc('font', family='Microsoft YaHei')
%matplotlib inline
import seaborn as sns
import pandas as pd
import os
os.chdir("/Jupyter_working_path/Numpy/data")

df = pd.read_csv(r"cs-training.csv")
df.head(2)

df0 = df.rename(columns={"Unnamed:0":"ID",
       "SeriousDlqin2yrs":"好壞客戶",
       "RevolvingUtilizationOfUnsecuredLines":"額度比值",
       "age":"年齡",
       "NumberOfTime30-59DaysPastDueNotWorse":"逾30-59天",
       "DebtRatio":"負債率",
       "MonthlyIncome":"月收入",
       "NumberOfOpenCreditLinesAndLoans":"信貸數",
       "NumberOfTimes90DaysLate":"逾90天",
       "NumberRealEstateLoansOrLines":"固定資產",
       "NumberOfTime60-89DaysPastDueNotWorse":"逾60-89天",
       "NumberOfDependents":"家屬"})
df0.head(3)

這一步主要是把列名換成中文，比較容易讀。

第二步：數據預處理

#數據預處理
#缺失值
df0.info()

可見“月收入”和“家屬”有缺失值，我們把它找出來

def missing_values_table(df0):
    #全部缺失值
    mis_val=df0.isnull().sum()
    #缺失值比例
    mis_val_percent=100*df0.isnull().sum() /len(df0)
    #做成一個表
    mis_val_table=pd.concat([mis_val,mis_val_percent], axis=1)
    #改列名
    mis_val_table_ren_columns = mis_val_table.rename(columns={0:'缺失值',1:'缺失比例'})
    #對缺失值排序
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('缺失比例',ascending=False).round(1)
    
    #打印表
    print("數據有"+str(df0.shape[1])+"列.\n"  "其中" + str(mis_val_table_ren_columns.shape[0]) +"列含有缺失值")
    
    #返回缺失行列
    return mis_val_table_ren_columns

missing_values_table(df0)

#填補缺失值的方法
‘’’
df = df.dropna() #去掉缺失的行
df = df.fillna(num) #補充固定的數
df = df.fillna(df.mean()) #補充均值
df = df.fillna(df.median()) #補充中位數
df = df.interpolate()#插值法
‘’’
#處理異常值

#分箱法
df0['年齡'].hist(bins=50)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture")

def blk(floor, root):
    def f(x):
        if x <floor:
            x=floor
        elif x >root:
            x = root
        return x
    return f


q1= df0['年齡'].quantile(0.01)
q99 = df0['年齡'].quantile(0.99)
blk_tot=blk(floor=q1,root=q99)

df0['年齡'] = df0['年齡'].map(blk_tot)
df0['年齡'].hist(bins=50)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture0")

第三步：探索性分析
#探索性分析
#單變量分析

#單變量:
age_cut = pd.cut(df0['年齡'],5)
age_cut_grouped=df0["好壞客戶"].groupby(age_cut).count()
age_cut_grouped1=df0["好壞客戶"].groupby(age_cut).sum()

df1=pd.merge(pd.DataFrame(age_cut_grouped), pd.DataFrame(age_cut_grouped1),right_index=True,left_index=True)
df1.rename(columns={"好壞客戶_x":"好客戶","好壞客戶_y":"壞客戶"},inplace=True)
df1.insert(2,"壞客戶率",df1["壞客戶"]/df1["好客戶"])

ax1 = df1[["好客戶","壞客戶"]].plot.bar()
ax1.set_xticklabels(df1.index,rotation=15)
ax1.set_ylabel("客戶數")
ax1.set_title("年齡於好壞客戶分佈圖")
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture1")

ax11 = df1["壞客戶率"].plot()
ax11.set_ylabel("壞客戶率")
ax11.set_title("壞客戶率隨年齡變化趨勢圖")
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture2")

進行多變量分析

#多變量：
plt.rcParams["font.sans-serif"] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
corr = df0.corr() #計算各個變量的相關係數
xticks = list(corr.index) #x軸標籤
yticks = list(corr.index)  #y軸標籤
fig = plt.figure()
axl2 = fig.add_subplot(1,1,1)
sns.heatmap(corr, annot=True, cmap="rainbow", ax=axl2, linewidths=.5, annot_kws={'size':6,'weight':'bold','color':"blue"})

axl2.set_xticklabels(xticks, rotation=35, fontsize=9)
axl2.set_yticklabels(yticks, rotation=0, fontsize=9)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture3")

第四步：WOE值替換和LR建模

#分箱
pinf = float('inf') #正無窮
ninf = float('-inf') #負無窮

cut1=pd.qcut(df0["額度比值"],4, labels=False)
cut2=pd.qcut(df0["年齡"],8, labels=False)
bins3=[ninf,0,1,3,5,pinf]
cut3=pd.cut(df0["逾30-59天"],bins3,labels=False)
cut4=pd.qcut(df0["負債率"],3, labels=False)
cut5=pd.qcut(df0["月收入"],4, labels=False)
cut6=pd.qcut(df0["信貸數"],4, labels=False)
bins7=[ninf,0,1,3,5,pinf]
cut7=pd.cut(df0["逾90天"],bins7, labels=False)
bins8=[ninf,0,1,2,3,pinf]
cut8=pd.cut(df0["固定資產"],bins8, labels=False)
bins9=[ninf,0,1,3,pinf]
cut9=pd.cut(df0["逾60-89天"],bins9, labels=False)
bins10=[ninf,0,1,3,5,pinf]
cut10=pd.cut(df0["家屬"],bins10, labels=False)

#計算WOE值和 IV 值

#好壞客戶比率
rate = df0["好壞客戶"].sum()/(df0["好壞客戶"].count() - df0["好壞客戶"].sum())

#定義WOE計算函數
def get_woe_data(cut):
    grouped=df0["好壞客戶"].groupby(cut, as_index = True).value_counts()
    woe=np.log(pd.DataFrame(grouped).unstack().iloc[:,1] / pd.DataFrame(grouped).unstack().iloc[:,0] /rate) 
    #計算每個分組的woe值
    return(woe)

cut1_woe=get_woe_data(cut1)
cut2_woe=get_woe_data(cut2)
cut3_woe=get_woe_data(cut3)
cut4_woe=get_woe_data(cut4)
cut5_woe=get_woe_data(cut5)
cut6_woe=get_woe_data(cut6)
cut7_woe=get_woe_data(cut7)
cut8_woe=get_woe_data(cut8)
cut9_woe=get_woe_data(cut9)
cut10_woe=get_woe_data(cut10)

#定義IV值計算函數
def get_IV_data(cut, cut_woe):
    grouped=df0["好壞客戶"].groupby(cut, as_index=True).value_counts()
    cut_IV=((pd.DataFrame(grouped).unstack().iloc[:,1]/df0["好壞客戶"].sum() -pd.DataFrame(grouped).unstack().iloc[:,0] /(df0["好壞客戶"].count() - df0["好壞客戶"].sum()))* cut_woe).sum()
    return(cut_IV)

#計算各分組的IV值
cut1_IV=get_IV_data(cut1, cut1_woe)
cut2_IV=get_IV_data(cut2, cut2_woe)
cut3_IV=get_IV_data(cut3, cut3_woe)
cut4_IV=get_IV_data(cut4, cut4_woe)
cut5_IV=get_IV_data(cut5, cut5_woe)
cut6_IV=get_IV_data(cut6, cut6_woe)
cut7_IV=get_IV_data(cut7, cut7_woe)
cut8_IV=get_IV_data(cut8, cut8_woe)
cut9_IV=get_IV_data(cut9, cut9_woe)
cut10_IV=get_IV_data(cut10, cut10_woe)

#各組的IV值可視化
df_IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV], index=df0.columns[2:])
df_IV.plot(kind="bar")
#for a,b in zip(range(10),df1.values):   報錯TypeError: only size-1 arrays can be converted to Python scalars
    #plt.text(a,b,'%.2f' % b,ha='center', va='bottom',fontsize=9)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture4")

df_new=df0.drop(["負債率","月收入","信貸數","固定資產","家屬"],axis=1)

第五步：LR建模

#LR建模（邏輯斯蒂迴歸模型）
#替換數據
def replace_data(cut,cut_woe):
    a = []
    for i in cut.unique():
        a.append(i)
        a.sort()
    for m in range(len(a)):
        cut.replace(a[m],cut_woe.values[m], inplace=True)
    return cut

#進行替換
df_new["可用額度比值"]=replace_data(cut1, cut1_woe)
df_new["年齡"]=replace_data(cut2, cut2_woe)
df_new["逾30-59天"]=replace_data(cut3, cut3_woe)
df_new["逾90天"]=replace_data(cut7, cut7_woe)
df_new["逾60-89天"]=replace_data(cut9, cut9_woe)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc

x = df_new.iloc[:,1:]
y = df_new.iloc[:,0]

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=0)

#模型訓練
model=LogisticRegression()
clf=model.fit(x_train,y_train)
print("測試成績：{}".format(clf.score(x_test,y_test)))
y_pred=clf.predict(x_test)
y_pred1=clf.decison_function(x_test)

#繪製ROC曲線以及計算AUC值
fpr,tpr,threshold = roc_curve(y_test,y_pred1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,color='darkorange',
        label='ROC curve(area =%0.2f)' % roc_auc)
plt.plot([0,1],[0,1],color='navy',linestype='--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_curve')
plt.legend(loc='lower right')
plt.show()