在家不能太頹廢,跟着視頻裏的老師練習了一個風控建模案例,算用來練練手吧。中間出了一點小bug耽誤了大半天的時間。
第一步:導入數據
#導入科學計算包
import numpy as np
import matplotlib as mlt
import matplotlib.pyplot as plt
plt.rc('font', family='Microsoft YaHei')
%matplotlib inline
import seaborn as sns
import pandas as pd
import os
os.chdir("/Jupyter_working_path/Numpy/data")
df = pd.read_csv(r"cs-training.csv")
df.head(2)
df0 = df.rename(columns={"Unnamed:0":"ID",
"SeriousDlqin2yrs":"好壞客戶",
"RevolvingUtilizationOfUnsecuredLines":"額度比值",
"age":"年齡",
"NumberOfTime30-59DaysPastDueNotWorse":"逾30-59天",
"DebtRatio":"負債率",
"MonthlyIncome":"月收入",
"NumberOfOpenCreditLinesAndLoans":"信貸數",
"NumberOfTimes90DaysLate":"逾90天",
"NumberRealEstateLoansOrLines":"固定資產",
"NumberOfTime60-89DaysPastDueNotWorse":"逾60-89天",
"NumberOfDependents":"家屬"})
df0.head(3)
這一步主要是把列名換成中文,比較容易讀。
第二步:數據預處理
#數據預處理
#缺失值
df0.info()
可見“月收入”和“家屬”有缺失值,我們把它找出來
def missing_values_table(df0):
#全部缺失值
mis_val=df0.isnull().sum()
#缺失值比例
mis_val_percent=100*df0.isnull().sum() /len(df0)
#做成一個表
mis_val_table=pd.concat([mis_val,mis_val_percent], axis=1)
#改列名
mis_val_table_ren_columns = mis_val_table.rename(columns={0:'缺失值',1:'缺失比例'})
#對缺失值排序
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('缺失比例',ascending=False).round(1)
#打印表
print("數據有"+str(df0.shape[1])+"列.\n" "其中" + str(mis_val_table_ren_columns.shape[0]) +"列含有缺失值")
#返回缺失行列
return mis_val_table_ren_columns
missing_values_table(df0)
#填補缺失值的方法
‘’’
df = df.dropna() #去掉缺失的行
df = df.fillna(num) #補充固定的數
df = df.fillna(df.mean()) #補充均值
df = df.fillna(df.median()) #補充中位數
df = df.interpolate()#插值法
‘’’
#處理異常值
#分箱法
df0['年齡'].hist(bins=50)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture")
def blk(floor, root):
def f(x):
if x <floor:
x=floor
elif x >root:
x = root
return x
return f
q1= df0['年齡'].quantile(0.01)
q99 = df0['年齡'].quantile(0.99)
blk_tot=blk(floor=q1,root=q99)
df0['年齡'] = df0['年齡'].map(blk_tot)
df0['年齡'].hist(bins=50)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture0")
第三步:探索性分析
#探索性分析
#單變量分析
#單變量:
age_cut = pd.cut(df0['年齡'],5)
age_cut_grouped=df0["好壞客戶"].groupby(age_cut).count()
age_cut_grouped1=df0["好壞客戶"].groupby(age_cut).sum()
df1=pd.merge(pd.DataFrame(age_cut_grouped), pd.DataFrame(age_cut_grouped1),right_index=True,left_index=True)
df1.rename(columns={"好壞客戶_x":"好客戶","好壞客戶_y":"壞客戶"},inplace=True)
df1.insert(2,"壞客戶率",df1["壞客戶"]/df1["好客戶"])
ax1 = df1[["好客戶","壞客戶"]].plot.bar()
ax1.set_xticklabels(df1.index,rotation=15)
ax1.set_ylabel("客戶數")
ax1.set_title("年齡於好壞客戶分佈圖")
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture1")
ax11 = df1["壞客戶率"].plot()
ax11.set_ylabel("壞客戶率")
ax11.set_title("壞客戶率隨年齡變化趨勢圖")
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture2")
進行多變量分析
#多變量:
plt.rcParams["font.sans-serif"] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
corr = df0.corr() #計算各個變量的相關係數
xticks = list(corr.index) #x軸標籤
yticks = list(corr.index) #y軸標籤
fig = plt.figure()
axl2 = fig.add_subplot(1,1,1)
sns.heatmap(corr, annot=True, cmap="rainbow", ax=axl2, linewidths=.5, annot_kws={'size':6,'weight':'bold','color':"blue"})
axl2.set_xticklabels(xticks, rotation=35, fontsize=9)
axl2.set_yticklabels(yticks, rotation=0, fontsize=9)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture3")
第四步:WOE值替換和LR建模
#分箱
pinf = float('inf') #正無窮
ninf = float('-inf') #負無窮
cut1=pd.qcut(df0["額度比值"],4, labels=False)
cut2=pd.qcut(df0["年齡"],8, labels=False)
bins3=[ninf,0,1,3,5,pinf]
cut3=pd.cut(df0["逾30-59天"],bins3,labels=False)
cut4=pd.qcut(df0["負債率"],3, labels=False)
cut5=pd.qcut(df0["月收入"],4, labels=False)
cut6=pd.qcut(df0["信貸數"],4, labels=False)
bins7=[ninf,0,1,3,5,pinf]
cut7=pd.cut(df0["逾90天"],bins7, labels=False)
bins8=[ninf,0,1,2,3,pinf]
cut8=pd.cut(df0["固定資產"],bins8, labels=False)
bins9=[ninf,0,1,3,pinf]
cut9=pd.cut(df0["逾60-89天"],bins9, labels=False)
bins10=[ninf,0,1,3,5,pinf]
cut10=pd.cut(df0["家屬"],bins10, labels=False)
#計算WOE值 和 IV 值
#好壞客戶比率
rate = df0["好壞客戶"].sum()/(df0["好壞客戶"].count() - df0["好壞客戶"].sum())
#定義WOE計算函數
def get_woe_data(cut):
grouped=df0["好壞客戶"].groupby(cut, as_index = True).value_counts()
woe=np.log(pd.DataFrame(grouped).unstack().iloc[:,1] / pd.DataFrame(grouped).unstack().iloc[:,0] /rate)
#計算每個分組的woe值
return(woe)
cut1_woe=get_woe_data(cut1)
cut2_woe=get_woe_data(cut2)
cut3_woe=get_woe_data(cut3)
cut4_woe=get_woe_data(cut4)
cut5_woe=get_woe_data(cut5)
cut6_woe=get_woe_data(cut6)
cut7_woe=get_woe_data(cut7)
cut8_woe=get_woe_data(cut8)
cut9_woe=get_woe_data(cut9)
cut10_woe=get_woe_data(cut10)
#定義IV值計算函數
def get_IV_data(cut, cut_woe):
grouped=df0["好壞客戶"].groupby(cut, as_index=True).value_counts()
cut_IV=((pd.DataFrame(grouped).unstack().iloc[:,1]/df0["好壞客戶"].sum() -pd.DataFrame(grouped).unstack().iloc[:,0] /(df0["好壞客戶"].count() - df0["好壞客戶"].sum()))* cut_woe).sum()
return(cut_IV)
#計算各分組的IV值
cut1_IV=get_IV_data(cut1, cut1_woe)
cut2_IV=get_IV_data(cut2, cut2_woe)
cut3_IV=get_IV_data(cut3, cut3_woe)
cut4_IV=get_IV_data(cut4, cut4_woe)
cut5_IV=get_IV_data(cut5, cut5_woe)
cut6_IV=get_IV_data(cut6, cut6_woe)
cut7_IV=get_IV_data(cut7, cut7_woe)
cut8_IV=get_IV_data(cut8, cut8_woe)
cut9_IV=get_IV_data(cut9, cut9_woe)
cut10_IV=get_IV_data(cut10, cut10_woe)
#各組的IV值可視化
df_IV=pd.DataFrame([cut1_IV,cut2_IV,cut3_IV,cut4_IV,cut5_IV,cut6_IV,cut7_IV,cut8_IV,cut9_IV,cut10_IV], index=df0.columns[2:])
df_IV.plot(kind="bar")
#for a,b in zip(range(10),df1.values): 報錯TypeError: only size-1 arrays can be converted to Python scalars
#plt.text(a,b,'%.2f' % b,ha='center', va='bottom',fontsize=9)
plt.savefig("C:\\Jupyter_working_path\\Numpy\\picture4")
df_new=df0.drop(["負債率","月收入","信貸數","固定資產","家屬"],axis=1)
第五步:LR建模
#LR建模(邏輯斯蒂迴歸模型)
#替換數據
def replace_data(cut,cut_woe):
a = []
for i in cut.unique():
a.append(i)
a.sort()
for m in range(len(a)):
cut.replace(a[m],cut_woe.values[m], inplace=True)
return cut
#進行替換
df_new["可用額度比值"]=replace_data(cut1, cut1_woe)
df_new["年齡"]=replace_data(cut2, cut2_woe)
df_new["逾30-59天"]=replace_data(cut3, cut3_woe)
df_new["逾90天"]=replace_data(cut7, cut7_woe)
df_new["逾60-89天"]=replace_data(cut9, cut9_woe)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
x = df_new.iloc[:,1:]
y = df_new.iloc[:,0]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=0)
#模型訓練
model=LogisticRegression()
clf=model.fit(x_train,y_train)
print("測試成績:{}".format(clf.score(x_test,y_test)))
y_pred=clf.predict(x_test)
y_pred1=clf.decison_function(x_test)
#繪製ROC曲線以及計算AUC值
fpr,tpr,threshold = roc_curve(y_test,y_pred1)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,color='darkorange',
label='ROC curve(area =%0.2f)' % roc_auc)
plt.plot([0,1],[0,1],color='navy',linestype='--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC_curve')
plt.legend(loc='lower right')
plt.show()
模型跑了半天都跑不出來,未完待續。。。。