導入數據,將 標籤(Y)和樣本(X) 分開來
import pandas as pd
X = pd.read_csv("data.csv")
y = X["status"]
X.drop(["status"],axis=1)
X.head(5)
將數據分成訓練集和測試集
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30, random_state=2018)
查看數據的信息
X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3327 entries, 110 to 1274
Data columns (total 90 columns):
Unnamed: 0 3327 non-null int64
custid 3327 non-null int64
trade_no 3327 non-null object
bank_card_no 3327 non-null object
low_volume_percent 3325 non-null float64
middle_volume_percent 3325 non-null float64
take_amount_in_later_12_month_highest 3327 non-null int64
trans_amount_increase_rate_lately 3325 non-null float64
trans_activity_month 3325 non-null float64
trans_activity_day 3325 non-null float64
transd_mcc 3325 non-null float64
trans_days_interval_filter 3321 non-null float64
trans_days_interval 3325 non-null float64
regional_mobility 3325 non-null float64
student_feature 1208 non-null float64
repayment_capability 3327 non-null int64
is_high_user 3327 non-null int64
number_of_trans_from_2011 3325 non-null float64
first_transaction_time 3325 non-null float64
historical_trans_amount 3327 non-null int64
historical_trans_day 3325 non-null float64
rank_trad_1_month 3325 non-null float64
trans_amount_3_month 3327 non-null int64
avg_consume_less_12_valid_month 3325 non-null float64
abs 3327 non-null int64
top_trans_count_last_1_month 3325 non-null float64
avg_price_last_12_month 3327 non-null int64
avg_price_top_last_12_valid_month 3249 non-null float64
reg_preference_for_trad 3325 non-null object
trans_top_time_last_1_month 3321 non-null float64
trans_top_time_last_6_month 3321 non-null float64
consume_top_time_last_1_month 3321 non-null float64
consume_top_time_last_6_month 3321 non-null float64
cross_consume_count_last_1_month 3037 non-null float64
trans_fail_top_count_enum_last_1_month 3314 non-null float64
trans_fail_top_count_enum_last_6_month 3314 non-null float64
trans_fail_top_count_enum_last_12_month 3314 non-null float64
consume_mini_time_last_1_month 3312 non-null float64
max_cumulative_consume_later_1_month 3327 non-null int64
max_consume_count_later_6_month 3321 non-null float64
railway_consume_count_last_12_month 3319 non-null float64
pawns_auctions_trusts_consume_last_1_month 3327 non-null int64
pawns_auctions_trusts_consume_last_6_month 3327 non-null int64
jewelry_consume_count_last_6_month 3319 non-null float64
status 3327 non-null int64
source 3327 non-null object
first_transaction_day 3325 non-null float64
trans_day_last_12_month 3325 non-null float64
id_name 3125 non-null object
apply_score 3107 non-null float64
apply_credibility 3107 non-null float64
query_org_count 3107 non-null float64
query_finance_count 3107 non-null float64
query_cash_count 3107 non-null float64
query_sum_count 3107 non-null float64
latest_query_time 3107 non-null object
latest_one_month_apply 3107 non-null float64
latest_three_month_apply 3107 non-null float64
latest_six_month_apply 3107 non-null float64
loans_score 3112 non-null float64
loans_credibility_behavior 3112 non-null float64
loans_count 3112 non-null float64
loans_settle_count 3112 non-null float64
loans_overdue_count 3112 non-null float64
loans_org_count_behavior 3112 non-null float64
consfin_org_count_behavior 3112 non-null float64
loans_cash_count 3112 non-null float64
latest_one_month_loan 3112 non-null float64
latest_three_month_loan 3112 non-null float64
latest_six_month_loan 3112 non-null float64
history_suc_fee 3112 non-null float64
history_fail_fee 3112 non-null float64
latest_one_month_suc 3112 non-null float64
latest_one_month_fail 3112 non-null float64
loans_long_time 3112 non-null float64
loans_latest_time 3112 non-null object
loans_credit_limit 3112 non-null float64
loans_credibility_limit 3112 non-null float64
loans_org_count_current 3112 non-null float64
loans_product_count 3112 non-null float64
loans_max_limit 3112 non-null float64
loans_avg_limit 3112 non-null float64
consfin_credit_limit 3112 non-null float64
consfin_credibility 3112 non-null float64
consfin_org_count_current 3112 non-null float64
consfin_product_count 3112 non-null float64
consfin_max_limit 3112 non-null float64
consfin_avg_limit 3112 non-null float64
latest_query_day 3107 non-null float64
loans_latest_day 3112 non-null float64
dtypes: float64(70), int64(13), object(7)
memory usage: 2.3+ MB
將一些沒用的特徵刪除
##將只有一個值的列刪除
X_train.drop(["bank_card_no"],axis=1,inplace=True)
##刪除身份證號
X_train.drop(["trade_no"],axis=1,inplace=True)
##刪除名字
X_train.drop(['id_name'],axis=1,inplace=True)
##刪除custid
X_train.drop(['custid'],axis=1,inplace=True)
X_train.drop(['loans_latest_time', 'latest_query_time'], axis=1,inplace=True)
X_train.drop(['source'], axis=1,inplace=True)
把值轉爲離散型數據
X_train['reg_preference_for_trad'].value_counts()
//
一線城市 2380
三線城市 747
境外 103
二線城市 92
其他城市 3
//
def get_change(dt):
if dt == '一線城市':
return 1
if dt == '二線城市':
return 2
if dt == '三線城市':
return 3
if dt == '境外':
return 4
else :
return 5
X_train['reg_preference_for_trad'] = np.array([get_change(x) for x in X_train['reg_preference_for_trad']])
X_train['reg_preference_for_trad'].value_counts()
//
1 2380
3 747
4 103
2 92
5 5
Name: reg_preference_for_trad, dtype: int64
處理缺失值,將缺失值填充爲那一列的均值
cols = X_train.columns
for col in cols:
X_train[col] = X_train[col].fillna(X_train[col].mean())