數據挖掘實戰

導入數據,將 標籤(Y)和樣本(X) 分開來

import pandas as pd
X = pd.read_csv("data.csv")
y = X["status"]
X.drop(["status"],axis=1)
X.head(5)

將數據分成訓練集和測試集

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.30, random_state=2018)

查看數據的信息

X_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3327 entries, 110 to 1274
Data columns (total 90 columns):
Unnamed: 0                                    3327 non-null int64
custid                                        3327 non-null int64
trade_no                                      3327 non-null object
bank_card_no                                  3327 non-null object
low_volume_percent                            3325 non-null float64
middle_volume_percent                         3325 non-null float64
take_amount_in_later_12_month_highest         3327 non-null int64
trans_amount_increase_rate_lately             3325 non-null float64
trans_activity_month                          3325 non-null float64
trans_activity_day                            3325 non-null float64
transd_mcc                                    3325 non-null float64
trans_days_interval_filter                    3321 non-null float64
trans_days_interval                           3325 non-null float64
regional_mobility                             3325 non-null float64
student_feature                               1208 non-null float64
repayment_capability                          3327 non-null int64
is_high_user                                  3327 non-null int64
number_of_trans_from_2011                     3325 non-null float64
first_transaction_time                        3325 non-null float64
historical_trans_amount                       3327 non-null int64
historical_trans_day                          3325 non-null float64
rank_trad_1_month                             3325 non-null float64
trans_amount_3_month                          3327 non-null int64
avg_consume_less_12_valid_month               3325 non-null float64
abs                                           3327 non-null int64
top_trans_count_last_1_month                  3325 non-null float64
avg_price_last_12_month                       3327 non-null int64
avg_price_top_last_12_valid_month             3249 non-null float64
reg_preference_for_trad                       3325 non-null object
trans_top_time_last_1_month                   3321 non-null float64
trans_top_time_last_6_month                   3321 non-null float64
consume_top_time_last_1_month                 3321 non-null float64
consume_top_time_last_6_month                 3321 non-null float64
cross_consume_count_last_1_month              3037 non-null float64
trans_fail_top_count_enum_last_1_month        3314 non-null float64
trans_fail_top_count_enum_last_6_month        3314 non-null float64
trans_fail_top_count_enum_last_12_month       3314 non-null float64
consume_mini_time_last_1_month                3312 non-null float64
max_cumulative_consume_later_1_month          3327 non-null int64
max_consume_count_later_6_month               3321 non-null float64
railway_consume_count_last_12_month           3319 non-null float64
pawns_auctions_trusts_consume_last_1_month    3327 non-null int64
pawns_auctions_trusts_consume_last_6_month    3327 non-null int64
jewelry_consume_count_last_6_month            3319 non-null float64
status                                        3327 non-null int64
source                                        3327 non-null object
first_transaction_day                         3325 non-null float64
trans_day_last_12_month                       3325 non-null float64
id_name                                       3125 non-null object
apply_score                                   3107 non-null float64
apply_credibility                             3107 non-null float64
query_org_count                               3107 non-null float64
query_finance_count                           3107 non-null float64
query_cash_count                              3107 non-null float64
query_sum_count                               3107 non-null float64
latest_query_time                             3107 non-null object
latest_one_month_apply                        3107 non-null float64
latest_three_month_apply                      3107 non-null float64
latest_six_month_apply                        3107 non-null float64
loans_score                                   3112 non-null float64
loans_credibility_behavior                    3112 non-null float64
loans_count                                   3112 non-null float64
loans_settle_count                            3112 non-null float64
loans_overdue_count                           3112 non-null float64
loans_org_count_behavior                      3112 non-null float64
consfin_org_count_behavior                    3112 non-null float64
loans_cash_count                              3112 non-null float64
latest_one_month_loan                         3112 non-null float64
latest_three_month_loan                       3112 non-null float64
latest_six_month_loan                         3112 non-null float64
history_suc_fee                               3112 non-null float64
history_fail_fee                              3112 non-null float64
latest_one_month_suc                          3112 non-null float64
latest_one_month_fail                         3112 non-null float64
loans_long_time                               3112 non-null float64
loans_latest_time                             3112 non-null object
loans_credit_limit                            3112 non-null float64
loans_credibility_limit                       3112 non-null float64
loans_org_count_current                       3112 non-null float64
loans_product_count                           3112 non-null float64
loans_max_limit                               3112 non-null float64
loans_avg_limit                               3112 non-null float64
consfin_credit_limit                          3112 non-null float64
consfin_credibility                           3112 non-null float64
consfin_org_count_current                     3112 non-null float64
consfin_product_count                         3112 non-null float64
consfin_max_limit                             3112 non-null float64
consfin_avg_limit                             3112 non-null float64
latest_query_day                              3107 non-null float64
loans_latest_day                              3112 non-null float64
dtypes: float64(70), int64(13), object(7)
memory usage: 2.3+ MB

將一些沒用的特徵刪除

##將只有一個值的列刪除
X_train.drop(["bank_card_no"],axis=1,inplace=True)
##刪除身份證號
X_train.drop(["trade_no"],axis=1,inplace=True)
##刪除名字
X_train.drop(['id_name'],axis=1,inplace=True)
##刪除custid
X_train.drop(['custid'],axis=1,inplace=True)
X_train.drop(['loans_latest_time', 'latest_query_time'], axis=1,inplace=True)
X_train.drop(['source'], axis=1,inplace=True)

把值轉爲離散型數據

X_train['reg_preference_for_trad'].value_counts()
//
一線城市    2380
三線城市     747
境外       103
二線城市      92
其他城市       3
//
def get_change(dt):
    if dt == '一線城市':
        return 1
    if dt == '二線城市':
        return 2
    if dt == '三線城市':
        return 3
    if dt == '境外':
        return 4
    else :
        return 5
X_train['reg_preference_for_trad'] = np.array([get_change(x) for x in X_train['reg_preference_for_trad']])
X_train['reg_preference_for_trad'].value_counts()
//
1    2380
3     747
4     103
2      92
5       5
Name: reg_preference_for_trad, dtype: int64

處理缺失值,將缺失值填充爲那一列的均值

cols = X_train.columns
for col in cols:
    X_train[col] = X_train[col].fillna(X_train[col].mean())
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章