這一篇主要是系統地對數據進行機器學習前的預處理。
# -*- coding: utf-8 -*- """ Created on Sun Oct 21 14:37:15 2018 @author: Administrator """ % reset -f % clear # In[*] ##########第一步 導入包 # In[*] from sklearn.model_selection import cross_val_score from sklearn import linear_model from sklearn import metrics import matplotlib.pyplot as plt import pandas as pd import matplotlib import numpy as np import seaborn as sns import os from scipy.stats import skew from scipy.stats.stats import pearsonr os.chdir("C:\\Users\\Administrator\\Desktop\\all") # In[*] ##########第二步 導入數據 # In[*] train = pd.read_csv('train.csv',header = 0,index_col=0) test = pd.read_csv('test.csv',header = 0,index_col=0) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']))
前兩步,導入包和數據。
數據大概80列,3000個觀測值,屬性包括有數字列,同時也有字符串列。
# In[*] # 第三步,將目標變量標準化 matplotlib.rcParams['figure.figsize'] = (12.0, 6.0) prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])}) prices.hist() #log transform the target:
# In[*] # 第四步,將預測變量標準化 train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features: numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
這一步主要目的是將數字類型的屬性,將這些特徵其中比較偏,不屬於正態分佈的特徵做log標準化。
# In[*] # 第五步,處理字符型變量以及將填充缺失值 # In[*] all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) # In[*] # 第六步,劃分訓練集和測試集 # In[*] #creating matrices for sklearn: X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice
數據預處理要點: 1.使用log(x+1)來轉換偏斜的數字特徵 -,這將使我們的數據更加正常 2.爲分類要素創建虛擬變量 3.將數字缺失值(NaN)替換爲各自列的平均值
全部代碼:
# -*- coding: utf-8 -*- """ Created on Sun Oct 21 14:37:15 2018 @author: Administrator """ % reset -f % clear # In[*] ##########第一步 導入包 # In[*] from sklearn.model_selection import cross_val_score from sklearn import linear_model from sklearn import metrics import matplotlib.pyplot as plt import pandas as pd import matplotlib import numpy as np import seaborn as sns import os from scipy.stats import skew from scipy.stats.stats import pearsonr os.chdir("C:\\Users\\Administrator\\Desktop\\all") # In[*] ##########第二步 導入數據 # In[*] train = pd.read_csv('train.csv',header = 0,index_col=0) test = pd.read_csv('test.csv',header = 0,index_col=0) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) # In[*] #Data preprocessing: #We're not going to do anything fancy here: #First I'll transform the skewed numeric features by taking log(feature + 1) - #this will make the features more normal #Create Dummy variables for the categorical features #Replace the numeric missing values (NaN's) with the mean of their respective columns # In[*] # 第三步,將目標變量標準化 # In[*] matplotlib.rcParams['figure.figsize'] = (12.0, 6.0) prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])}) prices.hist() #log transform the target: # In[*] # 第四步,將預測變量標準化 # In[*] train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features: numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) # In[*] # 第五步,處理字符型變量以及將填充缺失值 # In[*] all_data = pd.get_dummies(all_data) all_data = all_data.fillna(all_data.mean()) # In[*] # 第六步,劃分訓練集和測試集 # In[*] #creating matrices for sklearn: X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice