昨天晚上的報錯原來是因爲.DS_Store文件
改了錯之後出來的結果當然是非常不理想
========== data/wlkc120302_mooc.csv ========== ev mae mse r2 BayesianRidge 0.0 8.795455 105.572314 -0.185910 LinearRegression 0.0 8.795455 105.572314 -0.185910 ElasticNet 0.0 8.795455 105.572314 -0.185910 SVR 0.0 8.522727 101.431818 -0.139399 GBR 0.0 8.795455 105.572314 -0.185910 ========== data/wlkc120426_mooc.csv ========== ev mae mse r2 BayesianRidge 0.000118 10.990590 186.242984 -0.077984 LinearRegression -0.592456 12.615322 297.355614 -0.721110 ElasticNet -0.087667 11.551404 203.667208 -0.178837 SVR 0.001605 9.619207 172.972564 -0.001174 GBR -1.167629 13.947161 408.378638 -1.363717 ========== data/0072110_mooc.csv ========== ev mae mse r2 BayesianRidge 0.000005 13.544866 286.463752 -0.184131 LinearRegression -0.003763 13.492413 285.811114 -0.181433 ElasticNet 0.004550 13.494982 285.036195 -0.178230 SVR 0.002413 12.400882 246.585484 -0.019290 GBR -0.385784 13.738933 358.159824 -0.480495 ========== data/xgw1601_mooc.csv ========== ev mae mse r2 BayesianRidge 0.014434 6.197771 50.394648 0.011846 LinearRegression -0.211770 6.144246 63.693061 -0.248913 ElasticNet 0.193198 5.073056 41.401018 0.188196 SVR 0.009616 5.963487 56.876082 -0.115244 GBR -0.033089 5.676433 53.096388 -0.041131 ========== data/sxwl1605_mooc.csv ========== ev mae mse r2 BayesianRidge 0.000645 3.650837 23.849838 -0.002639 LinearRegression -0.150069 3.911446 27.476062 -0.155084 ElasticNet -0.089928 3.827334 25.987708 -0.092515 SVR -0.003928 3.656463 23.942758 -0.006546 GBR -0.490234 4.591850 35.831892 -0.506361 ========== data/0075202_mooc.csv ========== ev mae mse r2 BayesianRidge -0.000388 6.557933 59.181354 -0.015041 LinearRegression -1.409543 9.045977 149.441039 -1.563118 ElasticNet -0.163822 7.088272 67.966959 -0.165726 SVR -0.008628 6.847836 62.619095 -0.074003 GBR -0.231770 7.421227 79.503907 -0.363600
用卡方檢驗選擇特徵:
'''
卡方檢驗選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model=SelectKBest(chi2, k=2)
X_train=model.fit_transform(X_train, y_train)
X_test=model.transform(X_test)
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
經過多組實驗發現選擇6個特徵效果會好一些
========== data/wlkc120302_mooc.csv ========== ev mae mse r2 BayesianRidge 4.440892e-16 10.274793 139.302169 -0.013076 LinearRegression 4.440892e-16 10.274793 139.302169 -0.013076 ElasticNet 4.440892e-16 10.274793 139.302169 -0.013076 SVR 4.440892e-16 10.263636 139.646364 -0.015579 GBR 4.440892e-16 10.274793 139.302169 -0.013076 ========== data/wlkc120426_mooc.csv ========== ev mae mse r2 BayesianRidge 0.024023 12.791504 300.188965 -0.055624 LinearRegression 0.034556 12.753884 295.587190 -0.039442 ElasticNet 0.026033 12.794118 298.578349 -0.049960 SVR 0.050969 13.079902 332.549724 -0.169422 GBR 0.215518 11.291786 228.676196 0.195853 ========== data/0072110_mooc.csv ========== ev mae mse r2 BayesianRidge 0.000477 14.637398 470.614522 -0.005295 LinearRegression -0.001352 14.555344 471.901501 -0.008044 ElasticNet 0.000748 14.725919 470.934133 -0.005977 SVR -0.001119 14.558936 494.533772 -0.056389 GBR 0.034091 14.270784 453.639950 0.030965 ========== data/xgw1601_mooc.csv ========== ev mae mse r2 BayesianRidge 0.092998 4.485135 31.235511 0.089491 LinearRegression 0.017339 4.854518 33.727998 0.016836 ElasticNet 0.071581 4.500176 31.943751 0.068846 SVR 0.030234 5.382077 37.695584 -0.098819 GBR -0.355482 5.146727 46.833385 -0.365184 ========== data/sxwl1605_mooc.csv ========== ev mae mse r2 BayesianRidge 0.021166 3.788196 25.477850 0.015230 LinearRegression 0.030038 3.729810 25.226252 0.024955 ElasticNet 0.027092 3.759087 25.312499 0.021621 SVR -0.022665 3.990168 27.159234 -0.049759 GBR -0.148409 4.142123 29.758142 -0.150212 ========== data/0075202_mooc.csv ========== ev mae mse r2 BayesianRidge -0.000339 7.127515 66.264427 -0.000454 LinearRegression -1.124962 10.174639 148.218662 -1.237791 ElasticNet -0.336886 8.546784 89.814588 -0.356012 SVR -0.008095 7.246442 67.476140 -0.018748 GBR -1.251593 10.751833 150.253849 -1.268518
使用遞歸特徵消除選擇的時候是4個特徵效果最好
'''
遞歸特徵消除法
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
#data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
rfe=RFE(estimator=model, n_features_to_select=4)
X_train=rfe.fit_transform(X_train, y_train)
X_test=rfe.transform(X_test)
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
========== data/wlkc120302_mooc.csv ========== ev mae mse r2 BayesianRidge 0.0 4.650826 31.855888 -0.013825 LinearRegression 0.0 4.650826 31.855888 -0.013825 ElasticNet 0.0 4.650826 31.855888 -0.013825 SVR 0.0 4.772727 36.795455 -0.171028 GBR 0.0 4.650826 31.855888 -0.013825 ========== data/wlkc120426_mooc.csv ========== ev mae mse r2 BayesianRidge 0.003650 9.047077 149.586735 0.002180 LinearRegression -0.026233 9.377046 155.082482 -0.034480 ElasticNet -0.006369 9.219164 151.466864 -0.010362 SVR 0.015636 8.219336 160.355107 -0.069651 GBR 0.123134 8.018877 133.826141 0.107311 ========== data/0072110_mooc.csv ========== ev mae mse r2 BayesianRidge -0.006005 13.278626 313.951116 -0.023746 LinearRegression -0.051468 13.430929 328.755865 -0.072021 ElasticNet -0.030315 13.376066 322.138874 -0.050445 SVR -0.004068 12.838003 307.922039 -0.004086 GBR -0.335792 14.108979 421.318363 -0.373853 ========== data/xgw1601_mooc.csv ========== ev mae mse r2 BayesianRidge 0.254146 4.730700 30.455919 0.143720 LinearRegression 0.272013 4.602304 28.636587 0.194871 ElasticNet 0.227886 4.965201 33.279970 0.064321 SVR 0.035482 4.172519 37.573933 -0.056406 GBR -0.297793 5.603996 48.573487 -0.365663 ========== data/sxwl1605_mooc.csv ========== ev mae mse r2 BayesianRidge -0.000380 3.608012 20.329233 -0.001796 LinearRegression 0.090503 3.470283 18.461841 0.090227 ElasticNet 0.087249 3.482499 18.522397 0.087243 SVR -0.030353 3.677536 20.909002 -0.030366 GBR -0.080871 3.787335 22.188537 -0.093419 ========== data/0075202_mooc.csv ========== ev mae mse r2 BayesianRidge -0.000010 8.089313 78.805973 -0.272347 LinearRegression -0.129348 8.340741 85.010122 -0.372515 ElasticNet -0.006599 8.110855 78.959876 -0.274832 SVR 0.016822 7.845792 75.608331 -0.220720 GBR 0.337400 7.557406 76.076446 -0.228278
如果是直接使用l2正則化懲罰選擇特徵:
'''
logistic懲罰項選擇特徵
'''
from __future__ import division
import time
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import BayesianRidge, LinearRegression, ElasticNet # 批量導入要實現的迴歸算法
from sklearn.model_selection import cross_val_score # 交叉檢驗
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score # 批量導入指標算法
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor # 集成算法
from sklearn.feature_selection import VarianceThreshold,SelectKBest,chi2,RFE,SelectFromModel
from sklearn.linear_model import LogisticRegression
import os
file=[]
for root,dirs,files in os.walk('data'):
for name in files:
file.append(os.path.join(root, name))
for f in file:
if f=='data/.DS_Store':
continue
data=pd.read_csv(f,header=None,index_col=0)
data_=data.iloc[:,3:]
#print(data)
data_=data_.sample(frac=1)
dataset=np.array(data_)
#dataset=np.loadtxt(dir)
index=int(dataset.shape[0]*0.8)
data_x=dataset[:,:-1]
data_y=dataset[:,-1]
#data_x=SelectKBest(chi2, k=8).fit_transform(data_x, data_y)
X_train=data_x[:index,:]
y_train=data_y[:index]
X_test=data_x[index:,:]
y_test=data_y[index:]
model=SelectFromModel(LogisticRegression(penalty='l2',C=0.1))
X_train=model.fit_transform(X_train,y_train)
X_test=model.transform(X_test)
model_br = BayesianRidge() # 建立貝葉斯嶺迴歸模型對象
model_lr = LinearRegression() # 建立普通線性迴歸模型對象
model_etc = ElasticNet() # 建立彈性網絡迴歸模型對象
model_svr = SVR() # 建立支持向量機迴歸模型對象
model_gbr = GradientBoostingRegressor() # 建立梯度增強迴歸模型對象
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR'] # 不同模型的名稱列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr] # 不同迴歸模型對象的集合
cv_score_list = [] # 交叉檢驗結果列表
pre_y_list = [] # 各個迴歸模型預測的y值列表
for model in model_dic: # 讀出每個迴歸模型對象
scores = cross_val_score(model, X_train, y_train, cv=5) # 將每個迴歸模型導入交叉檢驗模型中做訓練檢驗
cv_score_list.append(scores) # 將交叉檢驗結果存入結果列表
pre_y_list.append(model.fit(X_train, y_train).predict(X_test)) # 將回歸訓練中得到的預測y存入列表
model_metrics_name = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score] # 迴歸評估指標對象集
model_metrics_list = [] # 迴歸評估指標列表
for i in range(5): # 循環每個模型索引
tmp_list = [] # 每個內循環的臨時結果列表
for m in model_metrics_name: # 循環每個指標對象
tmp_score = m(y_test, pre_y_list[i]) # 計算每個迴歸指標結果
tmp_list.append(tmp_score) # 將結果存入每個內循環的臨時結果列表
model_metrics_list.append(tmp_list) # 將結果存入迴歸評估指標列表
df2 = pd.DataFrame(model_metrics_list, index=model_names, columns=['ev', 'mae', 'mse', 'r2']) # 建立迴歸指標的數據框
print('='*10,f,'='*10)
print (df2)
========== data/wlkc120302_mooc.csv ========== ev mae mse r2 BayesianRidge -2.220446e-16 7.497934 89.070764 -0.204197 LinearRegression -2.220446e-16 7.497934 89.070764 -0.204197 ElasticNet -2.220446e-16 7.497934 89.070764 -0.204197 SVR -2.220446e-16 7.454545 88.545455 -0.197095 GBR -2.220446e-16 7.497934 89.070764 -0.204197 ========== data/wlkc120426_mooc.csv ========== ev mae mse r2 BayesianRidge 0.016162 10.211720 228.515831 0.014033 LinearRegression -0.003705 10.796461 232.902592 -0.004895 ElasticNet 0.009242 10.475627 230.060742 0.007367 SVR 0.038126 9.203842 232.811866 -0.004503 GBR 0.281124 9.745502 166.738358 0.280581 ========== data/0072110_mooc.csv ========== ev mae mse r2 BayesianRidge 0.000029 13.960865 340.783442 -0.000100 LinearRegression 0.001445 13.985477 340.436509 0.000918 ElasticNet 0.001863 13.976484 340.260897 0.001434 SVR -0.005961 13.787318 345.787911 -0.014786 GBR -0.643308 15.747902 564.042445 -0.655300 ========== data/xgw1601_mooc.csv ========== ev mae mse r2 BayesianRidge 0.271405 5.396872 48.732612 0.239358 LinearRegression 0.265018 4.962093 49.671277 0.224707 ElasticNet 0.309029 5.229352 46.652425 0.271826 SVR 0.021994 5.722142 62.991804 0.016793 GBR -0.453151 6.865954 96.042096 -0.499072 ========== data/sxwl1605_mooc.csv ========== ev mae mse r2 BayesianRidge 0.002783 4.508833 32.406245 -0.000092 LinearRegression 0.000846 4.520675 32.487420 -0.002597 ElasticNet 0.001174 4.515737 32.449221 -0.001418 SVR 0.015616 4.509390 32.004629 0.012303 GBR -0.044163 4.611927 33.834823 -0.044179 ========== data/0075202_mooc.csv ========== ev mae mse r2 BayesianRidge -0.000019 4.838834 35.842253 -0.004336 LinearRegression 0.023276 4.607107 34.857635 0.023254 ElasticNet -0.003795 4.671598 35.851415 -0.004593 SVR 0.026058 4.856963 35.104422 0.016338 GBR -0.279743 5.799621 45.875594 -0.285481
總結是結果都非常不好。。我也很迷