# 導入庫from sklearn import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 使文字可以展示
plt.rcParams['font.sans-serif']=['SimHei']# 使負號可以展示
plt.rcParams['axes.unicode_minus']=False# 讀取數據
data = pd.read_excel('F:\\Desktop\\建模數據.xlsx')
第二步,構建集成函數;
# 用Python實現多投票分類from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
classMajorityVoteClassifier(BaseEstimator,
ClassifierMixin):""" A majority vote ensemble classifier
Parameters
----------
classifiers : array-like, shape = [n_classifiers]
Different classifiers for the ensemble
vote : str, {'classlabel', 'probability'} (default='label')
If 'classlabel' the prediction is based on the argmax of
class labels. Else if 'probability', the argmax of
the sum of probabilities is used to predict the class label
(recommended for calibrated classifiers).
weights : array-like, shape = [n_classifiers], optional (default=None)
If a list of `int` or `float` values are provided, the classifiers
are weighted by importance; Uses uniform weights if `weights=None`.
"""def__init__(self, classifiers, vote='classlabel', weights=None):
self.classifiers = classifiers
self.named_classifiers ={key: value for key, value
in _name_estimators(classifiers)}
self.vote = vote
self.weights = weights
deffit(self, X, y):""" Fit classifiers.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.
y : array-like, shape = [n_samples]
Vector of target class labels.
Returns
-------
self : object
"""if self.vote notin('probability','classlabel'):raise ValueError("vote must be 'probability' or 'classlabel'""; got (vote=%r)"% self.vote)if self.weights andlen(self.weights)!=len(self.classifiers):raise ValueError('Number of classifiers and weights must be equal''; got %d weights, %d classifiers'%(len(self.weights),len(self.classifiers)))# Use LabelEncoder to ensure class labels start with 0, which# is important for np.argmax call in self.predict
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ =[]for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)return self
defpredict(self, X):""" Predict class labels for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.
Returns
----------
maj_vote : array-like, shape = [n_samples]
Predicted class labels.
"""if self.vote =='probability':
maj_vote = np.argmax(self.predict_proba(X), axis=1)else:# 'classlabel' vote# Collect results from clf.predict calls
predictions = np.asarray([clf.predict(X)for clf in self.classifiers_]).T
maj_vote = np.apply_along_axis(lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj_vote = self.lablenc_.inverse_transform(maj_vote)return maj_vote
defpredict_proba(self, X):""" Predict class probabilities for X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
----------
avg_proba : array-like, shape = [n_samples, n_classes]
Weighted average probability for each class per sample.
"""
probas = np.asarray([clf.predict_proba(X)for clf in self.classifiers_])
avg_proba = np.average(probas, axis=0, weights=self.weights)return avg_proba
defget_params(self, deep=True):""" Get classifier parameter names for GridSearch"""ifnot deep:returnsuper(MajorityVoteClassifier, self).get_params(deep=False)else:
out = self.named_classifiers.copy()for name, step in six.iteritems(self.named_classifiers):for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s'%(name, key)]= value
return out
第三步,數據處理;
# 設置 X 和 y
X = data.iloc[:,1:]
y = data.iloc[:,0]from sklearn.cross_validation import train_test_split
# 設置訓練數據集和測試數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state =0)# 數據標準化from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()# 將訓練數據標準化
X_train_std = stdsc.fit_transform(X_train)# 將測試數據標準化
X_test_std = stdsc.transform(X_test)