



# 文件功能:svm分類鳶尾花數據集
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

data = load_iris()

x = data.data[:, :2]
y = data.target
train_data, test_data, train_label, test_label = train_test_split\
                    (x, y, random_state=1, train_size=0.6, test_size=0.4)

classifier = svm.SVC(C=2, kernel='rbf', gamma=10, decision_function_shape='ovo') # ovr:一對多策略  
classifier.fit(train_data, train_label.ravel()) #ravel函數在降維時默認是行序優先  

print("訓練集:", classifier.score(train_data, train_label))
print("測試集:", classifier.score(test_data, test_label))

tra_label = classifier.predict(train_data)      #訓練集的預測標籤  
tes_label = classifier.predict(test_data)       #測試集的預測標籤  
print("訓練集:", accuracy_score(train_label, tra_label))
print("測試集:", accuracy_score(test_label, tes_label))

print('train_decision_function:\n', classifier.decision_function(train_data))     # (90,3)  
print('predict_result:\n', classifier.predict(train_data))



# 文件功能:knn實現鳶尾花數據集分類
from sklearn import datasets                           # 引入sklearn包含的衆多數據集
from sklearn.model_selection import train_test_split   # 將數據分爲測試集和訓練集
from sklearn.neighbors import KNeighborsClassifier     # 利用knn方式訓練數據

# 【1】引入訓練數據
iris = datasets.load_iris() # 引入iris鳶尾花數據,iris數據包含4個特徵變量
iris_X = iris.data          # 特徵變量
iris_y = iris.target        # 目標值
# 利用train_test_split進行將訓練集和測試集進行分開,test_size佔30%
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.8)
print(y_train)              #訓練數據的特徵值分爲3類

# 【2】執行訓練
knn = KNeighborsClassifier()   # 引入訓練方法
knn.fit(X_train, y_train)      # 進行填充測試數據進行訓練

# 【3】預測數據
print(knn.predict(X_test))      # 預測特徵值
print(y_test)                   # 真實特徵值

# 【4】可直接調用accuracy_score計算準確率  
from sklearn.metrics import accuracy_score
print("測試準確度:", accuracy_score(knn.predict(X_test), y_test))


# 文件功能:隨機森林分類鳶尾花數據集

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

RF = RandomForestClassifier(n_estimators=100, n_jobs=4, oob_score=True)
iris = load_iris()
x = iris.data[:, :2]
y = iris.target
RF.fit(x, y)
h = .02
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weight in ['uniform', 'distance']:
    x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
    y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h),
        np.arange(y_min, y_max, h)
    z = RF.predict(np.c_[xx.ravel(), yy.ravel()])
    z = z.reshape(xx.shape)
    plt.pcolormesh(xx, yy, z, cmap=cmap_light)
    plt.scatter(x[:, 0], x[:, 1], c=y, cmap=cmap_bold, edgecolors='k', s=20)
    plt.xlim(xx.min(), xx.max())
print('RandomForestClassifier:', RF.score(x, y))




from sklearn import datasets                         # 導入方法類
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# 【1】載入數據集
iris = datasets.load_iris()                         # 加載 iris 數據集
iris_feature = iris.data                            # 特徵數據
iris_target = iris.target                           # 分類數據

# 【2】數據集劃分
feature_train, feature_test, target_train, target_test = train_test_split(iris_feature, iris_target, test_size=0.33, random_state=42)

# 【3】訓練模型
dt_model = DecisionTreeClassifier()                 # 所有參數均置爲默認狀態
dt_model.fit(feature_train,target_train)            # 使用訓練集訓練模型
predict_results = dt_model.predict(feature_test)    # 使用模型對測試集進行預測

# 【4】結果評估
scores = dt_model.score(feature_test, target_test)



# 文件功能:決策樹分類鳶尾花數據集
# 代碼整體思路:
# 1 . 先處理數據,shuffle函數隨機抽取80%樣本做訓練集。
# 2 . 特徵值離散化
# 3 . 用信息熵來遞歸地構造樹
# 4 . 用構造好的樹來判斷剩下20%的測試集,求算法做分類的正確率

from sklearn import datasets
import math
import numpy as np

# 【1】獲取信息熵
def getInformationEntropy(arr, leng):
    return -(arr[0] / leng * math.log(arr[0] / leng if arr[0] > 0 else 1) + arr[1] / leng * math.log(
        arr[1] / leng if arr[1] > 0 else 1) + arr[2] / leng * math.log(arr[2] / leng if arr[2] > 0 else 1))

# 【2】離散化特徵一的值
def discretization(index):
    feature1 = np.array([iris.data[:, index], iris.target]).T
    feature1 = feature1[feature1[:, 0].argsort()]

    counter1 = np.array([0, 0, 0])
    counter2 = np.array([0, 0, 0])

    resEntropy = 100000
    for i in range(len(feature1[:, 0])):
        counter1[int(feature1[i, 1])] = counter1[int(feature1[i, 1])] + 1
        counter2 = np.copy(counter1)
        for j in range(i + 1, len(feature1[:, 0])):
            counter2[int(feature1[j, 1])] = counter2[int(feature1[j, 1])] + 1
            # print(i,j,counter1,counter2)
            # 貪心算法求最優的切割點
            if i != j and j != len(feature1[:, 0]) - 1:
                sum = (i + 1) * getInformationEntropy(counter1, i + 1) + (j - i) * getInformationEntropy(
                    counter2 - counter1, j - i) + (length - j - 1) * getInformationEntropy(np.array(num) - counter2,                                                                               length - j - 1)
                if sum < resEntropy:
                    resEntropy = sum
                    res = np.array([i, j])
    res_value = [feature1[res[0], 0], feature1[res[1], 0]]
    print(res, resEntropy, res_value)
    return res_value

# 【3】計算合適的分割值
def getRazors():
    a = []
    for i in range(len(iris.feature_names)):
    return np.array(a)

# 【4】隨機抽取80%的訓練集和20%的測試集
def divideData():
    completeData = np.c_[iris.data, iris.target.T]
    trainData = completeData[range(int(length * 0.8)), :]
    testData = completeData[range(int(length * 0.8), length), :]
    return [trainData, testData]

# 【5】
def getEntropy(counter):
    res = 0
    denominator = np.sum(counter)
    if denominator == 0:
        return 0
    for value in counter:
        if value == 0:
        res += value / denominator * math.log(value / denominator if value > 0 and denominator > 0 else 1)
    return -res

# 【6】尋找最大索引
def findMaxIndex(dataSet):
    maxIndex = 0
    maxValue = -1
    for index, value in enumerate(dataSet):
        if value > maxValue:
            maxIndex = index
            maxValue = value
    return maxIndex

# 【7】遞歸
def recursion(featureSet, dataSet, counterSet):
    if (counterSet[0] == 0 and counterSet[1] == 0 and counterSet[2] != 0):
        return iris.target_names[2]
    if (counterSet[0] != 0 and counterSet[1] == 0 and counterSet[2] == 0):
        return iris.target_names[0]
    if (counterSet[0] == 0 and counterSet[1] != 0 and counterSet[2] == 0):
        return iris.target_names[1]
    if len(featureSet) == 0:
        return iris.target_names[findMaxIndex(counterSet)]
    if len(dataSet) == 0:
        return []
    res = 1000
    final = 0
    # print("剩餘特徵數目", len(featureSet))
    for feature in featureSet:
        i = razors[feature][0]
        j = razors[feature][1]
        # print("i = ",i," j = ",j)
        set1 = []
        set2 = []
        set3 = []
        counter1 = [0, 0, 0]
        counter2 = [0, 0, 0]
        counter3 = [0, 0, 0]
        for data in dataSet:
            index = int(data[-1])
            # print("data ",data," index ",index)

            if data[feature] < i:
                counter1[index] = counter1[index] + 1
            elif data[feature] >= i and data[feature] <= j:
                counter2[index] = counter2[index] + 1
                counter3[index] = counter3[index] + 1

        a = (len(set1) * getEntropy(counter1) + len(set2) * getEntropy(counter2) + len(set3) * getEntropy(
            counter3)) / len(dataSet)
        # print("特徵編號:",feature,"選取該特徵得到的信息熵:",a)
        if a < res:
            res = a
            final = feature
    # 返回被選中的特徵的下標
    # sequence.append(final)
    # print("最終在本節點上選取的特徵編號是:",final)
    child = [0, 0, 0, 0]
    child[0] = final
    child[1] = recursion(featureSet, set1, counter1)
    child[2] = recursion(featureSet, set2, counter2)
    child[3] = recursion(featureSet, set3, counter3)
    return child

# 【8】決策
def judge(data, tree):
    root = "unknow"
    while (len(tree) > 0):
        if isinstance(tree, str) and tree in iris.target_names:
            return tree
        root = tree[0]
        if (isinstance(root, str)):
            return root
        if isinstance(root, int):
            if data[root] < razors[root][0] and tree[1] != []:
                tree = tree[1]
            elif tree[2] != [] and (tree[1] == [] or (data[root] >= razors[root][0] and data[root] <= razors[root][1])):
                tree = tree[2]
                tree = tree[3]
    return root

# 【9】調用
if __name__ == '__main__':
    iris = datasets.load_iris()
    num = [0, 0, 0]
    for row in iris.data:
        num[int(row[-1])] = num[int(row[-1])] + 1
    length = len(iris.target)
    [trainData, testData] = divideData()
    razors = getRazors()
    tree = recursion(list(range(len(iris.feature_names))), trainData,
                     [np.sum(trainData[:, -1] == 0), np.sum(trainData[:, -1] == 1), np.sum(trainData[:, -1] == 2)])
    print("本次選取的訓練集構建出的樹: ", tree)
    index = 0
    right = 0
    for data in testData:
        result = judge(testData[index], tree)
        truth = iris.target_names[int(testData[index][-1])]
        print("result is ", result, "  truth is ", truth)
        index = index + 1
        if result == truth:
            right = right + 1
    print("正確率 : ", right / index)







