手寫cart

from scipy.io import arff
import numpy as np
import pandas as pd

iris = arff.loadarff('../dataset/iris.arff')
df = pd.DataFrame(iris[0])
df = df.sample(frac=1)
length = df.shape[0]
classes = list(set(df['class']))
classes_length = len(classes)
classes_dict = dict()
for i in range(classes_length):
    classes_dict[classes[i]] = i
for i in range(length):
    df.iloc[i, 4] = classes_dict[df.iloc[i, 4]]
train_data = df.iloc[0:100]
test_data = df.iloc[100:]
train_length = train_data.shape[0]
test_length = test_data.shape[0]

attribute_list = df.columns[:-1]
median_dict = {}
attribute_dict = {}
# choose median to split
x = 0
for attribute in attribute_list:
    attribute_dict[attribute] = x
    x += 1
    median_dict[attribute] = df.iloc[:train_length][attribute].median()


# calculate gini index according to an attribute of a part of data
def gini(attribute, data):
    whole_count = data.shape[0]
    if whole_count == 0:
        return 1
    median = median_dict[attribute]
    over_median = data[data[attribute] >= median]
    less_median = data[data[attribute] < median]
    over_median_count = over_median.shape[0]
    less_median_count = less_median.shape[0]
    over_sum_sigma_classes = 0
    less_sum_sigma_classes = 0
    for i in range(classes_length):
        class_i_count = over_median[over_median['class'] == i].shape[0]
        over_sum_sigma_classes += (class_i_count / whole_count) ** 2
        class_i_count = less_median[less_median['class'] == i].shape[0]
        less_sum_sigma_classes += (class_i_count / whole_count) ** 2
    return over_median_count / whole_count * (1 - over_sum_sigma_classes) + less_median_count / whole_count * (
            1 - less_sum_sigma_classes)


# choose the attribute with the min Gini index
def attribute_choose(data, part_attribute_list):
    ginis = []
    for attribute in part_attribute_list:
        ginis.append(gini(attribute_list[attribute], data))
    ginis = np.array(ginis)
    min_attribute_id = np.argmin(ginis)
    return part_attribute_list[min_attribute_id]


def majority_class(data):
    return data['class'].value_counts().sort_values(ascending=False).index[0]


def create_tree(data, attributes):
    class_value_counts = data['class'].value_counts()
    if class_value_counts.shape[0] == 1:
        return class_value_counts.index[0]
    if len(attributes) == 1:
        return majority_class(data)
    attribute_min_gini_id = attribute_choose(data, list(attributes))
    sub_attributes = attributes.copy()
    sub_attributes.remove(attribute_min_gini_id)
    attribute_min_gini = attribute_list[attribute_min_gini_id]
    mytree = {'best_feat_id': attribute_min_gini_id}
    median = median_dict[attribute_min_gini]
    mytree['more'] = create_tree(data[data[attribute_min_gini] >= median], sub_attributes)
    mytree['less'] = create_tree(data[data[attribute_min_gini] < median], sub_attributes)
    return mytree


def classify(tree, x):
    if type(tree) == np.int64:
        return tree
    best_feat_id = tree['best_feat_id']
    best_feat = attribute_list[best_feat_id]
    median = median_dict[best_feat]
    if x[best_feat] >= median:
        more = tree['more']
        if type(more) == int:
            return more
        else:
            return classify(more, x)
    else:
        less = tree['less']
        if type(less) == int:
            return less
        else:
            return classify(less, x)


attribute_list_ids = set(range(len(attribute_list)))
decision_tree = create_tree(train_data, attribute_list_ids)
print(decision_tree)
y_predict_list = []
for i in range(test_length):
    y_predict = classify(decision_tree, test_data.iloc[i][0:4])
    y_predict_list.append(y_predict)
y_predict_list = np.array(y_predict_list)
print('acc test %.2f' % (np.mean(y_predict_list == np.array(test_data.iloc[:, 4]))))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章