from scipy.io import arff
import numpy as np
import pandas as pd
iris = arff.loadarff('../dataset/iris.arff')
df = pd.DataFrame(iris[0])
df = df.sample(frac=1)
length = df.shape[0]
classes = list(set(df['class']))
classes_length = len(classes)
classes_dict = dict()
for i in range(classes_length):
classes_dict[classes[i]] = i
for i in range(length):
df.iloc[i, 4] = classes_dict[df.iloc[i, 4]]
train_data = df.iloc[0:100]
test_data = df.iloc[100:]
train_length = train_data.shape[0]
test_length = test_data.shape[0]
attribute_list = df.columns[:-1]
median_dict = {}
attribute_dict = {}
# choose median to split
x = 0
for attribute in attribute_list:
attribute_dict[attribute] = x
x += 1
median_dict[attribute] = df.iloc[:train_length][attribute].median()
# calculate gini index according to an attribute of a part of data
def gini(attribute, data):
whole_count = data.shape[0]
if whole_count == 0:
return 1
median = median_dict[attribute]
over_median = data[data[attribute] >= median]
less_median = data[data[attribute] < median]
over_median_count = over_median.shape[0]
less_median_count = less_median.shape[0]
over_sum_sigma_classes = 0
less_sum_sigma_classes = 0
for i in range(classes_length):
class_i_count = over_median[over_median['class'] == i].shape[0]
over_sum_sigma_classes += (class_i_count / whole_count) ** 2
class_i_count = less_median[less_median['class'] == i].shape[0]
less_sum_sigma_classes += (class_i_count / whole_count) ** 2
return over_median_count / whole_count * (1 - over_sum_sigma_classes) + less_median_count / whole_count * (
1 - less_sum_sigma_classes)
# choose the attribute with the min Gini index
def attribute_choose(data, part_attribute_list):
ginis = []
for attribute in part_attribute_list:
ginis.append(gini(attribute_list[attribute], data))
ginis = np.array(ginis)
min_attribute_id = np.argmin(ginis)
return part_attribute_list[min_attribute_id]
def majority_class(data):
return data['class'].value_counts().sort_values(ascending=False).index[0]
def create_tree(data, attributes):
class_value_counts = data['class'].value_counts()
if class_value_counts.shape[0] == 1:
return class_value_counts.index[0]
if len(attributes) == 1:
return majority_class(data)
attribute_min_gini_id = attribute_choose(data, list(attributes))
sub_attributes = attributes.copy()
sub_attributes.remove(attribute_min_gini_id)
attribute_min_gini = attribute_list[attribute_min_gini_id]
mytree = {'best_feat_id': attribute_min_gini_id}
median = median_dict[attribute_min_gini]
mytree['more'] = create_tree(data[data[attribute_min_gini] >= median], sub_attributes)
mytree['less'] = create_tree(data[data[attribute_min_gini] < median], sub_attributes)
return mytree
def classify(tree, x):
if type(tree) == np.int64:
return tree
best_feat_id = tree['best_feat_id']
best_feat = attribute_list[best_feat_id]
median = median_dict[best_feat]
if x[best_feat] >= median:
more = tree['more']
if type(more) == int:
return more
else:
return classify(more, x)
else:
less = tree['less']
if type(less) == int:
return less
else:
return classify(less, x)
attribute_list_ids = set(range(len(attribute_list)))
decision_tree = create_tree(train_data, attribute_list_ids)
print(decision_tree)
y_predict_list = []
for i in range(test_length):
y_predict = classify(decision_tree, test_data.iloc[i][0:4])
y_predict_list.append(y_predict)
y_predict_list = np.array(y_predict_list)
print('acc test %.2f' % (np.mean(y_predict_list == np.array(test_data.iloc[:, 4]))))