樸素貝葉斯理解
實驗流程
代碼實現
main.py
#coding:utf-8
#樸素貝葉斯算法 貝葉斯估計, λ=1 K=2, S=3; λ=1 拉普拉斯平滑
import pandas as pd
import numpy as np
class NavieBayesB(object):
def __init__(self):
self.A = 1 # 即λ=1
self.K = 2
self.S = 3
def getTrainSet(self):
trainSet = pd.read_csv('naivebayes_data.csv')
trainSetNP = np.array(trainSet) #由dataframe類型轉換爲數組類型
trainData = trainSetNP[:,0:trainSetNP.shape[1]-1] #訓練數據x1,x2
labels = trainSetNP[:,trainSetNP.shape[1]-1] #訓練數據所對應的所屬類型Y
return trainData, labels
def classify(self, trainData, labels, features):
labels = list(labels) #轉換爲list類型
#求先驗概率
P_y = {}
for label in labels:
P_y[label] = (labels.count(label) + self.A) / float(len(labels) + self.K*self.A)
#求條件概率
P = {}
for y in P_y.keys():
y_index = [i for i, label in enumerate(labels) if label == y] # y在labels中的所有下標
y_count = labels.count(y) # y在labels中出現的次數
for j in range(len(features)):
pkey = str(features[j]) + '|' + str(y)
x_index = [i for i, x in enumerate(trainData[:,j]) if x == features[j]] # x在trainData[:,j]中的所有下標
xy_count = len(set(x_index) & set(y_index)) #x y同時出現的次數
P[pkey] = (xy_count + self.A) / float(y_count + self.S*self.A) #條件概率
#features所屬類
F = {}
for y in P_y.keys():
F[y] = P_y[y]
for x in features:
F[y] = F[y] * P[str(x)+'|'+str(y)]
features_y = max(F, key=F.get) #概率最大值對應的類別
return features_y
if __name__ == '__main__':
nb = NavieBayesB()
# 訓練數據
trainData, labels = nb.getTrainSet()
# x1,x2
features = [2,'S']
# 該特徵應屬於哪一類
result = nb.classify(trainData, labels, features)
print (features,'屬於',result)
naivebayes_data.csv
x1,x2,Y
1,S,-1
1,M,-1
1,M,1
1,S,1
1,S,-1
2,S,-1
2,M,-1
2,M,1
2,L,1
2,L,1
3,L,1
3,M,1
3,M,1
3,L,1
3,L,-1
測試結果
[2, 'S'] 屬於 -1