基於Sigmoid函數和Logistic迴歸的分類算法。
思想:使用梯度上升找到最優迴歸係數,相當於找到決策邊界。再用數據特徵和Logistic迴歸就能算出分類。
import numpy as np
'''
用Logistic迴歸擬合決策邊界從而進行分類
'''
def sigmoid(x):
try:
return 1.0/(1+np.exp(-x))
except:
return 0.0 #overflow
# 隨機梯度上升算法
def stocGradAscent1(xMat, classLabels, numIter=150):
m,n = np.shape(xMat)
w = np.ones(n)
for j in range(numIter):
dataIdx = range(m)
for i in range(m):
alpha = 4/(1.0+j+i)+0.01
randIdx = int(np.random.uniform(0,len(dataIdx)))
h = sigmoid(sum(xMat[dataIdx[randIdx]]*w))
err = classLabels[dataIdx[randIdx]] - h
w = w + alpha*err*xMat[dataIdx[randIdx]]
del(dataIdx[randIdx])
return w
def classifyVector(x, w):
prob = sigmoid(sum(x*w))
if prob>0.5:return 1.0
else:return 0.0
def colicTest():
allData = open('horseColic.txt')
allSet = []
allLab = []
for line in allData.readlines():
currLine = line.strip().split(' ')
lineArr = []
for i in range(27): # 27個特徵,第28列是類別
lineArr.append(0 if currLine[i] is '?' else float(currLine[i])) # 缺失數據補爲0
allSet.append(lineArr)
allLab.append(0.0 if int(currLine[27]) is 2 else float(currLine[27]))# 標籤1和2變成1和0
trainSet = np.array(allSet[:300]) # 共368個樣本,前300個用作訓練,後68個用於預測
trainLab = np.array(allLab[:300])
testSet = np.array(allSet[300:])
testLab = np.array(allLab[300:])
trainW = stocGradAscent1(trainSet, trainLab, 500) # 梯度上升迭代求迴歸係數
# ----------------------------- 預測樣本集 ------------------------------
predLab = []
errCount = 0.0
numTest = np.shape(testSet)[0]
for i in range(numTest): # 依次計算每個預測樣本
predLab.append(classifyVector(testSet[i], trainW))
print(classifyVector(testSet[i], trainW), testLab[i])
if int(classifyVector(testSet[i], trainW)) is not int(testLab[i]):
errCount += 1.0
errRate = float(errCount)/numTest
print('the error rate is: %f, error count:%f' % (errRate,errCount))
return predLab, errRate