Logistic迴歸是非常經典的分類算法,它可以對非線性可分的數據集進行分類(當然有誤分類的情況),因爲這本書主要針對初學者所以只考慮的二分類問題。這章的代碼沒有需要改動的,python2.X和python3.X均能跑。
最開始的訓練使用的是梯度上升算法來求解出擬合曲線的參數最優值,此時的計算均是矩陣運算。效果很不錯:
但是梯度上升算法對於數據量小的情況還比較適用,差別沒那麼明顯,但是一旦數據量很大計算將變得非常緩慢(每次數據集更新都要重新讀取一遍數據集,而且每次都是循環全部數據集的計算,假設有10000個數據,循環10次都已經得做100000次運算了),因此之後作者考慮了隨機梯度上升算法,其實本質差不多只不過是不進行矩陣相乘的運算了(矩陣相乘就代表着全體數據集的相乘),而是向量之間的運算(就是針對每一條數據集進行運算),從數據集中每次隨機選一些數據進行計算,然後整體迭代n次就可以了,看起來擬合效果沒差很大嘛(迭代了150次):
最後用了這段代碼預測了以下馬🐴患疝氣病之後的死亡率問題。
具體代碼如下:
# -*- coding: utf-8 -*-
from numpy import *
import math
def loadDataSet():
dataMat = []
labelMat = []
fr = open("testSet.txt")
for lines in fr.readlines():
lineArr = lines.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[-1]))
return dataMat,labelMat
def sigmoid(z):
return 1.0/(1+exp(-z))
#梯度上升法
def gradAscent(dataMat,classLabel):
#dataMatrix是m*n維的矩陣
dataMatrix = mat(dataMat)
#轉置矩陣
labelMat = mat(classLabel).transpose()
m,n = shape(dataMatrix)
alpha = 0.001
maxCycles = 500
#weight是n*1維向量
weight = ones((n,1))
for k in range(maxCycles):
#h是M*1維向量,表示所有護具和
h = sigmoid(dataMatrix * weight)
error = labelMat - h
weight = weight + alpha* dataMatrix.transpose()*error
return weight
#隨機梯度上升法
def stoGradAscent0(dataMatrix,classlabels):
m,n = shape(dataMatrix)
alpha = 0.1
weight = ones(n)
for i in range(m):
h = sigmoid(sum(dataMatrix[i] * weight))
error = classlabels[i] - h
weight = weight + alpha*error*dataMatrix[i]
return weight
#改進的隨機梯度上升法
def stoGradAscent1(dataMatrix,classlabels,numIter=150):
import random
m,n = shape(dataMatrix)
alpha = 0.1
weight = ones(n)
for j in range(numIter):
dataIndex = list(range(m))
for i in range(m):
alpha = 4/(1.0+j+i)+0.01
randIndex = int(random.uniform(0,len(dataIndex)))
h = sigmoid(sum(dataMatrix[randIndex] * weight))
error = classlabels[randIndex] - h
weight = weight + alpha*error*dataMatrix[randIndex]
del(dataIndex[randIndex])
return weight
#畫出決策邊界
def plotBestFit(weights):
import matplotlib.pyplot as plt
dataMat,labelMat = loadDataSet()
dataArr = array(dataMat)
#數據個數,一共有100組數據
n = shape(dataArr)[0]
#xcord1,ycord1是指類別爲1的數據的座標值
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelMat[i]) == 1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=20,c='red',marker='s')
ax.scatter(xcord2,ycord2,s=20,c='green')
#生成60個點來畫決策邊界,太短的畫可能看不出來。
x = arange(-3.0,3.0,0.1)
y =(-weights[0]-weights[1]*x)/weights[2]
ax.plot(x,y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
def classifyVector(inX,weights):
prob = sigmoid(sum(inX*weights))
if prob > 0.5:
return 1.0
else:
return 0.0
def colicTest():
frTrain = open('horseColicTraining.txt')
frTest = open('horseColicTest.txt')
trainingSet = []
trainingLabels = []
for line in frTrain.readlines():
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[21]))
trainWeights = stoGradAscent1(array(trainingSet),trainingLabels,500)
errorCount = 0.0
numTestVec = 0.0
for line in frTest.readlines():
numTestVec += 1
currLine = line.strip().split('\t')
lineArr = []
for i in range(21):
lineArr.append(float(currLine[i]))
label = classifyVector(array(lineArr),trainWeights)
if int(label) != int(currLine[21]):
errorCount += 1
ac = float(errorCount)/float(numTestVec)
print("預測的錯誤率是:%f" % ac)
return ac
def multTest():
numTest = 10
errorSum = 0.0
for k in range(numTest):
errorSum += colicTest()
print('經過 %d 次迭代測試,發現該模型的平均錯誤率是: %f' % (numTest,errorSum/float(numTest)))