原理:
- 通過改變訓(xùn)練樣本的權(quán)重季研,學(xué)習(xí)多個(gè)分類器嫌套,并將這些分類器進(jìn)行線性組合逆屡,提高分類的性能;
- bagging是通過隨機(jī)抽樣的替換方式踱讨,得到與原數(shù)據(jù)集規(guī)模一樣的數(shù)據(jù)康二;
- boosting在bagging的思路上更進(jìn)一步,在數(shù)據(jù)集上順序應(yīng)用了多個(gè)不同的分類器勇蝙。
優(yōu)點(diǎn):
- 泛化錯(cuò)誤率低,易編碼挨约,可以用在大部分分類器上味混,無參數(shù)調(diào)整。
缺點(diǎn):
- 對離群點(diǎn)敏感
適用數(shù)據(jù)類型:
- 數(shù)值型和標(biāo)稱型數(shù)據(jù)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def loadSimpData():
dataMat = np.mat([[1.,2.1],
[2.,1.1],
[1.3,1.],
[1.,1.],
[2.,1.]])
classLabels = [1.0,1.0,-1.0,-1.0,1.0]
return dataMat,classLabels
dataMat,classLabels=loadSimpData()
#可視化
x = pd.DataFrame(dataMat)
y= pd.Series(classLabels)
pos = x[y==1].copy()
neg = x[y==-1].copy()
plt.figure()
plt.scatter(pos.loc[:,0],pos.loc[:,1],c='b',label='positive',marker='s')
plt.scatter(neg.loc[:,0],neg.loc[:,1],c='r',label='negative')
plt.show()
output_1_0.png
分類規(guī)則
def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
retArray = np.ones((np.shape(dataMatrix)[0],1))
if threshIneq == 'lt':
retArray[dataMatrix[:,dimen] <= threshVal] =-1.0 #大小判斷進(jìn)行分類
else: #threshIneq == 'gt'
retArray[dataMatrix[:,dimen] > threshVal] =-1.0
return retArray
遍歷所有特征找到最小誤差對應(yīng)的特征诫惭、閾值和分類規(guī)則
def buildStump(dataArr,classLabels,D):
dataMatrix = np.mat(dataArr);labelMat = np.mat(classLabels).T
m,n=np.shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClassEst = np.mat(np.zeros((m,1)))
minError=np.inf
for i in range(n):
rangeMin = dataMatrix[:,i].min() #取第i個(gè)特征的最小值
rangeMax = dataMatrix[:,i].max() #取第i個(gè)特征的最大值
stepSize = (rangeMax - rangeMin)/numSteps #設(shè)置步長
for j in range(-1,int(numSteps)+1): #j:-1到11翁锡,用于遍歷不同的閾值
for inequal in ['lt','gt']: #lt:less than; gt: great than
threshVal = rangeMin + float(j)*stepSize #設(shè)置閾值,按一個(gè)步長一個(gè)步長往上增加
predictedVals = stumpClassify(dataMatrix,i,threshVal,inequal) #分類結(jié)果
errArr = np.mat(np.ones((m,1))) #初始化誤差矩陣
errArr[predictedVals == labelMat] = 0 #正確分類的誤差為0
weightedError = D.T*errArr #計(jì)算分類誤差率夕土,參考李航P139
# print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" %(i,threshVal,inequal,weightedError))
if weightedError < minError:
minError = weightedError #找到最小誤差
bestClassEst = predictedVals.copy() #最小誤差對應(yīng)的分類結(jié)果
bestStump['dim'] = i #最小誤差對應(yīng)的特征
bestStump['thresh'] = threshVal #最小誤差對應(yīng)的閾值
bestStump['ineq'] = inequal #最小誤差對應(yīng)的規(guī)則
return bestStump,minError,bestClassEst
D = np.mat(np.ones((5,1))/5)
print(D)
buildStump(dataMat,classLabels,D)
[[0.2]
[0.2]
[0.2]
[0.2]
[0.2]]
({'dim': 0, 'thresh': 1.3, 'ineq': 'lt'}, matrix([[0.2]]), array([[-1.],
[ 1.],
[-1.],
[-1.],
[ 1.]]))
完整版AdaBoost算法實(shí)現(xiàn)
from math import *
def adaBoostTrainDS(dataArr,classLabels,numIt=40):
weakClassArr = []
m = np.shape(dataArr)[0]
D = np.mat(np.ones((m,1))/m)
aggClassEst = np.mat(np.zeros((m,1)))
for i in range(numIt):
#找到最佳單層決策樹
bestStump,error,classEst = buildStump(dataArr,classLabels,D)#classEst:array
# print('D:\n',D.T)
#更新alpha馆衔,weight
alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
# print('classEst:\n',classEst.T)
expon = np.multiply(-1*alpha*np.mat(classLabels).T,classEst) #matrix
D = np.multiply(D,np.exp(expon))
D = D/D.sum()
#記錄每個(gè)樣布的評估結(jié)果,可以看到每個(gè)樣本的評估都在往一個(gè)正確的方向變化
aggClassEst += alpha*classEst
# print('aggClassEst:\n',aggClassEst)
#統(tǒng)計(jì)錯(cuò)誤率
aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T,np.ones((m,1))) #統(tǒng)計(jì)錯(cuò)誤個(gè)數(shù)
errorRate = aggErrors.sum() / m#計(jì)算錯(cuò)誤率
print('total error:\n',errorRate)
#若錯(cuò)誤率為0怨绣,則停止迭代
if errorRate == 0.0: break
return weakClassArr,aggClassEst
classifierArray,aggClassEst = adaBoostTrainDS(dataMat,classLabels,9)
classifierArray
total error:
0.2
total error:
0.2
total error:
0.0
[{'dim': 0, 'thresh': 1.3, 'ineq': 'lt', 'alpha': 0.6931471805599453},
{'dim': 1, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.9729550745276565},
{'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.8958797346140273}]
測試算法
def adaClassify(datToclass,classifierArr):
dataMatrix = np.mat(datToclass)
m = np.shape(dataMatrix)[0]
aggClassEst = np.mat(np.zeros((m,1)))
for i in range(len(classifierArray)): #遍歷全部弱分類器
classEst = stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArray[i]['thresh'],classifierArray[i]['ineq']) #分類結(jié)果:1角溃,-1
aggClassEst += classifierArray[i]['alpha']*classEst
# print(aggClassEst)
res =np.sign(aggClassEst)
return res
#測試
adaClassify([0,0],classifierArray)
matrix([[-1.]])
在馬疝病數(shù)據(jù)集應(yīng)用AdaBoost分類器
def loadDataSet(fileName):
fr = open(fileName)
numFeature = len(fr.readline().strip().split('\t'))
dataMat =[];labelMat =[]
for line in fr.readlines():
lineList =[]
curLine = line.strip().split('\t')
for i in range(numFeature-1):
lineList.append(float(curLine[i]))
dataMat.append(lineList)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
traindataMat,trainlabelMat = loadDataSet('../../Reference Code/Ch07/horseColicTraining2.txt')
testdataMat,testlabelMat = loadDataSet('../../Reference Code/Ch07/horseColicTest2.txt')
classifierArray,aggClassEst = adaBoostTrainDS(traindataMat,trainlabelMat,10)
classifierArray
total error:
0.28523489932885904
total error:
0.28523489932885904
total error:
0.2483221476510067
total error:
0.2483221476510067
total error:
0.2483221476510067
total error:
0.24161073825503357
total error:
0.24161073825503357
total error:
0.2214765100671141
total error:
0.2483221476510067
total error:
0.2214765100671141
[{'dim': 9, 'thresh': 3.0, 'ineq': 'gt', 'alpha': 0.4593204546095544},
{'dim': 17, 'thresh': 52.5, 'ineq': 'gt', 'alpha': 0.31654488263333286},
{'dim': 3,
'thresh': 55.199999999999996,
'ineq': 'gt',
'alpha': 0.28402835050611847},
{'dim': 18,
'thresh': 62.300000000000004,
'ineq': 'lt',
'alpha': 0.23222873860913737},
{'dim': 10, 'thresh': 0.0, 'ineq': 'lt', 'alpha': 0.19836267426245105},
{'dim': 5, 'thresh': 2.0, 'ineq': 'gt', 'alpha': 0.18642416210017293},
{'dim': 12, 'thresh': 1.2, 'ineq': 'lt', 'alpha': 0.1496988869138094},
{'dim': 7, 'thresh': 1.2, 'ineq': 'gt', 'alpha': 0.15848275395378547},
{'dim': 5, 'thresh': 0.0, 'ineq': 'lt', 'alpha': 0.1370746524177519},
{'dim': 0, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.12365372615766472}]
prediction = adaClassify(testdataMat,classifierArray)
errArr = np.ones((len(prediction),1))
errNum = errArr[prediction!=np.mat(testlabelMat).T].sum()
errRate= errNum/len(prediction)
print('錯(cuò)誤個(gè)數(shù):%d'%errNum)
print('錯(cuò)誤率:%.2f'%errRate)
錯(cuò)誤個(gè)數(shù):15
錯(cuò)誤率:0.23
ROC曲線
def plotROC(predStrengths,classLabels):
import matplotlib.pyplot as plt
cur = (0.,0.)
ySum =0.
numPosClass = sum(np.array(classLabels)==1.0) #統(tǒng)計(jì)正例的數(shù)目
yStep = 1/ float(numPosClass) #正陽率的步長
xStep = 1/float(len(classLabels)-numPosClass) #假陽率的步長
sortedIndicies = predStrengths.argsort() #從小到大排序,返回index
fig = plt.figure()
# fig.clf()
ax = plt.subplot(111)
#[::-1],反轉(zhuǎn)篮撑,從大到小排序减细,即所有樣例判定為反例
for index in sortedIndicies.tolist()[0][::-1]: #tolist(),matrix 變成list,才能遍歷里面的元素
if classLabels[index] == 1.0: #若當(dāng)前為正例赢笨,則正陽率增加一個(gè)步長未蝌,假陽率不變
delX=0;delY=yStep
else:
delX = xStep;delY=0
ySum+=cur[1]
ax.plot([cur[0] , cur[0]+delX],[cur[1],cur[1]+delY],c='b')
cur = (cur[0] + delX,cur[1] + delY) #當(dāng)前樣例的坐標(biāo)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
ax.axis([0,1,0,1])
plt.show()
print('the Area Under the Curve is :',ySum*xStep)
traindataMat,trainlabelMat = loadDataSet('../../Reference Code/Ch07/horseColicTraining2.txt')
classifierArray,aggClassEst = adaBoostTrainDS(traindataMat,trainlabelMat,10)
total error:
0.28523489932885904
total error:
0.28523489932885904
total error:
0.2483221476510067
total error:
0.2483221476510067
total error:
0.2483221476510067
total error:
0.24161073825503357
total error:
0.24161073825503357
total error:
0.2214765100671141
total error:
0.2483221476510067
total error:
0.2214765100671141
plotROC(aggClassEst.T,trainlabelMat) #aggClassEst.T變成一行
output_21_0.png
the Area Under the Curve is : 0.8538389513108627