from numpy import *
def loadSimpData():
datMat = matrix([[1., 2.1],
[2., 1.1],
[1.3, 1.],
[1.1, 1.1],
[2., 1.]
])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return datMat, classLabels
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): # 单层决策树模型
retArray = ones((shape(dataMatrix)[0], 1))
if threshIneq == 'lt': # 若threshVal == 'lt' 并且dataMatirx <= threshVal 则 labels 改为 -1
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
def buildStump(dataArr, classLabels, D): # 遍历所有的数据,通过单层决策数对其进行分类
dataMatrix = mat(dataArr)
labelMat = mat(classLabels).T
m,n = shape(dataMatrix)
numSteps = 10.0; bestStump = {}; bestClasEst = mat(zeros((m, 1)))
minError = inf
for i in range(n): # 遍历N列
rangeMin = dataMatrix[:, i].min(); rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin)/numSteps
for j in range(-1, int(numSteps) + 1):
for inequal in ['lt', 'gt']:
threshVal = (rangeMin + float(j) * stepSize) # 保持threshVal在最大值和最小值之间
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal) # 预测分类结果
errArr = mat(ones((m, 1)))
errArr[predictedVals == labelMat] = 0
weightedError = D.T * errArr
print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" %(i, threshVal, inequal, weightedError))
if weightedError < minError: # 更新最小error rate对应的分类标准
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
def adaBoostTrainDS(dataArr, classLables, numIt = 40):
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m, 1))/m)
aggClassEst = mat(ones((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLables, D) # 获取最佳的分类标准
print("D:", D.T)
alpha = float(0.5*log((1.0 - error)/max(error, 1e-16))) # 用于确保在没有错误时不会发生除零溢出 alpha = 0.5 * (ln(1-error)/error)
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
print("classEst:", classEst.T) # 若样本被正确分类,则权重为 D(t+1) = D*exp(-alpha)/sum(D),错误分类为D(t+1) = D*exp(alpha)/sum(D)
expon = multiply(-1*alpha*mat(classLables).T, classEst) # 错误预测*实际标签值始终等于-1,正确分类*实际标签值始终等于1
D = multiply(D, exp(expon))
D = D/D.sum()
aggClassEst += alpha * classEst
print("aggClassEst:", aggClassEst.T)
aggErrors = multiply(sign(aggClassEst) != mat(classLables).T, ones((m, 1)))
errorRate = aggErrors.sum()/m
print("total error:", errorRate, "\n")
if errorRate == 0.0: break
return weakClassArr, aggClassEst
def adaClassify(dataToClass, classifileArr):
dataMatrix = mat(dataToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m, 1)))
for i in range(len(classifileArr)):
classEst = stumpClassify(dataMatrix, classifileArr[i]['dim'], classifileArr[i]['thresh'], classifileArr[i]['ineq'])
aggClassEst += classifileArr[i]['alpha']*classEst
print(aggClassEst)
return sign(aggClassEst)
def loadDataSet(filenName):
filePath = r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch07'
numFeat = len(open(filePath + filenName).readline().strip().split('\t'))
dataMat = []; labelMat = []
fr = open(filePath + filenName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
def plotRoc(predStrengths, classLabels):
import matplotlib.pyplot as plt
cur = (1.0, 1.0)
ySum = 0.0
numPosClas = sum(array(classLabels) == 1.0) # 正列的个数
yStep = 1/float(numPosClas)
xStep = 1/float(len(classLabels) - numPosClas)
sortedIndices = predStrengths.argsort()
fig = plt.figure()
fig.clf()
ax = plt.subplot(1, 1, 1)
for index in sortedIndices.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0; delY = yStep
else:
delX = xStep; delY = 0
ySum += cur[1]
ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b')
cur = (cur[0] - delX, cur[1] - delY)
ax.plot([0, 1], [0, 1], 'b--')
plt.xlabel('False positive rate'); plt.ylabel('true positive rate')
plt.title('ROC curve for adaboost horse colic Detection system')
ax.axis([0, 1, 0, 1])
plt.show()
print('the area under the curve is:', ySum*xStep)
def main():
datArr, labelArray = AdaBoost.loadDataSet('/horseColicTraining2.txt')
classifierArray, aggClassEst = AdaBoost.adaBoostTrainDS(dataArr, labelArr, 10)
AdaBoost.plotRoc(aggClassEst.T, labelArr)
if __name__ == '__main__':
main()
相关文章: