数据背景:
如何约会网站寻找适合自己的的约会对象?网站会推荐不同的人,但是推荐的人不总是恰当的。
所以将推荐的人标签为:不喜欢、魅力一般、极具魅力
三个特征:每年获得的飞行常客里程数、玩视频游戏所消耗时间百分比、每周消费的冰淇淋公升数
(这三个特征收据可能跟网站数据有关)从而构成整个数据为四维数据。
数据链接:
链接: https://pan.baidu.com/s/1drSoyB4wnhDzmJS-dbu-iw 提取码: 6tcj 复制这段内容后打开百度网盘手机App,操作更方便哦
kNN算法简介:
原理:监督学习+分类算法+距离+投票原则(具体步骤百度一下,很简单)
优缺点:精度高、对异常值不敏感,计算和空间复杂度高
使用数据范围:数值型和标称型
kNN算法主要模块:
1、数据解析模块(读数据,转格式)
2、数据预处理,标准化模块
3、分类器模块(也是预测模块,knn算法不需要进行模型训练)
4、分类器好坏判断
5、前期一些描述性分析模块(如果需要)
代码:
from numpy import *
import operator
from os import listdir
import pandas as pd
from sklearn.preprocessing import LabelEncoder #变量编码
模块1—数据读入
def file2matrix(filename):
file=open(filename)
arraylines=file.readlines() ##读取所有行
numberlines=len(arraylines)
returnMat=zeros((numberlines,3)) ##存放需要返回的数据
classlabelvector=[]
index=0
for line in arraylines:
line=line.strip(’\n’)
listfromline=line.split(’\t’)
returnMat[index,:]=listfromline[0:3]
classlabelvector.append(listfromline[-1])
index+=1
classlabelvector=LabelEncoder().fit_transform(classlabelvector) ##标签进行编码
return returnMat,classlabelvector
模块2—分类器
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat^2
sqDistances = sqDiffMat.sum(axis=1) ## 按行求和
distances = sqDistances^0.5
sortedDistIndicies = distances.argsort() ##按距离升序排名
voteIlabel=labels[sortedDistIndicies[0:k]].tolist()
classCount=list((a,voteIlabel.count(a)) for a in voteIlabel)
sortedClassCount=sorted(classCount.items(),key=lambda x:x[1],reverse=True) ##投票结果
##sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
模块3-----标准化
def autoNorm(dataSet):
minVals = dataSet.min(0) ##按列求最值
maxVals = dataSet.max(0)
ranges = maxVals - minVals+1 ##防止分母为0
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))+1
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
模块4-----参数训练与判断其好坏判断
def datingClassTest(k,ratio):
hoRatio = ratio #hold out 30%
datingDataMat,datingLabels = file2matrix(‘datingTestSet2.txt’) #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
n= int(m*hoRatio)
errorCount = 0.0
random.seed(1314)
numTestVecs=random.sample(range(m),n)
numTrainVecs=setdiff1d(range(m), numTestVecs)
for i in numTestVecs:
classifierResult = classify0(normMat[i,:],normMat[numTrainVecs,:],datingLabels[numTrainVecs],k=k)
#print(“the classifier came back with: %d, the real answer is: %d” % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
return errorCount/float(len(numTestVecs))
如何选择较好的k?------可以看到k=10较好
模块5----应用系统中使用:
def classifyperson():
resultList=[“didnt like”," in small doses",“in large doses”]
percentTags=float(input(“percentage of time spent playing video games?”))
ffmiles=float(input(“frequent flier miles earned per year?”))
icecream=float(input(“liters of ice cream consumed per year?”))
datingDataMat,datingLabels = file2matrix(‘datingTestSet2.txt’)
normMat, ranges, minVals = autoNorm(datingDataMat)
inarr=array([percentTags,ffmiles,icecream])
classifierResult=classify0((inarr-minVals)/ranges,normMat,datingLabels,10)
return resultList[classifierResult-1]
##描述性分析
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
datingDataMat,datingLabels = file2matrix(‘datingTestSet.txt’)
xcord1=datingDataMat[datingLabels0,1]
ycord1=datingDataMat[datingLabels0,2]
xcord2=datingDataMat[datingLabels1,1]
ycord2=datingDataMat[datingLabels1,2]
xcord3=datingDataMat[datingLabels2,1]
ycord3=datingDataMat[datingLabels2,2]
type1 = ax.scatter(xcord1, ycord1, s=20, c=‘red’)
type2 = ax.scatter(xcord2, ycord2, s=30, c=‘green’)
type3 = ax.scatter(xcord3, ycord3, s=50, c=‘blue’)
ax.legend([type1, type2, type3], [“Did Not Like”, “Liked in Small Doses”, “Liked in Large Doses”], loc=2)
ax.axis([-2,25,-0.2,2.0])
plt.xlabel(‘Percentage of Time Spent Playing Video Games’)
plt.ylabel(‘Liters of Ice Cream Consumed Per Week’)
plt.show()
from numpy import *
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
datingDataMat,datingLabels = file2matrix(‘datingTestSet.txt’)
xcord1=datingDataMat[datingLabels0,0]
ycord1=datingDataMat[datingLabels0,1]
xcord2=datingDataMat[datingLabels1,0]
ycord2=datingDataMat[datingLabels1,1]
xcord3=datingDataMat[datingLabels2,0]
ycord3=datingDataMat[datingLabels2,1]
type1 = ax.scatter(xcord1, ycord1, s=20, c=‘red’)
type2 = ax.scatter(xcord2, ycord2, s=30, c=‘green’)
type3 = ax.scatter(xcord3, ycord3, s=50, c=‘blue’)
ax.legend([type1, type2, type3], [“Did Not Like”, “Liked in Small Doses”, “Liked in Large Doses”], loc=2)
ax.axis([-5000,100000,-2,25])
plt.xlabel(‘Frequent Flyier Miles Earned Per Year’)
plt.ylabel(‘Percentage of Time Spent Playing Video Games’)
plt.show()