KNN 算法优缺点:
优点:精度高,对异常值不敏感
缺点:计算复杂度高,空间复杂度高
使用数据范围:数值型和标称型
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。
import numpy as np
import matplotlib.pyplot as plt
# 创建训练集
def create_data():
x_train = np.array([[1,1.1],
[1.3,0.8],
[1.4,1.2],
[1.1,0.9],
[0.8,1.5],
[2.5,2],
[3.4,2.5],
[3.7,2.5],
[2,3]])
y_train = np.array(['a','a','a','a','a','b','b','b','b'])
return x_train, y_train
# 预测点
x_test = np.array([2,2])
# 计算距离
def calculate_dis(x_train, k =3):
dis = (x_train - x_test)**2
dis = dis.sum(axis = 1)**0.5
dis = dis.argsort() # argsort()函数,是numpy库中的函数,返回的是数组值从小到大的索引值
small_k = dis[:k]
return dis,small_k
# 确定预测点所属类别
def pre_result(small_k, y_train):
dic = {}
for i in small_k:
if y_train[i] in dic.keys():
dic[y_train[i]] += 1
else:
dic[y_train[i]] = 1
return list(dic.keys())[0]
# 将训练集按照所属类别分类
def to_array(cla):
x_train, y_train = create_data()
x = []
for i in range(len(y_train)):
if y_train[i] == cla:
x.append(list(x_train[i,:]))
return np.array(x)
# 画图
def plot_(x_train, pre, small_k):
x_train_a = to_array('a')
x_train_b = to_array('b')
plt.scatter(x_train_a[:,0], x_train_a[:,1], c = 'b', marker='o', label='train_class_a')
plt.scatter(x_train_b[:,0], x_train_b[:,1], c= 'r', marker='o', label = 'train_class_b')
if pre == 'a':
test_class = 'b'
elif pre == 'b':
test_class = 'r'
plt.scatter(x_test[0], x_test[1], c = test_class, marker='*', label='test_class')
for i in small_k:
print([x_test[0], x_train[i,:][0]], [x_test[1], x_train[i,:][1]])
plt.plot([x_test[0], x_train[i,:][0]], [x_test[1], x_train[i,:][1]], c='c')
plt.legend(loc = 'best')
plt.show()
def main():
x_train, y_train = create_data()
dis, small_k = calculate_dis(x_train)
pre = pre_result(small_k, y_train)
plot_(x_train, pre, small_k)
if __name__ == '__main__':
main()
画图结果展示:
以下是完整的代码实现过程
''' KNN 算法
优点:精度高,对异常值不敏感,武术家输入假定
缺点:计算复杂度高,空间复杂度高
使用数据范围:数值型和标称型
有标签的分类算法:即输入一个无标签的数据系列,与有标签的现有数据属性进行对比,算法提取样本集中特征最相似的K个分类标签,最后选择K个相似数据中出现次数最多的分类。
'''
import numpy as np
import operator
import os
def create_dataSet():
group = np.array([[1, 1.1], [1, 1], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(in_x, data_set, labels, k):
data_set_size = data_set.shape[0]
diff_mat = np.tile(in_x, (data_set_size, 1)) - data_set
dist = (diff_mat**2).sum(axis=1)**0.5
sorted_dist_index = dist.argsort()
class_count = {}
for i in range(k):
votelabel = labels[sorted_dist_index[i]]
class_count[votelabel] = class_count.get(votelabel, 0) + 1
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
# dating数据
def file2matrix(filename):
file_path = r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02'
fr = open(file_path + filename)
ar_lines = fr.readlines()
num = len(ar_lines)
return_mat = np.zeros((num, 3))
class_label = []
index = 0
for line in ar_lines:
line = line.strip().split('\t')
return_mat[index, :] = line[0:3]
# class_label.append(int(line[-1]))
if line[-1] == 'largeDoses':
class_label.append(3)
elif line[-1] == 'didntLike':
class_label.append(2)
else:
class_label.append(1)
index += 1
return return_mat, class_label
# 归一化特征值
def auto_norm(data_set):
min_val = data_set.min(0) # 行对比,得到n列最小的值
max_val = data_set.max(0)
ranges = max_val - min_val
norm_data_set = np.zeros(np.shape(data_set))
m = data_set.shape[0]
norm_data_set = data_set - np.tile(min_val, (m, 1))
norm_data_set = norm_data_set/np.tile(ranges, (m, 1))
return norm_data_set, ranges, min_val
# # 分类器针对约会网站的测试代码
def dating_class_test():
h = 0.1
dating_data_mat, dating_labels = file2matrix('/datingTestSet.txt')
norm_mat, ranges, min_val = auto_norm(dating_data_mat)
m = norm_mat.shape[0]
num_test_vect = int(m*h)
error_count = 0.0
for i in range(num_test_vect):
classify_result = classify0(norm_mat[i,:], dating_data_mat[num_test_vect:, :], dating_labels[num_test_vect:], 3)
print("the classifier come back with: %d, the real answer is : %d" %(int(classify_result), int(dating_labels[i])))
if classify_result != dating_labels[i]: error_count += 1
print("the total error rate is: %.2f%%" % (error_count*100/float(num_test_vect)))
def classify_preson():
result_list = ['not at all', 'in small does', 'in large does']
percent_tats = float(input("percentage of time spent playing video games:"))
ff_miles = float(input("frequent flier miles earned games:"))
ice_cream = float(input("liters of ice cream consumed per year:"))
dating_mat, dating_labels = file2matrix('/datingTestSet.txt')
norm_mat, ranges, min_val = auto_norm(dating_mat)
in_x = np.array((percent_tats, ff_miles, ice_cream))
classifier_result = classify0(in_x, norm_mat, dating_labels, 3)
print("you will probably like this person:", result_list[classifier_result - 1])
# 导入手写数字文件
def img2vector(filename):
filepath = r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\trainingDigits'
fr = open(filepath + filename)
return_vect = []
for i in fr.readlines():
for j in i.strip():
return_vect.append(int(j))
# return_vect = np.array(return_vect).reshape(32, 32)
return_vect = np.array(return_vect)
return return_vect
def handwriteing_classtest():
filepath = r'C:\Users\Administrator\Desktop\ML\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits'
hw_labels = []
training_filelist = os.listdir(filepath + '/trainingDigits')
m = len(training_filelist)
training_mat = np.zeros((m, 1024))
for i in range(m):
filename_str = training_filelist[i]
file_str = filename_str.split('.')[0]
class_num = int(file_str.split('_')[0])
hw_labels.append(class_num)
training_mat[i, :] = img2vector('/%s' % filename_str)
test_filelist = os.listdir(filepath + '/testDigits')
error_count = 0.0
m_t = len(test_filelist)
for i in range(m_t):
filename_str = test_filelist[i]
class_num = int(filename_str.split('.')[0].split('_')[0])
vector_undertest = img2vector('/%s' % filename_str)
classifierresult = classify0(vector_undertest, training_mat, hw_labels, 3)
print("the classifier came back with: %d, the real answer is: %d"%(classifierresult, class_num))
if (classifierresult != class_num): error_count += 1.0
print("\nthe total number of errors is: %d" %error_count)
print("\nthe total error rate is: %f" %(error_count/float(m_t)))