KNN概念
物以类聚
1.k——超参数(hyper-parameter)
2.k最好为奇数(no even number , better be odd)
3.k大小有学问:
k太小:outliers 对判断的影像加大
k太大:会"冲淡"周边neighbor(高质量、高权重的数据)对最终判断的影像
# Euclidean Distance
from math import sqrt
def calculate_euclidean_distance(row1,row2):
# 累计的计数器
distance = 0.0
for i in range(len(row1)-1):
# 这是一种快速写sum求和的方法+=
distance += (row1[i] - row2[i])**2
return sqrt(distance)
# 创建dummy data
# x轴 y轴 类别
dataset = [[1.80,1.91,0],
[1.85,2.11,0],
[2.31,2.88,0],
[3.54,-3.21,0],
[3.66,3.12,0],
[5.52,2.13,1],
[6.32,1.46,1],
[7.35,2.34,1],
[7.78,3.26,1],
[8.43,-0.34,1]
]
row0 = dataset[0] # 我们先从数据中1.80,1.91这个点来计算和dataset中每个数据的距离
for row in dataset:
distance = calculate_euclidean_distance(row0,row) # 然后把我们取出的点逐一带进去计算距离
print(distance)
找思路:1.需要一个输入变量k
2.需要排序(选前面k个)
3.数据类型储存使用元组进行储存
# lambda 排序
multi_d_list = [('f',1,6),
('c',3,4),
('d',4,5),
('b',2,3),
('a',5,2),
('e',6,1)]
# sorted可以对所有可迭代类型进行排序,并且返回新的已排序的列表
print(sorted(multi_d_list,key=lambda x:x[0])) # 安装第0个维度进行排序
# lambda 排序
multi_d_list = [('f',1,6),
('c',3,4),
('d',4,5),
('b',2,3),
('a',5,2),
('e',6,1)]
# sorted可以对所有可迭代类型进行排序,并且返回新的已排序的列表
print(sorted(multi_d_list,key=lambda x:x[1])) # 安装第1个维度进行排序
编写完整代码
dataset = [[1.80,1.91,0],
[3.66,3.12,0],
[1.85,2.11,0],
[3.54,-3.21,0],
[2.31,2.88,0],
[5.52,2.13,1],
[6.32,1.46,1],
[7.35,2.34,1],
[7.78,3.26,1],
[8.43,-0.34,1]
]
def get_our_neighbors(train,test_row,num_of_neighbors): # 传入训练数据,测试数据,k值
distances = list()# 使用空的列表来存储后面的数据
for train_row in train: # 拆解train为每一行
dist = calculate_euclidean_distance(test_row,train_row)# 调用欧氏距离函数计算距离
distances.append((train_row,dist)) # 把结果放入空列表
distances.sort(key=lambda every_tuple:every_tuple[1])# 排序
neighbors = list()
for i in range(num_of_neighbors): # 循环k值得次数
neighbors.append(distances[i][0])
#print(neighbors)
return neighbors
# 传入数据集,选取一个点来计算距离,k=3
neighbors =get_our_neighbors(dataset,dataset[0],3)
for neighbor in neighbors:# 逐一打印
print(neighbor)
def predict_the_class(train, test_row, num_of_neighbors):# 预测类别 传入训练数据 测试数据 k值
neighbors = get_our_neighbors(train,test_row,num_of_neighbors) # 调用函数进行计算距离
the_class_values = [row[-1] for row in neighbors] # 通过切片选取最后一行类别
print(the_class_values)
prediction = max(set(the_class_values),key=the_class_values.count)# 通过set去重
print(prediction)
return prediction
prediction = predict_the_class(dataset,dataset[0],3)
print('Our expectation(the real class) is class 【%d】' %(dataset[0][-1]))# 打印实际值
print('Our prediction(the predicted class) is class 【%d】' %(prediction)) # 打印预测值
def predict_the_class_V2(train,test_row,num_of_neighbors):# 预测类别V2版本
neighbors = get_our_neighbors(train,test_row,num_of_neighbors) # 算出最近的距离
the_class_values = [row[-1] for row in neighbors] # 通过切片取出类别
print(the_class_values)
prediction = sum(the_class_values) / float(len(the_class_values))# 把预测出来的类别求平均值
print(prediction)
return prediction
prediction = predict_the_class_V2(dataset,dataset[6],3)
print('Our expectation(the real class) is class 【%d】' %(dataset[6][-1]))
print('Our prediction(the predicted class) is class 【%d】' %(prediction))