文章目录
总结一下李航老师《统计学习方法》中第三章-K近邻法【K 近邻算法(K-Nearest Neighbor, KNN】的知识与python代码实现
1.基础理论
1.1 基本定义
1.2 距离度量
1.3 K值的选择
1.4 分类决策规则
1.5 k近邻法的实现——kd树
2. python代码实现
一种监督学习的算法:K近邻
机器学习K近邻算法——python详细代码解析(sklearn)
2.1 k近邻算法的实现、模型评估及分类报告
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,mean_squared_error
import matplotlib.pyplot as plt
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 分割数据集,80%作为训练集,20%作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 创建KNN分类器,选择K=3
# metric='minkowski','manhattan','euclidean','chebyshev'
knn = KNeighborsClassifier(n_neighbors=3,weights='uniform',
algorithm='kd_tree',leaf_size=30,
metric='euclidean', p=2,
metric_params=None, n_jobs=1)
# 训练模型
knn.fit(X_train, y_train)
# 进行预测
y_pred = knn.predict(X_test)
# 评估模型精度
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# 可视化混淆矩阵
confusion = confusion_matrix(y_test, y_pred)
plt.matshow(confusion)
# 设置中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 在每个单元格中添加数字
for i in range(confusion.shape[0]):
for j in range(confusion.shape[1]):
plt.text(x=j, y=i, s=str(confusion[i, j]), va='center', ha='center', color='white')
plt.colorbar()
plt.ylabel('实际类型')
plt.xlabel('预测类型')
plt.title('混淆矩阵')
plt.show()
#分类报告
print(classification_report(y_test, y_pred))
2.2 模型优化——选择K值
## 选择最优的K
k_range = range(1, 21)
accuracies = []
mse = []
scores = []
# 对不同的K值进行训练并计算准确度
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracies.append(accuracy_score(y_test, y_pred))
mse.append(mean_squared_error(y_test, y_pred))
score = knn.score(X_test, y_test)
scores.append(score)
print('最优得分:',max(scores))
index_max = np.argmax(scores)
print(f'最优K值: {k_range[index_max]}')#K近邻算法(选取最优K的图形展示)
# 创建图表
fig, ax1 = plt.subplots()
# 在第一个轴上绘制第一组数据
ax1.plot(k_range, accuracies, marker='o', label='accuracies')
ax1.set_ylabel('accuracies',size=20)
ax1.axvline(k_range[index_max], linewidth=1, linestyle='--', color='k',lw=3)
# 创建第二个Y轴
ax2 = ax1.twinx()
# 在第二个轴上绘制第二组数据
ax2.plot(k_range, mse, marker='*', label='mse',c='red')
ax2.set_ylabel('mse',size=20)
# 添加图例
ax1.legend(
title='accuracies',
loc='upper left',
bbox_to_anchor=(1.12, 1), # 将图例放在右侧外部
fontsize=12,
title_fontsize=13
)
ax2.legend(
title='mse',
loc='upper left',
bbox_to_anchor=(1.12, 0.8), # 将图例放在右侧外部
fontsize=12,
title_fontsize=13
)
plt.title('KNN Classifier Accuracy vs K Value')
# 最优模型拟合效果图形展示
from sklearn.metrics import mean_squared_error
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
pred = model.predict(X_test)
mean_squared_error(y_test, pred)
model.score(X_test, y_test)
t = np.arange(len(y_test))
plt.rcParams['font.sans-serif'] = ['SimHei']#本代码的含义是解决图表中中文显示问题。
plt.plot(t, y_test, 'r-', linewidth=2, label=u'原值')
plt.plot(t, pred, 'g-', linewidth=2, label=u'预测值')
plt.legend(loc='upper right')
plt.grid()
plt.show()
太懒了,后面再补充