已经是最新一篇文章了!
已经是最后一篇文章了!
最简单直观的分类算法
前言
K近邻(K-Nearest Neighbors, KNN)是最简单的机器学习算法之一。它的核心思想是:物以类聚——一个样本的类别由它周围最近的K个邻居投票决定。
算法原理
基本思想
- 计算测试样本与所有训练样本的距离
- 选出距离最近的K个邻居
- 根据这K个邻居的标签投票决定测试样本的类别
形式化定义
给定训练集 $D = {(x_1, y_1), …, (x_n, y_n)}$ 和测试样本 $x$:
- 找到 $x$ 的K个最近邻:$N_K(x) = {x_{(1)}, …, x_{(K)}}$
- 预测类别:$\hat{y} = \arg\max_c \sum_{x_i \in N_K(x)} \mathbb{1}(y_i = c)$
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成数据
np.random.seed(42)
X, y = make_classification(n_samples=200, n_features=2, n_informative=2,
n_redundant=0, n_clusters_per_class=1, random_state=42)
# 可视化KNN过程
test_point = np.array([[0, 0]])
distances = np.sqrt(np.sum((X - test_point) ** 2, axis=1))
k = 5
nearest_indices = np.argsort(distances)[:k]
plt.figure(figsize=(10, 8))
plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='Class 0', alpha=0.6)
plt.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', alpha=0.6)
plt.scatter(test_point[0, 0], test_point[0, 1], c='green', s=200, marker='*', label='测试点')
# 画出K个最近邻
for idx in nearest_indices:
plt.plot([test_point[0, 0], X[idx, 0]], [test_point[0, 1], X[idx, 1]],
'k--', alpha=0.5)
plt.scatter(X[idx, 0], X[idx, 1], s=100, facecolors='none',
edgecolors='black', linewidths=2)
# 画出距离圈
max_dist = distances[nearest_indices[-1]]
circle = plt.Circle((test_point[0, 0], test_point[0, 1]), max_dist,
fill=False, linestyle='--', color='gray')
plt.gca().add_patch(circle)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'KNN (K={k}) 示意图')
plt.legend()
plt.axis('equal')
plt.show()
# 统计投票
neighbor_labels = y[nearest_indices]
print(f"K个邻居的标签: {neighbor_labels}")
print(f"预测类别: {np.bincount(neighbor_labels).argmax()}")
距离度量
常用距离
| 距离 | 公式 | 特点 |
|---|---|---|
| 欧氏距离 | $\sqrt{\sum(x_i - y_i)^2}$ | 最常用 |
| 曼哈顿距离 | $\sum|x_i - y_i|$ | 网格路径 |
| 闵可夫斯基距离 | $(\sum|x_i - y_i|^p)^{1/p}$ | 通用形式 |
| 余弦距离 | $1 - \frac{x \cdot y}{|x||y|}$ | 方向相似度 |
from scipy.spatial.distance import cdist
x1 = np.array([[0, 0]])
x2 = np.array([[3, 4]])
distances = {
'欧氏距离': cdist(x1, x2, 'euclidean')[0, 0],
'曼哈顿距离': cdist(x1, x2, 'cityblock')[0, 0],
'切比雪夫距离': cdist(x1, x2, 'chebyshev')[0, 0],
'余弦距离': cdist(x1, x2, 'cosine')[0, 0],
}
for name, dist in distances.items():
print(f"{name}: {dist:.4f}")
距离可视化
# 不同距离度量的等距线
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
metrics = ['euclidean', 'cityblock', 'chebyshev']
titles = ['欧氏距离 (L2)', '曼哈顿距离 (L1)', '切比雪夫距离 (L∞)']
for ax, metric, title in zip(axes, metrics, titles):
xx, yy = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
points = np.c_[xx.ravel(), yy.ravel()]
origin = np.array([[0, 0]])
distances = cdist(points, origin, metric).reshape(xx.shape)
ax.contour(xx, yy, distances, levels=10, cmap='viridis')
ax.scatter(0, 0, c='red', s=100, zorder=5)
ax.set_title(title)
ax.set_aspect('equal')
ax.set_xlabel('X')
ax.set_ylabel('Y')
plt.tight_layout()
plt.show()
从零实现
class KNNClassifier:
def __init__(self, k=5, metric='euclidean', weights='uniform'):
self.k = k
self.metric = metric
self.weights = weights # 'uniform' or 'distance'
self.X_train = None
self.y_train = None
def fit(self, X, y):
"""KNN是懒惰学习,fit只是存储数据"""
self.X_train = np.array(X)
self.y_train = np.array(y)
self.classes_ = np.unique(y)
return self
def _compute_distances(self, X):
"""计算测试样本到所有训练样本的距离"""
if self.metric == 'euclidean':
# 使用广播加速
# ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a·b
X_sq = np.sum(X ** 2, axis=1, keepdims=True)
train_sq = np.sum(self.X_train ** 2, axis=1)
cross = X @ self.X_train.T
distances = np.sqrt(np.maximum(X_sq + train_sq - 2 * cross, 0))
elif self.metric == 'manhattan':
distances = cdist(X, self.X_train, 'cityblock')
else:
distances = cdist(X, self.X_train, self.metric)
return distances
def predict(self, X):
X = np.array(X)
distances = self._compute_distances(X)
predictions = []
for i in range(len(X)):
# 找到K个最近邻
nearest_indices = np.argsort(distances[i])[:self.k]
nearest_labels = self.y_train[nearest_indices]
if self.weights == 'uniform':
# 多数投票
pred = np.bincount(nearest_labels).argmax()
else:
# 距离加权
nearest_distances = distances[i][nearest_indices]
# 避免除零
nearest_distances = np.maximum(nearest_distances, 1e-10)
weights = 1 / nearest_distances
# 加权投票
class_weights = np.zeros(len(self.classes_))
for label, weight in zip(nearest_labels, weights):
class_weights[label] += weight
pred = np.argmax(class_weights)
predictions.append(pred)
return np.array(predictions)
def predict_proba(self, X):
X = np.array(X)
distances = self._compute_distances(X)
probas = []
for i in range(len(X)):
nearest_indices = np.argsort(distances[i])[:self.k]
nearest_labels = self.y_train[nearest_indices]
# 计算各类概率
proba = np.bincount(nearest_labels, minlength=len(self.classes_)) / self.k
probas.append(proba)
return np.array(probas)
def score(self, X, y):
return np.mean(self.predict(X) == y)
# 测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn = KNNClassifier(k=5)
knn.fit(X_train, y_train)
print(f"训练准确率: {knn.score(X_train, y_train):.4f}")
print(f"测试准确率: {knn.score(X_test, y_test):.4f}")
K值的选择
K值影响
| K值 | 决策边界 | 特点 |
|---|---|---|
| 小K | 复杂、锯齿 | 容易过拟合 |
| 大K | 平滑、简单 | 容易欠拟合 |
from sklearn.neighbors import KNeighborsClassifier
# 不同K值的决策边界
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
k_values = [1, 3, 5, 10, 20, 50]
xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))
for ax, k in zip(axes.ravel(), k_values):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
ax.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', label='Class 0')
ax.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', label='Class 1')
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
ax.set_title(f'K={k}\nTrain: {train_score:.3f}, Test: {test_score:.3f}')
plt.tight_layout()
plt.show()
交叉验证选K
from sklearn.model_selection import cross_val_score
k_range = range(1, 31)
cv_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
cv_scores.append(scores.mean())
plt.figure(figsize=(10, 6))
plt.plot(k_range, cv_scores, 'bo-')
plt.xlabel('K')
plt.ylabel('交叉验证准确率')
plt.title('K值选择')
plt.axvline(x=k_range[np.argmax(cv_scores)], color='r', linestyle='--',
label=f'最优K={k_range[np.argmax(cv_scores)]}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"最优K: {k_range[np.argmax(cv_scores)]}, 准确率: {max(cv_scores):.4f}")
特征缩放的重要性
为什么需要缩放
KNN基于距离,特征尺度差异会导致大尺度特征主导距离计算。
# 创建不同尺度的数据
np.random.seed(42)
X_unscaled = np.random.randn(200, 2)
X_unscaled[:, 0] *= 1000 # 第一个特征放大1000倍
y_rand = (X_unscaled[:, 1] > 0).astype(int) # 实际上只有第二个特征有用
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(
X_unscaled, y_rand, test_size=0.2, random_state=42)
# 不缩放
knn_unscaled = KNeighborsClassifier(n_neighbors=5)
knn_unscaled.fit(X_train_us, y_train_us)
print(f"未缩放准确率: {knn_unscaled.score(X_test_us, y_test_us):.4f}")
# 缩放后
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_us)
X_test_scaled = scaler.transform(X_test_us)
knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train_us)
print(f"缩放后准确率: {knn_scaled.score(X_test_scaled, y_test_us):.4f}")
加权KNN
距离加权
离测试点越近的邻居,投票权重越大。
# 比较uniform和distance加权
knn_uniform = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn_distance = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_uniform.fit(X_train, y_train)
knn_distance.fit(X_train, y_train)
print(f"Uniform加权: {knn_uniform.score(X_test, y_test):.4f}")
print(f"Distance加权: {knn_distance.score(X_test, y_test):.4f}")
KNN用于回归
from sklearn.neighbors import KNeighborsRegressor
# 生成回归数据
np.random.seed(42)
X_reg = np.sort(np.random.rand(100) * 10).reshape(-1, 1)
y_reg = np.sin(X_reg).ravel() + np.random.randn(100) * 0.2
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42)
# 不同K值
plt.figure(figsize=(15, 5))
for i, k in enumerate([1, 5, 20]):
plt.subplot(1, 3, i+1)
knn_reg = KNeighborsRegressor(n_neighbors=k)
knn_reg.fit(X_train_r, y_train_r)
X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
y_plot = knn_reg.predict(X_plot)
plt.scatter(X_train_r, y_train_r, c='blue', alpha=0.5, label='训练数据')
plt.scatter(X_test_r, y_test_r, c='red', alpha=0.5, label='测试数据')
plt.plot(X_plot, y_plot, 'g-', linewidth=2, label='KNN预测')
plt.plot(X_plot, np.sin(X_plot), 'k--', alpha=0.5, label='真实函数')
test_score = knn_reg.score(X_test_r, y_test_r)
plt.title(f'K={k}, R²={test_score:.3f}')
plt.legend()
plt.tight_layout()
plt.show()
加速KNN:KD树
暴力搜索的问题
- 时间复杂度:$O(DN)$,D为特征维度,N为样本数
- 大规模数据时非常慢
KD树
空间划分数据结构,将搜索复杂度降到 $O(D\log N)$。
from sklearn.neighbors import KNeighborsClassifier
import time
# 生成大规模数据
X_large, y_large = make_classification(n_samples=10000, n_features=20,
n_informative=10, random_state=42)
# 比较不同算法
algorithms = ['brute', 'kd_tree', 'ball_tree']
for algo in algorithms:
knn = KNeighborsClassifier(n_neighbors=5, algorithm=algo)
start = time.time()
knn.fit(X_large, y_large)
knn.predict(X_large[:100])
elapsed = time.time() - start
print(f"{algo:10s}: {elapsed:.4f}秒")
维度灾难
KD树在高维数据上效率下降,当维度超过20时,暴力搜索可能更快。
实战:手写数字识别
from sklearn.datasets import load_digits
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# 加载数据
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
print(f"数据形状: {X_digits.shape}")
print(f"类别: {np.unique(y_digits)}")
# 可视化样本
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for ax, img, label in zip(axes.ravel(), digits.images[:10], digits.target[:10]):
ax.imshow(img, cmap='gray')
ax.set_title(f'Label: {label}')
ax.axis('off')
plt.tight_layout()
plt.show()
# 划分数据
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
X_digits, y_digits, test_size=0.2, random_state=42)
# 标准化
scaler = StandardScaler()
X_train_d = scaler.fit_transform(X_train_d)
X_test_d = scaler.transform(X_test_d)
# 训练KNN
knn_digits = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_digits.fit(X_train_d, y_train_d)
y_pred_d = knn_digits.predict(X_test_d)
print(f"测试准确率: {knn_digits.score(X_test_d, y_test_d):.4f}")
print("\n分类报告:")
print(classification_report(y_test_d, y_pred_d))
# 混淆矩阵
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_d, y_pred_d)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测')
plt.ylabel('实际')
plt.title('KNN手写数字识别混淆矩阵')
plt.show()
常见问题
Q1: KNN的优缺点?
| 优点 | 缺点 |
|---|---|
| 简单直观 | 预测慢(需遍历所有数据) |
| 无需训练 | 内存占用大 |
| 天然支持多分类 | 对特征尺度敏感 |
| 非参数模型 | 高维数据效果差 |
Q2: 如何处理高维数据?
- 降维:PCA、t-SNE
- 特征选择
- 使用其他算法
Q3: K一般取多少?
- 经验法则:$K = \sqrt{N}$
- 通过交叉验证选择
- 通常选奇数避免平票
Q4: 类别不平衡怎么办?
- 使用距离加权
- 调整采样
- 修改投票规则
总结
| 概念 | 说明 |
|---|---|
| 核心思想 | 近朱者赤,近墨者黑 |
| 分类决策 | K个邻居投票 |
| 关键参数 | K值、距离度量、权重方式 |
| 预处理 | 必须特征缩放 |
| 加速 | KD树、Ball树 |
参考资料
- 《统计学习方法》李航 第3章
- scikit-learn 文档:KNN
- Cover, T., Hart, P. (1967). “Nearest neighbor pattern classification”
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——K近邻算法 》
本文链接:http://localhost:3015/ai/K%E8%BF%91%E9%82%BB.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!