最简单直观的分类算法

前言

K近邻(K-Nearest Neighbors, KNN)是最简单的机器学习算法之一。它的核心思想是:物以类聚——一个样本的类别由它周围最近的K个邻居投票决定。


算法原理

基本思想

  1. 计算测试样本与所有训练样本的距离
  2. 选出距离最近的K个邻居
  3. 根据这K个邻居的标签投票决定测试样本的类别

形式化定义

给定训练集 $D = {(x_1, y_1), …, (x_n, y_n)}$ 和测试样本 $x$:

  1. 找到 $x$ 的K个最近邻:$N_K(x) = {x_{(1)}, …, x_{(K)}}$
  2. 预测类别:$\hat{y} = \arg\max_c \sum_{x_i \in N_K(x)} \mathbb{1}(y_i = c)$
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 生成数据
np.random.seed(42)
X, y = make_classification(n_samples=200, n_features=2, n_informative=2,
                           n_redundant=0, n_clusters_per_class=1, random_state=42)

# 可视化KNN过程
test_point = np.array([[0, 0]])

distances = np.sqrt(np.sum((X - test_point) ** 2, axis=1))
k = 5
nearest_indices = np.argsort(distances)[:k]

plt.figure(figsize=(10, 8))
plt.scatter(X[y==0, 0], X[y==0, 1], c='blue', label='Class 0', alpha=0.6)
plt.scatter(X[y==1, 0], X[y==1, 1], c='red', label='Class 1', alpha=0.6)
plt.scatter(test_point[0, 0], test_point[0, 1], c='green', s=200, marker='*', label='测试点')

# 画出K个最近邻
for idx in nearest_indices:
    plt.plot([test_point[0, 0], X[idx, 0]], [test_point[0, 1], X[idx, 1]], 
             'k--', alpha=0.5)
    plt.scatter(X[idx, 0], X[idx, 1], s=100, facecolors='none', 
                edgecolors='black', linewidths=2)

# 画出距离圈
max_dist = distances[nearest_indices[-1]]
circle = plt.Circle((test_point[0, 0], test_point[0, 1]), max_dist, 
                     fill=False, linestyle='--', color='gray')
plt.gca().add_patch(circle)

plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'KNN (K={k}) 示意图')
plt.legend()
plt.axis('equal')
plt.show()

# 统计投票
neighbor_labels = y[nearest_indices]
print(f"K个邻居的标签: {neighbor_labels}")
print(f"预测类别: {np.bincount(neighbor_labels).argmax()}")

距离度量

常用距离

距离 公式 特点
欧氏距离 $\sqrt{\sum(x_i - y_i)^2}$ 最常用
曼哈顿距离 $\sum|x_i - y_i|$ 网格路径
闵可夫斯基距离 $(\sum|x_i - y_i|^p)^{1/p}$ 通用形式
余弦距离 $1 - \frac{x \cdot y}{|x||y|}$ 方向相似度
from scipy.spatial.distance import cdist

x1 = np.array([[0, 0]])
x2 = np.array([[3, 4]])

distances = {
    '欧氏距离': cdist(x1, x2, 'euclidean')[0, 0],
    '曼哈顿距离': cdist(x1, x2, 'cityblock')[0, 0],
    '切比雪夫距离': cdist(x1, x2, 'chebyshev')[0, 0],
    '余弦距离': cdist(x1, x2, 'cosine')[0, 0],
}

for name, dist in distances.items():
    print(f"{name}: {dist:.4f}")

距离可视化

# 不同距离度量的等距线
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics = ['euclidean', 'cityblock', 'chebyshev']
titles = ['欧氏距离 (L2)', '曼哈顿距离 (L1)', '切比雪夫距离 (L∞)']

for ax, metric, title in zip(axes, metrics, titles):
    xx, yy = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
    points = np.c_[xx.ravel(), yy.ravel()]
    origin = np.array([[0, 0]])
    
    distances = cdist(points, origin, metric).reshape(xx.shape)
    
    ax.contour(xx, yy, distances, levels=10, cmap='viridis')
    ax.scatter(0, 0, c='red', s=100, zorder=5)
    ax.set_title(title)
    ax.set_aspect('equal')
    ax.set_xlabel('X')
    ax.set_ylabel('Y')

plt.tight_layout()
plt.show()

从零实现

class KNNClassifier:
    def __init__(self, k=5, metric='euclidean', weights='uniform'):
        self.k = k
        self.metric = metric
        self.weights = weights  # 'uniform' or 'distance'
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        """KNN是懒惰学习,fit只是存储数据"""
        self.X_train = np.array(X)
        self.y_train = np.array(y)
        self.classes_ = np.unique(y)
        return self
    
    def _compute_distances(self, X):
        """计算测试样本到所有训练样本的距离"""
        if self.metric == 'euclidean':
            # 使用广播加速
            # ||a-b||^2 = ||a||^2 + ||b||^2 - 2*a·b
            X_sq = np.sum(X ** 2, axis=1, keepdims=True)
            train_sq = np.sum(self.X_train ** 2, axis=1)
            cross = X @ self.X_train.T
            distances = np.sqrt(np.maximum(X_sq + train_sq - 2 * cross, 0))
        elif self.metric == 'manhattan':
            distances = cdist(X, self.X_train, 'cityblock')
        else:
            distances = cdist(X, self.X_train, self.metric)
        return distances
    
    def predict(self, X):
        X = np.array(X)
        distances = self._compute_distances(X)
        
        predictions = []
        for i in range(len(X)):
            # 找到K个最近邻
            nearest_indices = np.argsort(distances[i])[:self.k]
            nearest_labels = self.y_train[nearest_indices]
            
            if self.weights == 'uniform':
                # 多数投票
                pred = np.bincount(nearest_labels).argmax()
            else:
                # 距离加权
                nearest_distances = distances[i][nearest_indices]
                # 避免除零
                nearest_distances = np.maximum(nearest_distances, 1e-10)
                weights = 1 / nearest_distances
                
                # 加权投票
                class_weights = np.zeros(len(self.classes_))
                for label, weight in zip(nearest_labels, weights):
                    class_weights[label] += weight
                pred = np.argmax(class_weights)
            
            predictions.append(pred)
        
        return np.array(predictions)
    
    def predict_proba(self, X):
        X = np.array(X)
        distances = self._compute_distances(X)
        
        probas = []
        for i in range(len(X)):
            nearest_indices = np.argsort(distances[i])[:self.k]
            nearest_labels = self.y_train[nearest_indices]
            
            # 计算各类概率
            proba = np.bincount(nearest_labels, minlength=len(self.classes_)) / self.k
            probas.append(proba)
        
        return np.array(probas)
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)

# 测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNNClassifier(k=5)
knn.fit(X_train, y_train)
print(f"训练准确率: {knn.score(X_train, y_train):.4f}")
print(f"测试准确率: {knn.score(X_test, y_test):.4f}")

K值的选择

K值影响

K值 决策边界 特点
小K 复杂、锯齿 容易过拟合
大K 平滑、简单 容易欠拟合
from sklearn.neighbors import KNeighborsClassifier

# 不同K值的决策边界
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

k_values = [1, 3, 5, 10, 20, 50]

xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
                     np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))

for ax, k in zip(axes.ravel(), k_values):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    ax.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', label='Class 0')
    ax.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', label='Class 1')
    
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    ax.set_title(f'K={k}\nTrain: {train_score:.3f}, Test: {test_score:.3f}')

plt.tight_layout()
plt.show()

交叉验证选K

from sklearn.model_selection import cross_val_score

k_range = range(1, 31)
cv_scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

plt.figure(figsize=(10, 6))
plt.plot(k_range, cv_scores, 'bo-')
plt.xlabel('K')
plt.ylabel('交叉验证准确率')
plt.title('K值选择')
plt.axvline(x=k_range[np.argmax(cv_scores)], color='r', linestyle='--', 
            label=f'最优K={k_range[np.argmax(cv_scores)]}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"最优K: {k_range[np.argmax(cv_scores)]}, 准确率: {max(cv_scores):.4f}")

特征缩放的重要性

为什么需要缩放

KNN基于距离,特征尺度差异会导致大尺度特征主导距离计算。

# 创建不同尺度的数据
np.random.seed(42)
X_unscaled = np.random.randn(200, 2)
X_unscaled[:, 0] *= 1000  # 第一个特征放大1000倍
y_rand = (X_unscaled[:, 1] > 0).astype(int)  # 实际上只有第二个特征有用

X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(
    X_unscaled, y_rand, test_size=0.2, random_state=42)

# 不缩放
knn_unscaled = KNeighborsClassifier(n_neighbors=5)
knn_unscaled.fit(X_train_us, y_train_us)
print(f"未缩放准确率: {knn_unscaled.score(X_test_us, y_test_us):.4f}")

# 缩放后
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_us)
X_test_scaled = scaler.transform(X_test_us)

knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train_us)
print(f"缩放后准确率: {knn_scaled.score(X_test_scaled, y_test_us):.4f}")

加权KNN

距离加权

离测试点越近的邻居,投票权重越大。

# 比较uniform和distance加权
knn_uniform = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn_distance = KNeighborsClassifier(n_neighbors=5, weights='distance')

knn_uniform.fit(X_train, y_train)
knn_distance.fit(X_train, y_train)

print(f"Uniform加权: {knn_uniform.score(X_test, y_test):.4f}")
print(f"Distance加权: {knn_distance.score(X_test, y_test):.4f}")

KNN用于回归

from sklearn.neighbors import KNeighborsRegressor

# 生成回归数据
np.random.seed(42)
X_reg = np.sort(np.random.rand(100) * 10).reshape(-1, 1)
y_reg = np.sin(X_reg).ravel() + np.random.randn(100) * 0.2

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42)

# 不同K值
plt.figure(figsize=(15, 5))

for i, k in enumerate([1, 5, 20]):
    plt.subplot(1, 3, i+1)
    
    knn_reg = KNeighborsRegressor(n_neighbors=k)
    knn_reg.fit(X_train_r, y_train_r)
    
    X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
    y_plot = knn_reg.predict(X_plot)
    
    plt.scatter(X_train_r, y_train_r, c='blue', alpha=0.5, label='训练数据')
    plt.scatter(X_test_r, y_test_r, c='red', alpha=0.5, label='测试数据')
    plt.plot(X_plot, y_plot, 'g-', linewidth=2, label='KNN预测')
    plt.plot(X_plot, np.sin(X_plot), 'k--', alpha=0.5, label='真实函数')
    
    test_score = knn_reg.score(X_test_r, y_test_r)
    plt.title(f'K={k}, R²={test_score:.3f}')
    plt.legend()

plt.tight_layout()
plt.show()

加速KNN:KD树

暴力搜索的问题

  • 时间复杂度:$O(DN)$,D为特征维度,N为样本数
  • 大规模数据时非常慢

KD树

空间划分数据结构,将搜索复杂度降到 $O(D\log N)$。

from sklearn.neighbors import KNeighborsClassifier
import time

# 生成大规模数据
X_large, y_large = make_classification(n_samples=10000, n_features=20,
                                        n_informative=10, random_state=42)

# 比较不同算法
algorithms = ['brute', 'kd_tree', 'ball_tree']

for algo in algorithms:
    knn = KNeighborsClassifier(n_neighbors=5, algorithm=algo)
    
    start = time.time()
    knn.fit(X_large, y_large)
    knn.predict(X_large[:100])
    elapsed = time.time() - start
    
    print(f"{algo:10s}: {elapsed:.4f}")

维度灾难

KD树在高维数据上效率下降,当维度超过20时,暴力搜索可能更快。


实战:手写数字识别

from sklearn.datasets import load_digits
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# 加载数据
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

print(f"数据形状: {X_digits.shape}")
print(f"类别: {np.unique(y_digits)}")

# 可视化样本
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for ax, img, label in zip(axes.ravel(), digits.images[:10], digits.target[:10]):
    ax.imshow(img, cmap='gray')
    ax.set_title(f'Label: {label}')
    ax.axis('off')
plt.tight_layout()
plt.show()

# 划分数据
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_digits, y_digits, test_size=0.2, random_state=42)

# 标准化
scaler = StandardScaler()
X_train_d = scaler.fit_transform(X_train_d)
X_test_d = scaler.transform(X_test_d)

# 训练KNN
knn_digits = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_digits.fit(X_train_d, y_train_d)

y_pred_d = knn_digits.predict(X_test_d)

print(f"测试准确率: {knn_digits.score(X_test_d, y_test_d):.4f}")
print("\n分类报告:")
print(classification_report(y_test_d, y_pred_d))

# 混淆矩阵
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_d, y_pred_d)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测')
plt.ylabel('实际')
plt.title('KNN手写数字识别混淆矩阵')
plt.show()

常见问题

Q1: KNN的优缺点?

优点 缺点
简单直观 预测慢(需遍历所有数据)
无需训练 内存占用大
天然支持多分类 对特征尺度敏感
非参数模型 高维数据效果差

Q2: 如何处理高维数据?

  • 降维:PCA、t-SNE
  • 特征选择
  • 使用其他算法

Q3: K一般取多少?

  • 经验法则:$K = \sqrt{N}$
  • 通过交叉验证选择
  • 通常选奇数避免平票

Q4: 类别不平衡怎么办?

  • 使用距离加权
  • 调整采样
  • 修改投票规则

总结

概念 说明
核心思想 近朱者赤,近墨者黑
分类决策 K个邻居投票
关键参数 K值、距离度量、权重方式
预处理 必须特征缩放
加速 KD树、Ball树

参考资料

  • 《统计学习方法》李航 第3章
  • scikit-learn 文档:KNN
  • Cover, T., Hart, P. (1967). “Nearest neighbor pattern classification”

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——K近邻算法 》

本文链接:http://localhost:3015/ai/K%E8%BF%91%E9%82%BB.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!