偏差-方差权衡

前言

机器学习的核心挑战是让模型在未见数据上表现良好(泛化)。过拟合和欠拟合是影响泛化的两个主要问题,理解偏差-方差权衡是解决这些问题的关键。


基本概念

训练误差 vs 测试误差

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

np.random.seed(42)

# 生成数据
n_samples = 50
X = np.sort(np.random.rand(n_samples) * 10).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.randn(n_samples) * 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 不同复杂度的模型
degrees = range(1, 15)
train_errors = []
test_errors = []

for degree in degrees:
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_errors.append(mean_squared_error(y_train, train_pred))
    test_errors.append(mean_squared_error(y_test, test_pred))

# 绘制误差曲线
plt.figure(figsize=(10, 6))
plt.plot(degrees, train_errors, 'b-o', label='训练误差')
plt.plot(degrees, test_errors, 'r-o', label='测试误差')
plt.xlabel('多项式次数')
plt.ylabel('MSE')
plt.title('模型复杂度与误差')
plt.legend()
plt.grid(True, alpha=0.3)

# 标注区域
plt.axvspan(0.5, 3.5, alpha=0.2, color='blue', label='欠拟合区')
plt.axvspan(8.5, 14.5, alpha=0.2, color='red', label='过拟合区')
plt.show()

三种情况对比

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

X_plot = np.linspace(0, 10, 100).reshape(-1, 1)

scenarios = [
    (1, '欠拟合 (degree=1)', 'blue'),
    (4, '适度拟合 (degree=4)', 'green'),
    (15, '过拟合 (degree=15)', 'red')
]

for ax, (degree, title, color) in zip(axes, scenarios):
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    model.fit(X_train, y_train)
    
    y_plot = model.predict(X_plot)
    
    ax.scatter(X_train, y_train, c='blue', alpha=0.5, label='训练数据')
    ax.scatter(X_test, y_test, c='red', marker='x', alpha=0.5, label='测试数据')
    ax.plot(X_plot, y_plot, color=color, linewidth=2, label='模型')
    ax.plot(X_plot, np.sin(X_plot), 'k--', alpha=0.5, label='真实函数')
    
    train_mse = mean_squared_error(y_train, model.predict(X_train))
    test_mse = mean_squared_error(y_test, model.predict(X_test))
    
    ax.set_title(f'{title}\nTrain MSE={train_mse:.3f}, Test MSE={test_mse:.3f}')
    ax.legend()
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()
状态 训练误差 测试误差 问题
欠拟合 模型太简单
适度拟合 理想状态
过拟合 很低 模型太复杂

偏差-方差分解

理论

对于任意预测点 $x$,期望预测误差可以分解为:

\[E[(y - \hat{f}(x))^2] = \text{Bias}^2 + \text{Variance} + \text{Noise}\]
  • 偏差(Bias):模型假设与真实函数的差距
  • 方差(Variance):模型对训练数据的敏感程度
  • 噪声(Noise):数据本身的随机性(不可约)

可视化偏差-方差

from sklearn.utils import resample

n_models = 100
n_samples = 30
X_full = np.linspace(0, 10, 1000).reshape(-1, 1)

def fit_multiple_models(degree, n_models=100):
    """训练多个模型(不同bootstrap样本)"""
    predictions = []
    
    for _ in range(n_models):
        # Bootstrap采样
        X_boot, y_boot = resample(X, y, n_samples=n_samples, random_state=None)
        
        model = Pipeline([
            ('poly', PolynomialFeatures(degree=degree)),
            ('linear', LinearRegression())
        ])
        model.fit(X_boot, y_boot)
        predictions.append(model.predict(X_full).ravel())
    
    return np.array(predictions)

# 不同复杂度的模型
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

degrees = [1, 3, 10]

for i, degree in enumerate(degrees):
    preds = fit_multiple_models(degree, n_models=50)
    
    # 上行:多个模型的预测
    ax = axes[0, i]
    for pred in preds[:20]:
        ax.plot(X_full, pred, 'b-', alpha=0.2)
    ax.plot(X_full, np.sin(X_full), 'r-', linewidth=2, label='真实函数')
    ax.plot(X_full, preds.mean(axis=0), 'g-', linewidth=2, label='平均预测')
    ax.scatter(X, y, c='black', s=20, alpha=0.3, label='数据')
    ax.set_title(f'Degree={degree}')
    ax.legend()
    ax.set_ylim(-2, 2)
    
    # 下行:偏差和方差分布
    ax = axes[1, i]
    true_values = np.sin(X_full).ravel()
    mean_pred = preds.mean(axis=0)
    
    bias_sq = (mean_pred - true_values) ** 2
    variance = preds.var(axis=0)
    
    ax.fill_between(X_full.ravel(), 0, bias_sq, alpha=0.5, label=f'Bias²={bias_sq.mean():.3f}')
    ax.fill_between(X_full.ravel(), 0, variance, alpha=0.5, label=f'Var={variance.mean():.3f}')
    ax.set_title(f'Degree={degree}: Bias² + Variance')
    ax.legend()
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

偏差-方差权衡

# 计算不同复杂度的偏差和方差
degrees = range(1, 12)
biases = []
variances = []
total_errors = []

for degree in degrees:
    preds = fit_multiple_models(degree, n_models=30)
    
    true_values = np.sin(X_full).ravel()
    mean_pred = preds.mean(axis=0)
    
    bias_sq = ((mean_pred - true_values) ** 2).mean()
    variance = preds.var(axis=0).mean()
    
    biases.append(np.sqrt(bias_sq))  # 使用标准差更直观
    variances.append(np.sqrt(variance))
    total_errors.append(np.sqrt(bias_sq + variance))

plt.figure(figsize=(10, 6))
plt.plot(degrees, biases, 'b-o', label='Bias')
plt.plot(degrees, variances, 'r-o', label='Variance')
plt.plot(degrees, total_errors, 'g-o', label='Total Error')

plt.xlabel('模型复杂度 (多项式次数)')
plt.ylabel('误差')
plt.title('偏差-方差权衡')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

诊断过拟合与欠拟合

学习曲线诊断

from sklearn.model_selection import learning_curve

def plot_learning_curves(estimator, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10),
        cv=5, scoring='neg_mean_squared_error'
    )
    
    train_mean = -train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = -val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)
    
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练')
    plt.plot(train_sizes, val_mean, 'o-', color='orange', label='验证')
    plt.xlabel('训练样本数')
    plt.ylabel('MSE')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = [
    (Pipeline([('poly', PolynomialFeatures(1)), ('lr', LinearRegression())]), '欠拟合 (degree=1)'),
    (Pipeline([('poly', PolynomialFeatures(4)), ('lr', LinearRegression())]), '适度拟合 (degree=4)'),
    (Pipeline([('poly', PolynomialFeatures(15)), ('lr', LinearRegression())]), '过拟合 (degree=15)')
]

for ax, (model, title) in zip(axes, models):
    plt.sca(ax)
    plot_learning_curves(model, X, y, title)

plt.tight_layout()
plt.show()

学习曲线解读

模式 训练误差 验证误差 诊断
两者都高且接近 欠拟合
两者都低且接近 理想
训练低,验证高 过拟合

解决过拟合

方法一:增加数据

# 模拟数据量的影响
sample_sizes = [20, 50, 100, 200, 500]

fig, axes = plt.subplots(1, len(sample_sizes), figsize=(20, 4))

for ax, n in zip(axes, sample_sizes):
    X_large = np.sort(np.random.rand(n) * 10).reshape(-1, 1)
    y_large = np.sin(X_large).ravel() + np.random.randn(n) * 0.3
    
    X_tr, X_te, y_tr, y_te = train_test_split(X_large, y_large, test_size=0.3, random_state=42)
    
    model = Pipeline([
        ('poly', PolynomialFeatures(10)),
        ('linear', LinearRegression())
    ])
    model.fit(X_tr, y_tr)
    
    X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
    y_plot = model.predict(X_plot)
    
    ax.scatter(X_tr, y_tr, c='blue', alpha=0.3, s=20)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    test_mse = mean_squared_error(y_te, model.predict(X_te))
    ax.set_title(f'N={n}, Test MSE={test_mse:.3f}')
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法二:正则化

from sklearn.linear_model import Ridge, Lasso

X_train_p = PolynomialFeatures(10).fit_transform(X_train)
X_test_p = PolynomialFeatures(10).fit_transform(X_test)
X_plot_p = PolynomialFeatures(10).fit_transform(X_plot)

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

models = [
    (LinearRegression(), '无正则化'),
    (Ridge(alpha=0.01), 'Ridge α=0.01'),
    (Ridge(alpha=1.0), 'Ridge α=1.0'),
    (Ridge(alpha=100), 'Ridge α=100')
]

for ax, (model, title) in zip(axes, models):
    model.fit(X_train_p, y_train)
    
    y_plot = model.predict(X_plot_p)
    
    ax.scatter(X_train, y_train, c='blue', alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    train_mse = mean_squared_error(y_train, model.predict(X_train_p))
    test_mse = mean_squared_error(y_test, model.predict(X_test_p))
    
    ax.set_title(f'{title}\nTrain={train_mse:.3f}, Test={test_mse:.3f}')
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法三:降低模型复杂度

  • 减少特征数量
  • 使用更简单的模型
  • 减少神经网络层数/神经元

方法四:早停(神经网络)

# 模拟训练过程
np.random.seed(42)

epochs = 200
train_losses = []
val_losses = []

# 模拟损失曲线
for epoch in range(epochs):
    train_loss = 1.0 * np.exp(-epoch/30) + 0.1 + np.random.randn() * 0.02
    val_loss = 1.0 * np.exp(-epoch/30) + 0.15 + 0.003 * epoch + np.random.randn() * 0.02
    train_losses.append(train_loss)
    val_losses.append(val_loss)

plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='训练损失')
plt.plot(val_losses, label='验证损失')

# 标记最佳点
best_epoch = np.argmin(val_losses)
plt.axvline(best_epoch, color='r', linestyle='--', label=f'早停点 (epoch={best_epoch})')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('早停法示意')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

方法五:Dropout(神经网络)

# Dropout示意图
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 无Dropout
ax = axes[0]
layers = [4, 6, 6, 2]
for i, (n1, n2) in enumerate(zip(layers[:-1], layers[1:])):
    for j in range(n1):
        for k in range(n2):
            ax.plot([i, i+1], [j/(n1-1), k/(n2-1)], 'b-', alpha=0.3)

for i, n in enumerate(layers):
    for j in range(n):
        ax.scatter(i, j/(n-1), c='blue', s=200, zorder=5)

ax.set_title('无Dropout')
ax.axis('off')

# 有Dropout
ax = axes[1]
np.random.seed(42)
dropout_mask = [np.random.rand(n) > 0.3 for n in layers[1:-1]]  # 30% dropout

for i, (n1, n2) in enumerate(zip(layers[:-1], layers[1:])):
    for j in range(n1):
        for k in range(n2):
            if i == 0 or dropout_mask[i-1][j] if i > 0 and j < len(dropout_mask[i-1]) else True:
                if i < len(dropout_mask) and not dropout_mask[i][k]:
                    continue
                ax.plot([i, i+1], [j/(n1-1), k/(n2-1)], 'b-', alpha=0.3)

for i, n in enumerate(layers):
    for j in range(n):
        if 0 < i < len(layers)-1:
            color = 'blue' if dropout_mask[i-1][j] else 'gray'
            alpha = 1.0 if dropout_mask[i-1][j] else 0.3
        else:
            color = 'blue'
            alpha = 1.0
        ax.scatter(i, j/(n-1), c=color, s=200, zorder=5, alpha=alpha)

ax.set_title('Dropout (p=0.3)')
ax.axis('off')

plt.tight_layout()
plt.show()

方法六:数据增强

from sklearn.datasets import load_digits
from scipy.ndimage import rotate, shift

# 图像数据增强示例
digits = load_digits()
original = digits.images[0]

fig, axes = plt.subplots(2, 4, figsize=(12, 6))

augmentations = [
    ('原始', original),
    ('旋转10°', rotate(original, 10, reshape=False)),
    ('旋转-10°', rotate(original, -10, reshape=False)),
    ('平移', shift(original, [1, 1])),
    ('噪声', original + np.random.randn(*original.shape) * 0.5),
    ('缩放', original * 1.2),
    ('翻转', np.fliplr(original)),
    ('组合', rotate(original + np.random.randn(*original.shape) * 0.3, 5, reshape=False))
]

for ax, (title, img) in zip(axes.ravel(), augmentations):
    ax.imshow(img, cmap='gray')
    ax.set_title(title)
    ax.axis('off')

plt.suptitle('数据增强示例', fontsize=14)
plt.tight_layout()
plt.show()

解决欠拟合

方法一:增加特征

from sklearn.preprocessing import PolynomialFeatures

# 特征工程
X_1d = np.random.rand(100, 1) * 10
y_1d = X_1d.ravel() ** 2 + np.random.randn(100) * 5

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

features = [1, 2, 3]

for ax, n_features in zip(axes, features):
    if n_features == 1:
        X_feat = X_1d
    else:
        poly = PolynomialFeatures(n_features, include_bias=False)
        X_feat = poly.fit_transform(X_1d)
    
    lr = LinearRegression()
    lr.fit(X_feat, y_1d)
    
    X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
    if n_features == 1:
        X_plot_feat = X_plot
    else:
        X_plot_feat = poly.transform(X_plot)
    
    y_plot = lr.predict(X_plot_feat)
    
    ax.scatter(X_1d, y_1d, alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, X_plot**2, 'g--', alpha=0.5, label='真实函数')
    
    r2 = lr.score(X_feat, y_1d)
    ax.set_title(f'特征数={n_features}, R²={r2:.3f}')
    ax.legend()

plt.tight_layout()
plt.show()

方法二:使用更复杂的模型

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# 比较不同复杂度的模型
models = [
    ('线性回归', LinearRegression()),
    ('多项式回归', Pipeline([('poly', PolynomialFeatures(5)), ('lr', LinearRegression())])),
    ('随机森林', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('SVR (RBF)', SVR(kernel='rbf', C=100))
]

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

for ax, (name, model) in zip(axes, models):
    model.fit(X, y)
    
    y_plot = model.predict(X_plot)
    
    ax.scatter(X, y, alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    ax.set_title(name)
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法三:减少正则化

方法四:训练更长时间(神经网络)


实用检查清单

过拟合检查

  • 训练误差远低于验证误差?
  • 增加数据后性能提升?
  • 增加正则化后性能提升?
  • 简化模型后性能提升?

欠拟合检查

  • 训练误差本身就很高?
  • 添加特征后性能提升?
  • 使用更复杂模型后性能提升?
  • 减少正则化后性能提升?

常见问题

Q1: 如何判断是过拟合还是欠拟合?

看训练误差和验证误差:

  • 两者都高 → 欠拟合
  • 训练低、验证高 → 过拟合

Q2: 什么时候应该停止增加模型复杂度?

当验证误差开始上升时停止。

Q3: 正则化参数如何选择?

使用交叉验证,选择使验证误差最小的参数。

Q4: 数据量对过拟合的影响?

更多数据 → 更难过拟合 → 可以使用更复杂的模型


总结

问题 特征 解决方案
过拟合 训练好、测试差 正则化、更多数据、简化模型
欠拟合 训练差、测试差 增加特征、复杂模型、减少正则化
概念 含义
偏差 模型假设与真实的差距
方差 对训练数据的敏感度
权衡 降低一个往往会增加另一个

参考资料

  • 《机器学习》周志华 第2章
  • Andrew Ng Machine Learning Course
  • Geman, S. et al. (1992). “Neural networks and the bias/variance dilemma”

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——过拟合与欠拟合 》

本文链接:http://localhost:3015/ai/%E8%BF%87%E6%8B%9F%E5%90%88%E4%B8%8E%E6%AC%A0%E6%8B%9F%E5%90%88.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!