机器学习基础系列——过拟合与欠拟合

前言

机器学习的核心挑战是让模型在未见数据上表现良好（泛化）。过拟合和欠拟合是影响泛化的两个主要问题，理解偏差-方差权衡是解决这些问题的关键。

基本概念

训练误差 vs 测试误差

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

np.random.seed(42)

# 生成数据
n_samples = 50
X = np.sort(np.random.rand(n_samples) * 10).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.randn(n_samples) * 0.3

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 不同复杂度的模型
degrees = range(1, 15)
train_errors = []
test_errors = []

for degree in degrees:
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    model.fit(X_train, y_train)
    
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    train_errors.append(mean_squared_error(y_train, train_pred))
    test_errors.append(mean_squared_error(y_test, test_pred))

# 绘制误差曲线
plt.figure(figsize=(10, 6))
plt.plot(degrees, train_errors, 'b-o', label='训练误差')
plt.plot(degrees, test_errors, 'r-o', label='测试误差')
plt.xlabel('多项式次数')
plt.ylabel('MSE')
plt.title('模型复杂度与误差')
plt.legend()
plt.grid(True, alpha=0.3)

# 标注区域
plt.axvspan(0.5, 3.5, alpha=0.2, color='blue', label='欠拟合区')
plt.axvspan(8.5, 14.5, alpha=0.2, color='red', label='过拟合区')
plt.show()

三种情况对比

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

X_plot = np.linspace(0, 10, 100).reshape(-1, 1)

scenarios = [
    (1, '欠拟合 (degree=1)', 'blue'),
    (4, '适度拟合 (degree=4)', 'green'),
    (15, '过拟合 (degree=15)', 'red')
]

for ax, (degree, title, color) in zip(axes, scenarios):
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),
        ('linear', LinearRegression())
    ])
    model.fit(X_train, y_train)
    
    y_plot = model.predict(X_plot)
    
    ax.scatter(X_train, y_train, c='blue', alpha=0.5, label='训练数据')
    ax.scatter(X_test, y_test, c='red', marker='x', alpha=0.5, label='测试数据')
    ax.plot(X_plot, y_plot, color=color, linewidth=2, label='模型')
    ax.plot(X_plot, np.sin(X_plot), 'k--', alpha=0.5, label='真实函数')
    
    train_mse = mean_squared_error(y_train, model.predict(X_train))
    test_mse = mean_squared_error(y_test, model.predict(X_test))
    
    ax.set_title(f'{title}\nTrain MSE={train_mse:.3f}, Test MSE={test_mse:.3f}')
    ax.legend()
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

状态	训练误差	测试误差	问题
欠拟合	高	高	模型太简单
适度拟合	低	低	理想状态
过拟合	很低	高	模型太复杂

偏差-方差分解

理论

对于任意预测点 $x$，期望预测误差可以分解为：

\[E[(y - \hat{f}(x))^2] = \text{Bias}^2 + \text{Variance} + \text{Noise}\]

偏差（Bias）：模型假设与真实函数的差距
方差（Variance）：模型对训练数据的敏感程度
噪声（Noise）：数据本身的随机性（不可约）

可视化偏差-方差

from sklearn.utils import resample

n_models = 100
n_samples = 30
X_full = np.linspace(0, 10, 1000).reshape(-1, 1)

def fit_multiple_models(degree, n_models=100):
    """训练多个模型（不同bootstrap样本）"""
    predictions = []
    
    for _ in range(n_models):
        # Bootstrap采样
        X_boot, y_boot = resample(X, y, n_samples=n_samples, random_state=None)
        
        model = Pipeline([
            ('poly', PolynomialFeatures(degree=degree)),
            ('linear', LinearRegression())
        ])
        model.fit(X_boot, y_boot)
        predictions.append(model.predict(X_full).ravel())
    
    return np.array(predictions)

# 不同复杂度的模型
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

degrees = [1, 3, 10]

for i, degree in enumerate(degrees):
    preds = fit_multiple_models(degree, n_models=50)
    
    # 上行：多个模型的预测
    ax = axes[0, i]
    for pred in preds[:20]:
        ax.plot(X_full, pred, 'b-', alpha=0.2)
    ax.plot(X_full, np.sin(X_full), 'r-', linewidth=2, label='真实函数')
    ax.plot(X_full, preds.mean(axis=0), 'g-', linewidth=2, label='平均预测')
    ax.scatter(X, y, c='black', s=20, alpha=0.3, label='数据')
    ax.set_title(f'Degree={degree}')
    ax.legend()
    ax.set_ylim(-2, 2)
    
    # 下行：偏差和方差分布
    ax = axes[1, i]
    true_values = np.sin(X_full).ravel()
    mean_pred = preds.mean(axis=0)
    
    bias_sq = (mean_pred - true_values) ** 2
    variance = preds.var(axis=0)
    
    ax.fill_between(X_full.ravel(), 0, bias_sq, alpha=0.5, label=f'Bias²={bias_sq.mean():.3f}')
    ax.fill_between(X_full.ravel(), 0, variance, alpha=0.5, label=f'Var={variance.mean():.3f}')
    ax.set_title(f'Degree={degree}: Bias² + Variance')
    ax.legend()
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

偏差-方差权衡

# 计算不同复杂度的偏差和方差
degrees = range(1, 12)
biases = []
variances = []
total_errors = []

for degree in degrees:
    preds = fit_multiple_models(degree, n_models=30)
    
    true_values = np.sin(X_full).ravel()
    mean_pred = preds.mean(axis=0)
    
    bias_sq = ((mean_pred - true_values) ** 2).mean()
    variance = preds.var(axis=0).mean()
    
    biases.append(np.sqrt(bias_sq))  # 使用标准差更直观
    variances.append(np.sqrt(variance))
    total_errors.append(np.sqrt(bias_sq + variance))

plt.figure(figsize=(10, 6))
plt.plot(degrees, biases, 'b-o', label='Bias')
plt.plot(degrees, variances, 'r-o', label='Variance')
plt.plot(degrees, total_errors, 'g-o', label='Total Error')

plt.xlabel('模型复杂度 (多项式次数)')
plt.ylabel('误差')
plt.title('偏差-方差权衡')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

诊断过拟合与欠拟合

学习曲线诊断

from sklearn.model_selection import learning_curve

def plot_learning_curves(estimator, X, y, title):
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 10),
        cv=5, scoring='neg_mean_squared_error'
    )
    
    train_mean = -train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = -val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)
    
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练')
    plt.plot(train_sizes, val_mean, 'o-', color='orange', label='验证')
    plt.xlabel('训练样本数')
    plt.ylabel('MSE')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

models = [
    (Pipeline([('poly', PolynomialFeatures(1)), ('lr', LinearRegression())]), '欠拟合 (degree=1)'),
    (Pipeline([('poly', PolynomialFeatures(4)), ('lr', LinearRegression())]), '适度拟合 (degree=4)'),
    (Pipeline([('poly', PolynomialFeatures(15)), ('lr', LinearRegression())]), '过拟合 (degree=15)')
]

for ax, (model, title) in zip(axes, models):
    plt.sca(ax)
    plot_learning_curves(model, X, y, title)

plt.tight_layout()
plt.show()

学习曲线解读

模式	训练误差	验证误差	诊断
两者都高且接近	高	高	欠拟合
两者都低且接近	低	低	理想
训练低，验证高	低	高	过拟合

解决过拟合

方法一：增加数据

# 模拟数据量的影响
sample_sizes = [20, 50, 100, 200, 500]

fig, axes = plt.subplots(1, len(sample_sizes), figsize=(20, 4))

for ax, n in zip(axes, sample_sizes):
    X_large = np.sort(np.random.rand(n) * 10).reshape(-1, 1)
    y_large = np.sin(X_large).ravel() + np.random.randn(n) * 0.3
    
    X_tr, X_te, y_tr, y_te = train_test_split(X_large, y_large, test_size=0.3, random_state=42)
    
    model = Pipeline([
        ('poly', PolynomialFeatures(10)),
        ('linear', LinearRegression())
    ])
    model.fit(X_tr, y_tr)
    
    X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
    y_plot = model.predict(X_plot)
    
    ax.scatter(X_tr, y_tr, c='blue', alpha=0.3, s=20)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    test_mse = mean_squared_error(y_te, model.predict(X_te))
    ax.set_title(f'N={n}, Test MSE={test_mse:.3f}')
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法二：正则化

from sklearn.linear_model import Ridge, Lasso

X_train_p = PolynomialFeatures(10).fit_transform(X_train)
X_test_p = PolynomialFeatures(10).fit_transform(X_test)
X_plot_p = PolynomialFeatures(10).fit_transform(X_plot)

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

models = [
    (LinearRegression(), '无正则化'),
    (Ridge(alpha=0.01), 'Ridge α=0.01'),
    (Ridge(alpha=1.0), 'Ridge α=1.0'),
    (Ridge(alpha=100), 'Ridge α=100')
]

for ax, (model, title) in zip(axes, models):
    model.fit(X_train_p, y_train)
    
    y_plot = model.predict(X_plot_p)
    
    ax.scatter(X_train, y_train, c='blue', alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    train_mse = mean_squared_error(y_train, model.predict(X_train_p))
    test_mse = mean_squared_error(y_test, model.predict(X_test_p))
    
    ax.set_title(f'{title}\nTrain={train_mse:.3f}, Test={test_mse:.3f}')
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法三：降低模型复杂度

减少特征数量
使用更简单的模型
减少神经网络层数/神经元

方法四：早停（神经网络）

# 模拟训练过程
np.random.seed(42)

epochs = 200
train_losses = []
val_losses = []

# 模拟损失曲线
for epoch in range(epochs):
    train_loss = 1.0 * np.exp(-epoch/30) + 0.1 + np.random.randn() * 0.02
    val_loss = 1.0 * np.exp(-epoch/30) + 0.15 + 0.003 * epoch + np.random.randn() * 0.02
    train_losses.append(train_loss)
    val_losses.append(val_loss)

plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='训练损失')
plt.plot(val_losses, label='验证损失')

# 标记最佳点
best_epoch = np.argmin(val_losses)
plt.axvline(best_epoch, color='r', linestyle='--', label=f'早停点 (epoch={best_epoch})')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('早停法示意')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

方法五：Dropout（神经网络）

# Dropout示意图
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 无Dropout
ax = axes[0]
layers = [4, 6, 6, 2]
for i, (n1, n2) in enumerate(zip(layers[:-1], layers[1:])):
    for j in range(n1):
        for k in range(n2):
            ax.plot([i, i+1], [j/(n1-1), k/(n2-1)], 'b-', alpha=0.3)

for i, n in enumerate(layers):
    for j in range(n):
        ax.scatter(i, j/(n-1), c='blue', s=200, zorder=5)

ax.set_title('无Dropout')
ax.axis('off')

# 有Dropout
ax = axes[1]
np.random.seed(42)
dropout_mask = [np.random.rand(n) > 0.3 for n in layers[1:-1]]  # 30% dropout

for i, (n1, n2) in enumerate(zip(layers[:-1], layers[1:])):
    for j in range(n1):
        for k in range(n2):
            if i == 0 or dropout_mask[i-1][j] if i > 0 and j < len(dropout_mask[i-1]) else True:
                if i < len(dropout_mask) and not dropout_mask[i][k]:
                    continue
                ax.plot([i, i+1], [j/(n1-1), k/(n2-1)], 'b-', alpha=0.3)

for i, n in enumerate(layers):
    for j in range(n):
        if 0 < i < len(layers)-1:
            color = 'blue' if dropout_mask[i-1][j] else 'gray'
            alpha = 1.0 if dropout_mask[i-1][j] else 0.3
        else:
            color = 'blue'
            alpha = 1.0
        ax.scatter(i, j/(n-1), c=color, s=200, zorder=5, alpha=alpha)

ax.set_title('Dropout (p=0.3)')
ax.axis('off')

plt.tight_layout()
plt.show()

方法六：数据增强

from sklearn.datasets import load_digits
from scipy.ndimage import rotate, shift

# 图像数据增强示例
digits = load_digits()
original = digits.images[0]

fig, axes = plt.subplots(2, 4, figsize=(12, 6))

augmentations = [
    ('原始', original),
    ('旋转10°', rotate(original, 10, reshape=False)),
    ('旋转-10°', rotate(original, -10, reshape=False)),
    ('平移', shift(original, [1, 1])),
    ('噪声', original + np.random.randn(*original.shape) * 0.5),
    ('缩放', original * 1.2),
    ('翻转', np.fliplr(original)),
    ('组合', rotate(original + np.random.randn(*original.shape) * 0.3, 5, reshape=False))
]

for ax, (title, img) in zip(axes.ravel(), augmentations):
    ax.imshow(img, cmap='gray')
    ax.set_title(title)
    ax.axis('off')

plt.suptitle('数据增强示例', fontsize=14)
plt.tight_layout()
plt.show()

解决欠拟合

方法一：增加特征

from sklearn.preprocessing import PolynomialFeatures

# 特征工程
X_1d = np.random.rand(100, 1) * 10
y_1d = X_1d.ravel() ** 2 + np.random.randn(100) * 5

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

features = [1, 2, 3]

for ax, n_features in zip(axes, features):
    if n_features == 1:
        X_feat = X_1d
    else:
        poly = PolynomialFeatures(n_features, include_bias=False)
        X_feat = poly.fit_transform(X_1d)
    
    lr = LinearRegression()
    lr.fit(X_feat, y_1d)
    
    X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
    if n_features == 1:
        X_plot_feat = X_plot
    else:
        X_plot_feat = poly.transform(X_plot)
    
    y_plot = lr.predict(X_plot_feat)
    
    ax.scatter(X_1d, y_1d, alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, X_plot**2, 'g--', alpha=0.5, label='真实函数')
    
    r2 = lr.score(X_feat, y_1d)
    ax.set_title(f'特征数={n_features}, R²={r2:.3f}')
    ax.legend()

plt.tight_layout()
plt.show()

方法二：使用更复杂的模型

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# 比较不同复杂度的模型
models = [
    ('线性回归', LinearRegression()),
    ('多项式回归', Pipeline([('poly', PolynomialFeatures(5)), ('lr', LinearRegression())])),
    ('随机森林', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('SVR (RBF)', SVR(kernel='rbf', C=100))
]

fig, axes = plt.subplots(1, 4, figsize=(16, 4))

for ax, (name, model) in zip(axes, models):
    model.fit(X, y)
    
    y_plot = model.predict(X_plot)
    
    ax.scatter(X, y, alpha=0.5)
    ax.plot(X_plot, y_plot, 'r-', linewidth=2)
    ax.plot(X_plot, np.sin(X_plot), 'g--', alpha=0.5)
    
    ax.set_title(name)
    ax.set_ylim(-2, 2)

plt.tight_layout()
plt.show()

方法三：减少正则化

方法四：训练更长时间（神经网络）

实用检查清单

过拟合检查

训练误差远低于验证误差？
增加数据后性能提升？
增加正则化后性能提升？
简化模型后性能提升？

欠拟合检查

训练误差本身就很高？
添加特征后性能提升？
使用更复杂模型后性能提升？
减少正则化后性能提升？

常见问题

Q1: 如何判断是过拟合还是欠拟合？

看训练误差和验证误差：

两者都高 → 欠拟合
训练低、验证高 → 过拟合

Q2: 什么时候应该停止增加模型复杂度？

当验证误差开始上升时停止。

Q3: 正则化参数如何选择？

使用交叉验证，选择使验证误差最小的参数。

Q4: 数据量对过拟合的影响？

更多数据 → 更难过拟合 → 可以使用更复杂的模型

总结

问题	特征	解决方案
过拟合	训练好、测试差	正则化、更多数据、简化模型
欠拟合	训练差、测试差	增加特征、复杂模型、减少正则化

概念	含义
偏差	模型假设与真实的差距
方差	对训练数据的敏感度
权衡	降低一个往往会增加另一个

参考资料

《机器学习》周志华第2章
Andrew Ng Machine Learning Course
Geman, S. et al. (1992). “Neural networks and the bias/variance dilemma”

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——过拟合与欠拟合》

本文链接：http://localhost:3015/ai/%E8%BF%87%E6%8B%9F%E5%90%88%E4%B8%8E%E6%AC%A0%E6%8B%9F%E5%90%88.html

本文最后一次更新为天前，文章中的某些内容可能已过时！

机器学习基础系列——过拟合与欠拟合

偏差-方差权衡

机器学习基础系列

前言

基本概念

训练误差 vs 测试误差

三种情况对比

偏差-方差分解

理论

可视化偏差-方差

偏差-方差权衡

诊断过拟合与欠拟合

学习曲线诊断

学习曲线解读

解决过拟合

方法一：增加数据

方法二：正则化

方法三：降低模型复杂度

方法四：早停（神经网络）

方法五：Dropout（神经网络）

方法六：数据增强

解决欠拟合

方法一：增加特征

方法二：使用更复杂的模型

方法三：减少正则化

方法四：训练更长时间（神经网络）

实用检查清单

过拟合检查

欠拟合检查

常见问题

Q1: 如何判断是过拟合还是欠拟合？

Q2: 什么时候应该停止增加模型复杂度？

Q3: 正则化参数如何选择？

Q4: 数据量对过拟合的影响？

总结

参考资料

目录