机器学习基础系列——交叉验证

前言

简单的训练集/测试集划分可能导致评估结果的方差较大。交叉验证通过多次划分来获得更稳定可靠的性能估计。

为什么需要交叉验证

简单划分的问题

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

np.random.seed(42)

# 生成数据
X, y = make_classification(n_samples=200, n_features=20, n_informative=10,
                           random_state=42)

# 不同随机种子的简单划分
scores = []
for seed in range(100):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed)
    
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr.fit(X_train, y_train)
    scores.append(lr.score(X_test, y_test))

plt.figure(figsize=(10, 5))
plt.hist(scores, bins=20, edgecolor='black')
plt.xlabel('测试准确率')
plt.ylabel('频数')
plt.title(f'100次随机划分的准确率分布\n均值={np.mean(scores):.3f}, 标准差={np.std(scores):.3f}')
plt.axvline(np.mean(scores), color='r', linestyle='--', label='均值')
plt.legend()
plt.show()

print(f"准确率范围: {min(scores):.3f} - {max(scores):.3f}")

K折交叉验证

原理

将数据分成K个大小相等的子集（折）
每次用K-1折训练，1折验证
重复K次，每折都作为一次验证集
最终性能是K次验证的平均

from sklearn.model_selection import KFold, cross_val_score

# K折交叉验证
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# 可视化数据划分
fig, axes = plt.subplots(5, 1, figsize=(12, 8))

for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
    ax = axes[i]
    
    # 创建指示数组
    indices = np.zeros(len(X))
    indices[val_idx] = 1
    
    ax.scatter(range(len(X)), indices, c=indices, cmap='coolwarm', s=10)
    ax.set_yticks([0, 1])
    ax.set_yticklabels(['训练', '验证'])
    ax.set_ylabel(f'Fold {i+1}')
    if i == 4:
        ax.set_xlabel('样本索引')

plt.suptitle('5折交叉验证数据划分', fontsize=14)
plt.tight_layout()
plt.show()

使用sklearn

lr = LogisticRegression(random_state=42, max_iter=1000)

# 5折交叉验证
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')

print("5折交叉验证结果:")
print(f"  各折准确率: {cv_scores}")
print(f"  平均准确率: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

K值选择

K值	特点
小K（如2-3）	计算快，偏差大，方差小
大K（如10-20）	计算慢，偏差小，方差大
K=N（LOO）	最小偏差，最大方差

# 不同K值对比
k_values = [2, 3, 5, 10, 20]
results = []

for k in k_values:
    scores = cross_val_score(lr, X, y, cv=k, scoring='accuracy')
    results.append({
        'K': k,
        'Mean': scores.mean(),
        'Std': scores.std()
    })
    print(f"K={k}: {scores.mean():.4f} ± {scores.std():.4f}")

# 可视化
import pandas as pd
df = pd.DataFrame(results)

fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar(df['K'], df['Mean'], yerr=df['Std'], marker='o', capsize=5)
ax.set_xlabel('K值')
ax.set_ylabel('准确率')
ax.set_title('不同K值的交叉验证结果')
ax.grid(True, alpha=0.3)
plt.show()

其他交叉验证方法

留一法（LOO）

from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
print(f"LOO总共需要{loo.get_n_splits(X)}次迭代")

# 小数据集示例
X_small, y_small = X[:50], y[:50]
loo_scores = cross_val_score(lr, X_small, y_small, cv=LeaveOneOut())
print(f"LOO准确率: {loo_scores.mean():.4f}")

分层K折（Stratified K-Fold）

保持每折中类别比例与原数据一致。

from sklearn.model_selection import StratifiedKFold

# 不平衡数据
X_imb, y_imb = make_classification(n_samples=200, weights=[0.9, 0.1], random_state=42)

# 普通K折
kfold_normal = KFold(n_splits=5, shuffle=True, random_state=42)
# 分层K折
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("各折中正类比例:")
print("\n普通K折:")
for i, (train_idx, val_idx) in enumerate(kfold_normal.split(X_imb)):
    ratio = y_imb[val_idx].mean()
    print(f"  Fold {i+1}: {ratio:.2%}")

print("\n分层K折:")
for i, (train_idx, val_idx) in enumerate(skfold.split(X_imb, y_imb)):
    ratio = y_imb[val_idx].mean()
    print(f"  Fold {i+1}: {ratio:.2%}")

重复K折

from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold

# 重复5折交叉验证3次
rkfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

scores = cross_val_score(lr, X, y, cv=rkfold)
print(f"重复K折 (5折×3次): {scores.mean():.4f} ± {scores.std():.4f}")
print(f"共{len(scores)}个分数")

分组K折

当数据有分组结构时使用（如同一用户的多条记录）。

from sklearn.model_selection import GroupKFold

# 模拟分组数据
groups = np.repeat(np.arange(20), 10)  # 20个用户，每人10条记录

gkfold = GroupKFold(n_splits=5)

print("分组K折划分:")
for i, (train_idx, val_idx) in enumerate(gkfold.split(X, y, groups)):
    train_groups = set(groups[train_idx])
    val_groups = set(groups[val_idx])
    print(f"  Fold {i+1}: 训练组{len(train_groups)}个, 验证组{len(val_groups)}个")
    print(f"    验证组: {sorted(val_groups)}")

时间序列交叉验证

from sklearn.model_selection import TimeSeriesSplit

# 时间序列数据
n_samples = 100
X_ts = np.random.randn(n_samples, 5)
y_ts = np.random.randn(n_samples)

tscv = TimeSeriesSplit(n_splits=5)

fig, ax = plt.subplots(figsize=(12, 6))

for i, (train_idx, val_idx) in enumerate(tscv.split(X_ts)):
    ax.scatter(train_idx, [i+1] * len(train_idx), c='blue', marker='s', s=30, alpha=0.7)
    ax.scatter(val_idx, [i+1] * len(val_idx), c='red', marker='o', s=30)

ax.set_xlabel('样本索引')
ax.set_ylabel('Fold')
ax.set_title('时间序列交叉验证')
ax.legend(['训练', '验证'], loc='upper left')
plt.show()

交叉验证用于超参数调优

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# 准备数据
X, y = make_classification(n_samples=500, n_features=20, random_state=42)

# 参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)

# 网格搜索 + 交叉验证
grid_search = GridSearchCV(
    rf, param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳CV分数: {grid_search.best_score_:.4f}")

# 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5结果:")
print(results.nsmallest(5, 'rank_test_score')[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']])

嵌套交叉验证

避免超参数调优导致的过拟合评估。

from sklearn.model_selection import cross_val_score

# 外层：评估模型泛化性能
# 内层：超参数调优

outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 内层交叉验证用于超参数选择
grid_search_nested = GridSearchCV(rf, param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)

# 外层交叉验证评估
nested_scores = cross_val_score(grid_search_nested, X, y, cv=outer_cv, scoring='accuracy')

print(f"嵌套交叉验证分数: {nested_scores.mean():.4f} ± {nested_scores.std():.4f}")

# 比较：非嵌套可能过于乐观
print(f"非嵌套（内层最佳）: {grid_search.best_score_:.4f}")

交叉验证结果可视化

from sklearn.model_selection import cross_validate

# 多指标交叉验证
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

cv_results = cross_validate(
    LogisticRegression(max_iter=1000),
    X, y, cv=10,
    scoring=scoring,
    return_train_score=True
)

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 测试集分数
metrics = [f'test_{m}' for m in scoring]
test_scores = [cv_results[m] for m in metrics]

axes[0].boxplot(test_scores, labels=scoring)
axes[0].set_ylabel('分数')
axes[0].set_title('各指标的交叉验证分布')
axes[0].grid(True, alpha=0.3)

# 训练vs测试
train_scores = cv_results['train_accuracy']
test_scores_acc = cv_results['test_accuracy']

x = np.arange(10)
width = 0.35

axes[1].bar(x - width/2, train_scores, width, label='训练')
axes[1].bar(x + width/2, test_scores_acc, width, label='测试')
axes[1].set_xlabel('Fold')
axes[1].set_ylabel('准确率')
axes[1].set_title('训练vs测试准确率')
axes[1].legend()
axes[1].set_xticks(x)
axes[1].set_xticklabels([f'{i+1}' for i in range(10)])

plt.tight_layout()
plt.show()

学习曲线

from sklearn.model_selection import learning_curve

# 学习曲线
train_sizes, train_scores, val_scores = learning_curve(
    LogisticRegression(max_iter=1000), X, y,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5, scoring='accuracy'
)

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
plt.plot(train_sizes, val_mean, 'o-', color='orange', label='验证分数')

plt.xlabel('训练样本数')
plt.ylabel('准确率')
plt.title('学习曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

验证曲线

from sklearn.model_selection import validation_curve

# 验证曲线：检查超参数影响
param_range = np.logspace(-4, 2, 10)

train_scores, val_scores = validation_curve(
    LogisticRegression(max_iter=1000), X, y,
    param_name='C', param_range=param_range,
    cv=5, scoring='accuracy'
)

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)

plt.figure(figsize=(10, 6))
plt.semilogx(param_range, train_mean, 'o-', color='blue', label='训练分数')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.semilogx(param_range, val_mean, 'o-', color='orange', label='验证分数')
plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')

plt.xlabel('C (正则化参数)')
plt.ylabel('准确率')
plt.title('验证曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

常见问题

Q1: 应该用多少折？

数据量	推荐K值
<100	LOO或10折
100-1000	10折
>1000	5折
计算昂贵	3-5折

Q2: 什么时候用分层K折？

分类任务（尤其是不平衡数据）
确保每折代表性

Q3: 交叉验证和测试集的关系？

原始数据 → 训练集 + 测试集（hold-out）
         ↓
      交叉验证用于训练集
         ↓
      最终模型在测试集评估

Q4: 如何处理数据泄露？

预处理（如标准化）应在每折内部进行
使用Pipeline确保正确

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 正确做法：Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000))
])

cv_scores = cross_val_score(pipe, X, y, cv=5)
print(f"正确方式: {cv_scores.mean():.4f}")

总结

方法	适用场景
K-Fold	通用
Stratified K-Fold	分类，不平衡
LOO	小数据集
GroupKFold	分组数据
TimeSeriesSplit	时间序列
嵌套CV	超参数调优评估

参考资料

scikit-learn 文档：Cross-validation
Kohavi, R. (1995). “A study of cross-validation and bootstrap for accuracy estimation and model selection”

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——交叉验证》

本文链接：http://localhost:3015/ai/%E4%BA%A4%E5%8F%89%E9%AA%8C%E8%AF%81.html

本文最后一次更新为天前，文章中的某些内容可能已过时！

机器学习基础系列——交叉验证

可靠的模型评估方法

机器学习基础系列

前言

为什么需要交叉验证

简单划分的问题

K折交叉验证

原理

使用sklearn

K值选择

其他交叉验证方法

留一法（LOO）

分层K折（Stratified K-Fold）

重复K折

分组K折

时间序列交叉验证

交叉验证用于超参数调优

嵌套交叉验证

交叉验证结果可视化

学习曲线

验证曲线

常见问题

Q1: 应该用多少折？

Q2: 什么时候用分层K折？

Q3: 交叉验证和测试集的关系？

Q4: 如何处理数据泄露？

总结

参考资料

目录