已经是最新一篇文章了!
已经是最后一篇文章了!
可靠的模型评估方法
前言
简单的训练集/测试集划分可能导致评估结果的方差较大。交叉验证通过多次划分来获得更稳定可靠的性能估计。
为什么需要交叉验证
简单划分的问题
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
np.random.seed(42)
# 生成数据
X, y = make_classification(n_samples=200, n_features=20, n_informative=10,
random_state=42)
# 不同随机种子的简单划分
scores = []
for seed in range(100):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=seed)
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train, y_train)
scores.append(lr.score(X_test, y_test))
plt.figure(figsize=(10, 5))
plt.hist(scores, bins=20, edgecolor='black')
plt.xlabel('测试准确率')
plt.ylabel('频数')
plt.title(f'100次随机划分的准确率分布\n均值={np.mean(scores):.3f}, 标准差={np.std(scores):.3f}')
plt.axvline(np.mean(scores), color='r', linestyle='--', label='均值')
plt.legend()
plt.show()
print(f"准确率范围: {min(scores):.3f} - {max(scores):.3f}")
K折交叉验证
原理
- 将数据分成K个大小相等的子集(折)
- 每次用K-1折训练,1折验证
- 重复K次,每折都作为一次验证集
- 最终性能是K次验证的平均
from sklearn.model_selection import KFold, cross_val_score
# K折交叉验证
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# 可视化数据划分
fig, axes = plt.subplots(5, 1, figsize=(12, 8))
for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
ax = axes[i]
# 创建指示数组
indices = np.zeros(len(X))
indices[val_idx] = 1
ax.scatter(range(len(X)), indices, c=indices, cmap='coolwarm', s=10)
ax.set_yticks([0, 1])
ax.set_yticklabels(['训练', '验证'])
ax.set_ylabel(f'Fold {i+1}')
if i == 4:
ax.set_xlabel('样本索引')
plt.suptitle('5折交叉验证数据划分', fontsize=14)
plt.tight_layout()
plt.show()
使用sklearn
lr = LogisticRegression(random_state=42, max_iter=1000)
# 5折交叉验证
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
print("5折交叉验证结果:")
print(f" 各折准确率: {cv_scores}")
print(f" 平均准确率: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
K值选择
| K值 | 特点 |
|---|---|
| 小K(如2-3) | 计算快,偏差大,方差小 |
| 大K(如10-20) | 计算慢,偏差小,方差大 |
| K=N(LOO) | 最小偏差,最大方差 |
# 不同K值对比
k_values = [2, 3, 5, 10, 20]
results = []
for k in k_values:
scores = cross_val_score(lr, X, y, cv=k, scoring='accuracy')
results.append({
'K': k,
'Mean': scores.mean(),
'Std': scores.std()
})
print(f"K={k}: {scores.mean():.4f} ± {scores.std():.4f}")
# 可视化
import pandas as pd
df = pd.DataFrame(results)
fig, ax = plt.subplots(figsize=(10, 6))
ax.errorbar(df['K'], df['Mean'], yerr=df['Std'], marker='o', capsize=5)
ax.set_xlabel('K值')
ax.set_ylabel('准确率')
ax.set_title('不同K值的交叉验证结果')
ax.grid(True, alpha=0.3)
plt.show()
其他交叉验证方法
留一法(LOO)
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
print(f"LOO总共需要{loo.get_n_splits(X)}次迭代")
# 小数据集示例
X_small, y_small = X[:50], y[:50]
loo_scores = cross_val_score(lr, X_small, y_small, cv=LeaveOneOut())
print(f"LOO准确率: {loo_scores.mean():.4f}")
分层K折(Stratified K-Fold)
保持每折中类别比例与原数据一致。
from sklearn.model_selection import StratifiedKFold
# 不平衡数据
X_imb, y_imb = make_classification(n_samples=200, weights=[0.9, 0.1], random_state=42)
# 普通K折
kfold_normal = KFold(n_splits=5, shuffle=True, random_state=42)
# 分层K折
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("各折中正类比例:")
print("\n普通K折:")
for i, (train_idx, val_idx) in enumerate(kfold_normal.split(X_imb)):
ratio = y_imb[val_idx].mean()
print(f" Fold {i+1}: {ratio:.2%}")
print("\n分层K折:")
for i, (train_idx, val_idx) in enumerate(skfold.split(X_imb, y_imb)):
ratio = y_imb[val_idx].mean()
print(f" Fold {i+1}: {ratio:.2%}")
重复K折
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
# 重复5折交叉验证3次
rkfold = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
scores = cross_val_score(lr, X, y, cv=rkfold)
print(f"重复K折 (5折×3次): {scores.mean():.4f} ± {scores.std():.4f}")
print(f"共{len(scores)}个分数")
分组K折
当数据有分组结构时使用(如同一用户的多条记录)。
from sklearn.model_selection import GroupKFold
# 模拟分组数据
groups = np.repeat(np.arange(20), 10) # 20个用户,每人10条记录
gkfold = GroupKFold(n_splits=5)
print("分组K折划分:")
for i, (train_idx, val_idx) in enumerate(gkfold.split(X, y, groups)):
train_groups = set(groups[train_idx])
val_groups = set(groups[val_idx])
print(f" Fold {i+1}: 训练组{len(train_groups)}个, 验证组{len(val_groups)}个")
print(f" 验证组: {sorted(val_groups)}")
时间序列交叉验证
from sklearn.model_selection import TimeSeriesSplit
# 时间序列数据
n_samples = 100
X_ts = np.random.randn(n_samples, 5)
y_ts = np.random.randn(n_samples)
tscv = TimeSeriesSplit(n_splits=5)
fig, ax = plt.subplots(figsize=(12, 6))
for i, (train_idx, val_idx) in enumerate(tscv.split(X_ts)):
ax.scatter(train_idx, [i+1] * len(train_idx), c='blue', marker='s', s=30, alpha=0.7)
ax.scatter(val_idx, [i+1] * len(val_idx), c='red', marker='o', s=30)
ax.set_xlabel('样本索引')
ax.set_ylabel('Fold')
ax.set_title('时间序列交叉验证')
ax.legend(['训练', '验证'], loc='upper left')
plt.show()
交叉验证用于超参数调优
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# 准备数据
X, y = make_classification(n_samples=500, n_features=20, random_state=42)
# 参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 5, 10]
}
rf = RandomForestClassifier(random_state=42)
# 网格搜索 + 交叉验证
grid_search = GridSearchCV(
rf, param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y)
print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳CV分数: {grid_search.best_score_:.4f}")
# 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print("\nTop 5结果:")
print(results.nsmallest(5, 'rank_test_score')[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']])
嵌套交叉验证
避免超参数调优导致的过拟合评估。
from sklearn.model_selection import cross_val_score
# 外层:评估模型泛化性能
# 内层:超参数调优
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# 内层交叉验证用于超参数选择
grid_search_nested = GridSearchCV(rf, param_grid, cv=inner_cv, scoring='accuracy', n_jobs=-1)
# 外层交叉验证评估
nested_scores = cross_val_score(grid_search_nested, X, y, cv=outer_cv, scoring='accuracy')
print(f"嵌套交叉验证分数: {nested_scores.mean():.4f} ± {nested_scores.std():.4f}")
# 比较:非嵌套可能过于乐观
print(f"非嵌套(内层最佳): {grid_search.best_score_:.4f}")
交叉验证结果可视化
from sklearn.model_selection import cross_validate
# 多指标交叉验证
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cv_results = cross_validate(
LogisticRegression(max_iter=1000),
X, y, cv=10,
scoring=scoring,
return_train_score=True
)
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 测试集分数
metrics = [f'test_{m}' for m in scoring]
test_scores = [cv_results[m] for m in metrics]
axes[0].boxplot(test_scores, labels=scoring)
axes[0].set_ylabel('分数')
axes[0].set_title('各指标的交叉验证分布')
axes[0].grid(True, alpha=0.3)
# 训练vs测试
train_scores = cv_results['train_accuracy']
test_scores_acc = cv_results['test_accuracy']
x = np.arange(10)
width = 0.35
axes[1].bar(x - width/2, train_scores, width, label='训练')
axes[1].bar(x + width/2, test_scores_acc, width, label='测试')
axes[1].set_xlabel('Fold')
axes[1].set_ylabel('准确率')
axes[1].set_title('训练vs测试准确率')
axes[1].legend()
axes[1].set_xticks(x)
axes[1].set_xticklabels([f'{i+1}' for i in range(10)])
plt.tight_layout()
plt.show()
学习曲线
from sklearn.model_selection import learning_curve
# 学习曲线
train_sizes, train_scores, val_scores = learning_curve(
LogisticRegression(max_iter=1000), X, y,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=5, scoring='accuracy'
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练分数')
plt.plot(train_sizes, val_mean, 'o-', color='orange', label='验证分数')
plt.xlabel('训练样本数')
plt.ylabel('准确率')
plt.title('学习曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
验证曲线
from sklearn.model_selection import validation_curve
# 验证曲线:检查超参数影响
param_range = np.logspace(-4, 2, 10)
train_scores, val_scores = validation_curve(
LogisticRegression(max_iter=1000), X, y,
param_name='C', param_range=param_range,
cv=5, scoring='accuracy'
)
train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = val_scores.mean(axis=1)
val_std = val_scores.std(axis=1)
plt.figure(figsize=(10, 6))
plt.semilogx(param_range, train_mean, 'o-', color='blue', label='训练分数')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.semilogx(param_range, val_mean, 'o-', color='orange', label='验证分数')
plt.fill_between(param_range, val_mean - val_std, val_mean + val_std, alpha=0.1, color='orange')
plt.xlabel('C (正则化参数)')
plt.ylabel('准确率')
plt.title('验证曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
常见问题
Q1: 应该用多少折?
| 数据量 | 推荐K值 |
|---|---|
| <100 | LOO或10折 |
| 100-1000 | 10折 |
| >1000 | 5折 |
| 计算昂贵 | 3-5折 |
Q2: 什么时候用分层K折?
- 分类任务(尤其是不平衡数据)
- 确保每折代表性
Q3: 交叉验证和测试集的关系?
原始数据 → 训练集 + 测试集(hold-out)
↓
交叉验证用于训练集
↓
最终模型在测试集评估
Q4: 如何处理数据泄露?
- 预处理(如标准化)应在每折内部进行
- 使用Pipeline确保正确
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# 正确做法:Pipeline
pipe = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(max_iter=1000))
])
cv_scores = cross_val_score(pipe, X, y, cv=5)
print(f"正确方式: {cv_scores.mean():.4f}")
总结
| 方法 | 适用场景 |
|---|---|
| K-Fold | 通用 |
| Stratified K-Fold | 分类,不平衡 |
| LOO | 小数据集 |
| GroupKFold | 分组数据 |
| TimeSeriesSplit | 时间序列 |
| 嵌套CV | 超参数调优评估 |
参考资料
- scikit-learn 文档:Cross-validation
- Kohavi, R. (1995). “A study of cross-validation and bootstrap for accuracy estimation and model selection”
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——交叉验证 》
本文链接:http://localhost:3015/ai/%E4%BA%A4%E5%8F%89%E9%AA%8C%E8%AF%81.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!