Bagging与特征随机化
前言
随机森林(Random Forest)是最成功的集成学习算法之一,通过组合多棵决策树来提高预测性能和稳定性。本文介绍集成学习基础和随机森林原理。
集成学习基础
什么是集成学习
将多个基学习器(base learner)组合起来,通常能获得比单一学习器更好的性能。
为什么集成有效
根据”群体智慧”原理,多个独立的弱分类器组合可以成为强分类器。
数学上,假设有T个分类器,每个正确率为p(p>0.5),独立投票,整体正确率:
\[P(\text{majority correct}) = \sum_{k>\frac{T}{2}}^{T} \binom{T}{k} p^k (1-p)^{T-k}\]import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom
def ensemble_accuracy(T, p):
"""计算T个准确率为p的独立分类器投票的整体准确率"""
k = np.arange(T // 2 + 1, T + 1)
probs = binom.pmf(k, T, p)
return np.sum(probs)
# 不同数量和单分类器准确率的组合
T_values = range(1, 51, 2) # 奇数个分类器
p_values = [0.55, 0.6, 0.7, 0.8]
plt.figure(figsize=(10, 6))
for p in p_values:
accuracies = [ensemble_accuracy(T, p) for T in T_values]
plt.plot(T_values, accuracies, label=f'p={p}')
plt.xlabel('分类器数量')
plt.ylabel('集成准确率')
plt.title('集成学习:多个弱分类器组合')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axhline(y=0.5, color='k', linestyle='--', alpha=0.3)
plt.show()
Bagging
原理
Bootstrap Aggregating(Bagging):
- 从训练集有放回地抽样(Bootstrap),生成T个子数据集
- 在每个子集上训练一个基学习器
- 预测时投票(分类)或平均(回归)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
# 生成数据
np.random.seed(42)
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 单棵决策树
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
print(f"单棵决策树准确率: {dt.score(X_test, y_test):.4f}")
# Bagging
bagging = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=100,
max_samples=0.8,
max_features=1.0,
bootstrap=True,
random_state=42,
n_jobs=-1
)
bagging.fit(X_train, y_train)
print(f"Bagging准确率: {bagging.score(X_test, y_test):.4f}")
Bootstrap采样
# Bootstrap采样示意
original_data = np.arange(10)
bootstrap_samples = []
for i in range(5):
sample = np.random.choice(original_data, size=len(original_data), replace=True)
bootstrap_samples.append(sample)
print(f"Bootstrap样本 {i+1}: {sample}")
print(f" 未被选中的样本: {set(original_data) - set(sample)}")
OOB评估
Out-of-Bag(OOB):每个Bootstrap样本约有37%的数据未被选中,可用于估计泛化误差。
bagging_oob = BaggingClassifier(
estimator=DecisionTreeClassifier(),
n_estimators=100,
oob_score=True,
random_state=42
)
bagging_oob.fit(X_train, y_train)
print(f"测试集准确率: {bagging_oob.score(X_test, y_test):.4f}")
print(f"OOB准确率: {bagging_oob.oob_score_:.4f}")
随机森林
与Bagging的区别
随机森林在Bagging基础上增加了特征随机化:每次分裂只考虑随机选择的特征子集。
\[\text{Random Forest} = \text{Bagging} + \text{特征随机选择}\]实现原理
from sklearn.ensemble import RandomForestClassifier
# 随机森林
rf = RandomForestClassifier(
n_estimators=100, # 树的数量
max_depth=None, # 树的最大深度
max_features='sqrt', # 每次分裂考虑的特征数
min_samples_split=2, # 节点分裂最小样本数
min_samples_leaf=1, # 叶节点最小样本数
bootstrap=True, # 是否Bootstrap采样
oob_score=True, # 是否计算OOB分数
random_state=42,
n_jobs=-1
)
rf.fit(X_train, y_train)
print(f"随机森林准确率: {rf.score(X_test, y_test):.4f}")
print(f"OOB准确率: {rf.oob_score_:.4f}")
树数量的影响
# 不同树数量的影响
n_trees = [1, 5, 10, 20, 50, 100, 200, 500]
train_scores = []
test_scores = []
oob_scores = []
for n in n_trees:
rf_temp = RandomForestClassifier(n_estimators=n, oob_score=True, random_state=42, n_jobs=-1)
rf_temp.fit(X_train, y_train)
train_scores.append(rf_temp.score(X_train, y_train))
test_scores.append(rf_temp.score(X_test, y_test))
oob_scores.append(rf_temp.oob_score_)
plt.figure(figsize=(10, 6))
plt.plot(n_trees, train_scores, 'b-o', label='训练集')
plt.plot(n_trees, test_scores, 'r-s', label='测试集')
plt.plot(n_trees, oob_scores, 'g-^', label='OOB')
plt.xlabel('树的数量')
plt.ylabel('准确率')
plt.title('随机森林:树数量的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()
从零实现随机森林
class RandomForestClassifierScratch:
def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt',
min_samples_split=2, bootstrap=True, random_state=None):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.max_features = max_features
self.min_samples_split = min_samples_split
self.bootstrap = bootstrap
self.random_state = random_state
self.trees = []
self.feature_indices = []
def _bootstrap_sample(self, X, y):
"""Bootstrap采样"""
n_samples = X.shape[0]
indices = np.random.choice(n_samples, size=n_samples, replace=True)
return X[indices], y[indices]
def _get_n_features(self, n_features):
"""确定每次分裂使用的特征数"""
if self.max_features == 'sqrt':
return int(np.sqrt(n_features))
elif self.max_features == 'log2':
return int(np.log2(n_features))
elif isinstance(self.max_features, int):
return self.max_features
elif isinstance(self.max_features, float):
return int(self.max_features * n_features)
else:
return n_features
def fit(self, X, y):
np.random.seed(self.random_state)
X, y = np.array(X), np.array(y)
n_features = X.shape[1]
n_select = self._get_n_features(n_features)
self.trees = []
self.feature_indices = []
for _ in range(self.n_estimators):
# Bootstrap采样
if self.bootstrap:
X_sample, y_sample = self._bootstrap_sample(X, y)
else:
X_sample, y_sample = X, y
# 随机选择特征(每棵树使用相同的特征子集)
# 注意:标准随机森林是在每次分裂时选择特征子集
# 这里简化为每棵树选择一次
feature_idx = np.random.choice(n_features, size=n_select, replace=False)
self.feature_indices.append(feature_idx)
# 训练决策树
tree = DecisionTreeClassifier(
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
random_state=np.random.randint(10000)
)
tree.fit(X_sample[:, feature_idx], y_sample)
self.trees.append(tree)
self.classes_ = np.unique(y)
return self
def predict(self, X):
X = np.array(X)
predictions = np.zeros((len(X), self.n_estimators))
for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
predictions[:, i] = tree.predict(X[:, feat_idx])
# 多数投票
final_predictions = []
for pred in predictions:
counts = np.bincount(pred.astype(int))
final_predictions.append(np.argmax(counts))
return np.array(final_predictions)
def predict_proba(self, X):
X = np.array(X)
all_proba = np.zeros((len(X), len(self.classes_), self.n_estimators))
for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
proba = tree.predict_proba(X[:, feat_idx])
# 处理树可能没有见过所有类别的情况
for j, c in enumerate(tree.classes_):
class_idx = np.where(self.classes_ == c)[0][0]
all_proba[:, class_idx, i] = proba[:, j]
return np.mean(all_proba, axis=2)
def score(self, X, y):
return np.mean(self.predict(X) == y)
# 测试
rf_scratch = RandomForestClassifierScratch(n_estimators=100, max_depth=10, random_state=42)
rf_scratch.fit(X_train, y_train)
print(f"自实现随机森林准确率: {rf_scratch.score(X_test, y_test):.4f}")
特征重要性
基于不纯度的重要性
# 特征重要性
rf_importance = RandomForestClassifier(n_estimators=100, random_state=42)
rf_importance.fit(X_train, y_train)
importances = rf_importance.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xlabel('特征索引')
plt.ylabel('重要性')
plt.title('随机森林特征重要性(基于不纯度)')
plt.tight_layout()
plt.show()
print("Top 10 特征:")
for i in range(10):
print(f" 特征 {indices[i]}: {importances[indices[i]]:.4f}")
基于排列的重要性
from sklearn.inspection import permutation_importance
# 排列重要性
perm_importance = permutation_importance(rf_importance, X_test, y_test, n_repeats=10, random_state=42)
plt.figure(figsize=(12, 6))
sorted_idx = perm_importance.importances_mean.argsort()[::-1]
plt.boxplot([perm_importance.importances[i] for i in sorted_idx[:10]],
labels=[f'Feature {i}' for i in sorted_idx[:10]], vert=False)
plt.xlabel('重要性下降')
plt.title('排列特征重要性 (Top 10)')
plt.tight_layout()
plt.show()
超参数调优
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# 参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 20, None],
'max_features': ['sqrt', 'log2', 0.5],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 随机搜索(比网格搜索更高效)
rf_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions=param_grid,
n_iter=50,
cv=5,
scoring='accuracy',
random_state=42,
n_jobs=-1
)
rf_search.fit(X_train, y_train)
print(f"最佳参数: {rf_search.best_params_}")
print(f"最佳交叉验证分数: {rf_search.best_score_:.4f}")
print(f"测试集分数: {rf_search.score(X_test, y_test):.4f}")
随机森林 vs 其他方法
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import time
classifiers = {
'Decision Tree': DecisionTreeClassifier(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'Logistic Regression': LogisticRegression(max_iter=1000),
'SVM': SVC(),
'KNN': KNeighborsClassifier()
}
results = []
for name, clf in classifiers.items():
start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
results.append({
'Model': name,
'Train Acc': train_acc,
'Test Acc': test_acc,
'Train Time': train_time
})
print(f"{name}: Train={train_acc:.4f}, Test={test_acc:.4f}, Time={train_time:.3f}s")
import pandas as pd
pd.DataFrame(results)
实战:手写数字识别
from sklearn.datasets import load_digits
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
# 加载数据
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
X_digits, y_digits, test_size=0.2, random_state=42)
# 训练随机森林
rf_digits = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_digits.fit(X_train_d, y_train_d)
y_pred_d = rf_digits.predict(X_test_d)
print(f"测试准确率: {rf_digits.score(X_test_d, y_test_d):.4f}")
print("\n分类报告:")
print(classification_report(y_test_d, y_pred_d))
# 混淆矩阵
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_d, y_pred_d)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测')
plt.ylabel('实际')
plt.title('随机森林手写数字识别')
plt.show()
# 特征重要性可视化
importances_img = rf_digits.feature_importances_.reshape(8, 8)
plt.figure(figsize=(8, 8))
plt.imshow(importances_img, cmap='hot')
plt.colorbar()
plt.title('像素特征重要性')
plt.show()
随机森林回归
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score
# 加载数据
housing = fetch_california_housing()
X_house, y_house = housing.data, housing.target
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
X_house, y_house, test_size=0.2, random_state=42)
# 随机森林回归
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_h, y_train_h)
y_pred_h = rf_reg.predict(X_test_h)
print(f"R²: {r2_score(y_test_h, y_pred_h):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}")
# 预测vs实际
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test_h, y_pred_h, alpha=0.3)
plt.plot([y_test_h.min(), y_test_h.max()], [y_test_h.min(), y_test_h.max()], 'r--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.title('预测 vs 实际')
plt.subplot(1, 2, 2)
importances = rf_reg.feature_importances_
indices = np.argsort(importances)
plt.barh(range(len(importances)), importances[indices])
plt.yticks(range(len(importances)), np.array(housing.feature_names)[indices])
plt.xlabel('重要性')
plt.title('特征重要性')
plt.tight_layout()
plt.show()
常见问题
Q1: 随机森林的优缺点?
| 优点 | 缺点 |
|---|---|
| 准确率高 | 模型较大,内存占用高 |
| 不易过拟合 | 训练时间较长 |
| 能处理高维数据 | 可解释性不如单棵树 |
| 可以评估特征重要性 | 对噪声特征敏感 |
| 容易并行化 | - |
Q2: 树的数量设多少?
- 通常100-500足够
- 更多树不会过拟合,但收益递减
- 可通过OOB误差确定合适数量
Q3: max_features如何选择?
- 分类:
sqrt(n_features)(默认) - 回归:
n_features - 较小值增加多样性,但可能降低单树性能
Q4: 随机森林能处理缺失值吗?
scikit-learn的实现不直接支持,但可以:
- 预处理时填充缺失值
- 使用支持缺失值的实现(如H2O)
总结
| 概念 | 说明 |
|---|---|
| Bagging | Bootstrap采样 + 投票/平均 |
| 随机森林 | Bagging + 特征随机选择 |
| 关键参数 | n_estimators, max_depth, max_features |
| OOB | 使用未采样数据估计泛化误差 |
| 特征重要性 | 基于不纯度或排列 |
参考资料
- Breiman, L. (2001). “Random Forests”. Machine Learning.
- 《统计学习方法》李航 第8章
- scikit-learn 文档:Random Forest
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——随机森林 》
本文链接:http://localhost:3015/ai/%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!