Bagging与特征随机化

前言

随机森林(Random Forest)是最成功的集成学习算法之一,通过组合多棵决策树来提高预测性能和稳定性。本文介绍集成学习基础和随机森林原理。


集成学习基础

什么是集成学习

将多个基学习器(base learner)组合起来,通常能获得比单一学习器更好的性能。

为什么集成有效

根据”群体智慧”原理,多个独立的弱分类器组合可以成为强分类器。

数学上,假设有T个分类器,每个正确率为p(p>0.5),独立投票,整体正确率:

\[P(\text{majority correct}) = \sum_{k>\frac{T}{2}}^{T} \binom{T}{k} p^k (1-p)^{T-k}\]
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import binom

def ensemble_accuracy(T, p):
    """计算T个准确率为p的独立分类器投票的整体准确率"""
    k = np.arange(T // 2 + 1, T + 1)
    probs = binom.pmf(k, T, p)
    return np.sum(probs)

# 不同数量和单分类器准确率的组合
T_values = range(1, 51, 2)  # 奇数个分类器
p_values = [0.55, 0.6, 0.7, 0.8]

plt.figure(figsize=(10, 6))

for p in p_values:
    accuracies = [ensemble_accuracy(T, p) for T in T_values]
    plt.plot(T_values, accuracies, label=f'p={p}')

plt.xlabel('分类器数量')
plt.ylabel('集成准确率')
plt.title('集成学习:多个弱分类器组合')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axhline(y=0.5, color='k', linestyle='--', alpha=0.3)
plt.show()

Bagging

原理

Bootstrap Aggregating(Bagging):

  1. 从训练集有放回地抽样(Bootstrap),生成T个子数据集
  2. 在每个子集上训练一个基学习器
  3. 预测时投票(分类)或平均(回归)
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# 生成数据
np.random.seed(42)
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
                           n_redundant=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 单棵决策树
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
print(f"单棵决策树准确率: {dt.score(X_test, y_test):.4f}")

# Bagging
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    max_features=1.0,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
bagging.fit(X_train, y_train)
print(f"Bagging准确率: {bagging.score(X_test, y_test):.4f}")

Bootstrap采样

# Bootstrap采样示意
original_data = np.arange(10)

bootstrap_samples = []
for i in range(5):
    sample = np.random.choice(original_data, size=len(original_data), replace=True)
    bootstrap_samples.append(sample)
    print(f"Bootstrap样本 {i+1}: {sample}")
    print(f"  未被选中的样本: {set(original_data) - set(sample)}")

OOB评估

Out-of-Bag(OOB):每个Bootstrap样本约有37%的数据未被选中,可用于估计泛化误差。

bagging_oob = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100,
    oob_score=True,
    random_state=42
)
bagging_oob.fit(X_train, y_train)

print(f"测试集准确率: {bagging_oob.score(X_test, y_test):.4f}")
print(f"OOB准确率: {bagging_oob.oob_score_:.4f}")

随机森林

与Bagging的区别

随机森林在Bagging基础上增加了特征随机化:每次分裂只考虑随机选择的特征子集。

\[\text{Random Forest} = \text{Bagging} + \text{特征随机选择}\]

实现原理

from sklearn.ensemble import RandomForestClassifier

# 随机森林
rf = RandomForestClassifier(
    n_estimators=100,        # 树的数量
    max_depth=None,          # 树的最大深度
    max_features='sqrt',     # 每次分裂考虑的特征数
    min_samples_split=2,     # 节点分裂最小样本数
    min_samples_leaf=1,      # 叶节点最小样本数
    bootstrap=True,          # 是否Bootstrap采样
    oob_score=True,          # 是否计算OOB分数
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

print(f"随机森林准确率: {rf.score(X_test, y_test):.4f}")
print(f"OOB准确率: {rf.oob_score_:.4f}")

树数量的影响

# 不同树数量的影响
n_trees = [1, 5, 10, 20, 50, 100, 200, 500]
train_scores = []
test_scores = []
oob_scores = []

for n in n_trees:
    rf_temp = RandomForestClassifier(n_estimators=n, oob_score=True, random_state=42, n_jobs=-1)
    rf_temp.fit(X_train, y_train)
    train_scores.append(rf_temp.score(X_train, y_train))
    test_scores.append(rf_temp.score(X_test, y_test))
    oob_scores.append(rf_temp.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_trees, train_scores, 'b-o', label='训练集')
plt.plot(n_trees, test_scores, 'r-s', label='测试集')
plt.plot(n_trees, oob_scores, 'g-^', label='OOB')
plt.xlabel('树的数量')
plt.ylabel('准确率')
plt.title('随机森林:树数量的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()

从零实现随机森林

class RandomForestClassifierScratch:
    def __init__(self, n_estimators=100, max_depth=None, max_features='sqrt',
                 min_samples_split=2, bootstrap=True, random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.min_samples_split = min_samples_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.trees = []
        self.feature_indices = []
    
    def _bootstrap_sample(self, X, y):
        """Bootstrap采样"""
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[indices], y[indices]
    
    def _get_n_features(self, n_features):
        """确定每次分裂使用的特征数"""
        if self.max_features == 'sqrt':
            return int(np.sqrt(n_features))
        elif self.max_features == 'log2':
            return int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            return self.max_features
        elif isinstance(self.max_features, float):
            return int(self.max_features * n_features)
        else:
            return n_features
    
    def fit(self, X, y):
        np.random.seed(self.random_state)
        X, y = np.array(X), np.array(y)
        n_features = X.shape[1]
        n_select = self._get_n_features(n_features)
        
        self.trees = []
        self.feature_indices = []
        
        for _ in range(self.n_estimators):
            # Bootstrap采样
            if self.bootstrap:
                X_sample, y_sample = self._bootstrap_sample(X, y)
            else:
                X_sample, y_sample = X, y
            
            # 随机选择特征(每棵树使用相同的特征子集)
            # 注意:标准随机森林是在每次分裂时选择特征子集
            # 这里简化为每棵树选择一次
            feature_idx = np.random.choice(n_features, size=n_select, replace=False)
            self.feature_indices.append(feature_idx)
            
            # 训练决策树
            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                random_state=np.random.randint(10000)
            )
            tree.fit(X_sample[:, feature_idx], y_sample)
            self.trees.append(tree)
        
        self.classes_ = np.unique(y)
        return self
    
    def predict(self, X):
        X = np.array(X)
        predictions = np.zeros((len(X), self.n_estimators))
        
        for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
            predictions[:, i] = tree.predict(X[:, feat_idx])
        
        # 多数投票
        final_predictions = []
        for pred in predictions:
            counts = np.bincount(pred.astype(int))
            final_predictions.append(np.argmax(counts))
        
        return np.array(final_predictions)
    
    def predict_proba(self, X):
        X = np.array(X)
        all_proba = np.zeros((len(X), len(self.classes_), self.n_estimators))
        
        for i, (tree, feat_idx) in enumerate(zip(self.trees, self.feature_indices)):
            proba = tree.predict_proba(X[:, feat_idx])
            # 处理树可能没有见过所有类别的情况
            for j, c in enumerate(tree.classes_):
                class_idx = np.where(self.classes_ == c)[0][0]
                all_proba[:, class_idx, i] = proba[:, j]
        
        return np.mean(all_proba, axis=2)
    
    def score(self, X, y):
        return np.mean(self.predict(X) == y)

# 测试
rf_scratch = RandomForestClassifierScratch(n_estimators=100, max_depth=10, random_state=42)
rf_scratch.fit(X_train, y_train)

print(f"自实现随机森林准确率: {rf_scratch.score(X_test, y_test):.4f}")

特征重要性

基于不纯度的重要性

# 特征重要性
rf_importance = RandomForestClassifier(n_estimators=100, random_state=42)
rf_importance.fit(X_train, y_train)

importances = rf_importance.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12, 6))
plt.bar(range(X.shape[1]), importances[indices])
plt.xlabel('特征索引')
plt.ylabel('重要性')
plt.title('随机森林特征重要性(基于不纯度)')
plt.tight_layout()
plt.show()

print("Top 10 特征:")
for i in range(10):
    print(f"  特征 {indices[i]}: {importances[indices[i]]:.4f}")

基于排列的重要性

from sklearn.inspection import permutation_importance

# 排列重要性
perm_importance = permutation_importance(rf_importance, X_test, y_test, n_repeats=10, random_state=42)

plt.figure(figsize=(12, 6))
sorted_idx = perm_importance.importances_mean.argsort()[::-1]

plt.boxplot([perm_importance.importances[i] for i in sorted_idx[:10]], 
            labels=[f'Feature {i}' for i in sorted_idx[:10]], vert=False)
plt.xlabel('重要性下降')
plt.title('排列特征重要性 (Top 10)')
plt.tight_layout()
plt.show()

超参数调优

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# 参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'max_features': ['sqrt', 'log2', 0.5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 随机搜索(比网格搜索更高效)
rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

print(f"最佳参数: {rf_search.best_params_}")
print(f"最佳交叉验证分数: {rf_search.best_score_:.4f}")
print(f"测试集分数: {rf_search.score(X_test, y_test):.4f}")

随机森林 vs 其他方法

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import time

classifiers = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

results = []
for name, clf in classifiers.items():
    start = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start
    
    train_acc = clf.score(X_train, y_train)
    test_acc = clf.score(X_test, y_test)
    
    results.append({
        'Model': name,
        'Train Acc': train_acc,
        'Test Acc': test_acc,
        'Train Time': train_time
    })
    print(f"{name}: Train={train_acc:.4f}, Test={test_acc:.4f}, Time={train_time:.3f}s")

import pandas as pd
pd.DataFrame(results)

实战:手写数字识别

from sklearn.datasets import load_digits
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# 加载数据
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(
    X_digits, y_digits, test_size=0.2, random_state=42)

# 训练随机森林
rf_digits = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_digits.fit(X_train_d, y_train_d)

y_pred_d = rf_digits.predict(X_test_d)

print(f"测试准确率: {rf_digits.score(X_test_d, y_test_d):.4f}")
print("\n分类报告:")
print(classification_report(y_test_d, y_pred_d))

# 混淆矩阵
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_d, y_pred_d)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('预测')
plt.ylabel('实际')
plt.title('随机森林手写数字识别')
plt.show()

# 特征重要性可视化
importances_img = rf_digits.feature_importances_.reshape(8, 8)

plt.figure(figsize=(8, 8))
plt.imshow(importances_img, cmap='hot')
plt.colorbar()
plt.title('像素特征重要性')
plt.show()

随机森林回归

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score

# 加载数据
housing = fetch_california_housing()
X_house, y_house = housing.data, housing.target

X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_house, y_house, test_size=0.2, random_state=42)

# 随机森林回归
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_h, y_train_h)

y_pred_h = rf_reg.predict(X_test_h)

print(f"R²: {r2_score(y_test_h, y_pred_h):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_h, y_pred_h)):.4f}")

# 预测vs实际
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test_h, y_pred_h, alpha=0.3)
plt.plot([y_test_h.min(), y_test_h.max()], [y_test_h.min(), y_test_h.max()], 'r--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.title('预测 vs 实际')

plt.subplot(1, 2, 2)
importances = rf_reg.feature_importances_
indices = np.argsort(importances)
plt.barh(range(len(importances)), importances[indices])
plt.yticks(range(len(importances)), np.array(housing.feature_names)[indices])
plt.xlabel('重要性')
plt.title('特征重要性')

plt.tight_layout()
plt.show()

常见问题

Q1: 随机森林的优缺点?

优点 缺点
准确率高 模型较大,内存占用高
不易过拟合 训练时间较长
能处理高维数据 可解释性不如单棵树
可以评估特征重要性 对噪声特征敏感
容易并行化 -

Q2: 树的数量设多少?

  • 通常100-500足够
  • 更多树不会过拟合,但收益递减
  • 可通过OOB误差确定合适数量

Q3: max_features如何选择?

  • 分类:sqrt(n_features)(默认)
  • 回归:n_features
  • 较小值增加多样性,但可能降低单树性能

Q4: 随机森林能处理缺失值吗?

scikit-learn的实现不直接支持,但可以:

  • 预处理时填充缺失值
  • 使用支持缺失值的实现(如H2O)

总结

概念 说明
Bagging Bootstrap采样 + 投票/平均
随机森林 Bagging + 特征随机选择
关键参数 n_estimators, max_depth, max_features
OOB 使用未采样数据估计泛化误差
特征重要性 基于不纯度或排列

参考资料

  • Breiman, L. (2001). “Random Forests”. Machine Learning.
  • 《统计学习方法》李航 第8章
  • scikit-learn 文档:Random Forest

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——随机森林 》

本文链接:http://localhost:3015/ai/%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!