现代梯度提升框架详解

前言

XGBoost和LightGBM是目前最流行的梯度提升框架,在Kaggle竞赛和工业界广泛应用。本文详细介绍它们的原理、特性和使用方法。


XGBoost原理

目标函数

XGBoost的目标函数包含损失函数和正则化项:

\[Obj = \sum_{i=1}^{n} L(y_i, \hat{y}_i) + \sum_{k=1}^{K} \Omega(f_k)\]

其中正则化项: \(\Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^{T} w_j^2\)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error

np.random.seed(42)

# 生成数据
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
                          random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("数据集准备完成")
print(f"训练集: {X_train.shape}")
print(f"测试集: {X_test.shape}")

XGBoost基本使用

try:
    import xgboost as xgb
    
    # 创建DMatrix(XGBoost专用数据格式)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # 参数设置
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': 42
    }
    
    # 训练
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    model = xgb.train(
        params, dtrain,
        num_boost_round=100,
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=20
    )
    
    # 预测
    y_pred_prob = model.predict(dtest)
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    print(f"\nXGBoost准确率: {accuracy_score(y_test, y_pred):.4f}")
    
except ImportError:
    print("XGBoost未安装,请运行: pip install xgboost")

Sklearn API

try:
    from xgboost import XGBClassifier, XGBRegressor
    
    # 分类
    xgb_clf = XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    xgb_clf.fit(X_train, y_train, 
                eval_set=[(X_test, y_test)],
                early_stopping_rounds=10,
                verbose=False)
    
    print(f"XGBClassifier准确率: {xgb_clf.score(X_test, y_test):.4f}")
    print(f"最佳迭代次数: {xgb_clf.best_iteration}")
    
except NameError:
    print("需要先导入xgboost")

LightGBM原理

核心优化

技术 说明
GOSS 基于梯度的单边采样
EFB 互斥特征捆绑
Histogram 直方图加速
Leaf-wise 叶子优先生长
try:
    import lightgbm as lgb
    
    # 创建Dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    # 参数设置
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 42,
        'verbose': -1
    }
    
    # 训练
    model_lgb = lgb.train(
        params, train_data,
        num_boost_round=100,
        valid_sets=[train_data, test_data],
        valid_names=['train', 'valid'],
        callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
    )
    
    # 预测
    y_pred_prob = model_lgb.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int)
    
    print(f"\nLightGBM准确率: {accuracy_score(y_test, y_pred):.4f}")
    
except ImportError:
    print("LightGBM未安装,请运行: pip install lightgbm")

Sklearn API

try:
    from lightgbm import LGBMClassifier, LGBMRegressor
    
    lgb_clf = LGBMClassifier(
        n_estimators=100,
        num_leaves=31,
        learning_rate=0.1,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        random_state=42,
        verbose=-1
    )
    
    lgb_clf.fit(X_train, y_train,
                eval_set=[(X_test, y_test)],
                callbacks=[lgb.early_stopping(10, verbose=False)])
    
    print(f"LGBMClassifier准确率: {lgb_clf.score(X_test, y_test):.4f}")
    
except NameError:
    print("需要先导入lightgbm")

CatBoost简介

try:
    from catboost import CatBoostClassifier, Pool
    
    # CatBoost特点:自动处理类别特征
    cat_clf = CatBoostClassifier(
        iterations=100,
        depth=6,
        learning_rate=0.1,
        random_seed=42,
        verbose=False
    )
    
    cat_clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
    
    print(f"CatBoost准确率: {cat_clf.score(X_test, y_test):.4f}")
    
except ImportError:
    print("CatBoost未安装,请运行: pip install catboost")

特征重要性

可视化特征重要性

def plot_feature_importance(model, feature_names, model_name, top_n=15):
    """绘制特征重要性"""
    
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
    elif hasattr(model, 'get_score'):
        importance_dict = model.get_score(importance_type='gain')
        importance = np.array([importance_dict.get(f'f{i}', 0) for i in range(len(feature_names))])
    else:
        return
    
    # 排序
    indices = np.argsort(importance)[::-1][:top_n]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.barh(range(top_n), importance[indices][::-1], color='steelblue')
    ax.set_yticks(range(top_n))
    ax.set_yticklabels([feature_names[i] for i in indices[::-1]])
    ax.set_xlabel('重要性')
    ax.set_title(f'{model_name} 特征重要性')
    
    plt.tight_layout()
    plt.show()

# 示例
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
try:
    plot_feature_importance(xgb_clf, feature_names, 'XGBoost')
except:
    print("需要先训练模型")

参数调优

重要参数

参数 XGBoost LightGBM 作用
树深度 max_depth max_depth 控制过拟合
叶子数 - num_leaves 模型复杂度
学习率 learning_rate learning_rate 收敛速度
采样比例 subsample bagging_fraction 正则化
特征采样 colsample_bytree feature_fraction 正则化
L1正则 reg_alpha reg_alpha 稀疏性
L2正则 reg_lambda reg_lambda 平滑性

网格搜索调优

from sklearn.model_selection import GridSearchCV

def tune_xgboost():
    """XGBoost参数调优"""
    
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [50, 100, 200],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    xgb_clf = XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    
    grid_search = GridSearchCV(
        xgb_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"最佳参数: {grid_search.best_params_}")
    print(f"最佳分数: {grid_search.best_score_:.4f}")
    
    return grid_search.best_estimator_

# 简化版调优(避免运行太久)
print("参数调优示例(简化版)")

贝叶斯优化

try:
    from skopt import BayesSearchCV
    from skopt.space import Real, Integer
    
    search_spaces = {
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'n_estimators': Integer(50, 300),
        'subsample': Real(0.6, 1.0),
        'colsample_bytree': Real(0.6, 1.0)
    }
    
    print("贝叶斯优化配置已准备")
    
except ImportError:
    print("scikit-optimize未安装")

处理不平衡数据

# 生成不平衡数据
X_imb, y_imb = make_classification(n_samples=1000, n_features=20,
                                    weights=[0.9, 0.1], random_state=42)
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
    X_imb, y_imb, test_size=0.2, random_state=42
)

print(f"类别分布: {np.bincount(y_train_imb)}")

try:
    # XGBoost处理不平衡
    scale_pos_weight = np.sum(y_train_imb == 0) / np.sum(y_train_imb == 1)
    
    xgb_imb = XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb_imb.fit(X_train_imb, y_train_imb)
    
    from sklearn.metrics import classification_report
    y_pred_imb = xgb_imb.predict(X_test_imb)
    print("\nXGBoost (scale_pos_weight):")
    print(classification_report(y_test_imb, y_pred_imb))
    
except:
    print("需要先导入XGBoost")

速度对比

import time

def benchmark_models():
    """对比训练速度"""
    
    # 生成较大数据集
    X_large, y_large = make_classification(n_samples=10000, n_features=50,
                                           random_state=42)
    
    models = {}
    
    try:
        models['XGBoost'] = XGBClassifier(n_estimators=100, random_state=42,
                                          use_label_encoder=False, eval_metric='logloss')
    except:
        pass
    
    try:
        models['LightGBM'] = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
    except:
        pass
    
    results = []
    for name, model in models.items():
        start = time.time()
        model.fit(X_large, y_large)
        train_time = time.time() - start
        
        start = time.time()
        _ = model.predict(X_large)
        pred_time = time.time() - start
        
        results.append({
            'Model': name,
            'Train Time': f'{train_time:.3f}s',
            'Predict Time': f'{pred_time:.3f}s'
        })
        print(f"{name}: 训练={train_time:.3f}s, 预测={pred_time:.3f}s")
    
    return results

benchmark_models()

常见问题

Q1: XGBoost和LightGBM如何选择?

场景 推荐
大数据集 LightGBM(更快)
需要精确控制 XGBoost
类别特征多 CatBoost
Kaggle竞赛 都尝试

Q2: 如何防止过拟合?

  • 增加正则化参数
  • 减少max_depth/num_leaves
  • 使用early_stopping
  • 增加subsample和colsample

Q3: 特征重要性类型的区别?

  • gain:特征带来的增益
  • split:特征被使用的次数
  • cover:特征覆盖的样本数

Q4: GPU训练如何开启?

# XGBoost GPU
# tree_method='gpu_hist', gpu_id=0

# LightGBM GPU
# device='gpu', gpu_platform_id=0

总结

框架 优点 缺点
XGBoost 稳定、精确、特征多 相对较慢
LightGBM 速度快、内存效率高 可能过拟合
CatBoost 类别特征处理好 训练较慢

参考资料

  • Chen, T. & Guestrin, C. (2016). “XGBoost: A Scalable Tree Boosting System”
  • Ke, G. et al. (2017). “LightGBM: A Highly Efficient Gradient Boosting Decision Tree”
  • Prokhorenkova, L. et al. (2018). “CatBoost: Unbiased Boosting with Categorical Features”
  • XGBoost/LightGBM/CatBoost官方文档

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——XGBoost与LightGBM 》

本文链接:http://localhost:3015/ai/XGBoost%E4%B8%8ELightGBM.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!