已经是最新一篇文章了!
已经是最后一篇文章了!
现代梯度提升框架详解
前言
XGBoost和LightGBM是目前最流行的梯度提升框架,在Kaggle竞赛和工业界广泛应用。本文详细介绍它们的原理、特性和使用方法。
XGBoost原理
目标函数
XGBoost的目标函数包含损失函数和正则化项:
\[Obj = \sum_{i=1}^{n} L(y_i, \hat{y}_i) + \sum_{k=1}^{K} \Omega(f_k)\]其中正则化项: \(\Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^{T} w_j^2\)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error
np.random.seed(42)
# 生成数据
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("数据集准备完成")
print(f"训练集: {X_train.shape}")
print(f"测试集: {X_test.shape}")
XGBoost基本使用
try:
import xgboost as xgb
# 创建DMatrix(XGBoost专用数据格式)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 参数设置
params = {
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'max_depth': 6,
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'seed': 42
}
# 训练
evals = [(dtrain, 'train'), (dtest, 'eval')]
model = xgb.train(
params, dtrain,
num_boost_round=100,
evals=evals,
early_stopping_rounds=10,
verbose_eval=20
)
# 预测
y_pred_prob = model.predict(dtest)
y_pred = (y_pred_prob > 0.5).astype(int)
print(f"\nXGBoost准确率: {accuracy_score(y_test, y_pred):.4f}")
except ImportError:
print("XGBoost未安装,请运行: pip install xgboost")
Sklearn API
try:
from xgboost import XGBClassifier, XGBRegressor
# 分类
xgb_clf = XGBClassifier(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
subsample=0.8,
colsample_bytree=0.8,
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
xgb_clf.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10,
verbose=False)
print(f"XGBClassifier准确率: {xgb_clf.score(X_test, y_test):.4f}")
print(f"最佳迭代次数: {xgb_clf.best_iteration}")
except NameError:
print("需要先导入xgboost")
LightGBM原理
核心优化
| 技术 | 说明 |
|---|---|
| GOSS | 基于梯度的单边采样 |
| EFB | 互斥特征捆绑 |
| Histogram | 直方图加速 |
| Leaf-wise | 叶子优先生长 |
try:
import lightgbm as lgb
# 创建Dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
# 参数设置
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'seed': 42,
'verbose': -1
}
# 训练
model_lgb = lgb.train(
params, train_data,
num_boost_round=100,
valid_sets=[train_data, test_data],
valid_names=['train', 'valid'],
callbacks=[lgb.early_stopping(10), lgb.log_evaluation(20)]
)
# 预测
y_pred_prob = model_lgb.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
print(f"\nLightGBM准确率: {accuracy_score(y_test, y_pred):.4f}")
except ImportError:
print("LightGBM未安装,请运行: pip install lightgbm")
Sklearn API
try:
from lightgbm import LGBMClassifier, LGBMRegressor
lgb_clf = LGBMClassifier(
n_estimators=100,
num_leaves=31,
learning_rate=0.1,
feature_fraction=0.8,
bagging_fraction=0.8,
bagging_freq=5,
random_state=42,
verbose=-1
)
lgb_clf.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
callbacks=[lgb.early_stopping(10, verbose=False)])
print(f"LGBMClassifier准确率: {lgb_clf.score(X_test, y_test):.4f}")
except NameError:
print("需要先导入lightgbm")
CatBoost简介
try:
from catboost import CatBoostClassifier, Pool
# CatBoost特点:自动处理类别特征
cat_clf = CatBoostClassifier(
iterations=100,
depth=6,
learning_rate=0.1,
random_seed=42,
verbose=False
)
cat_clf.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
print(f"CatBoost准确率: {cat_clf.score(X_test, y_test):.4f}")
except ImportError:
print("CatBoost未安装,请运行: pip install catboost")
特征重要性
可视化特征重要性
def plot_feature_importance(model, feature_names, model_name, top_n=15):
"""绘制特征重要性"""
if hasattr(model, 'feature_importances_'):
importance = model.feature_importances_
elif hasattr(model, 'get_score'):
importance_dict = model.get_score(importance_type='gain')
importance = np.array([importance_dict.get(f'f{i}', 0) for i in range(len(feature_names))])
else:
return
# 排序
indices = np.argsort(importance)[::-1][:top_n]
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(range(top_n), importance[indices][::-1], color='steelblue')
ax.set_yticks(range(top_n))
ax.set_yticklabels([feature_names[i] for i in indices[::-1]])
ax.set_xlabel('重要性')
ax.set_title(f'{model_name} 特征重要性')
plt.tight_layout()
plt.show()
# 示例
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
try:
plot_feature_importance(xgb_clf, feature_names, 'XGBoost')
except:
print("需要先训练模型")
参数调优
重要参数
| 参数 | XGBoost | LightGBM | 作用 |
|---|---|---|---|
| 树深度 | max_depth | max_depth | 控制过拟合 |
| 叶子数 | - | num_leaves | 模型复杂度 |
| 学习率 | learning_rate | learning_rate | 收敛速度 |
| 采样比例 | subsample | bagging_fraction | 正则化 |
| 特征采样 | colsample_bytree | feature_fraction | 正则化 |
| L1正则 | reg_alpha | reg_alpha | 稀疏性 |
| L2正则 | reg_lambda | reg_lambda | 平滑性 |
网格搜索调优
from sklearn.model_selection import GridSearchCV
def tune_xgboost():
"""XGBoost参数调优"""
param_grid = {
'max_depth': [3, 5, 7],
'learning_rate': [0.01, 0.1, 0.2],
'n_estimators': [50, 100, 200],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
xgb_clf = XGBClassifier(
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
grid_search = GridSearchCV(
xgb_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
return grid_search.best_estimator_
# 简化版调优(避免运行太久)
print("参数调优示例(简化版)")
贝叶斯优化
try:
from skopt import BayesSearchCV
from skopt.space import Real, Integer
search_spaces = {
'max_depth': Integer(3, 10),
'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
'n_estimators': Integer(50, 300),
'subsample': Real(0.6, 1.0),
'colsample_bytree': Real(0.6, 1.0)
}
print("贝叶斯优化配置已准备")
except ImportError:
print("scikit-optimize未安装")
处理不平衡数据
# 生成不平衡数据
X_imb, y_imb = make_classification(n_samples=1000, n_features=20,
weights=[0.9, 0.1], random_state=42)
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
X_imb, y_imb, test_size=0.2, random_state=42
)
print(f"类别分布: {np.bincount(y_train_imb)}")
try:
# XGBoost处理不平衡
scale_pos_weight = np.sum(y_train_imb == 0) / np.sum(y_train_imb == 1)
xgb_imb = XGBClassifier(
scale_pos_weight=scale_pos_weight,
random_state=42,
use_label_encoder=False,
eval_metric='logloss'
)
xgb_imb.fit(X_train_imb, y_train_imb)
from sklearn.metrics import classification_report
y_pred_imb = xgb_imb.predict(X_test_imb)
print("\nXGBoost (scale_pos_weight):")
print(classification_report(y_test_imb, y_pred_imb))
except:
print("需要先导入XGBoost")
速度对比
import time
def benchmark_models():
"""对比训练速度"""
# 生成较大数据集
X_large, y_large = make_classification(n_samples=10000, n_features=50,
random_state=42)
models = {}
try:
models['XGBoost'] = XGBClassifier(n_estimators=100, random_state=42,
use_label_encoder=False, eval_metric='logloss')
except:
pass
try:
models['LightGBM'] = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
except:
pass
results = []
for name, model in models.items():
start = time.time()
model.fit(X_large, y_large)
train_time = time.time() - start
start = time.time()
_ = model.predict(X_large)
pred_time = time.time() - start
results.append({
'Model': name,
'Train Time': f'{train_time:.3f}s',
'Predict Time': f'{pred_time:.3f}s'
})
print(f"{name}: 训练={train_time:.3f}s, 预测={pred_time:.3f}s")
return results
benchmark_models()
常见问题
Q1: XGBoost和LightGBM如何选择?
| 场景 | 推荐 |
|---|---|
| 大数据集 | LightGBM(更快) |
| 需要精确控制 | XGBoost |
| 类别特征多 | CatBoost |
| Kaggle竞赛 | 都尝试 |
Q2: 如何防止过拟合?
- 增加正则化参数
- 减少max_depth/num_leaves
- 使用early_stopping
- 增加subsample和colsample
Q3: 特征重要性类型的区别?
- gain:特征带来的增益
- split:特征被使用的次数
- cover:特征覆盖的样本数
Q4: GPU训练如何开启?
# XGBoost GPU
# tree_method='gpu_hist', gpu_id=0
# LightGBM GPU
# device='gpu', gpu_platform_id=0
总结
| 框架 | 优点 | 缺点 |
|---|---|---|
| XGBoost | 稳定、精确、特征多 | 相对较慢 |
| LightGBM | 速度快、内存效率高 | 可能过拟合 |
| CatBoost | 类别特征处理好 | 训练较慢 |
参考资料
- Chen, T. & Guestrin, C. (2016). “XGBoost: A Scalable Tree Boosting System”
- Ke, G. et al. (2017). “LightGBM: A Highly Efficient Gradient Boosting Decision Tree”
- Prokhorenkova, L. et al. (2018). “CatBoost: Unbiased Boosting with Categorical Features”
- XGBoost/LightGBM/CatBoost官方文档
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——XGBoost与LightGBM 》
本文链接:http://localhost:3015/ai/XGBoost%E4%B8%8ELightGBM.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!