SGD、Momentum、Adam及学习率调度
前言
优化算法决定了神经网络如何更新参数以最小化损失函数。从基础的SGD到自适应方法Adam,不同算法各有特点。
梯度下降回顾
批量梯度下降(BGD)
\[\theta = \theta - \eta \nabla_\theta J(\theta)\]import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
# 生成数据
n_samples = 1000
X = np.random.randn(n_samples, 2)
w_true = np.array([2.0, -3.0])
y = X @ w_true + np.random.randn(n_samples) * 0.5
# 损失函数(MSE)
def compute_loss(X, y, w):
return np.mean((X @ w - y) ** 2)
def compute_gradient(X, y, w):
return 2 * X.T @ (X @ w - y) / len(y)
# 批量梯度下降
def batch_gd(X, y, learning_rate=0.1, n_iterations=100):
w = np.zeros(X.shape[1])
losses = []
weights_history = [w.copy()]
for _ in range(n_iterations):
grad = compute_gradient(X, y, w)
w = w - learning_rate * grad
losses.append(compute_loss(X, y, w))
weights_history.append(w.copy())
return w, losses, weights_history
w_bgd, losses_bgd, _ = batch_gd(X, y)
print(f"BGD结果: w = {w_bgd}")
print(f"真实值: w = {w_true}")
随机梯度下降(SGD)
每次只用一个样本更新:
\[\theta = \theta - \eta \nabla_\theta J(\theta; x^{(i)}, y^{(i)})\]def sgd(X, y, learning_rate=0.01, n_epochs=10):
w = np.zeros(X.shape[1])
losses = []
for epoch in range(n_epochs):
# 打乱数据
indices = np.random.permutation(len(y))
for i in indices:
xi = X[i:i+1]
yi = y[i:i+1]
grad = compute_gradient(xi, yi, w)
w = w - learning_rate * grad
losses.append(compute_loss(X, y, w))
return w, losses
w_sgd, losses_sgd = sgd(X, y, learning_rate=0.01, n_epochs=20)
print(f"SGD结果: w = {w_sgd}")
小批量梯度下降(Mini-batch GD)
def mini_batch_gd(X, y, batch_size=32, learning_rate=0.1, n_epochs=20):
w = np.zeros(X.shape[1])
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
w = w - learning_rate * grad
losses.append(compute_loss(X, y, w))
return w, losses
w_mini, losses_mini = mini_batch_gd(X, y, batch_size=32)
print(f"Mini-batch GD结果: w = {w_mini}")
比较
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(losses_bgd[:20], 'b-', label='Batch GD', linewidth=2)
ax.plot(losses_sgd, 'g-', label='SGD', linewidth=2)
ax.plot(losses_mini, 'r-', label='Mini-batch GD', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('不同梯度下降方法比较')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()
| 方法 | 优点 | 缺点 |
|---|---|---|
| BGD | 稳定收敛 | 计算慢 |
| SGD | 快速、可能跳出局部最优 | 噪声大 |
| Mini-batch | 平衡效率和稳定性 | 需要选择batch size |
Momentum(动量)
原理
累积历史梯度方向,加速收敛:
\(v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta)\) \(\theta = \theta - v_t\)
def sgd_momentum(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
w = np.zeros(X.shape[1])
v = np.zeros(X.shape[1]) # 动量
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
v = momentum * v + learning_rate * grad
w = w - v
losses.append(compute_loss(X, y, w))
return w, losses
w_momentum, losses_momentum = sgd_momentum(X, y)
print(f"SGD + Momentum结果: w = {w_momentum}")
可视化动量效果
# 构造一个椭圆形等高线的损失函数来演示
def loss_surface(w):
return 0.5 * (10 * w[0]**2 + w[1]**2)
def grad_surface(w):
return np.array([10 * w[0], w[1]])
# 不同方法的轨迹
def optimize_trajectory(method, n_steps=50):
w = np.array([1.0, 1.0])
trajectory = [w.copy()]
v = np.zeros(2)
lr = 0.1
for _ in range(n_steps):
grad = grad_surface(w)
if method == 'sgd':
w = w - lr * grad
elif method == 'momentum':
v = 0.9 * v + lr * grad
w = w - v
trajectory.append(w.copy())
return np.array(trajectory)
traj_sgd = optimize_trajectory('sgd')
traj_momentum = optimize_trajectory('momentum')
# 可视化
fig, ax = plt.subplots(figsize=(10, 8))
# 等高线
x_range = np.linspace(-1.5, 1.5, 100)
y_range = np.linspace(-1.5, 1.5, 100)
X_mesh, Y_mesh = np.meshgrid(x_range, y_range)
Z = 0.5 * (10 * X_mesh**2 + Y_mesh**2)
ax.contour(X_mesh, Y_mesh, Z, levels=20, cmap='viridis', alpha=0.6)
ax.plot(traj_sgd[:, 0], traj_sgd[:, 1], 'b-o', label='SGD', markersize=3)
ax.plot(traj_momentum[:, 0], traj_momentum[:, 1], 'r-s', label='Momentum', markersize=3)
ax.scatter([0], [0], c='green', s=200, marker='*', label='最优点')
ax.set_xlabel('w1')
ax.set_ylabel('w2')
ax.set_title('SGD vs Momentum 优化轨迹')
ax.legend()
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)
plt.show()
Nesterov Accelerated Gradient (NAG)
原理
先”展望”,再计算梯度:
\(v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta - \gamma v_{t-1})\) \(\theta = \theta - v_t\)
def nag(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
w = np.zeros(X.shape[1])
v = np.zeros(X.shape[1])
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
# 先移动到展望位置
w_lookahead = w - momentum * v
grad = compute_gradient(X_batch, y_batch, w_lookahead)
v = momentum * v + learning_rate * grad
w = w - v
losses.append(compute_loss(X, y, w))
return w, losses
w_nag, losses_nag = nag(X, y)
print(f"NAG结果: w = {w_nag}")
AdaGrad
原理
自适应学习率,对频繁更新的参数降低学习率:
\(G_t = G_{t-1} + g_t^2\) \(\theta = \theta - \frac{\eta}{\sqrt{G_t + \epsilon}} g_t\)
def adagrad(X, y, batch_size=32, learning_rate=0.5, n_epochs=20, epsilon=1e-8):
w = np.zeros(X.shape[1])
G = np.zeros(X.shape[1]) # 累积梯度平方
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
G += grad ** 2
w = w - learning_rate * grad / (np.sqrt(G) + epsilon)
losses.append(compute_loss(X, y, w))
return w, losses
w_adagrad, losses_adagrad = adagrad(X, y)
print(f"AdaGrad结果: w = {w_adagrad}")
问题
学习率单调下降,后期可能过小。
RMSprop
原理
使用指数加权移动平均:
\(E[g^2]_t = \gamma E[g^2]_{t-1} + (1-\gamma) g_t^2\) \(\theta = \theta - \frac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_t\)
def rmsprop(X, y, batch_size=32, learning_rate=0.01, decay_rate=0.9, n_epochs=20, epsilon=1e-8):
w = np.zeros(X.shape[1])
E_g2 = np.zeros(X.shape[1]) # 梯度平方的指数加权平均
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
E_g2 = decay_rate * E_g2 + (1 - decay_rate) * grad ** 2
w = w - learning_rate * grad / (np.sqrt(E_g2) + epsilon)
losses.append(compute_loss(X, y, w))
return w, losses
w_rmsprop, losses_rmsprop = rmsprop(X, y)
print(f"RMSprop结果: w = {w_rmsprop}")
Adam
原理
结合Momentum和RMSprop:
\(m_t = \beta_1 m_{t-1} + (1-\beta_1) g_t\) \(v_t = \beta_2 v_{t-1} + (1-\beta_2) g_t^2\) \(\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t}\) \(\theta = \theta - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t\)
def adam(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999,
n_epochs=20, epsilon=1e-8):
w = np.zeros(X.shape[1])
m = np.zeros(X.shape[1]) # 一阶矩
v = np.zeros(X.shape[1]) # 二阶矩
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
t = 0
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
t += 1
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
# 更新矩估计
m = beta1 * m + (1 - beta1) * grad
v = beta2 * v + (1 - beta2) * grad ** 2
# 偏差修正
m_hat = m / (1 - beta1 ** t)
v_hat = v / (1 - beta2 ** t)
# 更新参数
w = w - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
losses.append(compute_loss(X, y, w))
return w, losses
w_adam, losses_adam = adam(X, y)
print(f"Adam结果: w = {w_adam}")
Adam变体
def adamw(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999,
weight_decay=0.01, n_epochs=20, epsilon=1e-8):
"""AdamW: 解耦权重衰减"""
w = np.zeros(X.shape[1])
m = np.zeros(X.shape[1])
v = np.zeros(X.shape[1])
losses = []
n_samples = len(y)
n_batches = n_samples // batch_size
t = 0
for epoch in range(n_epochs):
indices = np.random.permutation(n_samples)
for i in range(n_batches):
t += 1
batch_idx = indices[i*batch_size:(i+1)*batch_size]
X_batch = X[batch_idx]
y_batch = y[batch_idx]
grad = compute_gradient(X_batch, y_batch, w)
m = beta1 * m + (1 - beta1) * grad
v = beta2 * v + (1 - beta2) * grad ** 2
m_hat = m / (1 - beta1 ** t)
v_hat = v / (1 - beta2 ** t)
# 解耦的权重衰减
w = w - learning_rate * (m_hat / (np.sqrt(v_hat) + epsilon) + weight_decay * w)
losses.append(compute_loss(X, y, w))
return w, losses
优化器比较
# 比较所有优化器
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(losses_mini, label='SGD', linewidth=2)
ax.plot(losses_momentum, label='Momentum', linewidth=2)
ax.plot(losses_nag, label='NAG', linewidth=2)
ax.plot(losses_adagrad, label='AdaGrad', linewidth=2)
ax.plot(losses_rmsprop, label='RMSprop', linewidth=2)
ax.plot(losses_adam, label='Adam', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('优化器比较')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_yscale('log')
plt.show()
| 优化器 | 特点 | 适用场景 |
|---|---|---|
| SGD | 简单、可能找到更平坦的极小值 | 需要精细调参 |
| Momentum | 加速收敛 | 通用 |
| AdaGrad | 自适应学习率 | 稀疏数据 |
| RMSprop | 解决AdaGrad学习率衰减问题 | RNN |
| Adam | 自适应+动量 | 默认首选 |
学习率调度
常用策略
# 学习率调度策略
def step_decay(epoch, initial_lr=0.1, drop=0.5, epochs_drop=10):
return initial_lr * (drop ** (epoch // epochs_drop))
def exponential_decay(epoch, initial_lr=0.1, decay_rate=0.96):
return initial_lr * (decay_rate ** epoch)
def cosine_annealing(epoch, initial_lr=0.1, T_max=100):
return initial_lr * (1 + np.cos(np.pi * epoch / T_max)) / 2
def warmup_cosine(epoch, initial_lr=0.1, warmup_epochs=10, total_epochs=100):
if epoch < warmup_epochs:
return initial_lr * epoch / warmup_epochs
else:
return initial_lr * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs))) / 2
# 可视化
epochs = np.arange(100)
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
ax = axes[0, 0]
ax.plot(epochs, [step_decay(e) for e in epochs])
ax.set_title('Step Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
ax = axes[0, 1]
ax.plot(epochs, [exponential_decay(e) for e in epochs])
ax.set_title('Exponential Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
ax = axes[1, 0]
ax.plot(epochs, [cosine_annealing(e, T_max=100) for e in epochs])
ax.set_title('Cosine Annealing')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
ax = axes[1, 1]
ax.plot(epochs, [warmup_cosine(e, total_epochs=100) for e in epochs])
ax.set_title('Warmup + Cosine')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
PyTorch实现
try:
import torch
import torch.optim as optim
# 定义模型参数
w = torch.randn(2, requires_grad=True)
# 不同优化器
optimizers = {
'SGD': optim.SGD([w], lr=0.01),
'SGD+Momentum': optim.SGD([w], lr=0.01, momentum=0.9),
'Adam': optim.Adam([w], lr=0.001),
'AdamW': optim.AdamW([w], lr=0.001, weight_decay=0.01),
'RMSprop': optim.RMSprop([w], lr=0.01)
}
# 学习率调度器
scheduler = optim.lr_scheduler.CosineAnnealingLR(
optim.Adam([w], lr=0.001),
T_max=100
)
print("PyTorch优化器:")
for name in optimizers:
print(f" {name}")
print("\n学习率调度器:")
print(" CosineAnnealingLR")
print(" StepLR")
print(" ExponentialLR")
print(" ReduceLROnPlateau")
except ImportError:
print("PyTorch未安装")
常见问题
Q1: 如何选择优化器?
- 默认首选:Adam
- 追求泛化:SGD + Momentum
- 稀疏数据:AdaGrad
Q2: 如何选择学习率?
- 从大到小尝试:0.1, 0.01, 0.001, 0.0001
- 使用学习率范围测试
- 使用学习率调度
Q3: Batch size如何影响训练?
| Batch Size | 优点 | 缺点 |
|---|---|---|
| 小 | 泛化好 | 训练慢、不稳定 |
| 大 | 训练快、稳定 | 可能泛化差 |
Q4: Adam的默认参数是什么?
- $\beta_1 = 0.9$
- $\beta_2 = 0.999$
- $\epsilon = 10^{-8}$
总结
| 优化器 | 更新规则 | 关键超参数 |
|---|---|---|
| SGD | $w - \eta g$ | $\eta$ |
| Momentum | $w - (\gamma v + \eta g)$ | $\eta, \gamma$ |
| Adam | $w - \eta \hat{m}/\sqrt{\hat{v}}$ | $\eta, \beta_1, \beta_2$ |
参考资料
- Kingma, D. & Ba, J. (2014). “Adam: A Method for Stochastic Optimization”
- Loshchilov, I. & Hutter, F. (2017). “Decoupled Weight Decay Regularization”
- Ruder, S. (2016). “An overview of gradient descent optimization algorithms”
- CS231n: Optimization
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——优化算法详解 》
本文链接:http://localhost:3015/ai/%E4%BC%98%E5%8C%96%E7%AE%97%E6%B3%95%E8%AF%A6%E8%A7%A3.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!