SGD、Momentum、Adam及学习率调度

前言

优化算法决定了神经网络如何更新参数以最小化损失函数。从基础的SGD到自适应方法Adam,不同算法各有特点。


梯度下降回顾

批量梯度下降(BGD)

\[\theta = \theta - \eta \nabla_\theta J(\theta)\]
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 生成数据
n_samples = 1000
X = np.random.randn(n_samples, 2)
w_true = np.array([2.0, -3.0])
y = X @ w_true + np.random.randn(n_samples) * 0.5

# 损失函数(MSE)
def compute_loss(X, y, w):
    return np.mean((X @ w - y) ** 2)

def compute_gradient(X, y, w):
    return 2 * X.T @ (X @ w - y) / len(y)

# 批量梯度下降
def batch_gd(X, y, learning_rate=0.1, n_iterations=100):
    w = np.zeros(X.shape[1])
    losses = []
    weights_history = [w.copy()]
    
    for _ in range(n_iterations):
        grad = compute_gradient(X, y, w)
        w = w - learning_rate * grad
        losses.append(compute_loss(X, y, w))
        weights_history.append(w.copy())
    
    return w, losses, weights_history

w_bgd, losses_bgd, _ = batch_gd(X, y)
print(f"BGD结果: w = {w_bgd}")
print(f"真实值: w = {w_true}")

随机梯度下降(SGD)

每次只用一个样本更新:

\[\theta = \theta - \eta \nabla_\theta J(\theta; x^{(i)}, y^{(i)})\]
def sgd(X, y, learning_rate=0.01, n_epochs=10):
    w = np.zeros(X.shape[1])
    losses = []
    
    for epoch in range(n_epochs):
        # 打乱数据
        indices = np.random.permutation(len(y))
        
        for i in indices:
            xi = X[i:i+1]
            yi = y[i:i+1]
            grad = compute_gradient(xi, yi, w)
            w = w - learning_rate * grad
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_sgd, losses_sgd = sgd(X, y, learning_rate=0.01, n_epochs=20)
print(f"SGD结果: w = {w_sgd}")

小批量梯度下降(Mini-batch GD)

def mini_batch_gd(X, y, batch_size=32, learning_rate=0.1, n_epochs=20):
    w = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            w = w - learning_rate * grad
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_mini, losses_mini = mini_batch_gd(X, y, batch_size=32)
print(f"Mini-batch GD结果: w = {w_mini}")

比较

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(losses_bgd[:20], 'b-', label='Batch GD', linewidth=2)
ax.plot(losses_sgd, 'g-', label='SGD', linewidth=2)
ax.plot(losses_mini, 'r-', label='Mini-batch GD', linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('不同梯度下降方法比较')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()
方法 优点 缺点
BGD 稳定收敛 计算慢
SGD 快速、可能跳出局部最优 噪声大
Mini-batch 平衡效率和稳定性 需要选择batch size

Momentum(动量)

原理

累积历史梯度方向,加速收敛:

\(v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta)\) \(\theta = \theta - v_t\)

def sgd_momentum(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
    w = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])  # 动量
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            v = momentum * v + learning_rate * grad
            w = w - v
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_momentum, losses_momentum = sgd_momentum(X, y)
print(f"SGD + Momentum结果: w = {w_momentum}")

可视化动量效果

# 构造一个椭圆形等高线的损失函数来演示
def loss_surface(w):
    return 0.5 * (10 * w[0]**2 + w[1]**2)

def grad_surface(w):
    return np.array([10 * w[0], w[1]])

# 不同方法的轨迹
def optimize_trajectory(method, n_steps=50):
    w = np.array([1.0, 1.0])
    trajectory = [w.copy()]
    v = np.zeros(2)
    
    lr = 0.1
    
    for _ in range(n_steps):
        grad = grad_surface(w)
        
        if method == 'sgd':
            w = w - lr * grad
        elif method == 'momentum':
            v = 0.9 * v + lr * grad
            w = w - v
        
        trajectory.append(w.copy())
    
    return np.array(trajectory)

traj_sgd = optimize_trajectory('sgd')
traj_momentum = optimize_trajectory('momentum')

# 可视化
fig, ax = plt.subplots(figsize=(10, 8))

# 等高线
x_range = np.linspace(-1.5, 1.5, 100)
y_range = np.linspace(-1.5, 1.5, 100)
X_mesh, Y_mesh = np.meshgrid(x_range, y_range)
Z = 0.5 * (10 * X_mesh**2 + Y_mesh**2)

ax.contour(X_mesh, Y_mesh, Z, levels=20, cmap='viridis', alpha=0.6)
ax.plot(traj_sgd[:, 0], traj_sgd[:, 1], 'b-o', label='SGD', markersize=3)
ax.plot(traj_momentum[:, 0], traj_momentum[:, 1], 'r-s', label='Momentum', markersize=3)
ax.scatter([0], [0], c='green', s=200, marker='*', label='最优点')
ax.set_xlabel('w1')
ax.set_ylabel('w2')
ax.set_title('SGD vs Momentum 优化轨迹')
ax.legend()
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)

plt.show()

Nesterov Accelerated Gradient (NAG)

原理

先”展望”,再计算梯度:

\(v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta - \gamma v_{t-1})\) \(\theta = \theta - v_t\)

def nag(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
    w = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            # 先移动到展望位置
            w_lookahead = w - momentum * v
            grad = compute_gradient(X_batch, y_batch, w_lookahead)
            
            v = momentum * v + learning_rate * grad
            w = w - v
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_nag, losses_nag = nag(X, y)
print(f"NAG结果: w = {w_nag}")

AdaGrad

原理

自适应学习率,对频繁更新的参数降低学习率:

\(G_t = G_{t-1} + g_t^2\) \(\theta = \theta - \frac{\eta}{\sqrt{G_t + \epsilon}} g_t\)

def adagrad(X, y, batch_size=32, learning_rate=0.5, n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    G = np.zeros(X.shape[1])  # 累积梯度平方
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            G += grad ** 2
            w = w - learning_rate * grad / (np.sqrt(G) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_adagrad, losses_adagrad = adagrad(X, y)
print(f"AdaGrad结果: w = {w_adagrad}")

问题

学习率单调下降,后期可能过小。


RMSprop

原理

使用指数加权移动平均:

\(E[g^2]_t = \gamma E[g^2]_{t-1} + (1-\gamma) g_t^2\) \(\theta = \theta - \frac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_t\)

def rmsprop(X, y, batch_size=32, learning_rate=0.01, decay_rate=0.9, n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    E_g2 = np.zeros(X.shape[1])  # 梯度平方的指数加权平均
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            E_g2 = decay_rate * E_g2 + (1 - decay_rate) * grad ** 2
            w = w - learning_rate * grad / (np.sqrt(E_g2) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_rmsprop, losses_rmsprop = rmsprop(X, y)
print(f"RMSprop结果: w = {w_rmsprop}")

Adam

原理

结合Momentum和RMSprop:

\(m_t = \beta_1 m_{t-1} + (1-\beta_1) g_t\) \(v_t = \beta_2 v_{t-1} + (1-\beta_2) g_t^2\) \(\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t}\) \(\theta = \theta - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t\)

def adam(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999, 
         n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    m = np.zeros(X.shape[1])  # 一阶矩
    v = np.zeros(X.shape[1])  # 二阶矩
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    t = 0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            t += 1
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            
            # 更新矩估计
            m = beta1 * m + (1 - beta1) * grad
            v = beta2 * v + (1 - beta2) * grad ** 2
            
            # 偏差修正
            m_hat = m / (1 - beta1 ** t)
            v_hat = v / (1 - beta2 ** t)
            
            # 更新参数
            w = w - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_adam, losses_adam = adam(X, y)
print(f"Adam结果: w = {w_adam}")

Adam变体

def adamw(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999,
          weight_decay=0.01, n_epochs=20, epsilon=1e-8):
    """AdamW: 解耦权重衰减"""
    w = np.zeros(X.shape[1])
    m = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    t = 0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            t += 1
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            
            m = beta1 * m + (1 - beta1) * grad
            v = beta2 * v + (1 - beta2) * grad ** 2
            
            m_hat = m / (1 - beta1 ** t)
            v_hat = v / (1 - beta2 ** t)
            
            # 解耦的权重衰减
            w = w - learning_rate * (m_hat / (np.sqrt(v_hat) + epsilon) + weight_decay * w)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

优化器比较

# 比较所有优化器
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(losses_mini, label='SGD', linewidth=2)
ax.plot(losses_momentum, label='Momentum', linewidth=2)
ax.plot(losses_nag, label='NAG', linewidth=2)
ax.plot(losses_adagrad, label='AdaGrad', linewidth=2)
ax.plot(losses_rmsprop, label='RMSprop', linewidth=2)
ax.plot(losses_adam, label='Adam', linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('优化器比较')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_yscale('log')

plt.show()
优化器 特点 适用场景
SGD 简单、可能找到更平坦的极小值 需要精细调参
Momentum 加速收敛 通用
AdaGrad 自适应学习率 稀疏数据
RMSprop 解决AdaGrad学习率衰减问题 RNN
Adam 自适应+动量 默认首选

学习率调度

常用策略

# 学习率调度策略
def step_decay(epoch, initial_lr=0.1, drop=0.5, epochs_drop=10):
    return initial_lr * (drop ** (epoch // epochs_drop))

def exponential_decay(epoch, initial_lr=0.1, decay_rate=0.96):
    return initial_lr * (decay_rate ** epoch)

def cosine_annealing(epoch, initial_lr=0.1, T_max=100):
    return initial_lr * (1 + np.cos(np.pi * epoch / T_max)) / 2

def warmup_cosine(epoch, initial_lr=0.1, warmup_epochs=10, total_epochs=100):
    if epoch < warmup_epochs:
        return initial_lr * epoch / warmup_epochs
    else:
        return initial_lr * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs))) / 2

# 可视化
epochs = np.arange(100)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

ax = axes[0, 0]
ax.plot(epochs, [step_decay(e) for e in epochs])
ax.set_title('Step Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[0, 1]
ax.plot(epochs, [exponential_decay(e) for e in epochs])
ax.set_title('Exponential Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[1, 0]
ax.plot(epochs, [cosine_annealing(e, T_max=100) for e in epochs])
ax.set_title('Cosine Annealing')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[1, 1]
ax.plot(epochs, [warmup_cosine(e, total_epochs=100) for e in epochs])
ax.set_title('Warmup + Cosine')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

PyTorch实现

try:
    import torch
    import torch.optim as optim
    
    # 定义模型参数
    w = torch.randn(2, requires_grad=True)
    
    # 不同优化器
    optimizers = {
        'SGD': optim.SGD([w], lr=0.01),
        'SGD+Momentum': optim.SGD([w], lr=0.01, momentum=0.9),
        'Adam': optim.Adam([w], lr=0.001),
        'AdamW': optim.AdamW([w], lr=0.001, weight_decay=0.01),
        'RMSprop': optim.RMSprop([w], lr=0.01)
    }
    
    # 学习率调度器
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optim.Adam([w], lr=0.001), 
        T_max=100
    )
    
    print("PyTorch优化器:")
    for name in optimizers:
        print(f"  {name}")
    
    print("\n学习率调度器:")
    print("  CosineAnnealingLR")
    print("  StepLR")
    print("  ExponentialLR")
    print("  ReduceLROnPlateau")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: 如何选择优化器?

  • 默认首选:Adam
  • 追求泛化:SGD + Momentum
  • 稀疏数据:AdaGrad

Q2: 如何选择学习率?

  • 从大到小尝试:0.1, 0.01, 0.001, 0.0001
  • 使用学习率范围测试
  • 使用学习率调度

Q3: Batch size如何影响训练?

Batch Size 优点 缺点
泛化好 训练慢、不稳定
训练快、稳定 可能泛化差

Q4: Adam的默认参数是什么?

  • $\beta_1 = 0.9$
  • $\beta_2 = 0.999$
  • $\epsilon = 10^{-8}$

总结

优化器 更新规则 关键超参数
SGD $w - \eta g$ $\eta$
Momentum $w - (\gamma v + \eta g)$ $\eta, \gamma$
Adam $w - \eta \hat{m}/\sqrt{\hat{v}}$ $\eta, \beta_1, \beta_2$

参考资料

  • Kingma, D. & Ba, J. (2014). “Adam: A Method for Stochastic Optimization”
  • Loshchilov, I. & Hutter, F. (2017). “Decoupled Weight Decay Regularization”
  • Ruder, S. (2016). “An overview of gradient descent optimization algorithms”
  • CS231n: Optimization

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——优化算法详解 》

本文链接:http://localhost:3015/ai/%E4%BC%98%E5%8C%96%E7%AE%97%E6%B3%95%E8%AF%A6%E8%A7%A3.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!