机器学习基础系列——优化算法详解

前言

优化算法决定了神经网络如何更新参数以最小化损失函数。从基础的SGD到自适应方法Adam，不同算法各有特点。

梯度下降回顾

批量梯度下降（BGD）

\[\theta = \theta - \eta \nabla_\theta J(\theta)\]

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 生成数据
n_samples = 1000
X = np.random.randn(n_samples, 2)
w_true = np.array([2.0, -3.0])
y = X @ w_true + np.random.randn(n_samples) * 0.5

# 损失函数（MSE）
def compute_loss(X, y, w):
    return np.mean((X @ w - y) ** 2)

def compute_gradient(X, y, w):
    return 2 * X.T @ (X @ w - y) / len(y)

# 批量梯度下降
def batch_gd(X, y, learning_rate=0.1, n_iterations=100):
    w = np.zeros(X.shape[1])
    losses = []
    weights_history = [w.copy()]
    
    for _ in range(n_iterations):
        grad = compute_gradient(X, y, w)
        w = w - learning_rate * grad
        losses.append(compute_loss(X, y, w))
        weights_history.append(w.copy())
    
    return w, losses, weights_history

w_bgd, losses_bgd, _ = batch_gd(X, y)
print(f"BGD结果: w = {w_bgd}")
print(f"真实值: w = {w_true}")

随机梯度下降（SGD）

每次只用一个样本更新：

\[\theta = \theta - \eta \nabla_\theta J(\theta; x^{(i)}, y^{(i)})\]

def sgd(X, y, learning_rate=0.01, n_epochs=10):
    w = np.zeros(X.shape[1])
    losses = []
    
    for epoch in range(n_epochs):
        # 打乱数据
        indices = np.random.permutation(len(y))
        
        for i in indices:
            xi = X[i:i+1]
            yi = y[i:i+1]
            grad = compute_gradient(xi, yi, w)
            w = w - learning_rate * grad
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_sgd, losses_sgd = sgd(X, y, learning_rate=0.01, n_epochs=20)
print(f"SGD结果: w = {w_sgd}")

小批量梯度下降（Mini-batch GD）

def mini_batch_gd(X, y, batch_size=32, learning_rate=0.1, n_epochs=20):
    w = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            w = w - learning_rate * grad
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_mini, losses_mini = mini_batch_gd(X, y, batch_size=32)
print(f"Mini-batch GD结果: w = {w_mini}")

比较

fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(losses_bgd[:20], 'b-', label='Batch GD', linewidth=2)
ax.plot(losses_sgd, 'g-', label='SGD', linewidth=2)
ax.plot(losses_mini, 'r-', label='Mini-batch GD', linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('不同梯度下降方法比较')
ax.legend()
ax.grid(True, alpha=0.3)
plt.show()

方法	优点	缺点
BGD	稳定收敛	计算慢
SGD	快速、可能跳出局部最优	噪声大
Mini-batch	平衡效率和稳定性	需要选择batch size

Momentum（动量）

原理

累积历史梯度方向，加速收敛：

$v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta)$ $\theta = \theta - v_t$

def sgd_momentum(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
    w = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])  # 动量
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            v = momentum * v + learning_rate * grad
            w = w - v
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_momentum, losses_momentum = sgd_momentum(X, y)
print(f"SGD + Momentum结果: w = {w_momentum}")

可视化动量效果

# 构造一个椭圆形等高线的损失函数来演示
def loss_surface(w):
    return 0.5 * (10 * w[0]**2 + w[1]**2)

def grad_surface(w):
    return np.array([10 * w[0], w[1]])

# 不同方法的轨迹
def optimize_trajectory(method, n_steps=50):
    w = np.array([1.0, 1.0])
    trajectory = [w.copy()]
    v = np.zeros(2)
    
    lr = 0.1
    
    for _ in range(n_steps):
        grad = grad_surface(w)
        
        if method == 'sgd':
            w = w - lr * grad
        elif method == 'momentum':
            v = 0.9 * v + lr * grad
            w = w - v
        
        trajectory.append(w.copy())
    
    return np.array(trajectory)

traj_sgd = optimize_trajectory('sgd')
traj_momentum = optimize_trajectory('momentum')

# 可视化
fig, ax = plt.subplots(figsize=(10, 8))

# 等高线
x_range = np.linspace(-1.5, 1.5, 100)
y_range = np.linspace(-1.5, 1.5, 100)
X_mesh, Y_mesh = np.meshgrid(x_range, y_range)
Z = 0.5 * (10 * X_mesh**2 + Y_mesh**2)

ax.contour(X_mesh, Y_mesh, Z, levels=20, cmap='viridis', alpha=0.6)
ax.plot(traj_sgd[:, 0], traj_sgd[:, 1], 'b-o', label='SGD', markersize=3)
ax.plot(traj_momentum[:, 0], traj_momentum[:, 1], 'r-s', label='Momentum', markersize=3)
ax.scatter([0], [0], c='green', s=200, marker='*', label='最优点')
ax.set_xlabel('w1')
ax.set_ylabel('w2')
ax.set_title('SGD vs Momentum 优化轨迹')
ax.legend()
ax.set_xlim(-1.5, 1.5)
ax.set_ylim(-1.5, 1.5)

plt.show()

Nesterov Accelerated Gradient (NAG)

原理

先”展望”，再计算梯度：

$v_t = \gamma v_{t-1} + \eta \nabla_\theta J(\theta - \gamma v_{t-1})$ $\theta = \theta - v_t$

def nag(X, y, batch_size=32, learning_rate=0.1, momentum=0.9, n_epochs=20):
    w = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            # 先移动到展望位置
            w_lookahead = w - momentum * v
            grad = compute_gradient(X_batch, y_batch, w_lookahead)
            
            v = momentum * v + learning_rate * grad
            w = w - v
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_nag, losses_nag = nag(X, y)
print(f"NAG结果: w = {w_nag}")

AdaGrad

原理

自适应学习率，对频繁更新的参数降低学习率：

$G_t = G_{t-1} + g_t^2$ $\theta = \theta - \frac{\eta}{\sqrt{G_t + \epsilon}} g_t$

def adagrad(X, y, batch_size=32, learning_rate=0.5, n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    G = np.zeros(X.shape[1])  # 累积梯度平方
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            G += grad ** 2
            w = w - learning_rate * grad / (np.sqrt(G) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_adagrad, losses_adagrad = adagrad(X, y)
print(f"AdaGrad结果: w = {w_adagrad}")

问题

学习率单调下降，后期可能过小。

RMSprop

原理

使用指数加权移动平均：

$E[g^2]_t = \gamma E[g^2]_{t-1} + (1-\gamma) g_t^2$ $\theta = \theta - \frac{\eta}{\sqrt{E[g^2]_t + \epsilon}} g_t$

def rmsprop(X, y, batch_size=32, learning_rate=0.01, decay_rate=0.9, n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    E_g2 = np.zeros(X.shape[1])  # 梯度平方的指数加权平均
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            E_g2 = decay_rate * E_g2 + (1 - decay_rate) * grad ** 2
            w = w - learning_rate * grad / (np.sqrt(E_g2) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_rmsprop, losses_rmsprop = rmsprop(X, y)
print(f"RMSprop结果: w = {w_rmsprop}")

Adam

原理

结合Momentum和RMSprop：

$m_t = \beta_1 m_{t-1} + (1-\beta_1) g_t$ $v_t = \beta_2 v_{t-1} + (1-\beta_2) g_t^2$ $\hat{m}_t = \frac{m_t}{1-\beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1-\beta_2^t}$ $\theta = \theta - \frac{\eta}{\sqrt{\hat{v}_t} + \epsilon} \hat{m}_t$

def adam(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999, 
         n_epochs=20, epsilon=1e-8):
    w = np.zeros(X.shape[1])
    m = np.zeros(X.shape[1])  # 一阶矩
    v = np.zeros(X.shape[1])  # 二阶矩
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    t = 0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            t += 1
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            
            # 更新矩估计
            m = beta1 * m + (1 - beta1) * grad
            v = beta2 * v + (1 - beta2) * grad ** 2
            
            # 偏差修正
            m_hat = m / (1 - beta1 ** t)
            v_hat = v / (1 - beta2 ** t)
            
            # 更新参数
            w = w - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

w_adam, losses_adam = adam(X, y)
print(f"Adam结果: w = {w_adam}")

Adam变体

def adamw(X, y, batch_size=32, learning_rate=0.001, beta1=0.9, beta2=0.999,
          weight_decay=0.01, n_epochs=20, epsilon=1e-8):
    """AdamW: 解耦权重衰减"""
    w = np.zeros(X.shape[1])
    m = np.zeros(X.shape[1])
    v = np.zeros(X.shape[1])
    losses = []
    
    n_samples = len(y)
    n_batches = n_samples // batch_size
    t = 0
    
    for epoch in range(n_epochs):
        indices = np.random.permutation(n_samples)
        
        for i in range(n_batches):
            t += 1
            batch_idx = indices[i*batch_size:(i+1)*batch_size]
            X_batch = X[batch_idx]
            y_batch = y[batch_idx]
            
            grad = compute_gradient(X_batch, y_batch, w)
            
            m = beta1 * m + (1 - beta1) * grad
            v = beta2 * v + (1 - beta2) * grad ** 2
            
            m_hat = m / (1 - beta1 ** t)
            v_hat = v / (1 - beta2 ** t)
            
            # 解耦的权重衰减
            w = w - learning_rate * (m_hat / (np.sqrt(v_hat) + epsilon) + weight_decay * w)
        
        losses.append(compute_loss(X, y, w))
    
    return w, losses

优化器比较

# 比较所有优化器
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(losses_mini, label='SGD', linewidth=2)
ax.plot(losses_momentum, label='Momentum', linewidth=2)
ax.plot(losses_nag, label='NAG', linewidth=2)
ax.plot(losses_adagrad, label='AdaGrad', linewidth=2)
ax.plot(losses_rmsprop, label='RMSprop', linewidth=2)
ax.plot(losses_adam, label='Adam', linewidth=2)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('优化器比较')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_yscale('log')

plt.show()

优化器	特点	适用场景
SGD	简单、可能找到更平坦的极小值	需要精细调参
Momentum	加速收敛	通用
AdaGrad	自适应学习率	稀疏数据
RMSprop	解决AdaGrad学习率衰减问题	RNN
Adam	自适应+动量	默认首选

学习率调度

常用策略

# 学习率调度策略
def step_decay(epoch, initial_lr=0.1, drop=0.5, epochs_drop=10):
    return initial_lr * (drop ** (epoch // epochs_drop))

def exponential_decay(epoch, initial_lr=0.1, decay_rate=0.96):
    return initial_lr * (decay_rate ** epoch)

def cosine_annealing(epoch, initial_lr=0.1, T_max=100):
    return initial_lr * (1 + np.cos(np.pi * epoch / T_max)) / 2

def warmup_cosine(epoch, initial_lr=0.1, warmup_epochs=10, total_epochs=100):
    if epoch < warmup_epochs:
        return initial_lr * epoch / warmup_epochs
    else:
        return initial_lr * (1 + np.cos(np.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs))) / 2

# 可视化
epochs = np.arange(100)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

ax = axes[0, 0]
ax.plot(epochs, [step_decay(e) for e in epochs])
ax.set_title('Step Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[0, 1]
ax.plot(epochs, [exponential_decay(e) for e in epochs])
ax.set_title('Exponential Decay')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[1, 0]
ax.plot(epochs, [cosine_annealing(e, T_max=100) for e in epochs])
ax.set_title('Cosine Annealing')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

ax = axes[1, 1]
ax.plot(epochs, [warmup_cosine(e, total_epochs=100) for e in epochs])
ax.set_title('Warmup + Cosine')
ax.set_xlabel('Epoch')
ax.set_ylabel('Learning Rate')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

PyTorch实现

try:
    import torch
    import torch.optim as optim
    
    # 定义模型参数
    w = torch.randn(2, requires_grad=True)
    
    # 不同优化器
    optimizers = {
        'SGD': optim.SGD([w], lr=0.01),
        'SGD+Momentum': optim.SGD([w], lr=0.01, momentum=0.9),
        'Adam': optim.Adam([w], lr=0.001),
        'AdamW': optim.AdamW([w], lr=0.001, weight_decay=0.01),
        'RMSprop': optim.RMSprop([w], lr=0.01)
    }
    
    # 学习率调度器
    scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optim.Adam([w], lr=0.001), 
        T_max=100
    )
    
    print("PyTorch优化器:")
    for name in optimizers:
        print(f"  {name}")
    
    print("\n学习率调度器:")
    print("  CosineAnnealingLR")
    print("  StepLR")
    print("  ExponentialLR")
    print("  ReduceLROnPlateau")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: 如何选择优化器？

默认首选：Adam
追求泛化：SGD + Momentum
稀疏数据：AdaGrad

Q2: 如何选择学习率？

从大到小尝试：0.1, 0.01, 0.001, 0.0001
使用学习率范围测试
使用学习率调度

Q3: Batch size如何影响训练？

Batch Size	优点	缺点
小	泛化好	训练慢、不稳定
大	训练快、稳定	可能泛化差

Q4: Adam的默认参数是什么？

$\beta_1 = 0.9$
$\beta_2 = 0.999$
$\epsilon = 10^{-8}$

总结

优化器	更新规则	关键超参数
SGD	$w - \eta g$	$\eta$
Momentum	$w - (\gamma v + \eta g)$	$\eta, \gamma$
Adam	$w - \eta \hat{m}/\sqrt{\hat{v}}$	$\eta, \beta_1, \beta_2$

参考资料

Kingma, D. & Ba, J. (2014). “Adam: A Method for Stochastic Optimization”
Loshchilov, I. & Hutter, F. (2017). “Decoupled Weight Decay Regularization”
Ruder, S. (2016). “An overview of gradient descent optimization algorithms”
CS231n: Optimization

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——优化算法详解》

本文链接：http://localhost:3015/ai/%E4%BC%98%E5%8C%96%E7%AE%97%E6%B3%95%E8%AF%A6%E8%A7%A3.html

本文最后一次更新为天前，文章中的某些内容可能已过时！