机器学习基础系列——反向传播

前言

反向传播（Backpropagation）是训练神经网络的核心算法。它利用链式法则高效计算损失函数相对于每个参数的梯度，从而实现梯度下降优化。

计算图

概念

计算图将计算过程表示为有向无环图（DAG）：

节点：变量或运算
边：数据流向

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 简单计算图示例
# f(x, y, z) = (x + y) * z

x, y, z = 2, 3, 4

# 前向传播
q = x + y  # 中间变量
f = q * z  # 最终输出

print(f"x={x}, y={y}, z={z}")
print(f"q = x + y = {q}")
print(f"f = q * z = {f}")

可视化

   x ─────┐
          ├──→ [+] ──→ q ─┐
   y ─────┘               ├──→ [*] ──→ f
   z ─────────────────────┘

链式法则

数学定义

对于复合函数 $f(g(x))$：

\[\frac{df}{dx} = \frac{df}{dg} \cdot \frac{dg}{dx}\]

多变量情况：

\[\frac{\partial L}{\partial x} = \sum_i \frac{\partial L}{\partial y_i} \cdot \frac{\partial y_i}{\partial x}\]

计算示例

# f(x, y, z) = (x + y) * z 的梯度计算

x, y, z = 2.0, 3.0, 4.0

# 前向传播
q = x + y
f = q * z

# 反向传播
# df/df = 1
df_df = 1.0

# df/dq = z, df/dz = q
df_dq = z * df_df  # 4
df_dz = q * df_df  # 5

# df/dx = df/dq * dq/dx = df/dq * 1
# df/dy = df/dq * dq/dy = df/dq * 1
df_dx = df_dq * 1  # 4
df_dy = df_dq * 1  # 4

print(f"∂f/∂x = {df_dx}")
print(f"∂f/∂y = {df_dy}")
print(f"∂f/∂z = {df_dz}")

# 验证
epsilon = 1e-5
numerical_dx = ((x + epsilon + y) * z - (x - epsilon + y) * z) / (2 * epsilon)
print(f"\n数值梯度验证: ∂f/∂x ≈ {numerical_dx}")

神经网络中的反向传播

单神经元示例

def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

class Neuron:
    """单个神经元"""
    
    def __init__(self, n_inputs):
        self.w = np.random.randn(n_inputs) * 0.5
        self.b = 0.0
    
    def forward(self, x):
        """前向传播"""
        self.x = x
        self.z = np.dot(x, self.w) + self.b
        self.a = sigmoid(self.z)
        return self.a
    
    def backward(self, da):
        """反向传播"""
        # sigmoid导数: σ'(z) = σ(z)(1 - σ(z))
        dz = da * self.a * (1 - self.a)
        
        # 计算各参数的梯度
        self.dw = self.x * dz
        self.db = dz
        self.dx = self.w * dz
        
        return self.dx

# 测试
neuron = Neuron(2)
x = np.array([1.0, 2.0])

# 前向
a = neuron.forward(x)
print(f"前向传播: 输入{x} -> 输出{a:.4f}")

# 假设损失函数梯度为1
da = 1.0

# 反向
dx = neuron.backward(da)
print(f"\n梯度:")
print(f"  dw: {neuron.dw}")
print(f"  db: {neuron.db:.4f}")
print(f"  dx: {neuron.dx}")

数值梯度验证

def numerical_gradient(func, x, epsilon=1e-5):
    """数值计算梯度"""
    grad = np.zeros_like(x)
    
    for i in range(len(x)):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += epsilon
        x_minus[i] -= epsilon
        
        grad[i] = (func(x_plus) - func(x_minus)) / (2 * epsilon)
    
    return grad

# 验证反向传播计算的梯度
def forward_for_grad(weights):
    z = np.dot(np.array([1.0, 2.0]), weights) + 0.0
    return sigmoid(z)

numerical_dw = numerical_gradient(forward_for_grad, neuron.w)
print(f"\n数值梯度验证:")
print(f"  解析梯度 dw: {neuron.dw}")
print(f"  数值梯度 dw: {numerical_dw}")
print(f"  相对误差: {np.abs(neuron.dw - numerical_dw).max():.2e}")

多层网络的反向传播

从零实现

class Layer:
    """全连接层"""
    
    def __init__(self, input_size, output_size):
        self.W = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
        self.b = np.zeros((1, output_size))
        
        # 梯度
        self.dW = None
        self.db = None
    
    def forward(self, X):
        self.X = X
        self.Z = np.dot(X, self.W) + self.b
        return self.Z
    
    def backward(self, dZ):
        m = self.X.shape[0]
        
        self.dW = np.dot(self.X.T, dZ) / m
        self.db = np.mean(dZ, axis=0, keepdims=True)
        dX = np.dot(dZ, self.W.T)
        
        return dX


class ReLUActivation:
    """ReLU激活函数"""
    
    def forward(self, Z):
        self.Z = Z
        return np.maximum(0, Z)
    
    def backward(self, dA):
        return dA * (self.Z > 0)


class SigmoidActivation:
    """Sigmoid激活函数"""
    
    def forward(self, Z):
        self.A = 1 / (1 + np.exp(-np.clip(Z, -500, 500)))
        return self.A
    
    def backward(self, dA):
        return dA * self.A * (1 - self.A)


class NeuralNetwork:
    """多层神经网络"""
    
    def __init__(self, layer_sizes):
        self.layers = []
        self.activations = []
        
        for i in range(len(layer_sizes) - 1):
            self.layers.append(Layer(layer_sizes[i], layer_sizes[i+1]))
            
            # 最后一层用Sigmoid，其他用ReLU
            if i == len(layer_sizes) - 2:
                self.activations.append(SigmoidActivation())
            else:
                self.activations.append(ReLUActivation())
    
    def forward(self, X):
        A = X
        for layer, activation in zip(self.layers, self.activations):
            Z = layer.forward(A)
            A = activation.forward(Z)
        return A
    
    def backward(self, y):
        m = y.shape[0]
        
        # 输出层梯度（二元交叉熵损失）
        A = self.activations[-1].A
        dA = -(y / (A + 1e-8) - (1 - y) / (1 - A + 1e-8)) / m
        
        # 反向传播
        for i in range(len(self.layers) - 1, -1, -1):
            dZ = self.activations[i].backward(dA)
            dA = self.layers[i].backward(dZ)
    
    def compute_loss(self, y):
        A = self.activations[-1].A
        loss = -np.mean(y * np.log(A + 1e-8) + (1 - y) * np.log(1 - A + 1e-8))
        return loss
    
    def update(self, learning_rate):
        for layer in self.layers:
            layer.W -= learning_rate * layer.dW
            layer.b -= learning_rate * layer.db
    
    def train(self, X, y, epochs, learning_rate=0.1):
        losses = []
        
        for epoch in range(epochs):
            # 前向传播
            self.forward(X)
            
            # 计算损失
            loss = self.compute_loss(y)
            losses.append(loss)
            
            # 反向传播
            self.backward(y)
            
            # 更新参数
            self.update(learning_rate)
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.6f}")
        
        return losses

训练示例

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 生成数据
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
y = y.reshape(-1, 1)

# 标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建网络: 2 -> 10 -> 5 -> 1
nn = NeuralNetwork([2, 10, 5, 1])

# 训练
losses = nn.train(X_train, y_train, epochs=1000, learning_rate=0.5)

# 评估
train_pred = (nn.forward(X_train) > 0.5).astype(int)
test_pred = (nn.forward(X_test) > 0.5).astype(int)

train_acc = np.mean(train_pred == y_train)
test_acc = np.mean(test_pred == y_test)

print(f"\n训练准确率: {train_acc:.4f}")
print(f"测试准确率: {test_acc:.4f}")

# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 损失曲线
ax = axes[0]
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('训练损失曲线')
ax.grid(True, alpha=0.3)

# 决策边界
ax = axes[1]
xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
                      np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))
Z = nn.forward(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

ax.contourf(xx, yy, Z, levels=20, cmap='coolwarm', alpha=0.6)
ax.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='coolwarm', edgecolors='k')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('决策边界')

plt.tight_layout()
plt.show()

梯度消失与梯度爆炸

梯度消失

# 演示深层网络中的梯度消失（使用Sigmoid）
np.random.seed(42)

n_layers = 10
layer_size = 50

def forward_with_gradients(n_layers, activation='sigmoid'):
    """追踪各层的梯度范数"""
    
    gradients = []
    x = np.random.randn(1, layer_size)
    
    # 前向传播，保存各层输出
    activations_list = [x]
    for i in range(n_layers):
        W = np.random.randn(layer_size, layer_size) * 0.5
        z = np.dot(activations_list[-1], W)
        
        if activation == 'sigmoid':
            a = sigmoid(z)
        else:
            a = np.maximum(0, z)
        
        activations_list.append(a)
    
    # 模拟反向传播
    grad = np.ones_like(activations_list[-1])
    
    for i in range(n_layers - 1, -1, -1):
        if activation == 'sigmoid':
            grad = grad * activations_list[i+1] * (1 - activations_list[i+1])
        else:
            grad = grad * (activations_list[i+1] > 0)
        
        gradients.append(np.linalg.norm(grad))
    
    return gradients[::-1]

sigmoid_grads = forward_with_gradients(10, 'sigmoid')
relu_grads = forward_with_gradients(10, 'relu')

# 可视化
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sigmoid_grads, 'b-o', label='Sigmoid')
plt.plot(range(1, 11), relu_grads, 'r-s', label='ReLU')
plt.xlabel('Layer')
plt.ylabel('Gradient Norm')
plt.title('梯度消失问题: Sigmoid vs ReLU')
plt.legend()
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()

解决方案

问题	解决方案
梯度消失	ReLU、残差连接、合适的初始化
梯度爆炸	梯度裁剪、BatchNorm、合适的初始化

# 梯度裁剪示例
def clip_gradients(gradients, max_norm):
    """梯度裁剪"""
    total_norm = np.sqrt(sum(np.sum(g**2) for g in gradients))
    clip_coef = max_norm / (total_norm + 1e-6)
    
    if clip_coef < 1:
        gradients = [g * clip_coef for g in gradients]
    
    return gradients

# 测试
grads = [np.random.randn(100, 100) * 10 for _ in range(5)]
original_norm = np.sqrt(sum(np.sum(g**2) for g in grads))
clipped = clip_gradients(grads, max_norm=1.0)
clipped_norm = np.sqrt(sum(np.sum(g**2) for g in clipped))

print(f"原始梯度范数: {original_norm:.4f}")
print(f"裁剪后范数: {clipped_norm:.4f}")

自动微分

PyTorch示例

try:
    import torch
    import torch.nn as nn
    
    # 使用autograd自动计算梯度
    x = torch.tensor([2.0], requires_grad=True)
    y = torch.tensor([3.0], requires_grad=True)
    z = torch.tensor([4.0], requires_grad=True)
    
    # 前向计算
    q = x + y
    f = q * z
    
    # 反向传播
    f.backward()
    
    print("PyTorch自动微分:")
    print(f"  ∂f/∂x = {x.grad.item()}")
    print(f"  ∂f/∂y = {y.grad.item()}")
    print(f"  ∂f/∂z = {z.grad.item()}")
    
    # 神经网络示例
    model = nn.Sequential(
        nn.Linear(2, 10),
        nn.ReLU(),
        nn.Linear(10, 1),
        nn.Sigmoid()
    )
    
    # 查看梯度
    X_torch = torch.FloatTensor(X_train[:10])
    y_torch = torch.FloatTensor(y_train[:10])
    
    output = model(X_torch)
    loss = nn.BCELoss()(output, y_torch)
    loss.backward()
    
    print(f"\n神经网络梯度示例:")
    for name, param in model.named_parameters():
        if param.grad is not None:
            print(f"  {name}: grad norm = {param.grad.norm().item():.4f}")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: 为什么叫”反向”传播？

因为梯度从输出层向输入层反向计算，与前向传播方向相反。

Q2: 反向传播的时间复杂度？

与前向传播相同，$O(n)$，其中n是参数数量。

Q3: 如何验证反向传播实现正确？

使用数值梯度检验：

\[\frac{\partial L}{\partial \theta} \approx \frac{L(\theta + \epsilon) - L(\theta - \epsilon)}{2\epsilon}\]

Q4: 为什么需要保存前向传播的中间值？

反向传播需要使用这些值来计算梯度。

总结

概念	描述
计算图	将计算表示为有向图
链式法则	复合函数求导的基础
反向传播	高效计算所有参数梯度
梯度消失/爆炸	深层网络的训练难题
自动微分	现代框架自动实现反向传播

参考资料

Rumelhart, D. et al. (1986). “Learning representations by back-propagating errors”
Goodfellow, I. et al. “Deep Learning” - Chapter 6.5
CS231n: Backpropagation, Intuitions
PyTorch Autograd Tutorial

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——反向传播》

本文链接：http://localhost:3015/ai/%E5%8F%8D%E5%90%91%E4%BC%A0%E6%92%AD.html

本文最后一次更新为天前，文章中的某些内容可能已过时！