已经是最新一篇文章了!
已经是最后一篇文章了!
链式法则、梯度计算与计算图
前言
反向传播(Backpropagation)是训练神经网络的核心算法。它利用链式法则高效计算损失函数相对于每个参数的梯度,从而实现梯度下降优化。
计算图
概念
计算图将计算过程表示为有向无环图(DAG):
- 节点:变量或运算
- 边:数据流向
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
# 简单计算图示例
# f(x, y, z) = (x + y) * z
x, y, z = 2, 3, 4
# 前向传播
q = x + y # 中间变量
f = q * z # 最终输出
print(f"x={x}, y={y}, z={z}")
print(f"q = x + y = {q}")
print(f"f = q * z = {f}")
可视化
x ─────┐
├──→ [+] ──→ q ─┐
y ─────┘ ├──→ [*] ──→ f
z ─────────────────────┘
链式法则
数学定义
对于复合函数 $f(g(x))$:
\[\frac{df}{dx} = \frac{df}{dg} \cdot \frac{dg}{dx}\]多变量情况:
\[\frac{\partial L}{\partial x} = \sum_i \frac{\partial L}{\partial y_i} \cdot \frac{\partial y_i}{\partial x}\]计算示例
# f(x, y, z) = (x + y) * z 的梯度计算
x, y, z = 2.0, 3.0, 4.0
# 前向传播
q = x + y
f = q * z
# 反向传播
# df/df = 1
df_df = 1.0
# df/dq = z, df/dz = q
df_dq = z * df_df # 4
df_dz = q * df_df # 5
# df/dx = df/dq * dq/dx = df/dq * 1
# df/dy = df/dq * dq/dy = df/dq * 1
df_dx = df_dq * 1 # 4
df_dy = df_dq * 1 # 4
print(f"∂f/∂x = {df_dx}")
print(f"∂f/∂y = {df_dy}")
print(f"∂f/∂z = {df_dz}")
# 验证
epsilon = 1e-5
numerical_dx = ((x + epsilon + y) * z - (x - epsilon + y) * z) / (2 * epsilon)
print(f"\n数值梯度验证: ∂f/∂x ≈ {numerical_dx}")
神经网络中的反向传播
单神经元示例
def sigmoid(z):
return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
class Neuron:
"""单个神经元"""
def __init__(self, n_inputs):
self.w = np.random.randn(n_inputs) * 0.5
self.b = 0.0
def forward(self, x):
"""前向传播"""
self.x = x
self.z = np.dot(x, self.w) + self.b
self.a = sigmoid(self.z)
return self.a
def backward(self, da):
"""反向传播"""
# sigmoid导数: σ'(z) = σ(z)(1 - σ(z))
dz = da * self.a * (1 - self.a)
# 计算各参数的梯度
self.dw = self.x * dz
self.db = dz
self.dx = self.w * dz
return self.dx
# 测试
neuron = Neuron(2)
x = np.array([1.0, 2.0])
# 前向
a = neuron.forward(x)
print(f"前向传播: 输入{x} -> 输出{a:.4f}")
# 假设损失函数梯度为1
da = 1.0
# 反向
dx = neuron.backward(da)
print(f"\n梯度:")
print(f" dw: {neuron.dw}")
print(f" db: {neuron.db:.4f}")
print(f" dx: {neuron.dx}")
数值梯度验证
def numerical_gradient(func, x, epsilon=1e-5):
"""数值计算梯度"""
grad = np.zeros_like(x)
for i in range(len(x)):
x_plus = x.copy()
x_minus = x.copy()
x_plus[i] += epsilon
x_minus[i] -= epsilon
grad[i] = (func(x_plus) - func(x_minus)) / (2 * epsilon)
return grad
# 验证反向传播计算的梯度
def forward_for_grad(weights):
z = np.dot(np.array([1.0, 2.0]), weights) + 0.0
return sigmoid(z)
numerical_dw = numerical_gradient(forward_for_grad, neuron.w)
print(f"\n数值梯度验证:")
print(f" 解析梯度 dw: {neuron.dw}")
print(f" 数值梯度 dw: {numerical_dw}")
print(f" 相对误差: {np.abs(neuron.dw - numerical_dw).max():.2e}")
多层网络的反向传播
从零实现
class Layer:
"""全连接层"""
def __init__(self, input_size, output_size):
self.W = np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
self.b = np.zeros((1, output_size))
# 梯度
self.dW = None
self.db = None
def forward(self, X):
self.X = X
self.Z = np.dot(X, self.W) + self.b
return self.Z
def backward(self, dZ):
m = self.X.shape[0]
self.dW = np.dot(self.X.T, dZ) / m
self.db = np.mean(dZ, axis=0, keepdims=True)
dX = np.dot(dZ, self.W.T)
return dX
class ReLUActivation:
"""ReLU激活函数"""
def forward(self, Z):
self.Z = Z
return np.maximum(0, Z)
def backward(self, dA):
return dA * (self.Z > 0)
class SigmoidActivation:
"""Sigmoid激活函数"""
def forward(self, Z):
self.A = 1 / (1 + np.exp(-np.clip(Z, -500, 500)))
return self.A
def backward(self, dA):
return dA * self.A * (1 - self.A)
class NeuralNetwork:
"""多层神经网络"""
def __init__(self, layer_sizes):
self.layers = []
self.activations = []
for i in range(len(layer_sizes) - 1):
self.layers.append(Layer(layer_sizes[i], layer_sizes[i+1]))
# 最后一层用Sigmoid,其他用ReLU
if i == len(layer_sizes) - 2:
self.activations.append(SigmoidActivation())
else:
self.activations.append(ReLUActivation())
def forward(self, X):
A = X
for layer, activation in zip(self.layers, self.activations):
Z = layer.forward(A)
A = activation.forward(Z)
return A
def backward(self, y):
m = y.shape[0]
# 输出层梯度(二元交叉熵损失)
A = self.activations[-1].A
dA = -(y / (A + 1e-8) - (1 - y) / (1 - A + 1e-8)) / m
# 反向传播
for i in range(len(self.layers) - 1, -1, -1):
dZ = self.activations[i].backward(dA)
dA = self.layers[i].backward(dZ)
def compute_loss(self, y):
A = self.activations[-1].A
loss = -np.mean(y * np.log(A + 1e-8) + (1 - y) * np.log(1 - A + 1e-8))
return loss
def update(self, learning_rate):
for layer in self.layers:
layer.W -= learning_rate * layer.dW
layer.b -= learning_rate * layer.db
def train(self, X, y, epochs, learning_rate=0.1):
losses = []
for epoch in range(epochs):
# 前向传播
self.forward(X)
# 计算损失
loss = self.compute_loss(y)
losses.append(loss)
# 反向传播
self.backward(y)
# 更新参数
self.update(learning_rate)
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.6f}")
return losses
训练示例
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 生成数据
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
y = y.reshape(-1, 1)
# 标准化
scaler = StandardScaler()
X = scaler.fit_transform(X)
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建网络: 2 -> 10 -> 5 -> 1
nn = NeuralNetwork([2, 10, 5, 1])
# 训练
losses = nn.train(X_train, y_train, epochs=1000, learning_rate=0.5)
# 评估
train_pred = (nn.forward(X_train) > 0.5).astype(int)
test_pred = (nn.forward(X_test) > 0.5).astype(int)
train_acc = np.mean(train_pred == y_train)
test_acc = np.mean(test_pred == y_test)
print(f"\n训练准确率: {train_acc:.4f}")
print(f"测试准确率: {test_acc:.4f}")
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# 损失曲线
ax = axes[0]
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('训练损失曲线')
ax.grid(True, alpha=0.3)
# 决策边界
ax = axes[1]
xx, yy = np.meshgrid(np.linspace(X[:, 0].min()-1, X[:, 0].max()+1, 100),
np.linspace(X[:, 1].min()-1, X[:, 1].max()+1, 100))
Z = nn.forward(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, levels=20, cmap='coolwarm', alpha=0.6)
ax.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='coolwarm', edgecolors='k')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_title('决策边界')
plt.tight_layout()
plt.show()
梯度消失与梯度爆炸
梯度消失
# 演示深层网络中的梯度消失(使用Sigmoid)
np.random.seed(42)
n_layers = 10
layer_size = 50
def forward_with_gradients(n_layers, activation='sigmoid'):
"""追踪各层的梯度范数"""
gradients = []
x = np.random.randn(1, layer_size)
# 前向传播,保存各层输出
activations_list = [x]
for i in range(n_layers):
W = np.random.randn(layer_size, layer_size) * 0.5
z = np.dot(activations_list[-1], W)
if activation == 'sigmoid':
a = sigmoid(z)
else:
a = np.maximum(0, z)
activations_list.append(a)
# 模拟反向传播
grad = np.ones_like(activations_list[-1])
for i in range(n_layers - 1, -1, -1):
if activation == 'sigmoid':
grad = grad * activations_list[i+1] * (1 - activations_list[i+1])
else:
grad = grad * (activations_list[i+1] > 0)
gradients.append(np.linalg.norm(grad))
return gradients[::-1]
sigmoid_grads = forward_with_gradients(10, 'sigmoid')
relu_grads = forward_with_gradients(10, 'relu')
# 可视化
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sigmoid_grads, 'b-o', label='Sigmoid')
plt.plot(range(1, 11), relu_grads, 'r-s', label='ReLU')
plt.xlabel('Layer')
plt.ylabel('Gradient Norm')
plt.title('梯度消失问题: Sigmoid vs ReLU')
plt.legend()
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.show()
解决方案
| 问题 | 解决方案 |
|---|---|
| 梯度消失 | ReLU、残差连接、合适的初始化 |
| 梯度爆炸 | 梯度裁剪、BatchNorm、合适的初始化 |
# 梯度裁剪示例
def clip_gradients(gradients, max_norm):
"""梯度裁剪"""
total_norm = np.sqrt(sum(np.sum(g**2) for g in gradients))
clip_coef = max_norm / (total_norm + 1e-6)
if clip_coef < 1:
gradients = [g * clip_coef for g in gradients]
return gradients
# 测试
grads = [np.random.randn(100, 100) * 10 for _ in range(5)]
original_norm = np.sqrt(sum(np.sum(g**2) for g in grads))
clipped = clip_gradients(grads, max_norm=1.0)
clipped_norm = np.sqrt(sum(np.sum(g**2) for g in clipped))
print(f"原始梯度范数: {original_norm:.4f}")
print(f"裁剪后范数: {clipped_norm:.4f}")
自动微分
PyTorch示例
try:
import torch
import torch.nn as nn
# 使用autograd自动计算梯度
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
z = torch.tensor([4.0], requires_grad=True)
# 前向计算
q = x + y
f = q * z
# 反向传播
f.backward()
print("PyTorch自动微分:")
print(f" ∂f/∂x = {x.grad.item()}")
print(f" ∂f/∂y = {y.grad.item()}")
print(f" ∂f/∂z = {z.grad.item()}")
# 神经网络示例
model = nn.Sequential(
nn.Linear(2, 10),
nn.ReLU(),
nn.Linear(10, 1),
nn.Sigmoid()
)
# 查看梯度
X_torch = torch.FloatTensor(X_train[:10])
y_torch = torch.FloatTensor(y_train[:10])
output = model(X_torch)
loss = nn.BCELoss()(output, y_torch)
loss.backward()
print(f"\n神经网络梯度示例:")
for name, param in model.named_parameters():
if param.grad is not None:
print(f" {name}: grad norm = {param.grad.norm().item():.4f}")
except ImportError:
print("PyTorch未安装")
常见问题
Q1: 为什么叫”反向”传播?
因为梯度从输出层向输入层反向计算,与前向传播方向相反。
Q2: 反向传播的时间复杂度?
与前向传播相同,$O(n)$,其中n是参数数量。
Q3: 如何验证反向传播实现正确?
使用数值梯度检验:
\[\frac{\partial L}{\partial \theta} \approx \frac{L(\theta + \epsilon) - L(\theta - \epsilon)}{2\epsilon}\]Q4: 为什么需要保存前向传播的中间值?
反向传播需要使用这些值来计算梯度。
总结
| 概念 | 描述 |
|---|---|
| 计算图 | 将计算表示为有向图 |
| 链式法则 | 复合函数求导的基础 |
| 反向传播 | 高效计算所有参数梯度 |
| 梯度消失/爆炸 | 深层网络的训练难题 |
| 自动微分 | 现代框架自动实现反向传播 |
参考资料
- Rumelhart, D. et al. (1986). “Learning representations by back-propagating errors”
- Goodfellow, I. et al. “Deep Learning” - Chapter 6.5
- CS231n: Backpropagation, Intuitions
- PyTorch Autograd Tutorial
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——反向传播 》
本文链接:http://localhost:3015/ai/%E5%8F%8D%E5%90%91%E4%BC%A0%E6%92%AD.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!