BatchNorm、LayerNorm与归一化技术
前言
批归一化(Batch Normalization)是深度学习中最重要的技术之一,它通过规范化层输入来加速训练并提高模型稳定性。
内部协变量偏移
问题描述
训练过程中,每层输入的分布会随着前层参数更新而变化。
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
# 模拟分布偏移
def simulate_distribution_shift():
"""模拟训练过程中的分布变化"""
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, (ax, epoch) in enumerate(zip(axes, [1, 50, 100])):
# 模拟不同epoch的激活值分布
mean_shift = (epoch - 1) * 0.05
std_change = 1 + (epoch - 1) * 0.02
data = np.random.randn(1000) * std_change + mean_shift
ax.hist(data, bins=50, density=True, alpha=0.7)
ax.axvline(x=0, color='r', linestyle='--')
ax.set_title(f'Epoch {epoch}\n均值={data.mean():.2f}, 标准差={data.std():.2f}')
ax.set_xlabel('激活值')
ax.set_ylabel('密度')
plt.tight_layout()
plt.suptitle('内部协变量偏移', y=1.02)
plt.show()
simulate_distribution_shift()
Batch Normalization
算法
对于每个小批量:
\(\hat{x}_i = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\) \(y_i = \gamma \hat{x}_i + \beta\)
其中 $\gamma$ 和 $\beta$ 是可学习参数。
class BatchNorm1D:
"""Batch Normalization实现"""
def __init__(self, num_features, eps=1e-5, momentum=0.1):
self.eps = eps
self.momentum = momentum
# 可学习参数
self.gamma = np.ones(num_features)
self.beta = np.zeros(num_features)
# 运行时均值和方差(用于推理)
self.running_mean = np.zeros(num_features)
self.running_var = np.ones(num_features)
# 梯度
self.dgamma = None
self.dbeta = None
def forward(self, x, training=True):
"""
x: 输入 (batch_size, num_features)
"""
if training:
# 计算批次统计量
self.batch_mean = np.mean(x, axis=0)
self.batch_var = np.var(x, axis=0)
# 更新运行时统计量
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * self.batch_mean
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * self.batch_var
mean = self.batch_mean
var = self.batch_var
else:
mean = self.running_mean
var = self.running_var
# 归一化
self.x_centered = x - mean
self.std = np.sqrt(var + self.eps)
self.x_norm = self.x_centered / self.std
# 缩放和偏移
out = self.gamma * self.x_norm + self.beta
# 保存用于反向传播
self.x = x
return out
def backward(self, dout):
"""反向传播"""
N = dout.shape[0]
# 计算gamma和beta的梯度
self.dbeta = np.sum(dout, axis=0)
self.dgamma = np.sum(dout * self.x_norm, axis=0)
# 计算输入的梯度
dx_norm = dout * self.gamma
dvar = np.sum(dx_norm * self.x_centered * -0.5 * (self.batch_var + self.eps) ** (-1.5), axis=0)
dmean = np.sum(dx_norm * -1 / self.std, axis=0) + dvar * np.mean(-2 * self.x_centered, axis=0)
dx = dx_norm / self.std + dvar * 2 * self.x_centered / N + dmean / N
return dx
# 测试
x = np.random.randn(32, 64) * 5 + 10 # 偏移的输入
bn = BatchNorm1D(64)
# 前向
y = bn.forward(x, training=True)
print(f"输入统计: 均值={x.mean():.4f}, 标准差={x.std():.4f}")
print(f"输出统计: 均值={y.mean():.4f}, 标准差={y.std():.4f}")
训练 vs 推理
# 演示训练和推理模式的区别
np.random.seed(42)
bn = BatchNorm1D(10)
# 模拟多个批次的训练
print("训练阶段:")
for i in range(5):
x_train = np.random.randn(32, 10) * (i + 1) + i * 2
y_train = bn.forward(x_train, training=True)
print(f" Batch {i+1}: 输入均值={x_train.mean():.2f}, 输出均值={y_train.mean():.4f}")
print(f"\n运行时统计:")
print(f" running_mean: {bn.running_mean[:3]}...")
print(f" running_var: {bn.running_var[:3]}...")
# 推理
print("\n推理阶段:")
x_test = np.random.randn(1, 10) * 3 + 5
y_test = bn.forward(x_test, training=False)
print(f" 测试输入均值={x_test.mean():.2f}, 输出均值={y_test.mean():.4f}")
Layer Normalization
与BatchNorm的区别
| 特性 | BatchNorm | LayerNorm |
|---|---|---|
| 归一化维度 | 批次维度 | 特征维度 |
| 适用场景 | CNN、batch size大 | RNN、Transformer |
| 对batch size依赖 | 是 | 否 |
class LayerNorm:
"""Layer Normalization实现"""
def __init__(self, normalized_shape, eps=1e-5):
self.eps = eps
self.gamma = np.ones(normalized_shape)
self.beta = np.zeros(normalized_shape)
def forward(self, x):
"""
x: 输入 (batch_size, num_features)
"""
# 对每个样本的特征维度归一化
self.mean = np.mean(x, axis=-1, keepdims=True)
self.var = np.var(x, axis=-1, keepdims=True)
self.x_norm = (x - self.mean) / np.sqrt(self.var + self.eps)
out = self.gamma * self.x_norm + self.beta
return out
# 比较BatchNorm和LayerNorm
x = np.random.randn(32, 64) * 5 + 10
bn = BatchNorm1D(64)
ln = LayerNorm(64)
y_bn = bn.forward(x, training=True)
y_ln = ln.forward(x)
print("BatchNorm:")
print(f" 输出均值={y_bn.mean():.4f}, 标准差={y_bn.std():.4f}")
print(f" 每列均值范围: [{y_bn.mean(axis=0).min():.4f}, {y_bn.mean(axis=0).max():.4f}]")
print("\nLayerNorm:")
print(f" 输出均值={y_ln.mean():.4f}, 标准差={y_ln.std():.4f}")
print(f" 每行均值范围: [{y_ln.mean(axis=1).min():.4f}, {y_ln.mean(axis=1).max():.4f}]")
可视化区别
# 可视化归一化方向的区别
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# 原始数据
ax = axes[0]
ax.imshow(x[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('原始数据')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)
# BatchNorm结果
ax = axes[1]
ax.imshow(y_bn[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('BatchNorm\n(沿样本维度归一化)')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)
# LayerNorm结果
ax = axes[2]
ax.imshow(y_ln[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('LayerNorm\n(沿特征维度归一化)')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)
plt.tight_layout()
plt.show()
其他归一化方法
Instance Normalization
用于风格迁移等任务:
class InstanceNorm:
"""Instance Normalization"""
def __init__(self, num_features, eps=1e-5):
self.eps = eps
self.gamma = np.ones(num_features)
self.beta = np.zeros(num_features)
def forward(self, x):
"""
x: (batch_size, channels, height, width) 或 (batch_size, features)
对每个样本的每个通道单独归一化
"""
if x.ndim == 2:
# 简化版:对每个样本单独归一化
mean = np.mean(x, axis=1, keepdims=True)
var = np.var(x, axis=1, keepdims=True)
else:
# 4D情况
mean = np.mean(x, axis=(2, 3), keepdims=True)
var = np.var(x, axis=(2, 3), keepdims=True)
x_norm = (x - mean) / np.sqrt(var + self.eps)
return self.gamma * x_norm + self.beta
Group Normalization
class GroupNorm:
"""Group Normalization"""
def __init__(self, num_groups, num_channels, eps=1e-5):
self.num_groups = num_groups
self.eps = eps
self.gamma = np.ones(num_channels)
self.beta = np.zeros(num_channels)
def forward(self, x):
"""
x: (batch_size, channels)
"""
N, C = x.shape
G = self.num_groups
# 重塑为组
x = x.reshape(N, G, C // G)
mean = np.mean(x, axis=2, keepdims=True)
var = np.var(x, axis=2, keepdims=True)
x_norm = (x - mean) / np.sqrt(var + self.eps)
x_norm = x_norm.reshape(N, C)
return self.gamma * x_norm + self.beta
# 测试
x = np.random.randn(32, 64)
gn = GroupNorm(num_groups=8, num_channels=64)
y_gn = gn.forward(x)
print(f"GroupNorm输出: 均值={y_gn.mean():.4f}, 标准差={y_gn.std():.4f}")
归一化方法比较
# 比较不同归一化方法
fig, ax = plt.subplots(figsize=(10, 6))
methods = ['BatchNorm', 'LayerNorm', 'InstanceNorm', 'GroupNorm']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
# 在不同batch size下的表现(模拟)
batch_sizes = [1, 4, 8, 16, 32, 64]
# 模拟性能(BatchNorm在小batch时不稳定)
bn_perf = [0.7, 0.8, 0.85, 0.9, 0.92, 0.93]
ln_perf = [0.88, 0.88, 0.88, 0.88, 0.88, 0.88]
in_perf = [0.85, 0.85, 0.85, 0.85, 0.85, 0.85]
gn_perf = [0.87, 0.88, 0.88, 0.89, 0.89, 0.89]
ax.plot(batch_sizes, bn_perf, 'o-', label='BatchNorm', color=colors[0], linewidth=2)
ax.plot(batch_sizes, ln_perf, 's-', label='LayerNorm', color=colors[1], linewidth=2)
ax.plot(batch_sizes, in_perf, '^-', label='InstanceNorm', color=colors[2], linewidth=2)
ax.plot(batch_sizes, gn_perf, 'd-', label='GroupNorm', color=colors[3], linewidth=2)
ax.set_xlabel('Batch Size')
ax.set_ylabel('性能(模拟)')
ax.set_title('不同归一化方法在不同Batch Size下的表现')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xscale('log', base=2)
plt.show()
BatchNorm的好处
加速训练
# 演示BatchNorm对训练的影响
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
# 生成数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X = StandardScaler().fit_transform(X)
y = y.reshape(-1, 1)
def train_network(use_bn=False, n_epochs=100):
"""训练简单网络"""
np.random.seed(42)
# 初始化权重
W1 = np.random.randn(20, 50) * 0.1
W2 = np.random.randn(50, 1) * 0.1
if use_bn:
bn = BatchNorm1D(50)
losses = []
lr = 0.1
for epoch in range(n_epochs):
# 前向传播
z1 = X @ W1
if use_bn:
z1 = bn.forward(z1, training=True)
a1 = np.maximum(0, z1) # ReLU
z2 = a1 @ W2
a2 = 1 / (1 + np.exp(-z2)) # Sigmoid
# 损失
loss = -np.mean(y * np.log(a2 + 1e-8) + (1-y) * np.log(1-a2 + 1e-8))
losses.append(loss)
# 反向传播(简化)
dz2 = a2 - y
dW2 = a1.T @ dz2 / len(y)
da1 = dz2 @ W2.T
dz1 = da1 * (z1 > 0)
if use_bn:
dz1 = bn.backward(dz1)
dW1 = X.T @ dz1 / len(y)
W2 -= lr * dW2
W1 -= lr * dW1
if use_bn:
bn.gamma -= lr * bn.dgamma
bn.beta -= lr * bn.dbeta
return losses
losses_no_bn = train_network(use_bn=False)
losses_with_bn = train_network(use_bn=True)
# 可视化
plt.figure(figsize=(10, 6))
plt.plot(losses_no_bn, label='Without BatchNorm', linewidth=2)
plt.plot(losses_with_bn, label='With BatchNorm', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('BatchNorm对训练的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
其他好处
| 好处 | 说明 |
|---|---|
| 允许更大学习率 | 梯度更稳定 |
| 减少对初始化敏感 | 自动调整分布 |
| 正则化效果 | 批次噪声 |
| 加速收敛 | 减少梯度消失 |
PyTorch实现
try:
import torch
import torch.nn as nn
# BatchNorm
bn_layer = nn.BatchNorm1d(64)
# LayerNorm
ln_layer = nn.LayerNorm(64)
# GroupNorm
gn_layer = nn.GroupNorm(num_groups=8, num_channels=64)
# InstanceNorm (通常用于2D)
in_layer = nn.InstanceNorm1d(64)
# 在网络中使用
class SimpleNet(nn.Module):
def __init__(self, use_bn=True):
super().__init__()
self.fc1 = nn.Linear(20, 64)
self.bn1 = nn.BatchNorm1d(64) if use_bn else nn.Identity()
self.fc2 = nn.Linear(64, 32)
self.bn2 = nn.BatchNorm1d(32) if use_bn else nn.Identity()
self.fc3 = nn.Linear(32, 1)
self.relu = nn.ReLU()
def forward(self, x):
x = self.relu(self.bn1(self.fc1(x)))
x = self.relu(self.bn2(self.fc2(x)))
return torch.sigmoid(self.fc3(x))
model = SimpleNet(use_bn=True)
print("网络结构:")
print(model)
# 测试
x = torch.randn(32, 20)
model.train()
y_train = model(x)
model.eval()
y_eval = model(x)
print(f"\n训练模式输出: {y_train.mean().item():.4f}")
print(f"评估模式输出: {y_eval.mean().item():.4f}")
except ImportError:
print("PyTorch未安装")
常见问题
Q1: BatchNorm应该放在激活函数前还是后?
两种都可以,原论文是前,但实践中后也常用。
Q2: 小batch size怎么办?
使用LayerNorm或GroupNorm。
Q3: RNN中为什么不用BatchNorm?
- 序列长度不固定
- 时间步间统计量不一致
- LayerNorm更适合
Q4: BatchNorm的momentum含义?
控制running_mean/var的更新速度,默认0.1。
总结
| 方法 | 归一化维度 | 适用场景 |
|---|---|---|
| BatchNorm | 批次 | CNN、大batch |
| LayerNorm | 特征 | Transformer、RNN |
| InstanceNorm | 单样本单通道 | 风格迁移 |
| GroupNorm | 通道组 | 小batch |
参考资料
- Ioffe, S. & Szegedy, C. (2015). “Batch Normalization: Accelerating Deep Network Training”
- Ba, J. et al. (2016). “Layer Normalization”
- Wu, Y. & He, K. (2018). “Group Normalization”
- Ulyanov, D. et al. (2016). “Instance Normalization”
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——批归一化 》
本文链接:http://localhost:3015/ai/%E6%89%B9%E5%BD%92%E4%B8%80%E5%8C%96.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!