BatchNorm、LayerNorm与归一化技术

前言

批归一化(Batch Normalization)是深度学习中最重要的技术之一,它通过规范化层输入来加速训练并提高模型稳定性。


内部协变量偏移

问题描述

训练过程中,每层输入的分布会随着前层参数更新而变化。

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 模拟分布偏移
def simulate_distribution_shift():
    """模拟训练过程中的分布变化"""
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    for i, (ax, epoch) in enumerate(zip(axes, [1, 50, 100])):
        # 模拟不同epoch的激活值分布
        mean_shift = (epoch - 1) * 0.05
        std_change = 1 + (epoch - 1) * 0.02
        
        data = np.random.randn(1000) * std_change + mean_shift
        
        ax.hist(data, bins=50, density=True, alpha=0.7)
        ax.axvline(x=0, color='r', linestyle='--')
        ax.set_title(f'Epoch {epoch}\n均值={data.mean():.2f}, 标准差={data.std():.2f}')
        ax.set_xlabel('激活值')
        ax.set_ylabel('密度')
    
    plt.tight_layout()
    plt.suptitle('内部协变量偏移', y=1.02)
    plt.show()

simulate_distribution_shift()

Batch Normalization

算法

对于每个小批量:

\(\hat{x}_i = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\) \(y_i = \gamma \hat{x}_i + \beta\)

其中 $\gamma$ 和 $\beta$ 是可学习参数。

class BatchNorm1D:
    """Batch Normalization实现"""
    
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        
        # 可学习参数
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
        
        # 运行时均值和方差(用于推理)
        self.running_mean = np.zeros(num_features)
        self.running_var = np.ones(num_features)
        
        # 梯度
        self.dgamma = None
        self.dbeta = None
    
    def forward(self, x, training=True):
        """
        x: 输入 (batch_size, num_features)
        """
        if training:
            # 计算批次统计量
            self.batch_mean = np.mean(x, axis=0)
            self.batch_var = np.var(x, axis=0)
            
            # 更新运行时统计量
            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * self.batch_mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * self.batch_var
            
            mean = self.batch_mean
            var = self.batch_var
        else:
            mean = self.running_mean
            var = self.running_var
        
        # 归一化
        self.x_centered = x - mean
        self.std = np.sqrt(var + self.eps)
        self.x_norm = self.x_centered / self.std
        
        # 缩放和偏移
        out = self.gamma * self.x_norm + self.beta
        
        # 保存用于反向传播
        self.x = x
        
        return out
    
    def backward(self, dout):
        """反向传播"""
        N = dout.shape[0]
        
        # 计算gamma和beta的梯度
        self.dbeta = np.sum(dout, axis=0)
        self.dgamma = np.sum(dout * self.x_norm, axis=0)
        
        # 计算输入的梯度
        dx_norm = dout * self.gamma
        
        dvar = np.sum(dx_norm * self.x_centered * -0.5 * (self.batch_var + self.eps) ** (-1.5), axis=0)
        dmean = np.sum(dx_norm * -1 / self.std, axis=0) + dvar * np.mean(-2 * self.x_centered, axis=0)
        
        dx = dx_norm / self.std + dvar * 2 * self.x_centered / N + dmean / N
        
        return dx

# 测试
x = np.random.randn(32, 64) * 5 + 10  # 偏移的输入
bn = BatchNorm1D(64)

# 前向
y = bn.forward(x, training=True)

print(f"输入统计: 均值={x.mean():.4f}, 标准差={x.std():.4f}")
print(f"输出统计: 均值={y.mean():.4f}, 标准差={y.std():.4f}")

训练 vs 推理

# 演示训练和推理模式的区别
np.random.seed(42)

bn = BatchNorm1D(10)

# 模拟多个批次的训练
print("训练阶段:")
for i in range(5):
    x_train = np.random.randn(32, 10) * (i + 1) + i * 2
    y_train = bn.forward(x_train, training=True)
    print(f"  Batch {i+1}: 输入均值={x_train.mean():.2f}, 输出均值={y_train.mean():.4f}")

print(f"\n运行时统计:")
print(f"  running_mean: {bn.running_mean[:3]}...")
print(f"  running_var: {bn.running_var[:3]}...")

# 推理
print("\n推理阶段:")
x_test = np.random.randn(1, 10) * 3 + 5
y_test = bn.forward(x_test, training=False)
print(f"  测试输入均值={x_test.mean():.2f}, 输出均值={y_test.mean():.4f}")

Layer Normalization

与BatchNorm的区别

特性 BatchNorm LayerNorm
归一化维度 批次维度 特征维度
适用场景 CNN、batch size大 RNN、Transformer
对batch size依赖
class LayerNorm:
    """Layer Normalization实现"""
    
    def __init__(self, normalized_shape, eps=1e-5):
        self.eps = eps
        self.gamma = np.ones(normalized_shape)
        self.beta = np.zeros(normalized_shape)
    
    def forward(self, x):
        """
        x: 输入 (batch_size, num_features)
        """
        # 对每个样本的特征维度归一化
        self.mean = np.mean(x, axis=-1, keepdims=True)
        self.var = np.var(x, axis=-1, keepdims=True)
        
        self.x_norm = (x - self.mean) / np.sqrt(self.var + self.eps)
        
        out = self.gamma * self.x_norm + self.beta
        
        return out

# 比较BatchNorm和LayerNorm
x = np.random.randn(32, 64) * 5 + 10

bn = BatchNorm1D(64)
ln = LayerNorm(64)

y_bn = bn.forward(x, training=True)
y_ln = ln.forward(x)

print("BatchNorm:")
print(f"  输出均值={y_bn.mean():.4f}, 标准差={y_bn.std():.4f}")
print(f"  每列均值范围: [{y_bn.mean(axis=0).min():.4f}, {y_bn.mean(axis=0).max():.4f}]")

print("\nLayerNorm:")
print(f"  输出均值={y_ln.mean():.4f}, 标准差={y_ln.std():.4f}")
print(f"  每行均值范围: [{y_ln.mean(axis=1).min():.4f}, {y_ln.mean(axis=1).max():.4f}]")

可视化区别

# 可视化归一化方向的区别
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# 原始数据
ax = axes[0]
ax.imshow(x[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('原始数据')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)

# BatchNorm结果
ax = axes[1]
ax.imshow(y_bn[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('BatchNorm\n(沿样本维度归一化)')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)

# LayerNorm结果
ax = axes[2]
ax.imshow(y_ln[:10, :10], cmap='viridis', aspect='auto')
ax.set_title('LayerNorm\n(沿特征维度归一化)')
ax.set_xlabel('特征')
ax.set_ylabel('样本')
plt.colorbar(ax.images[0], ax=ax)

plt.tight_layout()
plt.show()

其他归一化方法

Instance Normalization

用于风格迁移等任务:

class InstanceNorm:
    """Instance Normalization"""
    
    def __init__(self, num_features, eps=1e-5):
        self.eps = eps
        self.gamma = np.ones(num_features)
        self.beta = np.zeros(num_features)
    
    def forward(self, x):
        """
        x: (batch_size, channels, height, width) 或 (batch_size, features)
        对每个样本的每个通道单独归一化
        """
        if x.ndim == 2:
            # 简化版:对每个样本单独归一化
            mean = np.mean(x, axis=1, keepdims=True)
            var = np.var(x, axis=1, keepdims=True)
        else:
            # 4D情况
            mean = np.mean(x, axis=(2, 3), keepdims=True)
            var = np.var(x, axis=(2, 3), keepdims=True)
        
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

Group Normalization

class GroupNorm:
    """Group Normalization"""
    
    def __init__(self, num_groups, num_channels, eps=1e-5):
        self.num_groups = num_groups
        self.eps = eps
        self.gamma = np.ones(num_channels)
        self.beta = np.zeros(num_channels)
    
    def forward(self, x):
        """
        x: (batch_size, channels)
        """
        N, C = x.shape
        G = self.num_groups
        
        # 重塑为组
        x = x.reshape(N, G, C // G)
        
        mean = np.mean(x, axis=2, keepdims=True)
        var = np.var(x, axis=2, keepdims=True)
        
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        x_norm = x_norm.reshape(N, C)
        
        return self.gamma * x_norm + self.beta

# 测试
x = np.random.randn(32, 64)
gn = GroupNorm(num_groups=8, num_channels=64)
y_gn = gn.forward(x)

print(f"GroupNorm输出: 均值={y_gn.mean():.4f}, 标准差={y_gn.std():.4f}")

归一化方法比较

# 比较不同归一化方法
fig, ax = plt.subplots(figsize=(10, 6))

methods = ['BatchNorm', 'LayerNorm', 'InstanceNorm', 'GroupNorm']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']

# 在不同batch size下的表现(模拟)
batch_sizes = [1, 4, 8, 16, 32, 64]

# 模拟性能(BatchNorm在小batch时不稳定)
bn_perf = [0.7, 0.8, 0.85, 0.9, 0.92, 0.93]
ln_perf = [0.88, 0.88, 0.88, 0.88, 0.88, 0.88]
in_perf = [0.85, 0.85, 0.85, 0.85, 0.85, 0.85]
gn_perf = [0.87, 0.88, 0.88, 0.89, 0.89, 0.89]

ax.plot(batch_sizes, bn_perf, 'o-', label='BatchNorm', color=colors[0], linewidth=2)
ax.plot(batch_sizes, ln_perf, 's-', label='LayerNorm', color=colors[1], linewidth=2)
ax.plot(batch_sizes, in_perf, '^-', label='InstanceNorm', color=colors[2], linewidth=2)
ax.plot(batch_sizes, gn_perf, 'd-', label='GroupNorm', color=colors[3], linewidth=2)

ax.set_xlabel('Batch Size')
ax.set_ylabel('性能(模拟)')
ax.set_title('不同归一化方法在不同Batch Size下的表现')
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_xscale('log', base=2)

plt.show()

BatchNorm的好处

加速训练

# 演示BatchNorm对训练的影响
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler

# 生成数据
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X = StandardScaler().fit_transform(X)
y = y.reshape(-1, 1)

def train_network(use_bn=False, n_epochs=100):
    """训练简单网络"""
    np.random.seed(42)
    
    # 初始化权重
    W1 = np.random.randn(20, 50) * 0.1
    W2 = np.random.randn(50, 1) * 0.1
    
    if use_bn:
        bn = BatchNorm1D(50)
    
    losses = []
    lr = 0.1
    
    for epoch in range(n_epochs):
        # 前向传播
        z1 = X @ W1
        
        if use_bn:
            z1 = bn.forward(z1, training=True)
        
        a1 = np.maximum(0, z1)  # ReLU
        z2 = a1 @ W2
        a2 = 1 / (1 + np.exp(-z2))  # Sigmoid
        
        # 损失
        loss = -np.mean(y * np.log(a2 + 1e-8) + (1-y) * np.log(1-a2 + 1e-8))
        losses.append(loss)
        
        # 反向传播(简化)
        dz2 = a2 - y
        dW2 = a1.T @ dz2 / len(y)
        da1 = dz2 @ W2.T
        dz1 = da1 * (z1 > 0)
        
        if use_bn:
            dz1 = bn.backward(dz1)
        
        dW1 = X.T @ dz1 / len(y)
        
        W2 -= lr * dW2
        W1 -= lr * dW1
        
        if use_bn:
            bn.gamma -= lr * bn.dgamma
            bn.beta -= lr * bn.dbeta
    
    return losses

losses_no_bn = train_network(use_bn=False)
losses_with_bn = train_network(use_bn=True)

# 可视化
plt.figure(figsize=(10, 6))
plt.plot(losses_no_bn, label='Without BatchNorm', linewidth=2)
plt.plot(losses_with_bn, label='With BatchNorm', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('BatchNorm对训练的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

其他好处

好处 说明
允许更大学习率 梯度更稳定
减少对初始化敏感 自动调整分布
正则化效果 批次噪声
加速收敛 减少梯度消失

PyTorch实现

try:
    import torch
    import torch.nn as nn
    
    # BatchNorm
    bn_layer = nn.BatchNorm1d(64)
    
    # LayerNorm
    ln_layer = nn.LayerNorm(64)
    
    # GroupNorm
    gn_layer = nn.GroupNorm(num_groups=8, num_channels=64)
    
    # InstanceNorm (通常用于2D)
    in_layer = nn.InstanceNorm1d(64)
    
    # 在网络中使用
    class SimpleNet(nn.Module):
        def __init__(self, use_bn=True):
            super().__init__()
            self.fc1 = nn.Linear(20, 64)
            self.bn1 = nn.BatchNorm1d(64) if use_bn else nn.Identity()
            self.fc2 = nn.Linear(64, 32)
            self.bn2 = nn.BatchNorm1d(32) if use_bn else nn.Identity()
            self.fc3 = nn.Linear(32, 1)
            self.relu = nn.ReLU()
        
        def forward(self, x):
            x = self.relu(self.bn1(self.fc1(x)))
            x = self.relu(self.bn2(self.fc2(x)))
            return torch.sigmoid(self.fc3(x))
    
    model = SimpleNet(use_bn=True)
    print("网络结构:")
    print(model)
    
    # 测试
    x = torch.randn(32, 20)
    model.train()
    y_train = model(x)
    
    model.eval()
    y_eval = model(x)
    
    print(f"\n训练模式输出: {y_train.mean().item():.4f}")
    print(f"评估模式输出: {y_eval.mean().item():.4f}")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: BatchNorm应该放在激活函数前还是后?

两种都可以,原论文是前,但实践中后也常用。

Q2: 小batch size怎么办?

使用LayerNorm或GroupNorm。

Q3: RNN中为什么不用BatchNorm?

  • 序列长度不固定
  • 时间步间统计量不一致
  • LayerNorm更适合

Q4: BatchNorm的momentum含义?

控制running_mean/var的更新速度,默认0.1。


总结

方法 归一化维度 适用场景
BatchNorm 批次 CNN、大batch
LayerNorm 特征 Transformer、RNN
InstanceNorm 单样本单通道 风格迁移
GroupNorm 通道组 小batch

参考资料

  • Ioffe, S. & Szegedy, C. (2015). “Batch Normalization: Accelerating Deep Network Training”
  • Ba, J. et al. (2016). “Layer Normalization”
  • Wu, Y. & He, K. (2018). “Group Normalization”
  • Ulyanov, D. et al. (2016). “Instance Normalization”

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——批归一化 》

本文链接:http://localhost:3015/ai/%E6%89%B9%E5%BD%92%E4%B8%80%E5%8C%96.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!