机器学习基础系列——Dropout详解

前言

Dropout是一种简单而有效的正则化技术，通过在训练时随机”丢弃”一部分神经元来防止过拟合。它可以被视为一种隐式的模型集成。

Dropout原理

基本思想

训练时以概率 $p$ 随机将某些神经元的输出置为0：

\[h' = h \cdot m, \quad m_i \sim \text{Bernoulli}(1-p)\]

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 可视化Dropout
def visualize_dropout():
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # 原始网络
    ax = axes[0]
    layers = [4, 6, 6, 2]
    
    def draw_network(ax, layers, dropped=None):
        positions = []
        for i, n in enumerate(layers):
            x = i
            for j in range(n):
                y = j - n/2 + 0.5
                positions.append((x, y, i, j))
        
        # 绘制连接
        for i in range(len(layers) - 1):
            for j in range(layers[i]):
                for k in range(layers[i+1]):
                    if dropped is None or (i, j) not in dropped and (i+1, k) not in dropped:
                        ax.plot([i, i+1], [j - layers[i]/2 + 0.5, k - layers[i+1]/2 + 0.5], 
                               'gray', alpha=0.3, linewidth=0.5)
        
        # 绘制节点
        for x, y, layer, node in positions:
            if dropped and (layer, node) in dropped:
                color = 'lightgray'
                alpha = 0.3
            else:
                color = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'][layer]
                alpha = 1.0
            ax.scatter(x, y, s=200, c=color, alpha=alpha, edgecolors='black')
    
    # 原始网络
    draw_network(axes[0], layers)
    axes[0].set_title('原始网络')
    axes[0].axis('off')
    
    # Dropout网络（训练时）
    np.random.seed(42)
    dropped = set()
    for i in range(1, len(layers)-1):  # 不dropout输入和输出层
        for j in range(layers[i]):
            if np.random.random() < 0.5:
                dropped.add((i, j))
    
    draw_network(axes[1], layers, dropped)
    axes[1].set_title('训练时（Dropout p=0.5）')
    axes[1].axis('off')
    
    # 另一次dropout
    np.random.seed(123)
    dropped = set()
    for i in range(1, len(layers)-1):
        for j in range(layers[i]):
            if np.random.random() < 0.5:
                dropped.add((i, j))
    
    draw_network(axes[2], layers, dropped)
    axes[2].set_title('训练时（另一个mini-batch）')
    axes[2].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_dropout()

从零实现

class Dropout:
    """Dropout层实现"""
    
    def __init__(self, p=0.5):
        """
        p: dropout概率（被丢弃的概率）
        """
        self.p = p
        self.mask = None
    
    def forward(self, x, training=True):
        if training:
            # 生成mask
            self.mask = (np.random.rand(*x.shape) > self.p).astype(float)
            # 应用mask并缩放
            return x * self.mask / (1 - self.p)
        else:
            # 推理时直接返回
            return x
    
    def backward(self, dout):
        # 梯度只流过未被丢弃的神经元
        return dout * self.mask / (1 - self.p)

# 测试
dropout = Dropout(p=0.5)
x = np.ones((3, 5))

print("输入:")
print(x)

print("\n训练时（forward）:")
y_train = dropout.forward(x, training=True)
print(y_train)
print(f"非零比例: {(y_train > 0).mean():.2%}")

print("\n推理时（forward）:")
y_test = dropout.forward(x, training=False)
print(y_test)

Inverted Dropout

训练时缩放而非测试时缩放（现代实现）：

# 比较两种实现方式
def standard_dropout(x, p, training=True):
    """标准Dropout（测试时缩放）"""
    if training:
        mask = (np.random.rand(*x.shape) > p).astype(float)
        return x * mask
    else:
        return x * (1 - p)

def inverted_dropout(x, p, training=True):
    """Inverted Dropout（训练时缩放）"""
    if training:
        mask = (np.random.rand(*x.shape) > p).astype(float)
        return x * mask / (1 - p)
    else:
        return x

# 验证期望相同
np.random.seed(42)
x = np.ones((1000, 100))
p = 0.5

# 多次运行取平均
n_runs = 100

standard_train_mean = np.mean([standard_dropout(x, p, True).mean() for _ in range(n_runs)])
standard_test_mean = standard_dropout(x, p, False).mean()

inverted_train_mean = np.mean([inverted_dropout(x, p, True).mean() for _ in range(n_runs)])
inverted_test_mean = inverted_dropout(x, p, False).mean()

print("Standard Dropout:")
print(f"  训练时均值: {standard_train_mean:.4f}")
print(f"  测试时均值: {standard_test_mean:.4f}")

print("\nInverted Dropout:")
print(f"  训练时均值: {inverted_train_mean:.4f}")
print(f"  测试时均值: {inverted_test_mean:.4f}")

为什么Dropout有效

模型集成视角

Dropout可以看作训练了 $2^n$ 个子网络的集成：

# 简单示例：3个神经元的dropout可能性
n_neurons = 3
possibilities = []

for i in range(2**n_neurons):
    binary = format(i, f'0{n_neurons}b')
    possibilities.append(binary)

print(f"3个神经元的Dropout共有 {len(possibilities)} 种可能的子网络:")
for p in possibilities:
    active = [f"神经元{j+1}" for j, b in enumerate(p) if b == '1']
    print(f"  {p}: 激活 {active if active else '无'}")

减少神经元间的协同适应

# 可视化协同适应
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 无Dropout：神经元可能过度依赖彼此
ax = axes[0]
ax.set_xlim(0, 4)
ax.set_ylim(0, 3)

# 绘制神经元
for i in range(3):
    for j in range(3):
        ax.add_patch(plt.Circle((i+0.5, j+0.5), 0.2, color='blue'))

# 绘制强依赖连接
connections = [((0.5, 0.5), (1.5, 1.5)), ((0.5, 1.5), (1.5, 1.5)), 
               ((1.5, 0.5), (2.5, 1.5)), ((1.5, 1.5), (2.5, 1.5))]
for (x1, y1), (x2, y2) in connections:
    ax.annotate('', xy=(x2, y2), xytext=(x1, y1),
                arrowprops=dict(arrowstyle='->', color='red', lw=2))

ax.set_title('无Dropout: 强协同适应')
ax.axis('off')

# 有Dropout：神经元更独立
ax = axes[1]
ax.set_xlim(0, 4)
ax.set_ylim(0, 3)

for i in range(3):
    for j in range(3):
        ax.add_patch(plt.Circle((i+0.5, j+0.5), 0.2, color='blue' if np.random.rand() > 0.3 else 'gray'))

ax.set_title('有Dropout: 减少依赖')
ax.axis('off')

plt.tight_layout()
plt.show()

不同类型的Dropout

Spatial Dropout

用于CNN，丢弃整个特征图：

class SpatialDropout2D:
    """空间Dropout - 丢弃整个通道"""
    
    def __init__(self, p=0.5):
        self.p = p
    
    def forward(self, x, training=True):
        """
        x: (batch_size, channels, height, width)
        """
        if not training:
            return x
        
        batch_size, channels, h, w = x.shape
        # 对每个样本的每个通道生成mask
        mask = (np.random.rand(batch_size, channels, 1, 1) > self.p).astype(float)
        return x * mask / (1 - self.p)

# 测试
x = np.random.randn(2, 4, 8, 8)  # 2个样本，4个通道，8x8特征图
spatial_dropout = SpatialDropout2D(p=0.5)

y = spatial_dropout.forward(x, training=True)
print(f"输入形状: {x.shape}")
print(f"输出形状: {y.shape}")
print(f"非零通道比例: {(y.sum(axis=(2,3)) != 0).mean():.2%}")

DropConnect

丢弃权重而非激活值：

class DropConnect:
    """DropConnect - 丢弃权重连接"""
    
    def __init__(self, input_size, output_size, p=0.5):
        self.p = p
        self.W = np.random.randn(input_size, output_size) * 0.1
        self.b = np.zeros(output_size)
    
    def forward(self, x, training=True):
        if training:
            mask = (np.random.rand(*self.W.shape) > self.p).astype(float)
            W_masked = self.W * mask / (1 - self.p)
        else:
            W_masked = self.W
        
        return x @ W_masked + self.b

# 测试
dropconnect = DropConnect(10, 5, p=0.5)
x = np.random.randn(3, 10)

y = dropconnect.forward(x, training=True)
print(f"DropConnect输出形状: {y.shape}")

Variational Dropout

在整个序列中使用相同的mask（用于RNN）：

class VariationalDropout:
    """变分Dropout - 序列共享mask"""
    
    def __init__(self, p=0.5):
        self.p = p
        self.mask = None
    
    def forward(self, x, training=True, same_mask=True):
        """
        x: (batch_size, seq_len, features)
        """
        if not training:
            return x
        
        batch_size, seq_len, features = x.shape
        
        if same_mask or self.mask is None:
            # 生成与时间步共享的mask
            self.mask = (np.random.rand(batch_size, 1, features) > self.p).astype(float)
        
        return x * self.mask / (1 - self.p)

# 测试
var_dropout = VariationalDropout(p=0.5)
x = np.random.randn(2, 10, 5)  # 2个样本，10个时间步，5个特征

y = var_dropout.forward(x, training=True)
print(f"序列Dropout输出形状: {y.shape}")

# 验证mask在时间步间共享
print(f"时间步0和时间步5的非零位置相同: {np.allclose(y[:, 0] != 0, y[:, 5] != 0)}")

Dropout与正则化

与L2正则化的关系

# Dropout可以看作自适应的L2正则化
from sklearn.linear_model import Ridge
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=500, n_features=100, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# L2正则化
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

print("L2正则化（Ridge）:")
print(f"  训练R²: {ridge.score(X_train, y_train):.4f}")
print(f"  测试R²: {ridge.score(X_test, y_test):.4f}")
print(f"  权重范数: {np.linalg.norm(ridge.coef_):.4f}")

Dropout作为贝叶斯近似

# Monte Carlo Dropout用于不确定性估计
class MCDropoutModel:
    """使用MC Dropout估计不确定性"""
    
    def __init__(self, W, p=0.5):
        self.W = W
        self.p = p
    
    def predict_with_uncertainty(self, x, n_samples=100):
        """多次采样估计均值和方差"""
        predictions = []
        
        for _ in range(n_samples):
            mask = (np.random.rand(*self.W.shape) > self.p).astype(float)
            W_masked = self.W * mask / (1 - self.p)
            pred = x @ W_masked
            predictions.append(pred)
        
        predictions = np.array(predictions)
        mean = predictions.mean(axis=0)
        std = predictions.std(axis=0)
        
        return mean, std

# 测试
W = np.random.randn(10, 1) * 0.5
model = MCDropoutModel(W, p=0.3)

x = np.random.randn(5, 10)
mean, std = model.predict_with_uncertainty(x, n_samples=100)

print("MC Dropout不确定性估计:")
for i in range(5):
    print(f"  样本{i+1}: 均值={mean[i, 0]:.4f}, 标准差={std[i, 0]:.4f}")

实践中的Dropout

不同层的dropout率

# 典型的dropout配置
dropout_config = {
    '输入层': 0.2,      # 轻微dropout或不用
    '隐藏层': 0.5,      # 标准dropout
    '全连接层': 0.5,    # 通常较高
    'CNN卷积层': 0.25,  # 通常较低
    'RNN': 0.2,         # 使用variational dropout
}

print("典型Dropout配置:")
for layer, rate in dropout_config.items():
    print(f"  {layer}: p={rate}")

与BatchNorm的配合

# BatchNorm + Dropout的顺序
# 推荐: Conv -> BN -> ReLU -> Dropout

class ConvBlock:
    """卷积块示例"""
    
    def __init__(self, use_dropout=True):
        self.use_dropout = use_dropout
    
    def forward(self, x, training=True):
        # 1. 卷积（模拟）
        x = x  # Conv2D
        
        # 2. BatchNorm
        if training:
            x = (x - x.mean()) / (x.std() + 1e-5)
        
        # 3. 激活
        x = np.maximum(0, x)  # ReLU
        
        # 4. Dropout（可选）
        if self.use_dropout and training:
            mask = (np.random.rand(*x.shape) > 0.25).astype(float)
            x = x * mask / 0.75
        
        return x

PyTorch实现

try:
    import torch
    import torch.nn as nn
    
    # 标准Dropout
    dropout = nn.Dropout(p=0.5)
    
    # 2D Dropout（用于CNN）
    dropout2d = nn.Dropout2d(p=0.5)
    
    # 3D Dropout
    dropout3d = nn.Dropout3d(p=0.5)
    
    # Alpha Dropout（用于SELU）
    alpha_dropout = nn.AlphaDropout(p=0.5)
    
    # 在网络中使用
    class DropoutNet(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc1 = nn.Linear(100, 256)
            self.dropout1 = nn.Dropout(0.5)
            self.fc2 = nn.Linear(256, 128)
            self.dropout2 = nn.Dropout(0.5)
            self.fc3 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.dropout1(x)
            x = torch.relu(self.fc2(x))
            x = self.dropout2(x)
            return self.fc3(x)
    
    model = DropoutNet()
    
    # 训练模式
    model.train()
    x = torch.randn(32, 100)
    y_train = model(x)
    
    # 评估模式
    model.eval()
    y_eval = model(x)
    
    print("PyTorch Dropout:")
    print(f"  训练模式输出变化: {(y_train != y_eval).any()}")
    
    # MC Dropout
    model.train()  # 保持dropout激活
    with torch.no_grad():
        preds = torch.stack([model(x) for _ in range(100)])
    
    mean = preds.mean(dim=0)
    std = preds.std(dim=0)
    print(f"  MC Dropout均值: {mean.mean():.4f}")
    print(f"  MC Dropout标准差: {std.mean():.4f}")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: Dropout率如何选择？

层类型	推荐dropout率
全连接层	0.5
卷积层	0.1-0.25
输入层	0-0.2

Q2: 测试时为什么不用Dropout？

测试时使用全部神经元，但需要缩放（或使用inverted dropout）。

Q3: Dropout和BatchNorm能一起用吗？

可以，但需要注意顺序。有研究表明两者配合可能降低效果。

Q4: RNN中怎么用Dropout？

使用Variational Dropout，在时间步间共享mask。

总结

特性	描述
原理	随机丢弃神经元
效果	正则化、模型集成
典型值	0.5（全连接）、0.25（卷积）
变体	Spatial、DropConnect、Variational

参考资料

Srivastava, N. et al. (2014). “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”
Gal, Y. & Ghahramani, Z. (2016). “Dropout as a Bayesian Approximation”
Wan, L. et al. (2013). “Regularization of Neural Networks using DropConnect”
CS231n: Dropout

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——Dropout详解》

本文链接：http://localhost:3015/ai/Dropout%E8%AF%A6%E8%A7%A3.html

本文最后一次更新为天前，文章中的某些内容可能已过时！