机器学习基础系列——循环神经网络

前言

循环神经网络（RNN）是处理序列数据的专用架构，通过隐藏状态在时间步之间传递信息。本文介绍RNN及其变体LSTM和GRU。

为什么需要RNN

序列数据的特点

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

# 序列数据示例
sequences = {
    '文本': ['今', '天', '天', '气', '很', '好'],
    '时间序列': [1.2, 1.5, 1.3, 1.8, 2.1, 2.0],
    '音频': np.sin(np.linspace(0, 4*np.pi, 100))
}

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# 文本
ax = axes[0]
for i, char in enumerate(sequences['文本']):
    ax.text(i, 0.5, char, fontsize=20, ha='center')
ax.set_xlim(-0.5, 5.5)
ax.set_ylim(0, 1)
ax.set_title('文本序列')
ax.axis('off')

# 时间序列
ax = axes[1]
ax.plot(sequences['时间序列'], 'bo-', markersize=10)
ax.set_title('时间序列')
ax.set_xlabel('时间步')
ax.set_ylabel('值')
ax.grid(True, alpha=0.3)

# 音频
ax = axes[2]
ax.plot(sequences['音频'])
ax.set_title('音频波形')
ax.set_xlabel('采样点')
ax.set_ylabel('振幅')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

全连接网络的局限

问题	说明
固定长度输入	无法处理变长序列
无位置信息	忽略顺序关系
参数不共享	不同位置学习独立特征

基础RNN

结构

\(h_t = \tanh(W_{hh}h_{t-1} + W_{xh}x_t + b_h)\) \(y_t = W_{hy}h_t + b_y\)

class SimpleRNN:
    """简单RNN实现"""
    
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size
        
        # 初始化权重
        scale = np.sqrt(2.0 / (input_size + hidden_size))
        self.Wxh = np.random.randn(input_size, hidden_size) * scale
        self.Whh = np.random.randn(hidden_size, hidden_size) * scale
        self.Why = np.random.randn(hidden_size, output_size) * scale
        
        self.bh = np.zeros(hidden_size)
        self.by = np.zeros(output_size)
    
    def forward(self, X, h0=None):
        """
        X: (seq_len, batch_size, input_size)
        返回: 输出序列和最后的隐藏状态
        """
        seq_len, batch_size, _ = X.shape
        
        if h0 is None:
            h = np.zeros((batch_size, self.hidden_size))
        else:
            h = h0
        
        outputs = []
        hidden_states = [h]
        
        for t in range(seq_len):
            # 计算隐藏状态
            h = np.tanh(X[t] @ self.Wxh + h @ self.Whh + self.bh)
            hidden_states.append(h)
            
            # 计算输出
            y = h @ self.Why + self.by
            outputs.append(y)
        
        return np.array(outputs), h, hidden_states

# 测试
rnn = SimpleRNN(input_size=10, hidden_size=20, output_size=5)
x = np.random.randn(15, 3, 10)  # 15个时间步，3个样本，10维输入

outputs, h_final, hidden_states = rnn.forward(x)

print(f"输入形状: {x.shape}")
print(f"输出形状: {outputs.shape}")
print(f"最终隐藏状态形状: {h_final.shape}")

可视化RNN展开

def visualize_rnn_unroll():
    fig, ax = plt.subplots(figsize=(14, 6))
    
    # 绘制展开的RNN
    n_steps = 5
    
    for t in range(n_steps):
        # 输入
        ax.annotate(f'$x_{t}$', xy=(t*2, 0), fontsize=14, ha='center')
        ax.arrow(t*2, 0.2, 0, 0.5, head_width=0.1, head_length=0.1, fc='blue', ec='blue')
        
        # 隐藏状态
        circle = plt.Circle((t*2, 1.2), 0.3, fill=False, color='green', linewidth=2)
        ax.add_patch(circle)
        ax.annotate(f'$h_{t}$', xy=(t*2, 1.2), fontsize=12, ha='center', va='center')
        
        # 输出
        ax.arrow(t*2, 1.5, 0, 0.5, head_width=0.1, head_length=0.1, fc='red', ec='red')
        ax.annotate(f'$y_{t}$', xy=(t*2, 2.3), fontsize=14, ha='center')
        
        # 循环连接
        if t < n_steps - 1:
            ax.arrow(t*2+0.35, 1.2, 1.3, 0, head_width=0.1, head_length=0.1, fc='green', ec='green')
    
    ax.set_xlim(-1, 10)
    ax.set_ylim(-0.5, 3)
    ax.set_aspect('equal')
    ax.axis('off')
    ax.set_title('RNN展开图', fontsize=14)
    
    plt.show()

visualize_rnn_unroll()

梯度问题

梯度消失/爆炸

# 演示梯度消失
def gradient_flow_rnn(seq_len, hidden_size=100):
    """模拟RNN中的梯度流动"""
    
    # 随机权重矩阵
    W = np.random.randn(hidden_size, hidden_size) * 0.5
    
    # 计算W^t的特征值
    eigenvalues = np.abs(np.linalg.eigvals(W))
    max_eigenvalue = np.max(eigenvalues)
    
    # 梯度范数随时间的变化
    gradient_norms = []
    grad = np.ones(hidden_size)
    
    for t in range(seq_len):
        grad = W.T @ grad
        gradient_norms.append(np.linalg.norm(grad))
    
    return gradient_norms, max_eigenvalue

# 比较不同权重初始化
fig, ax = plt.subplots(figsize=(10, 6))

for scale in [0.5, 1.0, 1.5]:
    np.random.seed(42)
    W = np.random.randn(100, 100) * scale
    
    grad = np.ones(100)
    norms = []
    for t in range(50):
        grad = W.T @ grad * 0.9  # tanh导数约为0.9
        norms.append(np.linalg.norm(grad))
    
    ax.plot(norms, label=f'W scale={scale}')

ax.set_xlabel('时间步')
ax.set_ylabel('梯度范数')
ax.set_title('RNN梯度消失/爆炸')
ax.legend()
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
plt.show()

LSTM

结构

        遗忘门      输入门      输出门
          ↓          ↓          ↓
    f_t = σ(...)  i_t = σ(...)  o_t = σ(...)
          ↓          ↓          ↓
    c_t = f_t⊙c_{t-1} + i_t⊙tanh(...)
                     ↓
               h_t = o_t⊙tanh(c_t)

class LSTM:
    """LSTM实现"""
    
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size
        
        # 初始化权重（4个门合并）
        scale = np.sqrt(2.0 / (input_size + hidden_size))
        self.Wf = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wi = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wc = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wo = np.random.randn(input_size + hidden_size, hidden_size) * scale
        
        self.bf = np.zeros(hidden_size)
        self.bi = np.zeros(hidden_size)
        self.bc = np.zeros(hidden_size)
        self.bo = np.zeros(hidden_size)
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def forward(self, X, h0=None, c0=None):
        """
        X: (seq_len, batch_size, input_size)
        """
        seq_len, batch_size, input_size = X.shape
        
        if h0 is None:
            h = np.zeros((batch_size, self.hidden_size))
        else:
            h = h0
        
        if c0 is None:
            c = np.zeros((batch_size, self.hidden_size))
        else:
            c = c0
        
        outputs = []
        
        for t in range(seq_len):
            # 拼接输入和隐藏状态
            combined = np.concatenate([X[t], h], axis=1)
            
            # 遗忘门
            f = self.sigmoid(combined @ self.Wf + self.bf)
            
            # 输入门
            i = self.sigmoid(combined @ self.Wi + self.bi)
            
            # 候选细胞状态
            c_tilde = np.tanh(combined @ self.Wc + self.bc)
            
            # 输出门
            o = self.sigmoid(combined @ self.Wo + self.bo)
            
            # 更新细胞状态
            c = f * c + i * c_tilde
            
            # 更新隐藏状态
            h = o * np.tanh(c)
            
            outputs.append(h)
        
        return np.array(outputs), h, c

# 测试
lstm = LSTM(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)

outputs, h_final, c_final = lstm.forward(x)

print(f"LSTM输出形状: {outputs.shape}")
print(f"最终隐藏状态: {h_final.shape}")
print(f"最终细胞状态: {c_final.shape}")

门控机制可视化

def visualize_lstm_gates():
    """可视化LSTM门的作用"""
    
    # 模拟门值
    seq_len = 20
    t = np.arange(seq_len)
    
    # 模拟不同模式的门值
    forget_gate = 0.9 * np.ones(seq_len)
    forget_gate[5:8] = 0.1  # 在某些位置"遗忘"
    
    input_gate = 0.1 * np.ones(seq_len)
    input_gate[10:13] = 0.9  # 在某些位置"记忆"
    
    output_gate = 0.5 + 0.3 * np.sin(t / 3)
    
    fig, axes = plt.subplots(3, 1, figsize=(12, 8))
    
    ax = axes[0]
    ax.plot(t, forget_gate, 'b-', linewidth=2)
    ax.fill_between(t, 0, forget_gate, alpha=0.3)
    ax.set_ylabel('遗忘门 $f_t$')
    ax.set_title('LSTM门控值示例')
    ax.set_ylim(0, 1.1)
    ax.grid(True, alpha=0.3)
    
    ax = axes[1]
    ax.plot(t, input_gate, 'g-', linewidth=2)
    ax.fill_between(t, 0, input_gate, alpha=0.3, color='green')
    ax.set_ylabel('输入门 $i_t$')
    ax.set_ylim(0, 1.1)
    ax.grid(True, alpha=0.3)
    
    ax = axes[2]
    ax.plot(t, output_gate, 'r-', linewidth=2)
    ax.fill_between(t, 0, output_gate, alpha=0.3, color='red')
    ax.set_ylabel('输出门 $o_t$')
    ax.set_xlabel('时间步')
    ax.set_ylim(0, 1.1)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

visualize_lstm_gates()

GRU

结构

GRU是LSTM的简化版本，只有两个门：

\(z_t = \sigma(W_z [h_{t-1}, x_t])\) \(r_t = \sigma(W_r [h_{t-1}, x_t])\) \(\tilde{h}_t = \tanh(W [r_t \odot h_{t-1}, x_t])\) \(h_t = (1-z_t) \odot h_{t-1} + z_t \odot \tilde{h}_t\)

class GRU:
    """GRU实现"""
    
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size
        
        scale = np.sqrt(2.0 / (input_size + hidden_size))
        
        # 更新门
        self.Wz = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.bz = np.zeros(hidden_size)
        
        # 重置门
        self.Wr = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.br = np.zeros(hidden_size)
        
        # 候选隐藏状态
        self.Wh = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.bh = np.zeros(hidden_size)
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def forward(self, X, h0=None):
        seq_len, batch_size, input_size = X.shape
        
        if h0 is None:
            h = np.zeros((batch_size, self.hidden_size))
        else:
            h = h0
        
        outputs = []
        
        for t in range(seq_len):
            combined = np.concatenate([h, X[t]], axis=1)
            
            # 更新门
            z = self.sigmoid(combined @ self.Wz + self.bz)
            
            # 重置门
            r = self.sigmoid(combined @ self.Wr + self.br)
            
            # 候选隐藏状态
            combined_reset = np.concatenate([r * h, X[t]], axis=1)
            h_tilde = np.tanh(combined_reset @ self.Wh + self.bh)
            
            # 更新隐藏状态
            h = (1 - z) * h + z * h_tilde
            
            outputs.append(h)
        
        return np.array(outputs), h

# 测试
gru = GRU(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)

outputs, h_final = gru.forward(x)

print(f"GRU输出形状: {outputs.shape}")
print(f"最终隐藏状态: {h_final.shape}")

RNN vs LSTM vs GRU

模型	参数量	长期依赖	训练速度
RNN	最少	差	快
LSTM	最多	好	慢
GRU	中等	好	中等

双向RNN

class BidirectionalRNN:
    """双向RNN"""
    
    def __init__(self, input_size, hidden_size):
        self.forward_rnn = SimpleRNN(input_size, hidden_size, hidden_size)
        self.backward_rnn = SimpleRNN(input_size, hidden_size, hidden_size)
    
    def forward(self, X):
        # 前向传播
        forward_out, _, _ = self.forward_rnn.forward(X)
        
        # 反向传播（翻转序列）
        X_reversed = X[::-1]
        backward_out, _, _ = self.backward_rnn.forward(X_reversed)
        backward_out = backward_out[::-1]  # 翻转回来
        
        # 拼接
        output = np.concatenate([forward_out, backward_out], axis=-1)
        
        return output

# 测试
bi_rnn = BidirectionalRNN(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)

output = bi_rnn.forward(x)
print(f"双向RNN输出形状: {output.shape}")  # (15, 3, 40)

PyTorch实现

try:
    import torch
    import torch.nn as nn
    
    # 基础RNN
    rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
    
    # LSTM
    lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, 
                   batch_first=True, bidirectional=True)
    
    # GRU
    gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
    
    # 测试
    x = torch.randn(3, 15, 10)  # (batch, seq_len, features)
    
    out_rnn, h_rnn = rnn(x)
    out_lstm, (h_lstm, c_lstm) = lstm(x)
    out_gru, h_gru = gru(x)
    
    print("PyTorch RNN:")
    print(f"  RNN输出: {out_rnn.shape}")
    print(f"  LSTM输出: {out_lstm.shape} (双向)")
    print(f"  GRU输出: {out_gru.shape}")
    
    # 序列分类模型
    class SequenceClassifier(nn.Module):
        def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
            super().__init__()
            self.embedding = nn.Embedding(vocab_size, embed_dim)
            self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, 
                               bidirectional=True)
            self.fc = nn.Linear(hidden_dim * 2, num_classes)
            self.dropout = nn.Dropout(0.5)
        
        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, (h_n, c_n) = self.lstm(embedded)
            # 使用最后时刻的隐藏状态
            hidden = torch.cat([h_n[-2], h_n[-1]], dim=1)
            hidden = self.dropout(hidden)
            return self.fc(hidden)
    
    model = SequenceClassifier(vocab_size=10000, embed_dim=128, 
                               hidden_dim=256, num_classes=2)
    print(f"\n序列分类模型参数: {sum(p.numel() for p in model.parameters()):,}")
    
except ImportError:
    print("PyTorch未安装")

常见问题

Q1: RNN和LSTM的主要区别？

LSTM通过门控机制解决了长期依赖问题，能学习更长的序列模式。

Q2: 什么时候用GRU而不是LSTM？

数据量小：GRU参数少
需要快速训练：GRU更快
性能相近时：首选GRU

Q3: 如何处理变长序列？

使用padding + mask或pack_padded_sequence。

Q4: 为什么Transformer取代了RNN？

并行计算能力
更好的长程依赖建模
更易于训练

总结

模型	门数	特点
RNN	0	简单、梯度问题
LSTM	3	长期记忆
GRU	2	LSTM简化版

参考资料

Hochreiter, S. & Schmidhuber, J. (1997). “Long Short-Term Memory”
Cho, K. et al. (2014). “Learning Phrase Representations using RNN Encoder-Decoder”
Goodfellow, I. et al. “Deep Learning” - Chapter 10
Understanding LSTM Networks (colah’s blog)

（采用 CC BY-NC-SA 4.0 许可协议进行授权）

本文标题：《机器学习基础系列——循环神经网络》

本文链接：http://localhost:3015/ai/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.html

本文最后一次更新为天前，文章中的某些内容可能已过时！