RNN、LSTM、GRU与序列建模
前言
循环神经网络(RNN)是处理序列数据的专用架构,通过隐藏状态在时间步之间传递信息。本文介绍RNN及其变体LSTM和GRU。
为什么需要RNN
序列数据的特点
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
# 序列数据示例
sequences = {
'文本': ['今', '天', '天', '气', '很', '好'],
'时间序列': [1.2, 1.5, 1.3, 1.8, 2.1, 2.0],
'音频': np.sin(np.linspace(0, 4*np.pi, 100))
}
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# 文本
ax = axes[0]
for i, char in enumerate(sequences['文本']):
ax.text(i, 0.5, char, fontsize=20, ha='center')
ax.set_xlim(-0.5, 5.5)
ax.set_ylim(0, 1)
ax.set_title('文本序列')
ax.axis('off')
# 时间序列
ax = axes[1]
ax.plot(sequences['时间序列'], 'bo-', markersize=10)
ax.set_title('时间序列')
ax.set_xlabel('时间步')
ax.set_ylabel('值')
ax.grid(True, alpha=0.3)
# 音频
ax = axes[2]
ax.plot(sequences['音频'])
ax.set_title('音频波形')
ax.set_xlabel('采样点')
ax.set_ylabel('振幅')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
全连接网络的局限
| 问题 | 说明 |
|---|---|
| 固定长度输入 | 无法处理变长序列 |
| 无位置信息 | 忽略顺序关系 |
| 参数不共享 | 不同位置学习独立特征 |
基础RNN
结构
\(h_t = \tanh(W_{hh}h_{t-1} + W_{xh}x_t + b_h)\) \(y_t = W_{hy}h_t + b_y\)
class SimpleRNN:
"""简单RNN实现"""
def __init__(self, input_size, hidden_size, output_size):
self.hidden_size = hidden_size
# 初始化权重
scale = np.sqrt(2.0 / (input_size + hidden_size))
self.Wxh = np.random.randn(input_size, hidden_size) * scale
self.Whh = np.random.randn(hidden_size, hidden_size) * scale
self.Why = np.random.randn(hidden_size, output_size) * scale
self.bh = np.zeros(hidden_size)
self.by = np.zeros(output_size)
def forward(self, X, h0=None):
"""
X: (seq_len, batch_size, input_size)
返回: 输出序列和最后的隐藏状态
"""
seq_len, batch_size, _ = X.shape
if h0 is None:
h = np.zeros((batch_size, self.hidden_size))
else:
h = h0
outputs = []
hidden_states = [h]
for t in range(seq_len):
# 计算隐藏状态
h = np.tanh(X[t] @ self.Wxh + h @ self.Whh + self.bh)
hidden_states.append(h)
# 计算输出
y = h @ self.Why + self.by
outputs.append(y)
return np.array(outputs), h, hidden_states
# 测试
rnn = SimpleRNN(input_size=10, hidden_size=20, output_size=5)
x = np.random.randn(15, 3, 10) # 15个时间步,3个样本,10维输入
outputs, h_final, hidden_states = rnn.forward(x)
print(f"输入形状: {x.shape}")
print(f"输出形状: {outputs.shape}")
print(f"最终隐藏状态形状: {h_final.shape}")
可视化RNN展开
def visualize_rnn_unroll():
fig, ax = plt.subplots(figsize=(14, 6))
# 绘制展开的RNN
n_steps = 5
for t in range(n_steps):
# 输入
ax.annotate(f'$x_{t}$', xy=(t*2, 0), fontsize=14, ha='center')
ax.arrow(t*2, 0.2, 0, 0.5, head_width=0.1, head_length=0.1, fc='blue', ec='blue')
# 隐藏状态
circle = plt.Circle((t*2, 1.2), 0.3, fill=False, color='green', linewidth=2)
ax.add_patch(circle)
ax.annotate(f'$h_{t}$', xy=(t*2, 1.2), fontsize=12, ha='center', va='center')
# 输出
ax.arrow(t*2, 1.5, 0, 0.5, head_width=0.1, head_length=0.1, fc='red', ec='red')
ax.annotate(f'$y_{t}$', xy=(t*2, 2.3), fontsize=14, ha='center')
# 循环连接
if t < n_steps - 1:
ax.arrow(t*2+0.35, 1.2, 1.3, 0, head_width=0.1, head_length=0.1, fc='green', ec='green')
ax.set_xlim(-1, 10)
ax.set_ylim(-0.5, 3)
ax.set_aspect('equal')
ax.axis('off')
ax.set_title('RNN展开图', fontsize=14)
plt.show()
visualize_rnn_unroll()
梯度问题
梯度消失/爆炸
# 演示梯度消失
def gradient_flow_rnn(seq_len, hidden_size=100):
"""模拟RNN中的梯度流动"""
# 随机权重矩阵
W = np.random.randn(hidden_size, hidden_size) * 0.5
# 计算W^t的特征值
eigenvalues = np.abs(np.linalg.eigvals(W))
max_eigenvalue = np.max(eigenvalues)
# 梯度范数随时间的变化
gradient_norms = []
grad = np.ones(hidden_size)
for t in range(seq_len):
grad = W.T @ grad
gradient_norms.append(np.linalg.norm(grad))
return gradient_norms, max_eigenvalue
# 比较不同权重初始化
fig, ax = plt.subplots(figsize=(10, 6))
for scale in [0.5, 1.0, 1.5]:
np.random.seed(42)
W = np.random.randn(100, 100) * scale
grad = np.ones(100)
norms = []
for t in range(50):
grad = W.T @ grad * 0.9 # tanh导数约为0.9
norms.append(np.linalg.norm(grad))
ax.plot(norms, label=f'W scale={scale}')
ax.set_xlabel('时间步')
ax.set_ylabel('梯度范数')
ax.set_title('RNN梯度消失/爆炸')
ax.legend()
ax.set_yscale('log')
ax.grid(True, alpha=0.3)
plt.show()
LSTM
结构
遗忘门 输入门 输出门
↓ ↓ ↓
f_t = σ(...) i_t = σ(...) o_t = σ(...)
↓ ↓ ↓
c_t = f_t⊙c_{t-1} + i_t⊙tanh(...)
↓
h_t = o_t⊙tanh(c_t)
class LSTM:
"""LSTM实现"""
def __init__(self, input_size, hidden_size):
self.hidden_size = hidden_size
# 初始化权重(4个门合并)
scale = np.sqrt(2.0 / (input_size + hidden_size))
self.Wf = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.Wi = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.Wc = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.Wo = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.bf = np.zeros(hidden_size)
self.bi = np.zeros(hidden_size)
self.bc = np.zeros(hidden_size)
self.bo = np.zeros(hidden_size)
def sigmoid(self, x):
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def forward(self, X, h0=None, c0=None):
"""
X: (seq_len, batch_size, input_size)
"""
seq_len, batch_size, input_size = X.shape
if h0 is None:
h = np.zeros((batch_size, self.hidden_size))
else:
h = h0
if c0 is None:
c = np.zeros((batch_size, self.hidden_size))
else:
c = c0
outputs = []
for t in range(seq_len):
# 拼接输入和隐藏状态
combined = np.concatenate([X[t], h], axis=1)
# 遗忘门
f = self.sigmoid(combined @ self.Wf + self.bf)
# 输入门
i = self.sigmoid(combined @ self.Wi + self.bi)
# 候选细胞状态
c_tilde = np.tanh(combined @ self.Wc + self.bc)
# 输出门
o = self.sigmoid(combined @ self.Wo + self.bo)
# 更新细胞状态
c = f * c + i * c_tilde
# 更新隐藏状态
h = o * np.tanh(c)
outputs.append(h)
return np.array(outputs), h, c
# 测试
lstm = LSTM(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)
outputs, h_final, c_final = lstm.forward(x)
print(f"LSTM输出形状: {outputs.shape}")
print(f"最终隐藏状态: {h_final.shape}")
print(f"最终细胞状态: {c_final.shape}")
门控机制可视化
def visualize_lstm_gates():
"""可视化LSTM门的作用"""
# 模拟门值
seq_len = 20
t = np.arange(seq_len)
# 模拟不同模式的门值
forget_gate = 0.9 * np.ones(seq_len)
forget_gate[5:8] = 0.1 # 在某些位置"遗忘"
input_gate = 0.1 * np.ones(seq_len)
input_gate[10:13] = 0.9 # 在某些位置"记忆"
output_gate = 0.5 + 0.3 * np.sin(t / 3)
fig, axes = plt.subplots(3, 1, figsize=(12, 8))
ax = axes[0]
ax.plot(t, forget_gate, 'b-', linewidth=2)
ax.fill_between(t, 0, forget_gate, alpha=0.3)
ax.set_ylabel('遗忘门 $f_t$')
ax.set_title('LSTM门控值示例')
ax.set_ylim(0, 1.1)
ax.grid(True, alpha=0.3)
ax = axes[1]
ax.plot(t, input_gate, 'g-', linewidth=2)
ax.fill_between(t, 0, input_gate, alpha=0.3, color='green')
ax.set_ylabel('输入门 $i_t$')
ax.set_ylim(0, 1.1)
ax.grid(True, alpha=0.3)
ax = axes[2]
ax.plot(t, output_gate, 'r-', linewidth=2)
ax.fill_between(t, 0, output_gate, alpha=0.3, color='red')
ax.set_ylabel('输出门 $o_t$')
ax.set_xlabel('时间步')
ax.set_ylim(0, 1.1)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
visualize_lstm_gates()
GRU
结构
GRU是LSTM的简化版本,只有两个门:
\(z_t = \sigma(W_z [h_{t-1}, x_t])\) \(r_t = \sigma(W_r [h_{t-1}, x_t])\) \(\tilde{h}_t = \tanh(W [r_t \odot h_{t-1}, x_t])\) \(h_t = (1-z_t) \odot h_{t-1} + z_t \odot \tilde{h}_t\)
class GRU:
"""GRU实现"""
def __init__(self, input_size, hidden_size):
self.hidden_size = hidden_size
scale = np.sqrt(2.0 / (input_size + hidden_size))
# 更新门
self.Wz = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.bz = np.zeros(hidden_size)
# 重置门
self.Wr = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.br = np.zeros(hidden_size)
# 候选隐藏状态
self.Wh = np.random.randn(input_size + hidden_size, hidden_size) * scale
self.bh = np.zeros(hidden_size)
def sigmoid(self, x):
return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
def forward(self, X, h0=None):
seq_len, batch_size, input_size = X.shape
if h0 is None:
h = np.zeros((batch_size, self.hidden_size))
else:
h = h0
outputs = []
for t in range(seq_len):
combined = np.concatenate([h, X[t]], axis=1)
# 更新门
z = self.sigmoid(combined @ self.Wz + self.bz)
# 重置门
r = self.sigmoid(combined @ self.Wr + self.br)
# 候选隐藏状态
combined_reset = np.concatenate([r * h, X[t]], axis=1)
h_tilde = np.tanh(combined_reset @ self.Wh + self.bh)
# 更新隐藏状态
h = (1 - z) * h + z * h_tilde
outputs.append(h)
return np.array(outputs), h
# 测试
gru = GRU(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)
outputs, h_final = gru.forward(x)
print(f"GRU输出形状: {outputs.shape}")
print(f"最终隐藏状态: {h_final.shape}")
RNN vs LSTM vs GRU
| 模型 | 参数量 | 长期依赖 | 训练速度 |
|---|---|---|---|
| RNN | 最少 | 差 | 快 |
| LSTM | 最多 | 好 | 慢 |
| GRU | 中等 | 好 | 中等 |
双向RNN
class BidirectionalRNN:
"""双向RNN"""
def __init__(self, input_size, hidden_size):
self.forward_rnn = SimpleRNN(input_size, hidden_size, hidden_size)
self.backward_rnn = SimpleRNN(input_size, hidden_size, hidden_size)
def forward(self, X):
# 前向传播
forward_out, _, _ = self.forward_rnn.forward(X)
# 反向传播(翻转序列)
X_reversed = X[::-1]
backward_out, _, _ = self.backward_rnn.forward(X_reversed)
backward_out = backward_out[::-1] # 翻转回来
# 拼接
output = np.concatenate([forward_out, backward_out], axis=-1)
return output
# 测试
bi_rnn = BidirectionalRNN(input_size=10, hidden_size=20)
x = np.random.randn(15, 3, 10)
output = bi_rnn.forward(x)
print(f"双向RNN输出形状: {output.shape}") # (15, 3, 40)
PyTorch实现
try:
import torch
import torch.nn as nn
# 基础RNN
rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
# LSTM
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2,
batch_first=True, bidirectional=True)
# GRU
gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
# 测试
x = torch.randn(3, 15, 10) # (batch, seq_len, features)
out_rnn, h_rnn = rnn(x)
out_lstm, (h_lstm, c_lstm) = lstm(x)
out_gru, h_gru = gru(x)
print("PyTorch RNN:")
print(f" RNN输出: {out_rnn.shape}")
print(f" LSTM输出: {out_lstm.shape} (双向)")
print(f" GRU输出: {out_gru.shape}")
# 序列分类模型
class SequenceClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True,
bidirectional=True)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
self.dropout = nn.Dropout(0.5)
def forward(self, x):
embedded = self.embedding(x)
lstm_out, (h_n, c_n) = self.lstm(embedded)
# 使用最后时刻的隐藏状态
hidden = torch.cat([h_n[-2], h_n[-1]], dim=1)
hidden = self.dropout(hidden)
return self.fc(hidden)
model = SequenceClassifier(vocab_size=10000, embed_dim=128,
hidden_dim=256, num_classes=2)
print(f"\n序列分类模型参数: {sum(p.numel() for p in model.parameters()):,}")
except ImportError:
print("PyTorch未安装")
常见问题
Q1: RNN和LSTM的主要区别?
LSTM通过门控机制解决了长期依赖问题,能学习更长的序列模式。
Q2: 什么时候用GRU而不是LSTM?
- 数据量小:GRU参数少
- 需要快速训练:GRU更快
- 性能相近时:首选GRU
Q3: 如何处理变长序列?
使用padding + mask或pack_padded_sequence。
Q4: 为什么Transformer取代了RNN?
- 并行计算能力
- 更好的长程依赖建模
- 更易于训练
总结
| 模型 | 门数 | 特点 |
|---|---|---|
| RNN | 0 | 简单、梯度问题 |
| LSTM | 3 | 长期记忆 |
| GRU | 2 | LSTM简化版 |
参考资料
- Hochreiter, S. & Schmidhuber, J. (1997). “Long Short-Term Memory”
- Cho, K. et al. (2014). “Learning Phrase Representations using RNN Encoder-Decoder”
- Goodfellow, I. et al. “Deep Learning” - Chapter 10
- Understanding LSTM Networks (colah’s blog)
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——循环神经网络 》
本文链接:http://localhost:3015/ai/%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!