已经是最新一篇文章了!
已经是最后一篇文章了!
编码器-解码器架构详解
前言
Transformer是现代深度学习的里程碑架构,它摒弃了RNN的循环结构,完全基于注意力机制,成为了GPT、BERT等大语言模型的基础。
Transformer概览
整体架构
输入 → [Encoder] → 编码表示
↓
输出 → [Decoder] → 预测结果
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
def visualize_transformer_architecture():
"""可视化Transformer架构"""
fig, ax = plt.subplots(figsize=(14, 10))
# Encoder
encoder_x = 3
ax.add_patch(plt.Rectangle((encoder_x-1, 0), 2, 8, fill=False, edgecolor='blue', linewidth=2))
ax.text(encoder_x, 8.3, 'Encoder', ha='center', fontsize=12, fontweight='bold')
# Encoder components
components = ['Input\nEmbedding', 'Positional\nEncoding', 'Multi-Head\nAttention',
'Add & Norm', 'Feed\nForward', 'Add & Norm']
for i, comp in enumerate(components):
y = 0.5 + i * 1.2
ax.add_patch(plt.Rectangle((encoder_x-0.8, y), 1.6, 0.9,
facecolor='lightblue', edgecolor='blue'))
ax.text(encoder_x, y+0.45, comp, ha='center', va='center', fontsize=8)
# Decoder
decoder_x = 8
ax.add_patch(plt.Rectangle((decoder_x-1, 0), 2, 10, fill=False, edgecolor='green', linewidth=2))
ax.text(decoder_x, 10.3, 'Decoder', ha='center', fontsize=12, fontweight='bold')
# Decoder components
dec_components = ['Output\nEmbedding', 'Positional\nEncoding', 'Masked\nMulti-Head Attn',
'Add & Norm', 'Cross\nAttention', 'Add & Norm',
'Feed\nForward', 'Add & Norm']
for i, comp in enumerate(dec_components):
y = 0.5 + i * 1.2
ax.add_patch(plt.Rectangle((decoder_x-0.8, y), 1.6, 0.9,
facecolor='lightgreen', edgecolor='green'))
ax.text(decoder_x, y+0.45, comp, ha='center', va='center', fontsize=8)
# Cross attention arrow
ax.annotate('', xy=(decoder_x-0.8, 5.5), xytext=(encoder_x+0.8, 5.5),
arrowprops=dict(arrowstyle='->', color='red', lw=2))
ax.set_xlim(0, 12)
ax.set_ylim(-1, 12)
ax.axis('off')
ax.set_title('Transformer架构', fontsize=14)
plt.tight_layout()
plt.show()
visualize_transformer_architecture()
核心组件
位置编码
def positional_encoding(max_len, d_model):
"""正弦位置编码"""
pe = np.zeros((max_len, d_model))
position = np.arange(max_len)[:, np.newaxis]
div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
pe[:, 0::2] = np.sin(position * div_term)
pe[:, 1::2] = np.cos(position * div_term)
return pe
# 生成位置编码
pe = positional_encoding(100, 512)
print(f"位置编码形状: {pe.shape}")
多头注意力
def softmax(x, axis=-1):
exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
return exp_x / np.sum(exp_x, axis=axis, keepdims=True)
class MultiHeadAttention:
"""多头注意力"""
def __init__(self, d_model, n_heads):
self.d_model = d_model
self.n_heads = n_heads
self.d_k = d_model // n_heads
scale = np.sqrt(2.0 / d_model)
self.W_q = np.random.randn(d_model, d_model) * scale
self.W_k = np.random.randn(d_model, d_model) * scale
self.W_v = np.random.randn(d_model, d_model) * scale
self.W_o = np.random.randn(d_model, d_model) * scale
def split_heads(self, x, batch_size):
x = x.reshape(batch_size, -1, self.n_heads, self.d_k)
return x.transpose(0, 2, 1, 3)
def forward(self, Q, K, V, mask=None):
batch_size = Q.shape[0]
Q = Q @ self.W_q
K = K @ self.W_k
V = V @ self.W_v
Q = self.split_heads(Q, batch_size)
K = self.split_heads(K, batch_size)
V = self.split_heads(V, batch_size)
scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(self.d_k)
if mask is not None:
scores = scores + (mask * -1e9)
attn_weights = softmax(scores, axis=-1)
context = np.matmul(attn_weights, V)
context = context.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model)
return context @ self.W_o, attn_weights
前馈网络
class FeedForward:
"""位置前馈网络"""
def __init__(self, d_model, d_ff):
scale1 = np.sqrt(2.0 / d_model)
scale2 = np.sqrt(2.0 / d_ff)
self.W1 = np.random.randn(d_model, d_ff) * scale1
self.b1 = np.zeros(d_ff)
self.W2 = np.random.randn(d_ff, d_model) * scale2
self.b2 = np.zeros(d_model)
def forward(self, x):
# FFN(x) = max(0, xW1 + b1)W2 + b2
hidden = np.maximum(0, x @ self.W1 + self.b1) # ReLU
return hidden @ self.W2 + self.b2
# 测试
ff = FeedForward(d_model=512, d_ff=2048)
x = np.random.randn(2, 10, 512)
output = ff.forward(x)
print(f"前馈网络输入: {x.shape}, 输出: {output.shape}")
Layer Normalization
class LayerNorm:
"""层归一化"""
def __init__(self, d_model, eps=1e-6):
self.gamma = np.ones(d_model)
self.beta = np.zeros(d_model)
self.eps = eps
def forward(self, x):
mean = np.mean(x, axis=-1, keepdims=True)
std = np.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
Encoder实现
class EncoderLayer:
"""Encoder层"""
def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
self.mha = MultiHeadAttention(d_model, n_heads)
self.ff = FeedForward(d_model, d_ff)
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
self.dropout = dropout
def forward(self, x, mask=None):
# 自注意力 + 残差连接 + 层归一化
attn_output, _ = self.mha.forward(x, x, x, mask)
x = self.norm1.forward(x + attn_output)
# 前馈网络 + 残差连接 + 层归一化
ff_output = self.ff.forward(x)
x = self.norm2.forward(x + ff_output)
return x
class Encoder:
"""Transformer Encoder"""
def __init__(self, n_layers, d_model, n_heads, d_ff, max_len=5000):
self.layers = [EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)]
self.pe = positional_encoding(max_len, d_model)
def forward(self, x, mask=None):
seq_len = x.shape[1]
# 添加位置编码
x = x + self.pe[:seq_len]
for layer in self.layers:
x = layer.forward(x, mask)
return x
# 测试
encoder = Encoder(n_layers=6, d_model=512, n_heads=8, d_ff=2048)
x = np.random.randn(2, 20, 512) # batch=2, seq=20, d_model=512
encoded = encoder.forward(x)
print(f"Encoder输入: {x.shape}")
print(f"Encoder输出: {encoded.shape}")
Decoder实现
class DecoderLayer:
"""Decoder层"""
def __init__(self, d_model, n_heads, d_ff):
self.masked_mha = MultiHeadAttention(d_model, n_heads)
self.cross_mha = MultiHeadAttention(d_model, n_heads)
self.ff = FeedForward(d_model, d_ff)
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
self.norm3 = LayerNorm(d_model)
def forward(self, x, encoder_output, look_ahead_mask=None, padding_mask=None):
# Masked自注意力
attn1, _ = self.masked_mha.forward(x, x, x, look_ahead_mask)
x = self.norm1.forward(x + attn1)
# 交叉注意力
attn2, _ = self.cross_mha.forward(x, encoder_output, encoder_output, padding_mask)
x = self.norm2.forward(x + attn2)
# 前馈网络
ff_output = self.ff.forward(x)
x = self.norm3.forward(x + ff_output)
return x
class Decoder:
"""Transformer Decoder"""
def __init__(self, n_layers, d_model, n_heads, d_ff, max_len=5000):
self.layers = [DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)]
self.pe = positional_encoding(max_len, d_model)
def forward(self, x, encoder_output, look_ahead_mask=None, padding_mask=None):
seq_len = x.shape[1]
x = x + self.pe[:seq_len]
for layer in self.layers:
x = layer.forward(x, encoder_output, look_ahead_mask, padding_mask)
return x
# 测试
decoder = Decoder(n_layers=6, d_model=512, n_heads=8, d_ff=2048)
tgt = np.random.randn(2, 15, 512)
decoded = decoder.forward(tgt, encoded)
print(f"Decoder输入: {tgt.shape}")
print(f"Decoder输出: {decoded.shape}")
完整Transformer
class Transformer:
"""完整Transformer"""
def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512,
n_heads=8, n_layers=6, d_ff=2048, max_len=5000):
self.d_model = d_model
# Embeddings
self.src_embedding = np.random.randn(src_vocab_size, d_model) * 0.02
self.tgt_embedding = np.random.randn(tgt_vocab_size, d_model) * 0.02
# Encoder & Decoder
self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, max_len)
self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, max_len)
# Output projection
self.output_projection = np.random.randn(d_model, tgt_vocab_size) * 0.02
def create_look_ahead_mask(self, seq_len):
"""创建因果mask"""
return np.triu(np.ones((seq_len, seq_len)), k=1)
def encode(self, src, src_mask=None):
# Embedding
x = self.src_embedding[src] * np.sqrt(self.d_model)
return self.encoder.forward(x, src_mask)
def decode(self, tgt, encoder_output, look_ahead_mask=None, padding_mask=None):
# Embedding
x = self.tgt_embedding[tgt] * np.sqrt(self.d_model)
return self.decoder.forward(x, encoder_output, look_ahead_mask, padding_mask)
def forward(self, src, tgt):
# Encode
encoder_output = self.encode(src)
# Create mask
look_ahead_mask = self.create_look_ahead_mask(tgt.shape[1])
# Decode
decoder_output = self.decode(tgt, encoder_output, look_ahead_mask)
# Project to vocab
logits = decoder_output @ self.output_projection
return logits
# 测试
transformer = Transformer(
src_vocab_size=10000,
tgt_vocab_size=10000,
d_model=512,
n_heads=8,
n_layers=6
)
src = np.random.randint(0, 10000, (2, 20)) # 源序列
tgt = np.random.randint(0, 10000, (2, 15)) # 目标序列
logits = transformer.forward(src, tgt)
print(f"源序列: {src.shape}")
print(f"目标序列: {tgt.shape}")
print(f"输出logits: {logits.shape}")
训练技巧
Label Smoothing
def label_smoothing(targets, vocab_size, smoothing=0.1):
"""标签平滑"""
n_class = vocab_size
one_hot = np.zeros((targets.shape[0], n_class))
one_hot[np.arange(targets.shape[0]), targets] = 1
smooth_labels = one_hot * (1 - smoothing) + smoothing / n_class
return smooth_labels
# 示例
targets = np.array([1, 5, 3])
smooth = label_smoothing(targets, vocab_size=10, smoothing=0.1)
print("原始标签: ", targets)
print("平滑后标签 (部分):")
print(smooth[:, :6])
学习率调度
def transformer_lr_schedule(step, d_model, warmup_steps=4000):
"""Transformer学习率调度"""
arg1 = step ** -0.5
arg2 = step * (warmup_steps ** -1.5)
return (d_model ** -0.5) * min(arg1, arg2)
# 可视化
steps = np.arange(1, 100000)
lrs = [transformer_lr_schedule(s, 512, 4000) for s in steps]
plt.figure(figsize=(10, 5))
plt.plot(steps, lrs)
plt.xlabel('Training Step')
plt.ylabel('Learning Rate')
plt.title('Transformer学习率调度')
plt.grid(True, alpha=0.3)
plt.show()
PyTorch实现
try:
import torch
import torch.nn as nn
class TransformerModel(nn.Module):
def __init__(self, src_vocab, tgt_vocab, d_model=512, n_heads=8,
n_enc_layers=6, n_dec_layers=6, d_ff=2048, dropout=0.1):
super().__init__()
self.d_model = d_model
# Embeddings
self.src_embed = nn.Embedding(src_vocab, d_model)
self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
# Positional encoding
self.pos_encoder = PositionalEncoding(d_model, dropout)
# Transformer
self.transformer = nn.Transformer(
d_model=d_model,
nhead=n_heads,
num_encoder_layers=n_enc_layers,
num_decoder_layers=n_dec_layers,
dim_feedforward=d_ff,
dropout=dropout,
batch_first=True
)
# Output projection
self.fc_out = nn.Linear(d_model, tgt_vocab)
def forward(self, src, tgt, src_mask=None, tgt_mask=None):
src = self.pos_encoder(self.src_embed(src) * (self.d_model ** 0.5))
tgt = self.pos_encoder(self.tgt_embed(tgt) * (self.d_model ** 0.5))
output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
return self.fc_out(output)
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
-(np.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
# 测试
model = TransformerModel(src_vocab=10000, tgt_vocab=10000)
src = torch.randint(0, 10000, (4, 20))
tgt = torch.randint(0, 10000, (4, 15))
output = model(src, tgt)
print("PyTorch Transformer:")
print(f" 源序列: {src.shape}")
print(f" 目标序列: {tgt.shape}")
print(f" 输出: {output.shape}")
print(f" 参数量: {sum(p.numel() for p in model.parameters()):,}")
except ImportError:
print("PyTorch未安装")
Transformer变体
仅Encoder(BERT类)
# BERT架构(仅Encoder)
bert_structure = """
BERT (Bidirectional Encoder Representations)
├── Token Embedding
├── Position Embedding
├── Segment Embedding
└── N × Encoder Layers
├── Multi-Head Self-Attention
├── Add & Norm
├── Feed Forward
└── Add & Norm
"""
print(bert_structure)
仅Decoder(GPT类)
# GPT架构(仅Decoder)
gpt_structure = """
GPT (Generative Pre-trained Transformer)
├── Token Embedding
├── Position Embedding
└── N × Decoder Layers (Masked)
├── Masked Multi-Head Self-Attention
├── Add & Norm
├── Feed Forward
└── Add & Norm
"""
print(gpt_structure)
常见问题
Q1: 为什么用LayerNorm而不是BatchNorm?
LayerNorm对每个样本独立归一化,更适合变长序列。
Q2: 为什么需要位置编码?
自注意力是置换不变的,需要显式注入位置信息。
Q3: 交叉注意力和自注意力的区别?
- 自注意力:Q、K、V来自同一序列
- 交叉注意力:Q来自Decoder,K、V来自Encoder
Q4: 为什么Transformer比RNN效果好?
- 并行计算
- 直接建模长程依赖
- 更好的梯度流动
总结
| 组件 | 功能 |
|---|---|
| 自注意力 | 捕获序列内依赖 |
| 交叉注意力 | 连接Encoder和Decoder |
| 位置编码 | 注入位置信息 |
| 残差连接 | 缓解梯度问题 |
| LayerNorm | 稳定训练 |
参考资料
- Vaswani, A. et al. (2017). “Attention Is All You Need”
- The Illustrated Transformer (Jay Alammar)
- The Annotated Transformer (Harvard NLP)
- Hugging Face Transformers库
版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。
(采用 CC BY-NC-SA 4.0 许可协议进行授权)
本文标题:《 机器学习基础系列——Transformer 》
本文链接:http://localhost:3015/ai/Transformer.html
本文最后一次更新为 天前,文章中的某些内容可能已过时!