编码器-解码器架构详解

前言

Transformer是现代深度学习的里程碑架构,它摒弃了RNN的循环结构,完全基于注意力机制,成为了GPT、BERT等大语言模型的基础。


Transformer概览

整体架构

输入 → [Encoder] → 编码表示
                         ↓
输出 → [Decoder] → 预测结果
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

def visualize_transformer_architecture():
    """可视化Transformer架构"""
    
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Encoder
    encoder_x = 3
    ax.add_patch(plt.Rectangle((encoder_x-1, 0), 2, 8, fill=False, edgecolor='blue', linewidth=2))
    ax.text(encoder_x, 8.3, 'Encoder', ha='center', fontsize=12, fontweight='bold')
    
    # Encoder components
    components = ['Input\nEmbedding', 'Positional\nEncoding', 'Multi-Head\nAttention', 
                  'Add & Norm', 'Feed\nForward', 'Add & Norm']
    for i, comp in enumerate(components):
        y = 0.5 + i * 1.2
        ax.add_patch(plt.Rectangle((encoder_x-0.8, y), 1.6, 0.9, 
                                   facecolor='lightblue', edgecolor='blue'))
        ax.text(encoder_x, y+0.45, comp, ha='center', va='center', fontsize=8)
    
    # Decoder
    decoder_x = 8
    ax.add_patch(plt.Rectangle((decoder_x-1, 0), 2, 10, fill=False, edgecolor='green', linewidth=2))
    ax.text(decoder_x, 10.3, 'Decoder', ha='center', fontsize=12, fontweight='bold')
    
    # Decoder components
    dec_components = ['Output\nEmbedding', 'Positional\nEncoding', 'Masked\nMulti-Head Attn',
                      'Add & Norm', 'Cross\nAttention', 'Add & Norm', 
                      'Feed\nForward', 'Add & Norm']
    for i, comp in enumerate(dec_components):
        y = 0.5 + i * 1.2
        ax.add_patch(plt.Rectangle((decoder_x-0.8, y), 1.6, 0.9,
                                   facecolor='lightgreen', edgecolor='green'))
        ax.text(decoder_x, y+0.45, comp, ha='center', va='center', fontsize=8)
    
    # Cross attention arrow
    ax.annotate('', xy=(decoder_x-0.8, 5.5), xytext=(encoder_x+0.8, 5.5),
                arrowprops=dict(arrowstyle='->', color='red', lw=2))
    
    ax.set_xlim(0, 12)
    ax.set_ylim(-1, 12)
    ax.axis('off')
    ax.set_title('Transformer架构', fontsize=14)
    
    plt.tight_layout()
    plt.show()

visualize_transformer_architecture()

核心组件

位置编码

def positional_encoding(max_len, d_model):
    """正弦位置编码"""
    pe = np.zeros((max_len, d_model))
    position = np.arange(max_len)[:, np.newaxis]
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    
    return pe

# 生成位置编码
pe = positional_encoding(100, 512)
print(f"位置编码形状: {pe.shape}")

多头注意力

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

class MultiHeadAttention:
    """多头注意力"""
    
    def __init__(self, d_model, n_heads):
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        scale = np.sqrt(2.0 / d_model)
        self.W_q = np.random.randn(d_model, d_model) * scale
        self.W_k = np.random.randn(d_model, d_model) * scale
        self.W_v = np.random.randn(d_model, d_model) * scale
        self.W_o = np.random.randn(d_model, d_model) * scale
    
    def split_heads(self, x, batch_size):
        x = x.reshape(batch_size, -1, self.n_heads, self.d_k)
        return x.transpose(0, 2, 1, 3)
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.shape[0]
        
        Q = Q @ self.W_q
        K = K @ self.W_k
        V = V @ self.W_v
        
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)
        
        scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(self.d_k)
        
        if mask is not None:
            scores = scores + (mask * -1e9)
        
        attn_weights = softmax(scores, axis=-1)
        context = np.matmul(attn_weights, V)
        
        context = context.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model)
        
        return context @ self.W_o, attn_weights

前馈网络

class FeedForward:
    """位置前馈网络"""
    
    def __init__(self, d_model, d_ff):
        scale1 = np.sqrt(2.0 / d_model)
        scale2 = np.sqrt(2.0 / d_ff)
        
        self.W1 = np.random.randn(d_model, d_ff) * scale1
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.randn(d_ff, d_model) * scale2
        self.b2 = np.zeros(d_model)
    
    def forward(self, x):
        # FFN(x) = max(0, xW1 + b1)W2 + b2
        hidden = np.maximum(0, x @ self.W1 + self.b1)  # ReLU
        return hidden @ self.W2 + self.b2

# 测试
ff = FeedForward(d_model=512, d_ff=2048)
x = np.random.randn(2, 10, 512)
output = ff.forward(x)
print(f"前馈网络输入: {x.shape}, 输出: {output.shape}")

Layer Normalization

class LayerNorm:
    """层归一化"""
    
    def __init__(self, d_model, eps=1e-6):
        self.gamma = np.ones(d_model)
        self.beta = np.zeros(d_model)
        self.eps = eps
    
    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        std = np.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

Encoder实现

class EncoderLayer:
    """Encoder层"""
    
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        self.mha = MultiHeadAttention(d_model, n_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout = dropout
    
    def forward(self, x, mask=None):
        # 自注意力 + 残差连接 + 层归一化
        attn_output, _ = self.mha.forward(x, x, x, mask)
        x = self.norm1.forward(x + attn_output)
        
        # 前馈网络 + 残差连接 + 层归一化
        ff_output = self.ff.forward(x)
        x = self.norm2.forward(x + ff_output)
        
        return x


class Encoder:
    """Transformer Encoder"""
    
    def __init__(self, n_layers, d_model, n_heads, d_ff, max_len=5000):
        self.layers = [EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)]
        self.pe = positional_encoding(max_len, d_model)
    
    def forward(self, x, mask=None):
        seq_len = x.shape[1]
        
        # 添加位置编码
        x = x + self.pe[:seq_len]
        
        for layer in self.layers:
            x = layer.forward(x, mask)
        
        return x

# 测试
encoder = Encoder(n_layers=6, d_model=512, n_heads=8, d_ff=2048)
x = np.random.randn(2, 20, 512)  # batch=2, seq=20, d_model=512

encoded = encoder.forward(x)
print(f"Encoder输入: {x.shape}")
print(f"Encoder输出: {encoded.shape}")

Decoder实现

class DecoderLayer:
    """Decoder层"""
    
    def __init__(self, d_model, n_heads, d_ff):
        self.masked_mha = MultiHeadAttention(d_model, n_heads)
        self.cross_mha = MultiHeadAttention(d_model, n_heads)
        self.ff = FeedForward(d_model, d_ff)
        
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
    
    def forward(self, x, encoder_output, look_ahead_mask=None, padding_mask=None):
        # Masked自注意力
        attn1, _ = self.masked_mha.forward(x, x, x, look_ahead_mask)
        x = self.norm1.forward(x + attn1)
        
        # 交叉注意力
        attn2, _ = self.cross_mha.forward(x, encoder_output, encoder_output, padding_mask)
        x = self.norm2.forward(x + attn2)
        
        # 前馈网络
        ff_output = self.ff.forward(x)
        x = self.norm3.forward(x + ff_output)
        
        return x


class Decoder:
    """Transformer Decoder"""
    
    def __init__(self, n_layers, d_model, n_heads, d_ff, max_len=5000):
        self.layers = [DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)]
        self.pe = positional_encoding(max_len, d_model)
    
    def forward(self, x, encoder_output, look_ahead_mask=None, padding_mask=None):
        seq_len = x.shape[1]
        x = x + self.pe[:seq_len]
        
        for layer in self.layers:
            x = layer.forward(x, encoder_output, look_ahead_mask, padding_mask)
        
        return x

# 测试
decoder = Decoder(n_layers=6, d_model=512, n_heads=8, d_ff=2048)
tgt = np.random.randn(2, 15, 512)

decoded = decoder.forward(tgt, encoded)
print(f"Decoder输入: {tgt.shape}")
print(f"Decoder输出: {decoded.shape}")

完整Transformer

class Transformer:
    """完整Transformer"""
    
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, 
                 n_heads=8, n_layers=6, d_ff=2048, max_len=5000):
        
        self.d_model = d_model
        
        # Embeddings
        self.src_embedding = np.random.randn(src_vocab_size, d_model) * 0.02
        self.tgt_embedding = np.random.randn(tgt_vocab_size, d_model) * 0.02
        
        # Encoder & Decoder
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, max_len)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, max_len)
        
        # Output projection
        self.output_projection = np.random.randn(d_model, tgt_vocab_size) * 0.02
    
    def create_look_ahead_mask(self, seq_len):
        """创建因果mask"""
        return np.triu(np.ones((seq_len, seq_len)), k=1)
    
    def encode(self, src, src_mask=None):
        # Embedding
        x = self.src_embedding[src] * np.sqrt(self.d_model)
        return self.encoder.forward(x, src_mask)
    
    def decode(self, tgt, encoder_output, look_ahead_mask=None, padding_mask=None):
        # Embedding
        x = self.tgt_embedding[tgt] * np.sqrt(self.d_model)
        return self.decoder.forward(x, encoder_output, look_ahead_mask, padding_mask)
    
    def forward(self, src, tgt):
        # Encode
        encoder_output = self.encode(src)
        
        # Create mask
        look_ahead_mask = self.create_look_ahead_mask(tgt.shape[1])
        
        # Decode
        decoder_output = self.decode(tgt, encoder_output, look_ahead_mask)
        
        # Project to vocab
        logits = decoder_output @ self.output_projection
        
        return logits

# 测试
transformer = Transformer(
    src_vocab_size=10000,
    tgt_vocab_size=10000,
    d_model=512,
    n_heads=8,
    n_layers=6
)

src = np.random.randint(0, 10000, (2, 20))  # 源序列
tgt = np.random.randint(0, 10000, (2, 15))  # 目标序列

logits = transformer.forward(src, tgt)
print(f"源序列: {src.shape}")
print(f"目标序列: {tgt.shape}")
print(f"输出logits: {logits.shape}")

训练技巧

Label Smoothing

def label_smoothing(targets, vocab_size, smoothing=0.1):
    """标签平滑"""
    n_class = vocab_size
    one_hot = np.zeros((targets.shape[0], n_class))
    one_hot[np.arange(targets.shape[0]), targets] = 1
    
    smooth_labels = one_hot * (1 - smoothing) + smoothing / n_class
    return smooth_labels

# 示例
targets = np.array([1, 5, 3])
smooth = label_smoothing(targets, vocab_size=10, smoothing=0.1)
print("原始标签: ", targets)
print("平滑后标签 (部分):")
print(smooth[:, :6])

学习率调度

def transformer_lr_schedule(step, d_model, warmup_steps=4000):
    """Transformer学习率调度"""
    arg1 = step ** -0.5
    arg2 = step * (warmup_steps ** -1.5)
    return (d_model ** -0.5) * min(arg1, arg2)

# 可视化
steps = np.arange(1, 100000)
lrs = [transformer_lr_schedule(s, 512, 4000) for s in steps]

plt.figure(figsize=(10, 5))
plt.plot(steps, lrs)
plt.xlabel('Training Step')
plt.ylabel('Learning Rate')
plt.title('Transformer学习率调度')
plt.grid(True, alpha=0.3)
plt.show()

PyTorch实现

try:
    import torch
    import torch.nn as nn
    
    class TransformerModel(nn.Module):
        def __init__(self, src_vocab, tgt_vocab, d_model=512, n_heads=8, 
                     n_enc_layers=6, n_dec_layers=6, d_ff=2048, dropout=0.1):
            super().__init__()
            
            self.d_model = d_model
            
            # Embeddings
            self.src_embed = nn.Embedding(src_vocab, d_model)
            self.tgt_embed = nn.Embedding(tgt_vocab, d_model)
            
            # Positional encoding
            self.pos_encoder = PositionalEncoding(d_model, dropout)
            
            # Transformer
            self.transformer = nn.Transformer(
                d_model=d_model,
                nhead=n_heads,
                num_encoder_layers=n_enc_layers,
                num_decoder_layers=n_dec_layers,
                dim_feedforward=d_ff,
                dropout=dropout,
                batch_first=True
            )
            
            # Output projection
            self.fc_out = nn.Linear(d_model, tgt_vocab)
        
        def forward(self, src, tgt, src_mask=None, tgt_mask=None):
            src = self.pos_encoder(self.src_embed(src) * (self.d_model ** 0.5))
            tgt = self.pos_encoder(self.tgt_embed(tgt) * (self.d_model ** 0.5))
            
            output = self.transformer(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
            return self.fc_out(output)
    
    
    class PositionalEncoding(nn.Module):
        def __init__(self, d_model, dropout=0.1, max_len=5000):
            super().__init__()
            self.dropout = nn.Dropout(p=dropout)
            
            pe = torch.zeros(max_len, d_model)
            position = torch.arange(0, max_len).unsqueeze(1).float()
            div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                                -(np.log(10000.0) / d_model))
            
            pe[:, 0::2] = torch.sin(position * div_term)
            pe[:, 1::2] = torch.cos(position * div_term)
            pe = pe.unsqueeze(0)
            
            self.register_buffer('pe', pe)
        
        def forward(self, x):
            x = x + self.pe[:, :x.size(1)]
            return self.dropout(x)
    
    # 测试
    model = TransformerModel(src_vocab=10000, tgt_vocab=10000)
    
    src = torch.randint(0, 10000, (4, 20))
    tgt = torch.randint(0, 10000, (4, 15))
    
    output = model(src, tgt)
    
    print("PyTorch Transformer:")
    print(f"  源序列: {src.shape}")
    print(f"  目标序列: {tgt.shape}")
    print(f"  输出: {output.shape}")
    print(f"  参数量: {sum(p.numel() for p in model.parameters()):,}")
    
except ImportError:
    print("PyTorch未安装")

Transformer变体

仅Encoder(BERT类)

# BERT架构(仅Encoder)
bert_structure = """
BERT (Bidirectional Encoder Representations)
├── Token Embedding
├── Position Embedding
├── Segment Embedding
└── N × Encoder Layers
    ├── Multi-Head Self-Attention
    ├── Add & Norm
    ├── Feed Forward
    └── Add & Norm
"""
print(bert_structure)

仅Decoder(GPT类)

# GPT架构(仅Decoder)
gpt_structure = """
GPT (Generative Pre-trained Transformer)
├── Token Embedding
├── Position Embedding
└── N × Decoder Layers (Masked)
    ├── Masked Multi-Head Self-Attention
    ├── Add & Norm
    ├── Feed Forward
    └── Add & Norm
"""
print(gpt_structure)

常见问题

Q1: 为什么用LayerNorm而不是BatchNorm?

LayerNorm对每个样本独立归一化,更适合变长序列。

Q2: 为什么需要位置编码?

自注意力是置换不变的,需要显式注入位置信息。

Q3: 交叉注意力和自注意力的区别?

  • 自注意力:Q、K、V来自同一序列
  • 交叉注意力:Q来自Decoder,K、V来自Encoder

Q4: 为什么Transformer比RNN效果好?

  • 并行计算
  • 直接建模长程依赖
  • 更好的梯度流动

总结

组件 功能
自注意力 捕获序列内依赖
交叉注意力 连接Encoder和Decoder
位置编码 注入位置信息
残差连接 缓解梯度问题
LayerNorm 稳定训练

参考资料

  • Vaswani, A. et al. (2017). “Attention Is All You Need”
  • The Illustrated Transformer (Jay Alammar)
  • The Annotated Transformer (Harvard NLP)
  • Hugging Face Transformers库

版权声明: 如无特别声明,本文版权归 sshipanoo 所有,转载请注明本文链接。

(采用 CC BY-NC-SA 4.0 许可协议进行授权)

本文标题:《 机器学习基础系列——Transformer 》

本文链接:http://localhost:3015/ai/Transformer.html

本文最后一次更新为 天前,文章中的某些内容可能已过时!